sys_generic.c revision 114216
164562Sgshapiro/*
264562Sgshapiro * Copyright (c) 1982, 1986, 1989, 1993
364562Sgshapiro *	The Regents of the University of California.  All rights reserved.
464562Sgshapiro * (c) UNIX System Laboratories, Inc.
564562Sgshapiro * All or some portions of this file are derived from material licensed
6111823Sgshapiro * to the University of California by American Telephone and Telegraph
764562Sgshapiro * Co. or Unix System Laboratories, Inc. and are reproduced herein with
864562Sgshapiro * the permission of UNIX System Laboratories, Inc.
964562Sgshapiro *
1064562Sgshapiro * Redistribution and use in source and binary forms, with or without
1164562Sgshapiro * modification, are permitted provided that the following conditions
1264562Sgshapiro * are met:
1364562Sgshapiro * 1. Redistributions of source code must retain the above copyright
1464562Sgshapiro *    notice, this list of conditions and the following disclaimer.
1564562Sgshapiro * 2. Redistributions in binary form must reproduce the above copyright
1664562Sgshapiro *    notice, this list of conditions and the following disclaimer in the
17159609Sgshapiro *    documentation and/or other materials provided with the distribution.
1864562Sgshapiro * 3. All advertising materials mentioning features or use of this software
1964562Sgshapiro *    must display the following acknowledgement:
2064562Sgshapiro *	This product includes software developed by the University of
2164562Sgshapiro *	California, Berkeley and its contributors.
2264562Sgshapiro * 4. Neither the name of the University nor the names of its contributors
2364562Sgshapiro *    may be used to endorse or promote products derived from this software
2464562Sgshapiro *    without specific prior written permission.
2564562Sgshapiro *
26159609Sgshapiro * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27159609Sgshapiro * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28159609Sgshapiro * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29159609Sgshapiro * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30159609Sgshapiro * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31159609Sgshapiro * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32159609Sgshapiro * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33159609Sgshapiro * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34159609Sgshapiro * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35159609Sgshapiro * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36159609Sgshapiro * SUCH DAMAGE.
37159609Sgshapiro *
38159609Sgshapiro *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39159609Sgshapiro * $FreeBSD: head/sys/kern/sys_generic.c 114216 2003-04-29 13:36:06Z kan $
40159609Sgshapiro */
41159609Sgshapiro
42159609Sgshapiro#include "opt_ktrace.h"
43159609Sgshapiro
44159609Sgshapiro#include <sys/param.h>
45159609Sgshapiro#include <sys/systm.h>
46159609Sgshapiro#include <sys/sysproto.h>
4780785Sgshapiro#include <sys/filedesc.h>
48159609Sgshapiro#include <sys/filio.h>
49159609Sgshapiro#include <sys/fcntl.h>
50159609Sgshapiro#include <sys/file.h>
51159609Sgshapiro#include <sys/proc.h>
52159609Sgshapiro#include <sys/signalvar.h>
53159609Sgshapiro#include <sys/socketvar.h>
54159609Sgshapiro#include <sys/uio.h>
55159609Sgshapiro#include <sys/kernel.h>
56159609Sgshapiro#include <sys/limits.h>
57159609Sgshapiro#include <sys/malloc.h>
58159609Sgshapiro#include <sys/poll.h>
59159609Sgshapiro#include <sys/resourcevar.h>
60159609Sgshapiro#include <sys/selinfo.h>
61159609Sgshapiro#include <sys/syscallsubr.h>
62159609Sgshapiro#include <sys/sysctl.h>
63159609Sgshapiro#include <sys/sysent.h>
64159609Sgshapiro#include <sys/bio.h>
65159609Sgshapiro#include <sys/buf.h>
6680785Sgshapiro#include <sys/condvar.h>
67159609Sgshapiro#ifdef KTRACE
68159609Sgshapiro#include <sys/ktrace.h>
69159609Sgshapiro#endif
70159609Sgshapiro#include <vm/vm.h>
71159609Sgshapiro#include <vm/vm_page.h>
72159609Sgshapiro
73159609Sgshapirostatic MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74159609Sgshapirostatic MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75159609SgshapiroMALLOC_DEFINE(M_IOV, "iov", "large iov's");
76159609Sgshapiro
77159609Sgshapirostatic int	pollscan(struct thread *, struct pollfd *, u_int);
78159609Sgshapirostatic int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79159609Sgshapirostatic int	dofileread(struct thread *, struct file *, int, void *,
80159609Sgshapiro		    size_t, off_t, int);
81159609Sgshapirostatic int	dofilewrite(struct thread *, struct file *, int,
82159609Sgshapiro		    const void *, size_t, off_t, int);
83159609Sgshapiro
84159609Sgshapiro/*
85159609Sgshapiro * Read system call.
86159609Sgshapiro */
87159609Sgshapiro#ifndef _SYS_SYSPROTO_H_
88159609Sgshapirostruct read_args {
89159609Sgshapiro	int	fd;
90159609Sgshapiro	void	*buf;
91159609Sgshapiro	size_t	nbyte;
92159609Sgshapiro};
93159609Sgshapiro#endif
94159609Sgshapiro/*
95159609Sgshapiro * MPSAFE
96159609Sgshapiro */
97159609Sgshapiroint
98159609Sgshapiroread(td, uap)
99159609Sgshapiro	struct thread *td;
100159609Sgshapiro	struct read_args *uap;
101159609Sgshapiro{
102159609Sgshapiro	struct file *fp;
103159609Sgshapiro	int error;
104159609Sgshapiro
105159609Sgshapiro	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
106159609Sgshapiro		error = dofileread(td, fp, uap->fd, uap->buf,
107159609Sgshapiro			    uap->nbyte, (off_t)-1, 0);
108159609Sgshapiro		fdrop(fp, td);
109159609Sgshapiro	}
110159609Sgshapiro	return(error);
111159609Sgshapiro}
112159609Sgshapiro
113159609Sgshapiro/*
114159609Sgshapiro * Pread system call
115159609Sgshapiro */
116159609Sgshapiro#ifndef _SYS_SYSPROTO_H_
117159609Sgshapirostruct pread_args {
118159609Sgshapiro	int	fd;
119159609Sgshapiro	void	*buf;
120159609Sgshapiro	size_t	nbyte;
121159609Sgshapiro	int	pad;
122159609Sgshapiro	off_t	offset;
123159609Sgshapiro};
124159609Sgshapiro#endif
125159609Sgshapiro/*
126159609Sgshapiro * MPSAFE
127159609Sgshapiro */
128159609Sgshapiroint
129159609Sgshapiropread(td, uap)
130159609Sgshapiro	struct thread *td;
131159609Sgshapiro	struct pread_args *uap;
132159609Sgshapiro{
133159609Sgshapiro	struct file *fp;
134159609Sgshapiro	int error;
135159609Sgshapiro
136159609Sgshapiro	if ((error = fget_read(td, uap->fd, &fp)) != 0)
137159609Sgshapiro		return (error);
138159609Sgshapiro	if (fp->f_type != DTYPE_VNODE) {
139159609Sgshapiro		error = ESPIPE;
140159609Sgshapiro	} else {
141159609Sgshapiro		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
14264562Sgshapiro			    uap->offset, FOF_OFFSET);
14364562Sgshapiro	}
14464562Sgshapiro	fdrop(fp, td);
145249729Sgshapiro	return(error);
146249729Sgshapiro}
147249729Sgshapiro
148249729Sgshapiro/*
149249729Sgshapiro * Code common for read and pread
150249729Sgshapiro */
151249729Sgshapirostatic int
152249729Sgshapirodofileread(td, fp, fd, buf, nbyte, offset, flags)
153249729Sgshapiro	struct thread *td;
154249729Sgshapiro	struct file *fp;
155249729Sgshapiro	int fd, flags;
156249729Sgshapiro	void *buf;
157249729Sgshapiro	size_t nbyte;
158249729Sgshapiro	off_t offset;
159249729Sgshapiro{
160249729Sgshapiro	struct uio auio;
161249729Sgshapiro	struct iovec aiov;
162249729Sgshapiro	long cnt, error = 0;
163249729Sgshapiro#ifdef KTRACE
164249729Sgshapiro	struct iovec ktriov;
165249729Sgshapiro	struct uio ktruio;
166249729Sgshapiro	int didktr = 0;
167249729Sgshapiro#endif
168249729Sgshapiro
169249729Sgshapiro	aiov.iov_base = buf;
170249729Sgshapiro	aiov.iov_len = nbyte;
171249729Sgshapiro	auio.uio_iov = &aiov;
172249729Sgshapiro	auio.uio_iovcnt = 1;
173249729Sgshapiro	auio.uio_offset = offset;
174249729Sgshapiro	if (nbyte > INT_MAX)
175249729Sgshapiro		return (EINVAL);
176249729Sgshapiro	auio.uio_resid = nbyte;
177249729Sgshapiro	auio.uio_rw = UIO_READ;
178249729Sgshapiro	auio.uio_segflg = UIO_USERSPACE;
179249729Sgshapiro	auio.uio_td = td;
180249729Sgshapiro#ifdef KTRACE
181249729Sgshapiro	/*
182249729Sgshapiro	 * if tracing, save a copy of iovec
183249729Sgshapiro	 */
184249729Sgshapiro	if (KTRPOINT(td, KTR_GENIO)) {
185249729Sgshapiro		ktriov = aiov;
186249729Sgshapiro		ktruio = auio;
187249729Sgshapiro		didktr = 1;
188249729Sgshapiro	}
189249729Sgshapiro#endif
190249729Sgshapiro	cnt = nbyte;
191249729Sgshapiro
192249729Sgshapiro	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
193249729Sgshapiro		if (auio.uio_resid != cnt && (error == ERESTART ||
194249729Sgshapiro		    error == EINTR || error == EWOULDBLOCK))
195249729Sgshapiro			error = 0;
196249729Sgshapiro	}
197249729Sgshapiro	cnt -= auio.uio_resid;
198249729Sgshapiro#ifdef KTRACE
199249729Sgshapiro	if (didktr && error == 0) {
200249729Sgshapiro		ktruio.uio_iov = &ktriov;
201249729Sgshapiro		ktruio.uio_resid = cnt;
202249729Sgshapiro		ktrgenio(fd, UIO_READ, &ktruio, error);
203249729Sgshapiro	}
204249729Sgshapiro#endif
205249729Sgshapiro	td->td_retval[0] = cnt;
206249729Sgshapiro	return (error);
207249729Sgshapiro}
208249729Sgshapiro
209249729Sgshapiro/*
210249729Sgshapiro * Scatter read system call.
211249729Sgshapiro */
212249729Sgshapiro#ifndef _SYS_SYSPROTO_H_
213249729Sgshapirostruct readv_args {
214249729Sgshapiro	int	fd;
215249729Sgshapiro	struct	iovec *iovp;
216249729Sgshapiro	u_int	iovcnt;
217249729Sgshapiro};
218249729Sgshapiro#endif
219249729Sgshapiro/*
220249729Sgshapiro * MPSAFE
221249729Sgshapiro */
222249729Sgshapiroint
223249729Sgshapiroreadv(td, uap)
224249729Sgshapiro	struct thread *td;
225249729Sgshapiro	struct readv_args *uap;
226249729Sgshapiro{
227249729Sgshapiro	struct file *fp;
228249729Sgshapiro	struct uio auio;
229249729Sgshapiro	struct iovec *iov;
230249729Sgshapiro	struct iovec *needfree;
231249729Sgshapiro	struct iovec aiov[UIO_SMALLIOV];
232249729Sgshapiro	long i, cnt;
233249729Sgshapiro	int error;
234249729Sgshapiro	u_int iovlen;
235249729Sgshapiro#ifdef KTRACE
236249729Sgshapiro	struct iovec *ktriov = NULL;
237249729Sgshapiro	struct uio ktruio;
238249729Sgshapiro#endif
239249729Sgshapiro
240249729Sgshapiro	if ((error = fget_read(td, uap->fd, &fp)) != 0)
241249729Sgshapiro		return (error);
242249729Sgshapiro	needfree = NULL;
243249729Sgshapiro	/* note: can't use iovlen until iovcnt is validated */
244249729Sgshapiro	iovlen = uap->iovcnt * sizeof (struct iovec);
245249729Sgshapiro	if (uap->iovcnt > UIO_SMALLIOV) {
246249729Sgshapiro		if (uap->iovcnt > UIO_MAXIOV) {
247249729Sgshapiro			error = EINVAL;
248249729Sgshapiro			goto done;
249249729Sgshapiro		}
250249729Sgshapiro		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
251249729Sgshapiro		needfree = iov;
252249729Sgshapiro	} else
253249729Sgshapiro		iov = aiov;
254249729Sgshapiro	auio.uio_iov = iov;
255249729Sgshapiro	auio.uio_iovcnt = uap->iovcnt;
256249729Sgshapiro	auio.uio_rw = UIO_READ;
257249729Sgshapiro	auio.uio_segflg = UIO_USERSPACE;
258249729Sgshapiro	auio.uio_td = td;
259249729Sgshapiro	auio.uio_offset = -1;
260249729Sgshapiro	if ((error = copyin(uap->iovp, iov, iovlen)))
261249729Sgshapiro		goto done;
262249729Sgshapiro	auio.uio_resid = 0;
263249729Sgshapiro	for (i = 0; i < uap->iovcnt; i++) {
264249729Sgshapiro		if (iov->iov_len > INT_MAX - auio.uio_resid) {
265249729Sgshapiro			error = EINVAL;
266249729Sgshapiro			goto done;
267249729Sgshapiro		}
268249729Sgshapiro		auio.uio_resid += iov->iov_len;
269249729Sgshapiro		iov++;
270249729Sgshapiro	}
271249729Sgshapiro#ifdef KTRACE
272249729Sgshapiro	/*
273249729Sgshapiro	 * if tracing, save a copy of iovec
274249729Sgshapiro	 */
275249729Sgshapiro	if (KTRPOINT(td, KTR_GENIO))  {
276249729Sgshapiro		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
277249729Sgshapiro		bcopy(auio.uio_iov, ktriov, iovlen);
278249729Sgshapiro		ktruio = auio;
279249729Sgshapiro	}
280249729Sgshapiro#endif
281249729Sgshapiro	cnt = auio.uio_resid;
282244833Sgshapiro	if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
283244833Sgshapiro		if (auio.uio_resid != cnt && (error == ERESTART ||
284244833Sgshapiro		    error == EINTR || error == EWOULDBLOCK))
285244833Sgshapiro			error = 0;
286244833Sgshapiro	}
287244833Sgshapiro	cnt -= auio.uio_resid;
288244833Sgshapiro#ifdef KTRACE
289244833Sgshapiro	if (ktriov != NULL) {
290244833Sgshapiro		if (error == 0) {
291244833Sgshapiro			ktruio.uio_iov = ktriov;
292244833Sgshapiro			ktruio.uio_resid = cnt;
293244833Sgshapiro			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
294244833Sgshapiro		}
295244833Sgshapiro		FREE(ktriov, M_TEMP);
296244833Sgshapiro	}
297244833Sgshapiro#endif
298244833Sgshapiro	td->td_retval[0] = cnt;
299244833Sgshapirodone:
300244833Sgshapiro	fdrop(fp, td);
301244833Sgshapiro	if (needfree)
302244833Sgshapiro		FREE(needfree, M_IOV);
303244833Sgshapiro	return (error);
304244833Sgshapiro}
305244833Sgshapiro
306244833Sgshapiro/*
307244833Sgshapiro * Write system call
308244833Sgshapiro */
309244833Sgshapiro#ifndef _SYS_SYSPROTO_H_
310244833Sgshapirostruct write_args {
311244833Sgshapiro	int	fd;
312244833Sgshapiro	const void *buf;
313244833Sgshapiro	size_t	nbyte;
314244833Sgshapiro};
315244833Sgshapiro#endif
316244833Sgshapiro/*
317244833Sgshapiro * MPSAFE
318244833Sgshapiro */
319244833Sgshapiroint
320244833Sgshapirowrite(td, uap)
321244833Sgshapiro	struct thread *td;
322244833Sgshapiro	struct write_args *uap;
323244833Sgshapiro{
324244833Sgshapiro	struct file *fp;
325244833Sgshapiro	int error;
326244833Sgshapiro
327244833Sgshapiro	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
328244833Sgshapiro		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
329244833Sgshapiro			    (off_t)-1, 0);
330244833Sgshapiro		fdrop(fp, td);
331244833Sgshapiro	} else {
332244833Sgshapiro		error = EBADF;	/* XXX this can't be right */
333244833Sgshapiro	}
334244833Sgshapiro	return(error);
335244833Sgshapiro}
336244833Sgshapiro
337244833Sgshapiro/*
338244833Sgshapiro * Pwrite system call
339244833Sgshapiro */
340244833Sgshapiro#ifndef _SYS_SYSPROTO_H_
341244833Sgshapirostruct pwrite_args {
342244833Sgshapiro	int	fd;
343244833Sgshapiro	const void *buf;
344244833Sgshapiro	size_t	nbyte;
345244833Sgshapiro	int	pad;
346244833Sgshapiro	off_t	offset;
347244833Sgshapiro};
348244833Sgshapiro#endif
349244833Sgshapiro/*
350244833Sgshapiro * MPSAFE
351244833Sgshapiro */
352244833Sgshapiroint
353244833Sgshapiropwrite(td, uap)
354244833Sgshapiro	struct thread *td;
355244833Sgshapiro	struct pwrite_args *uap;
356244833Sgshapiro{
357244833Sgshapiro	struct file *fp;
358244833Sgshapiro	int error;
359244833Sgshapiro
360244833Sgshapiro	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
361244833Sgshapiro		if (fp->f_type == DTYPE_VNODE) {
362244833Sgshapiro			error = dofilewrite(td, fp, uap->fd, uap->buf,
363244833Sgshapiro				    uap->nbyte, uap->offset, FOF_OFFSET);
364244833Sgshapiro		} else {
365244833Sgshapiro			error = ESPIPE;
366244833Sgshapiro		}
367244833Sgshapiro		fdrop(fp, td);
368244833Sgshapiro	} else {
369244833Sgshapiro		error = EBADF;	/* this can't be right */
370244833Sgshapiro	}
371244833Sgshapiro	return(error);
372244833Sgshapiro}
373244833Sgshapiro
374244833Sgshapirostatic int
375244833Sgshapirodofilewrite(td, fp, fd, buf, nbyte, offset, flags)
376244833Sgshapiro	struct thread *td;
377244833Sgshapiro	struct file *fp;
378244833Sgshapiro	int fd, flags;
379244833Sgshapiro	const void *buf;
380244833Sgshapiro	size_t nbyte;
381244833Sgshapiro	off_t offset;
382244833Sgshapiro{
383244833Sgshapiro	struct uio auio;
384244833Sgshapiro	struct iovec aiov;
385244833Sgshapiro	long cnt, error = 0;
386244833Sgshapiro#ifdef KTRACE
387244833Sgshapiro	struct iovec ktriov;
388244833Sgshapiro	struct uio ktruio;
389244833Sgshapiro	int didktr = 0;
390244833Sgshapiro#endif
391244833Sgshapiro
392244833Sgshapiro	aiov.iov_base = (void *)(uintptr_t)buf;
393244833Sgshapiro	aiov.iov_len = nbyte;
394244833Sgshapiro	auio.uio_iov = &aiov;
395244833Sgshapiro	auio.uio_iovcnt = 1;
396244833Sgshapiro	auio.uio_offset = offset;
397244833Sgshapiro	if (nbyte > INT_MAX)
398244833Sgshapiro		return (EINVAL);
399244833Sgshapiro	auio.uio_resid = nbyte;
400244833Sgshapiro	auio.uio_rw = UIO_WRITE;
401244833Sgshapiro	auio.uio_segflg = UIO_USERSPACE;
402244833Sgshapiro	auio.uio_td = td;
403244833Sgshapiro#ifdef KTRACE
404244833Sgshapiro	/*
405244833Sgshapiro	 * if tracing, save a copy of iovec and uio
406244833Sgshapiro	 */
407244833Sgshapiro	if (KTRPOINT(td, KTR_GENIO)) {
408244833Sgshapiro		ktriov = aiov;
409244833Sgshapiro		ktruio = auio;
410244833Sgshapiro		didktr = 1;
411244833Sgshapiro	}
412244833Sgshapiro#endif
413244833Sgshapiro	cnt = nbyte;
414244833Sgshapiro	if (fp->f_type == DTYPE_VNODE)
415244833Sgshapiro		bwillwrite();
416244833Sgshapiro	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
417244833Sgshapiro		if (auio.uio_resid != cnt && (error == ERESTART ||
418244833Sgshapiro		    error == EINTR || error == EWOULDBLOCK))
419244833Sgshapiro			error = 0;
420244833Sgshapiro		/* Socket layer is responsible for issuing SIGPIPE. */
421244833Sgshapiro		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
422244833Sgshapiro			PROC_LOCK(td->td_proc);
423244833Sgshapiro			psignal(td->td_proc, SIGPIPE);
424244833Sgshapiro			PROC_UNLOCK(td->td_proc);
425223067Sgshapiro		}
426223067Sgshapiro	}
427223067Sgshapiro	cnt -= auio.uio_resid;
428223067Sgshapiro#ifdef KTRACE
429223067Sgshapiro	if (didktr && error == 0) {
430223067Sgshapiro		ktruio.uio_iov = &ktriov;
431223067Sgshapiro		ktruio.uio_resid = cnt;
432223067Sgshapiro		ktrgenio(fd, UIO_WRITE, &ktruio, error);
433223067Sgshapiro	}
434223067Sgshapiro#endif
435223067Sgshapiro	td->td_retval[0] = cnt;
436223067Sgshapiro	return (error);
437223067Sgshapiro}
438223067Sgshapiro
439223067Sgshapiro/*
440223067Sgshapiro * Gather write system call
441223067Sgshapiro */
442223067Sgshapiro#ifndef _SYS_SYSPROTO_H_
443223067Sgshapirostruct writev_args {
444223067Sgshapiro	int	fd;
445223067Sgshapiro	struct	iovec *iovp;
446223067Sgshapiro	u_int	iovcnt;
447223067Sgshapiro};
448223067Sgshapiro#endif
449223067Sgshapiro/*
450223067Sgshapiro * MPSAFE
451223067Sgshapiro */
452223067Sgshapiroint
453223067Sgshapirowritev(td, uap)
454223067Sgshapiro	struct thread *td;
455223067Sgshapiro	register struct writev_args *uap;
456223067Sgshapiro{
457223067Sgshapiro	struct file *fp;
458223067Sgshapiro	struct uio auio;
459223067Sgshapiro	register struct iovec *iov;
460223067Sgshapiro	struct iovec *needfree;
461223067Sgshapiro	struct iovec aiov[UIO_SMALLIOV];
462223067Sgshapiro	long i, cnt, error = 0;
463223067Sgshapiro	u_int iovlen;
464223067Sgshapiro#ifdef KTRACE
465223067Sgshapiro	struct iovec *ktriov = NULL;
466223067Sgshapiro	struct uio ktruio;
467223067Sgshapiro#endif
468223067Sgshapiro
469223067Sgshapiro	mtx_lock(&Giant);
470223067Sgshapiro	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
471223067Sgshapiro		error = EBADF;
472223067Sgshapiro		goto done2;
473223067Sgshapiro	}
474223067Sgshapiro	/* note: can't use iovlen until iovcnt is validated */
475223067Sgshapiro	iovlen = uap->iovcnt * sizeof (struct iovec);
476223067Sgshapiro	if (uap->iovcnt > UIO_SMALLIOV) {
477223067Sgshapiro		if (uap->iovcnt > UIO_MAXIOV) {
478223067Sgshapiro			needfree = NULL;
479223067Sgshapiro			error = EINVAL;
480223067Sgshapiro			goto done;
481223067Sgshapiro		}
482223067Sgshapiro		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
483223067Sgshapiro		needfree = iov;
484223067Sgshapiro	} else {
485223067Sgshapiro		iov = aiov;
486223067Sgshapiro		needfree = NULL;
487223067Sgshapiro	}
488223067Sgshapiro	auio.uio_iov = iov;
489223067Sgshapiro	auio.uio_iovcnt = uap->iovcnt;
490223067Sgshapiro	auio.uio_rw = UIO_WRITE;
491223067Sgshapiro	auio.uio_segflg = UIO_USERSPACE;
492223067Sgshapiro	auio.uio_td = td;
493223067Sgshapiro	auio.uio_offset = -1;
494223067Sgshapiro	if ((error = copyin(uap->iovp, iov, iovlen)))
495223067Sgshapiro		goto done;
496223067Sgshapiro	auio.uio_resid = 0;
497223067Sgshapiro	for (i = 0; i < uap->iovcnt; i++) {
498223067Sgshapiro		if (iov->iov_len > INT_MAX - auio.uio_resid) {
499223067Sgshapiro			error = EINVAL;
500223067Sgshapiro			goto done;
501223067Sgshapiro		}
502223067Sgshapiro		auio.uio_resid += iov->iov_len;
503223067Sgshapiro		iov++;
504223067Sgshapiro	}
505223067Sgshapiro#ifdef KTRACE
506223067Sgshapiro	/*
507223067Sgshapiro	 * if tracing, save a copy of iovec and uio
508223067Sgshapiro	 */
509223067Sgshapiro	if (KTRPOINT(td, KTR_GENIO))  {
510223067Sgshapiro		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
511223067Sgshapiro		bcopy(auio.uio_iov, ktriov, iovlen);
512223067Sgshapiro		ktruio = auio;
513223067Sgshapiro	}
514223067Sgshapiro#endif
515223067Sgshapiro	cnt = auio.uio_resid;
516223067Sgshapiro	if (fp->f_type == DTYPE_VNODE)
517223067Sgshapiro		bwillwrite();
518223067Sgshapiro	if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
519223067Sgshapiro		if (auio.uio_resid != cnt && (error == ERESTART ||
520223067Sgshapiro		    error == EINTR || error == EWOULDBLOCK))
521223067Sgshapiro			error = 0;
522223067Sgshapiro		if (error == EPIPE) {
523223067Sgshapiro			PROC_LOCK(td->td_proc);
524223067Sgshapiro			psignal(td->td_proc, SIGPIPE);
525223067Sgshapiro			PROC_UNLOCK(td->td_proc);
526223067Sgshapiro		}
527223067Sgshapiro	}
528223067Sgshapiro	cnt -= auio.uio_resid;
529223067Sgshapiro#ifdef KTRACE
530223067Sgshapiro	if (ktriov != NULL) {
531223067Sgshapiro		if (error == 0) {
532223067Sgshapiro			ktruio.uio_iov = ktriov;
533244833Sgshapiro			ktruio.uio_resid = cnt;
534223067Sgshapiro			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
535223067Sgshapiro		}
536223067Sgshapiro		FREE(ktriov, M_TEMP);
537223067Sgshapiro	}
538223067Sgshapiro#endif
539223067Sgshapiro	td->td_retval[0] = cnt;
540223067Sgshapirodone:
541223067Sgshapiro	fdrop(fp, td);
542223067Sgshapiro	if (needfree)
543223067Sgshapiro		FREE(needfree, M_IOV);
544223067Sgshapirodone2:
545223067Sgshapiro	mtx_unlock(&Giant);
546223067Sgshapiro	return (error);
547223067Sgshapiro}
548223067Sgshapiro
549223067Sgshapiro/*
550223067Sgshapiro * Ioctl system call
551223067Sgshapiro */
552223067Sgshapiro#ifndef _SYS_SYSPROTO_H_
553223067Sgshapirostruct ioctl_args {
554223067Sgshapiro	int	fd;
555223067Sgshapiro	u_long	com;
556223067Sgshapiro	caddr_t	data;
557223067Sgshapiro};
558223067Sgshapiro#endif
559223067Sgshapiro/*
560223067Sgshapiro * MPSAFE
561223067Sgshapiro */
562223067Sgshapiro/* ARGSUSED */
563223067Sgshapiroint
564223067Sgshapiroioctl(td, uap)
565223067Sgshapiro	struct thread *td;
566223067Sgshapiro	register struct ioctl_args *uap;
567223067Sgshapiro{
568223067Sgshapiro	struct file *fp;
569223067Sgshapiro	register struct filedesc *fdp;
570223067Sgshapiro	register u_long com;
571223067Sgshapiro	int error = 0;
572223067Sgshapiro	register u_int size;
573223067Sgshapiro	caddr_t data, memp;
574223067Sgshapiro	int tmp;
575223067Sgshapiro#define STK_PARAMS	128
576223067Sgshapiro	union {
577223067Sgshapiro	    char stkbuf[STK_PARAMS];
578223067Sgshapiro	    long align;
579223067Sgshapiro	} ubuf;
580223067Sgshapiro
581223067Sgshapiro	if ((error = fget(td, uap->fd, &fp)) != 0)
582223067Sgshapiro		return (error);
583223067Sgshapiro	mtx_lock(&Giant);
584223067Sgshapiro	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
585223067Sgshapiro		fdrop(fp, td);
586223067Sgshapiro		mtx_unlock(&Giant);
587223067Sgshapiro		return (EBADF);
588223067Sgshapiro	}
589223067Sgshapiro	fdp = td->td_proc->p_fd;
590223067Sgshapiro	switch (com = uap->com) {
591223067Sgshapiro	case FIONCLEX:
592223067Sgshapiro		FILEDESC_LOCK(fdp);
593223067Sgshapiro		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
594223067Sgshapiro		FILEDESC_UNLOCK(fdp);
595223067Sgshapiro		fdrop(fp, td);
596223067Sgshapiro		mtx_unlock(&Giant);
597223067Sgshapiro		return (0);
598223067Sgshapiro	case FIOCLEX:
599223067Sgshapiro		FILEDESC_LOCK(fdp);
600223067Sgshapiro		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
601223067Sgshapiro		FILEDESC_UNLOCK(fdp);
602223067Sgshapiro		fdrop(fp, td);
603223067Sgshapiro		mtx_unlock(&Giant);
604223067Sgshapiro		return (0);
605223067Sgshapiro	}
606223067Sgshapiro
607223067Sgshapiro	/*
608223067Sgshapiro	 * Interpret high order word to find amount of data to be
609223067Sgshapiro	 * copied to/from the user's address space.
610223067Sgshapiro	 */
611223067Sgshapiro	size = IOCPARM_LEN(com);
612223067Sgshapiro	if (size > IOCPARM_MAX) {
613223067Sgshapiro		fdrop(fp, td);
614223067Sgshapiro		mtx_unlock(&Giant);
615223067Sgshapiro		return (ENOTTY);
616223067Sgshapiro	}
617223067Sgshapiro
618223067Sgshapiro	memp = NULL;
619223067Sgshapiro	if (size > sizeof (ubuf.stkbuf)) {
620223067Sgshapiro		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
621223067Sgshapiro		data = memp;
622223067Sgshapiro	} else {
623223067Sgshapiro		data = ubuf.stkbuf;
624223067Sgshapiro	}
625223067Sgshapiro	if (com&IOC_IN) {
626223067Sgshapiro		if (size) {
627223067Sgshapiro			error = copyin(uap->data, data, (u_int)size);
628223067Sgshapiro			if (error) {
629223067Sgshapiro				if (memp)
630223067Sgshapiro					free(memp, M_IOCTLOPS);
631223067Sgshapiro				fdrop(fp, td);
632223067Sgshapiro				goto done;
633223067Sgshapiro			}
634223067Sgshapiro		} else {
635223067Sgshapiro			*(caddr_t *)data = uap->data;
636223067Sgshapiro		}
637223067Sgshapiro	} else if ((com&IOC_OUT) && size) {
638223067Sgshapiro		/*
639223067Sgshapiro		 * Zero the buffer so the user always
640223067Sgshapiro		 * gets back something deterministic.
641203004Sgshapiro		 */
642203004Sgshapiro		bzero(data, size);
643203004Sgshapiro	} else if (com&IOC_VOID) {
644203004Sgshapiro		*(caddr_t *)data = uap->data;
645203004Sgshapiro	}
646203004Sgshapiro
647203004Sgshapiro	switch (com) {
648203004Sgshapiro
649203004Sgshapiro	case FIONBIO:
650203004Sgshapiro		FILE_LOCK(fp);
651203004Sgshapiro		if ((tmp = *(int *)data))
652203004Sgshapiro			fp->f_flag |= FNONBLOCK;
653203004Sgshapiro		else
654203004Sgshapiro			fp->f_flag &= ~FNONBLOCK;
655203004Sgshapiro		FILE_UNLOCK(fp);
656203004Sgshapiro		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
657203004Sgshapiro		break;
658203004Sgshapiro
659203004Sgshapiro	case FIOASYNC:
660203004Sgshapiro		FILE_LOCK(fp);
661203004Sgshapiro		if ((tmp = *(int *)data))
662203004Sgshapiro			fp->f_flag |= FASYNC;
663203004Sgshapiro		else
664203004Sgshapiro			fp->f_flag &= ~FASYNC;
665203004Sgshapiro		FILE_UNLOCK(fp);
666203004Sgshapiro		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
667203004Sgshapiro		break;
668203004Sgshapiro
669203004Sgshapiro	default:
670203004Sgshapiro		error = fo_ioctl(fp, com, data, td->td_ucred, td);
671203004Sgshapiro		/*
672203004Sgshapiro		 * Copy any data to user, size was
673203004Sgshapiro		 * already set and checked above.
674203004Sgshapiro		 */
675203004Sgshapiro		if (error == 0 && (com&IOC_OUT) && size)
676203004Sgshapiro			error = copyout(data, uap->data, (u_int)size);
677203004Sgshapiro		break;
678203004Sgshapiro	}
679203004Sgshapiro	if (memp)
680203004Sgshapiro		free(memp, M_IOCTLOPS);
681203004Sgshapiro	fdrop(fp, td);
682203004Sgshapirodone:
683203004Sgshapiro	mtx_unlock(&Giant);
684203004Sgshapiro	return (error);
685203004Sgshapiro}
686203004Sgshapiro
687203004Sgshapiro/*
688203004Sgshapiro * sellock and selwait are initialized in selectinit() via SYSINIT.
689203004Sgshapiro */
690203004Sgshapirostruct mtx	sellock;
691203004Sgshapirostruct cv	selwait;
692203004Sgshapirou_int		nselcoll;	/* Select collisions since boot */
693203004SgshapiroSYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
694203004Sgshapiro
695203004Sgshapiro/*
696203004Sgshapiro * Select system call.
697203004Sgshapiro */
698203004Sgshapiro#ifndef _SYS_SYSPROTO_H_
699203004Sgshapirostruct select_args {
700203004Sgshapiro	int	nd;
701203004Sgshapiro	fd_set	*in, *ou, *ex;
702203004Sgshapiro	struct	timeval *tv;
703203004Sgshapiro};
704203004Sgshapiro#endif
705203004Sgshapiro/*
706203004Sgshapiro * MPSAFE
707203004Sgshapiro */
708203004Sgshapiroint
709203004Sgshapiroselect(td, uap)
710203004Sgshapiro	register struct thread *td;
711203004Sgshapiro	register struct select_args *uap;
712203004Sgshapiro{
713203004Sgshapiro	struct timeval tv, *tvp;
714182352Sgshapiro	int error;
715182352Sgshapiro
716182352Sgshapiro	if (uap->tv != NULL) {
717182352Sgshapiro		error = copyin(uap->tv, &tv, sizeof(tv));
718182352Sgshapiro		if (error)
719182352Sgshapiro			return (error);
720182352Sgshapiro		tvp = &tv;
721182352Sgshapiro	} else
722182352Sgshapiro		tvp = NULL;
723182352Sgshapiro
724182352Sgshapiro	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
725182352Sgshapiro}
726182352Sgshapiro
727182352Sgshapiroint
728182352Sgshapirokern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
729182352Sgshapiro    fd_set *fd_ex, struct timeval *tvp)
730182352Sgshapiro{
731182352Sgshapiro	struct filedesc *fdp;
732182352Sgshapiro	/*
733182352Sgshapiro	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
734182352Sgshapiro	 * infds with the new FD_SETSIZE of 1024, and more than enough for
735182352Sgshapiro	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
736182352Sgshapiro	 * of 256.
737182352Sgshapiro	 */
738182352Sgshapiro	fd_mask s_selbits[howmany(2048, NFDBITS)];
739182352Sgshapiro	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
740182352Sgshapiro	struct timeval atv, rtv, ttv;
741182352Sgshapiro	int error, timo;
742182352Sgshapiro	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
743182352Sgshapiro
744182352Sgshapiro	if (nd < 0)
745182352Sgshapiro		return (EINVAL);
746182352Sgshapiro	fdp = td->td_proc->p_fd;
747182352Sgshapiro	mtx_lock(&Giant);
748182352Sgshapiro	FILEDESC_LOCK(fdp);
749182352Sgshapiro
750182352Sgshapiro	if (nd > td->td_proc->p_fd->fd_nfiles)
751182352Sgshapiro		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
752182352Sgshapiro	FILEDESC_UNLOCK(fdp);
753182352Sgshapiro
754182352Sgshapiro	/*
755182352Sgshapiro	 * Allocate just enough bits for the non-null fd_sets.  Use the
756182352Sgshapiro	 * preallocated auto buffer if possible.
757182352Sgshapiro	 */
758182352Sgshapiro	nfdbits = roundup(nd, NFDBITS);
759182352Sgshapiro	ncpbytes = nfdbits / NBBY;
760182352Sgshapiro	nbufbytes = 0;
761182352Sgshapiro	if (fd_in != NULL)
762182352Sgshapiro		nbufbytes += 2 * ncpbytes;
763182352Sgshapiro	if (fd_ou != NULL)
764182352Sgshapiro		nbufbytes += 2 * ncpbytes;
765182352Sgshapiro	if (fd_ex != NULL)
766182352Sgshapiro		nbufbytes += 2 * ncpbytes;
767182352Sgshapiro	if (nbufbytes <= sizeof s_selbits)
768182352Sgshapiro		selbits = &s_selbits[0];
769182352Sgshapiro	else
770182352Sgshapiro		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
771182352Sgshapiro
772182352Sgshapiro	/*
773182352Sgshapiro	 * Assign pointers into the bit buffers and fetch the input bits.
774182352Sgshapiro	 * Put the output buffers together so that they can be bzeroed
775182352Sgshapiro	 * together.
776182352Sgshapiro	 */
777168515Sgshapiro	sbp = selbits;
778168515Sgshapiro#define	getbits(name, x) \
779168515Sgshapiro	do {								\
780168515Sgshapiro		if (name == NULL)					\
781168515Sgshapiro			ibits[x] = NULL;				\
782168515Sgshapiro		else {							\
783168515Sgshapiro			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
784168515Sgshapiro			obits[x] = sbp;					\
785168515Sgshapiro			sbp += ncpbytes / sizeof *sbp;			\
786168515Sgshapiro			error = copyin(name, ibits[x], ncpbytes);	\
787168515Sgshapiro			if (error != 0)					\
788168515Sgshapiro				goto done_nosellock;			\
789168515Sgshapiro		}							\
790168515Sgshapiro	} while (0)
791168515Sgshapiro	getbits(fd_in, 0);
792168515Sgshapiro	getbits(fd_ou, 1);
793168515Sgshapiro	getbits(fd_ex, 2);
794168515Sgshapiro#undef	getbits
795168515Sgshapiro	if (nbufbytes != 0)
796168515Sgshapiro		bzero(selbits, nbufbytes / 2);
797168515Sgshapiro
798168515Sgshapiro	if (tvp != NULL) {
799168515Sgshapiro		atv = *tvp;
800168515Sgshapiro		if (itimerfix(&atv)) {
801168515Sgshapiro			error = EINVAL;
802168515Sgshapiro			goto done_nosellock;
803168515Sgshapiro		}
804168515Sgshapiro		getmicrouptime(&rtv);
805168515Sgshapiro		timevaladd(&atv, &rtv);
806168515Sgshapiro	} else {
807168515Sgshapiro		atv.tv_sec = 0;
808168515Sgshapiro		atv.tv_usec = 0;
809168515Sgshapiro	}
810168515Sgshapiro	timo = 0;
811168515Sgshapiro	TAILQ_INIT(&td->td_selq);
812168515Sgshapiro	mtx_lock(&sellock);
813168515Sgshapiroretry:
814168515Sgshapiro	ncoll = nselcoll;
815168515Sgshapiro	mtx_lock_spin(&sched_lock);
816168515Sgshapiro	td->td_flags |= TDF_SELECT;
817168515Sgshapiro	mtx_unlock_spin(&sched_lock);
818168515Sgshapiro	mtx_unlock(&sellock);
819168515Sgshapiro
820168515Sgshapiro	error = selscan(td, ibits, obits, nd);
821168515Sgshapiro	mtx_lock(&sellock);
822168515Sgshapiro	if (error || td->td_retval[0])
823168515Sgshapiro		goto done;
824168515Sgshapiro	if (atv.tv_sec || atv.tv_usec) {
825168515Sgshapiro		getmicrouptime(&rtv);
826168515Sgshapiro		if (timevalcmp(&rtv, &atv, >=))
827168515Sgshapiro			goto done;
828168515Sgshapiro		ttv = atv;
829168515Sgshapiro		timevalsub(&ttv, &rtv);
830168515Sgshapiro		timo = ttv.tv_sec > 24 * 60 * 60 ?
831168515Sgshapiro		    24 * 60 * 60 * hz : tvtohz(&ttv);
832168515Sgshapiro	}
833168515Sgshapiro
834168515Sgshapiro	/*
835168515Sgshapiro	 * An event of interest may occur while we do not hold
836168515Sgshapiro	 * sellock, so check TDF_SELECT and the number of
837168515Sgshapiro	 * collisions and rescan the file descriptors if
838168515Sgshapiro	 * necessary.
839168515Sgshapiro	 */
840168515Sgshapiro	mtx_lock_spin(&sched_lock);
841168515Sgshapiro	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
842168515Sgshapiro		mtx_unlock_spin(&sched_lock);
843168515Sgshapiro		goto retry;
844168515Sgshapiro	}
845168515Sgshapiro	mtx_unlock_spin(&sched_lock);
846168515Sgshapiro
847168515Sgshapiro	if (timo > 0)
848168515Sgshapiro		error = cv_timedwait_sig(&selwait, &sellock, timo);
849168515Sgshapiro	else
850168515Sgshapiro		error = cv_wait_sig(&selwait, &sellock);
851168515Sgshapiro
852168515Sgshapiro	if (error == 0)
853157001Sgshapiro		goto retry;
854157001Sgshapiro
855157001Sgshapirodone:
856157001Sgshapiro	clear_selinfo_list(td);
857157001Sgshapiro	mtx_lock_spin(&sched_lock);
858159609Sgshapiro	td->td_flags &= ~TDF_SELECT;
859157001Sgshapiro	mtx_unlock_spin(&sched_lock);
860157001Sgshapiro	mtx_unlock(&sellock);
861157001Sgshapiro
862157001Sgshapirodone_nosellock:
863157001Sgshapiro	/* select is not restarted after signals... */
864157001Sgshapiro	if (error == ERESTART)
865157001Sgshapiro		error = EINTR;
866157001Sgshapiro	if (error == EWOULDBLOCK)
867157001Sgshapiro		error = 0;
868157001Sgshapiro#define	putbits(name, x) \
869157001Sgshapiro	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
870157001Sgshapiro		error = error2;
871157001Sgshapiro	if (error == 0) {
872157001Sgshapiro		int error2;
873157001Sgshapiro
874157001Sgshapiro		putbits(fd_in, 0);
875157001Sgshapiro		putbits(fd_ou, 1);
876157001Sgshapiro		putbits(fd_ex, 2);
877157001Sgshapiro#undef putbits
878157001Sgshapiro	}
879157001Sgshapiro	if (selbits != &s_selbits[0])
880157001Sgshapiro		free(selbits, M_SELECT);
881157001Sgshapiro
882157001Sgshapiro	mtx_unlock(&Giant);
883157001Sgshapiro	return (error);
884157001Sgshapiro}
885157001Sgshapiro
886157001Sgshapirostatic int
887157001Sgshapiroselscan(td, ibits, obits, nfd)
888157001Sgshapiro	struct thread *td;
889157001Sgshapiro	fd_mask **ibits, **obits;
890157001Sgshapiro	int nfd;
891157001Sgshapiro{
892157001Sgshapiro	int msk, i, fd;
893157001Sgshapiro	fd_mask bits;
894157001Sgshapiro	struct file *fp;
895157001Sgshapiro	int n = 0;
896157001Sgshapiro	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
897157001Sgshapiro	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
898157001Sgshapiro	struct filedesc *fdp = td->td_proc->p_fd;
899157001Sgshapiro
900157001Sgshapiro	FILEDESC_LOCK(fdp);
901157001Sgshapiro	for (msk = 0; msk < 3; msk++) {
902157001Sgshapiro		if (ibits[msk] == NULL)
903157001Sgshapiro			continue;
904157001Sgshapiro		for (i = 0; i < nfd; i += NFDBITS) {
905157001Sgshapiro			bits = ibits[msk][i/NFDBITS];
906157001Sgshapiro			/* ffs(int mask) not portable, fd_mask is long */
907157001Sgshapiro			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
908157001Sgshapiro				if (!(bits & 1))
909157001Sgshapiro					continue;
910157001Sgshapiro				if ((fp = fget_locked(fdp, fd)) == NULL) {
911157001Sgshapiro					FILEDESC_UNLOCK(fdp);
912157001Sgshapiro					return (EBADF);
913157001Sgshapiro				}
914157001Sgshapiro				if (fo_poll(fp, flag[msk], td->td_ucred,
915157001Sgshapiro				    td)) {
916157001Sgshapiro					obits[msk][(fd)/NFDBITS] |=
917157001Sgshapiro					    ((fd_mask)1 << ((fd) % NFDBITS));
918157001Sgshapiro					n++;
919157001Sgshapiro				}
920157001Sgshapiro			}
921157001Sgshapiro		}
922157001Sgshapiro	}
923157001Sgshapiro	FILEDESC_UNLOCK(fdp);
924157001Sgshapiro	td->td_retval[0] = n;
925157001Sgshapiro	return (0);
926157001Sgshapiro}
927157001Sgshapiro
928157001Sgshapiro/*
929157001Sgshapiro * Poll system call.
930141858Sgshapiro */
931141858Sgshapiro#ifndef _SYS_SYSPROTO_H_
932141858Sgshapirostruct poll_args {
933141858Sgshapiro	struct pollfd *fds;
934141858Sgshapiro	u_int	nfds;
935159609Sgshapiro	int	timeout;
936141858Sgshapiro};
937141858Sgshapiro#endif
938141858Sgshapiro/*
939141858Sgshapiro * MPSAFE
940141858Sgshapiro */
941141858Sgshapiroint
942141858Sgshapiropoll(td, uap)
943141858Sgshapiro	struct thread *td;
944141858Sgshapiro	struct poll_args *uap;
945141858Sgshapiro{
946141858Sgshapiro	caddr_t bits;
947141858Sgshapiro	char smallbits[32 * sizeof(struct pollfd)];
948141858Sgshapiro	struct timeval atv, rtv, ttv;
949141858Sgshapiro	int error = 0, timo;
950141858Sgshapiro	u_int ncoll, nfds;
951141858Sgshapiro	size_t ni;
952141858Sgshapiro
953141858Sgshapiro	nfds = uap->nfds;
954141858Sgshapiro
955141858Sgshapiro	mtx_lock(&Giant);
956141858Sgshapiro	/*
957141858Sgshapiro	 * This is kinda bogus.  We have fd limits, but that is not
958141858Sgshapiro	 * really related to the size of the pollfd array.  Make sure
959141858Sgshapiro	 * we let the process use at least FD_SETSIZE entries and at
960141858Sgshapiro	 * least enough for the current limits.  We want to be reasonably
961141858Sgshapiro	 * safe, but not overly restrictive.
962141858Sgshapiro	 */
963141858Sgshapiro	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
964141858Sgshapiro	    (nfds > FD_SETSIZE)) {
965141858Sgshapiro		error = EINVAL;
966141858Sgshapiro		goto done2;
967141858Sgshapiro	}
968141858Sgshapiro	ni = nfds * sizeof(struct pollfd);
969141858Sgshapiro	if (ni > sizeof(smallbits))
970141858Sgshapiro		bits = malloc(ni, M_TEMP, M_WAITOK);
971141858Sgshapiro	else
972141858Sgshapiro		bits = smallbits;
973141858Sgshapiro	error = copyin(uap->fds, bits, ni);
974141858Sgshapiro	if (error)
975141858Sgshapiro		goto done_nosellock;
976141858Sgshapiro	if (uap->timeout != INFTIM) {
977141858Sgshapiro		atv.tv_sec = uap->timeout / 1000;
978141858Sgshapiro		atv.tv_usec = (uap->timeout % 1000) * 1000;
979141858Sgshapiro		if (itimerfix(&atv)) {
980141858Sgshapiro			error = EINVAL;
981141858Sgshapiro			goto done_nosellock;
982159609Sgshapiro		}
983159609Sgshapiro		getmicrouptime(&rtv);
984159609Sgshapiro		timevaladd(&atv, &rtv);
985159609Sgshapiro	} else {
986159609Sgshapiro		atv.tv_sec = 0;
987159609Sgshapiro		atv.tv_usec = 0;
988159609Sgshapiro	}
989159609Sgshapiro	timo = 0;
990159609Sgshapiro	TAILQ_INIT(&td->td_selq);
991159609Sgshapiro	mtx_lock(&sellock);
992159609Sgshapiroretry:
993159609Sgshapiro	ncoll = nselcoll;
994159609Sgshapiro	mtx_lock_spin(&sched_lock);
995159609Sgshapiro	td->td_flags |= TDF_SELECT;
996159609Sgshapiro	mtx_unlock_spin(&sched_lock);
997159609Sgshapiro	mtx_unlock(&sellock);
998159609Sgshapiro
999159609Sgshapiro	error = pollscan(td, (struct pollfd *)bits, nfds);
1000159609Sgshapiro	mtx_lock(&sellock);
1001159609Sgshapiro	if (error || td->td_retval[0])
1002159609Sgshapiro		goto done;
1003159609Sgshapiro	if (atv.tv_sec || atv.tv_usec) {
1004159609Sgshapiro		getmicrouptime(&rtv);
1005159609Sgshapiro		if (timevalcmp(&rtv, &atv, >=))
1006159609Sgshapiro			goto done;
1007159609Sgshapiro		ttv = atv;
1008159609Sgshapiro		timevalsub(&ttv, &rtv);
1009159609Sgshapiro		timo = ttv.tv_sec > 24 * 60 * 60 ?
1010159609Sgshapiro		    24 * 60 * 60 * hz : tvtohz(&ttv);
1011159609Sgshapiro	}
1012159609Sgshapiro	/*
1013159609Sgshapiro	 * An event of interest may occur while we do not hold
1014159609Sgshapiro	 * sellock, so check TDF_SELECT and the number of collisions
1015159609Sgshapiro	 * and rescan the file descriptors if necessary.
1016159609Sgshapiro	 */
1017159609Sgshapiro	mtx_lock_spin(&sched_lock);
1018159609Sgshapiro	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1019159609Sgshapiro		mtx_unlock_spin(&sched_lock);
1020159609Sgshapiro		goto retry;
1021159609Sgshapiro	}
1022159609Sgshapiro	mtx_unlock_spin(&sched_lock);
1023141858Sgshapiro
1024141858Sgshapiro	if (timo > 0)
1025141858Sgshapiro		error = cv_timedwait_sig(&selwait, &sellock, timo);
1026125820Sgshapiro	else
1027125820Sgshapiro		error = cv_wait_sig(&selwait, &sellock);
1028125820Sgshapiro
1029125820Sgshapiro	if (error == 0)
1030125820Sgshapiro		goto retry;
1031159609Sgshapiro
1032125820Sgshapirodone:
1033125820Sgshapiro	clear_selinfo_list(td);
1034125820Sgshapiro	mtx_lock_spin(&sched_lock);
1035125820Sgshapiro	td->td_flags &= ~TDF_SELECT;
1036125820Sgshapiro	mtx_unlock_spin(&sched_lock);
1037125820Sgshapiro	mtx_unlock(&sellock);
1038125820Sgshapiro
1039125820Sgshapirodone_nosellock:
1040125820Sgshapiro	/* poll is not restarted after signals... */
1041125820Sgshapiro	if (error == ERESTART)
1042125820Sgshapiro		error = EINTR;
1043125820Sgshapiro	if (error == EWOULDBLOCK)
1044125820Sgshapiro		error = 0;
1045125820Sgshapiro	if (error == 0) {
1046125820Sgshapiro		error = copyout(bits, uap->fds, ni);
1047125820Sgshapiro		if (error)
1048125820Sgshapiro			goto out;
1049125820Sgshapiro	}
1050125820Sgshapiroout:
1051125820Sgshapiro	if (ni > sizeof(smallbits))
1052125820Sgshapiro		free(bits, M_TEMP);
1053125820Sgshapirodone2:
1054125820Sgshapiro	mtx_unlock(&Giant);
1055125820Sgshapiro	return (error);
1056125820Sgshapiro}
1057125820Sgshapiro
1058125820Sgshapirostatic int
1059125820Sgshapiropollscan(td, fds, nfd)
1060125820Sgshapiro	struct thread *td;
1061125820Sgshapiro	struct pollfd *fds;
1062125820Sgshapiro	u_int nfd;
1063125820Sgshapiro{
1064125820Sgshapiro	register struct filedesc *fdp = td->td_proc->p_fd;
1065125820Sgshapiro	int i;
1066125820Sgshapiro	struct file *fp;
1067125820Sgshapiro	int n = 0;
1068125820Sgshapiro
1069125820Sgshapiro	FILEDESC_LOCK(fdp);
1070125820Sgshapiro	for (i = 0; i < nfd; i++, fds++) {
1071125820Sgshapiro		if (fds->fd >= fdp->fd_nfiles) {
1072125820Sgshapiro			fds->revents = POLLNVAL;
1073125820Sgshapiro			n++;
1074125820Sgshapiro		} else if (fds->fd < 0) {
1075159609Sgshapiro			fds->revents = 0;
1076159609Sgshapiro		} else {
1077159609Sgshapiro			fp = fdp->fd_ofiles[fds->fd];
1078159609Sgshapiro			if (fp == NULL) {
1079159609Sgshapiro				fds->revents = POLLNVAL;
1080159609Sgshapiro				n++;
1081159609Sgshapiro			} else {
1082159609Sgshapiro				/*
1083159609Sgshapiro				 * Note: backend also returns POLLHUP and
1084159609Sgshapiro				 * POLLERR if appropriate.
1085159609Sgshapiro				 */
1086159609Sgshapiro				fds->revents = fo_poll(fp, fds->events,
1087159609Sgshapiro				    td->td_ucred, td);
1088159609Sgshapiro				if (fds->revents != 0)
1089125820Sgshapiro					n++;
1090159609Sgshapiro			}
1091159609Sgshapiro		}
1092159609Sgshapiro	}
1093159609Sgshapiro	FILEDESC_UNLOCK(fdp);
1094159609Sgshapiro	td->td_retval[0] = n;
1095159609Sgshapiro	return (0);
1096159609Sgshapiro}
1097159609Sgshapiro
1098159609Sgshapiro/*
1099159609Sgshapiro * OpenBSD poll system call.
1100159609Sgshapiro * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1101159609Sgshapiro */
1102159609Sgshapiro#ifndef _SYS_SYSPROTO_H_
1103159609Sgshapirostruct openbsd_poll_args {
1104159609Sgshapiro	struct pollfd *fds;
1105159609Sgshapiro	u_int	nfds;
1106159609Sgshapiro	int	timeout;
1107159609Sgshapiro};
1108159609Sgshapiro#endif
1109159609Sgshapiro/*
1110159609Sgshapiro * MPSAFE
1111159609Sgshapiro */
1112159609Sgshapiroint
1113159609Sgshapiroopenbsd_poll(td, uap)
1114125820Sgshapiro	register struct thread *td;
1115125820Sgshapiro	register struct openbsd_poll_args *uap;
1116125820Sgshapiro{
1117111823Sgshapiro	return (poll(td, (struct poll_args *)uap));
1118111823Sgshapiro}
1119111823Sgshapiro
1120111823Sgshapiro/*
1121111823Sgshapiro * Remove the references to the thread from all of the objects
1122159609Sgshapiro * we were polling.
1123111823Sgshapiro *
1124111823Sgshapiro * This code assumes that the underlying owner of the selinfo
1125111823Sgshapiro * structure will hold sellock before it changes it, and that
1126111823Sgshapiro * it will unlink itself from our list if it goes away.
1127111823Sgshapiro */
1128111823Sgshapirovoid
1129111823Sgshapiroclear_selinfo_list(td)
1130111823Sgshapiro	struct thread *td;
1131111823Sgshapiro{
1132111823Sgshapiro	struct selinfo *si;
1133111823Sgshapiro
1134111823Sgshapiro	mtx_assert(&sellock, MA_OWNED);
1135111823Sgshapiro	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1136111823Sgshapiro		si->si_thread = NULL;
1137111823Sgshapiro	TAILQ_INIT(&td->td_selq);
1138111823Sgshapiro}
1139111823Sgshapiro
1140111823Sgshapiro/*ARGSUSED*/
1141111823Sgshapiroint
1142111823Sgshapiroseltrue(dev, events, td)
1143111823Sgshapiro	dev_t dev;
1144111823Sgshapiro	int events;
1145111823Sgshapiro	struct thread *td;
1146111823Sgshapiro{
1147111823Sgshapiro
1148111823Sgshapiro	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1149111823Sgshapiro}
1150111823Sgshapiro
1151111823Sgshapiro/*
1152111823Sgshapiro * Record a select request.
1153111823Sgshapiro */
1154111823Sgshapirovoid
1155111823Sgshapiroselrecord(selector, sip)
1156111823Sgshapiro	struct thread *selector;
1157111823Sgshapiro	struct selinfo *sip;
1158111823Sgshapiro{
1159159609Sgshapiro
1160159609Sgshapiro	mtx_lock(&sellock);
1161159609Sgshapiro	/*
1162159609Sgshapiro	 * If the selinfo's thread pointer is NULL then take ownership of it.
1163159609Sgshapiro	 *
1164159609Sgshapiro	 * If the thread pointer is not NULL and it points to another
1165159609Sgshapiro	 * thread, then we have a collision.
1166159609Sgshapiro	 *
1167159609Sgshapiro	 * If the thread pointer is not NULL and points back to us then leave
1168159609Sgshapiro	 * it alone as we've already added pointed it at us and added it to
1169159609Sgshapiro	 * our list.
1170159609Sgshapiro	 */
1171159609Sgshapiro	if (sip->si_thread == NULL) {
1172159609Sgshapiro		sip->si_thread = selector;
1173159609Sgshapiro		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1174159609Sgshapiro	} else if (sip->si_thread != selector) {
1175159609Sgshapiro		sip->si_flags |= SI_COLL;
1176159609Sgshapiro	}
1177159609Sgshapiro
1178159609Sgshapiro	mtx_unlock(&sellock);
1179159609Sgshapiro}
1180159609Sgshapiro
1181159609Sgshapiro/*
1182159609Sgshapiro * Do a wakeup when a selectable event occurs.
1183159609Sgshapiro */
1184159609Sgshapirovoid
1185159609Sgshapiroselwakeup(sip)
1186159609Sgshapiro	struct selinfo *sip;
1187159609Sgshapiro{
1188159609Sgshapiro	struct thread *td;
1189159609Sgshapiro
1190159609Sgshapiro	mtx_lock(&sellock);
1191159609Sgshapiro	td = sip->si_thread;
1192159609Sgshapiro	if ((sip->si_flags & SI_COLL) != 0) {
1193159609Sgshapiro		nselcoll++;
1194159609Sgshapiro		sip->si_flags &= ~SI_COLL;
1195159609Sgshapiro		cv_broadcast(&selwait);
1196159609Sgshapiro	}
1197159609Sgshapiro	if (td == NULL) {
1198159609Sgshapiro		mtx_unlock(&sellock);
1199159609Sgshapiro		return;
1200159609Sgshapiro	}
1201159609Sgshapiro	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1202159609Sgshapiro	sip->si_thread = NULL;
1203159609Sgshapiro	mtx_lock_spin(&sched_lock);
1204159609Sgshapiro	if (td->td_wchan == &selwait) {
1205159609Sgshapiro		cv_waitq_remove(td);
1206159609Sgshapiro		TD_CLR_SLEEPING(td);
1207159609Sgshapiro		setrunnable(td);
1208159609Sgshapiro	} else
1209159609Sgshapiro		td->td_flags &= ~TDF_SELECT;
1210159609Sgshapiro	mtx_unlock_spin(&sched_lock);
1211159609Sgshapiro	mtx_unlock(&sellock);
1212159609Sgshapiro}
1213159609Sgshapiro
1214159609Sgshapirostatic void selectinit(void *);
1215159609SgshapiroSYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1216159609Sgshapiro
1217159609Sgshapiro/* ARGSUSED*/
1218159609Sgshapirostatic void
1219159609Sgshapiroselectinit(dummy)
1220159609Sgshapiro	void *dummy;
1221159609Sgshapiro{
1222159609Sgshapiro	cv_init(&selwait, "select");
1223159609Sgshapiro	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1224111823Sgshapiro}
1225111823Sgshapiro