Deleted Added
full compact
sys_generic.c (91972) sys_generic.c (92252)
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/sys_generic.c 91972 2002-03-09 22:44:37Z alfred $
39 * $FreeBSD: head/sys/kern/sys_generic.c 92252 2002-03-14 01:32:30Z alfred $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/sysproto.h>
47#include <sys/filedesc.h>
48#include <sys/filio.h>
49#include <sys/fcntl.h>
50#include <sys/file.h>
51#include <sys/proc.h>
52#include <sys/signalvar.h>
53#include <sys/socketvar.h>
54#include <sys/uio.h>
55#include <sys/kernel.h>
56#include <sys/malloc.h>
57#include <sys/poll.h>
58#include <sys/resourcevar.h>
59#include <sys/selinfo.h>
60#include <sys/sysctl.h>
61#include <sys/sysent.h>
62#include <sys/bio.h>
63#include <sys/buf.h>
64#include <sys/condvar.h>
65#ifdef KTRACE
66#include <sys/ktrace.h>
67#endif
68#include <vm/vm.h>
69#include <vm/vm_page.h>
70
71#include <machine/limits.h>
72
73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77static int pollscan(struct thread *, struct pollfd *, u_int);
78static int selscan(struct thread *, fd_mask **, fd_mask **, int);
79static int dofileread(struct thread *, struct file *, int, void *,
80 size_t, off_t, int);
81static int dofilewrite(struct thread *, struct file *, int,
82 const void *, size_t, off_t, int);
83
84/*
85 * Read system call.
86 */
87#ifndef _SYS_SYSPROTO_H_
88struct read_args {
89 int fd;
90 void *buf;
91 size_t nbyte;
92};
93#endif
94/*
95 * MPSAFE
96 */
97int
98read(td, uap)
99 struct thread *td;
100 struct read_args *uap;
101{
102 struct file *fp;
103 int error;
104
105 mtx_lock(&Giant);
106 if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 error = dofileread(td, fp, uap->fd, uap->buf,
108 uap->nbyte, (off_t)-1, 0);
109 fdrop(fp, td);
110 }
111 mtx_unlock(&Giant);
112 return(error);
113}
114
115/*
116 * Pread system call
117 */
118#ifndef _SYS_SYSPROTO_H_
119struct pread_args {
120 int fd;
121 void *buf;
122 size_t nbyte;
123 int pad;
124 off_t offset;
125};
126#endif
127/*
128 * MPSAFE
129 */
130int
131pread(td, uap)
132 struct thread *td;
133 struct pread_args *uap;
134{
135 struct file *fp;
136 int error;
137
138 if ((error = fget_read(td, uap->fd, &fp)) != 0)
139 return (error);
140 mtx_lock(&Giant);
141 if (fp->f_type != DTYPE_VNODE) {
142 error = ESPIPE;
143 } else {
144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 uap->offset, FOF_OFFSET);
146 }
147 fdrop(fp, td);
148 mtx_unlock(&Giant);
149 return(error);
150}
151
152/*
153 * Code common for read and pread
154 */
155int
156dofileread(td, fp, fd, buf, nbyte, offset, flags)
157 struct thread *td;
158 struct file *fp;
159 int fd, flags;
160 void *buf;
161 size_t nbyte;
162 off_t offset;
163{
164 struct uio auio;
165 struct iovec aiov;
166 long cnt, error = 0;
167#ifdef KTRACE
168 struct iovec ktriov;
169 struct uio ktruio;
170 int didktr = 0;
171#endif
172
173 aiov.iov_base = (caddr_t)buf;
174 aiov.iov_len = nbyte;
175 auio.uio_iov = &aiov;
176 auio.uio_iovcnt = 1;
177 auio.uio_offset = offset;
178 if (nbyte > INT_MAX)
179 return (EINVAL);
180 auio.uio_resid = nbyte;
181 auio.uio_rw = UIO_READ;
182 auio.uio_segflg = UIO_USERSPACE;
183 auio.uio_td = td;
184#ifdef KTRACE
185 /*
186 * if tracing, save a copy of iovec
187 */
188 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
189 ktriov = aiov;
190 ktruio = auio;
191 didktr = 1;
192 }
193#endif
194 cnt = nbyte;
195
196 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
197 if (auio.uio_resid != cnt && (error == ERESTART ||
198 error == EINTR || error == EWOULDBLOCK))
199 error = 0;
200 }
201 cnt -= auio.uio_resid;
202#ifdef KTRACE
203 if (didktr && error == 0) {
204 ktruio.uio_iov = &ktriov;
205 ktruio.uio_resid = cnt;
206 ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
207 }
208#endif
209 td->td_retval[0] = cnt;
210 return (error);
211}
212
213/*
214 * Scatter read system call.
215 */
216#ifndef _SYS_SYSPROTO_H_
217struct readv_args {
218 int fd;
219 struct iovec *iovp;
220 u_int iovcnt;
221};
222#endif
223/*
224 * MPSAFE
225 */
226int
227readv(td, uap)
228 struct thread *td;
229 struct readv_args *uap;
230{
231 struct file *fp;
232 struct uio auio;
233 struct iovec *iov;
234 struct iovec *needfree;
235 struct iovec aiov[UIO_SMALLIOV];
236 long i, cnt, error = 0;
237 u_int iovlen;
238#ifdef KTRACE
239 struct iovec *ktriov = NULL;
240 struct uio ktruio;
241#endif
242 mtx_lock(&Giant);
243
244 if ((error = fget_read(td, uap->fd, &fp)) != 0)
245 goto done2;
246 /* note: can't use iovlen until iovcnt is validated */
247 iovlen = uap->iovcnt * sizeof (struct iovec);
248 if (uap->iovcnt > UIO_SMALLIOV) {
249 if (uap->iovcnt > UIO_MAXIOV) {
250 error = EINVAL;
251 goto done2;
252 }
253 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
254 needfree = iov;
255 } else {
256 iov = aiov;
257 needfree = NULL;
258 }
259 auio.uio_iov = iov;
260 auio.uio_iovcnt = uap->iovcnt;
261 auio.uio_rw = UIO_READ;
262 auio.uio_segflg = UIO_USERSPACE;
263 auio.uio_td = td;
264 auio.uio_offset = -1;
265 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
266 goto done;
267 auio.uio_resid = 0;
268 for (i = 0; i < uap->iovcnt; i++) {
269 if (iov->iov_len > INT_MAX - auio.uio_resid) {
270 error = EINVAL;
271 goto done;
272 }
273 auio.uio_resid += iov->iov_len;
274 iov++;
275 }
276#ifdef KTRACE
277 /*
278 * if tracing, save a copy of iovec
279 */
280 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
281 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
282 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
283 ktruio = auio;
284 }
285#endif
286 cnt = auio.uio_resid;
287 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
288 if (auio.uio_resid != cnt && (error == ERESTART ||
289 error == EINTR || error == EWOULDBLOCK))
290 error = 0;
291 }
292 cnt -= auio.uio_resid;
293#ifdef KTRACE
294 if (ktriov != NULL) {
295 if (error == 0) {
296 ktruio.uio_iov = ktriov;
297 ktruio.uio_resid = cnt;
298 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
299 error);
300 }
301 FREE(ktriov, M_TEMP);
302 }
303#endif
304 td->td_retval[0] = cnt;
305done:
306 fdrop(fp, td);
307 if (needfree)
308 FREE(needfree, M_IOV);
309done2:
310 mtx_unlock(&Giant);
311 return (error);
312}
313
314/*
315 * Write system call
316 */
317#ifndef _SYS_SYSPROTO_H_
318struct write_args {
319 int fd;
320 const void *buf;
321 size_t nbyte;
322};
323#endif
324/*
325 * MPSAFE
326 */
327int
328write(td, uap)
329 struct thread *td;
330 struct write_args *uap;
331{
332 struct file *fp;
333 int error;
334
335 mtx_lock(&Giant);
336 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
337 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
338 (off_t)-1, 0);
339 fdrop(fp, td);
340 } else {
341 error = EBADF; /* XXX this can't be right */
342 }
343 mtx_unlock(&Giant);
344 return(error);
345}
346
347/*
348 * Pwrite system call
349 */
350#ifndef _SYS_SYSPROTO_H_
351struct pwrite_args {
352 int fd;
353 const void *buf;
354 size_t nbyte;
355 int pad;
356 off_t offset;
357};
358#endif
359/*
360 * MPSAFE
361 */
362int
363pwrite(td, uap)
364 struct thread *td;
365 struct pwrite_args *uap;
366{
367 struct file *fp;
368 int error;
369
370 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
371 mtx_lock(&Giant);
372 if (fp->f_type == DTYPE_VNODE) {
373 error = dofilewrite(td, fp, uap->fd, uap->buf,
374 uap->nbyte, uap->offset, FOF_OFFSET);
375 } else {
376 error = ESPIPE;
377 }
378 fdrop(fp, td);
379 mtx_unlock(&Giant);
380 } else {
381 error = EBADF; /* this can't be right */
382 }
383 return(error);
384}
385
386static int
387dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
388 struct thread *td;
389 struct file *fp;
390 int fd, flags;
391 const void *buf;
392 size_t nbyte;
393 off_t offset;
394{
395 struct uio auio;
396 struct iovec aiov;
397 long cnt, error = 0;
398#ifdef KTRACE
399 struct iovec ktriov;
400 struct uio ktruio;
401 int didktr = 0;
402#endif
403
404 aiov.iov_base = (void *)(uintptr_t)buf;
405 aiov.iov_len = nbyte;
406 auio.uio_iov = &aiov;
407 auio.uio_iovcnt = 1;
408 auio.uio_offset = offset;
409 if (nbyte > INT_MAX)
410 return (EINVAL);
411 auio.uio_resid = nbyte;
412 auio.uio_rw = UIO_WRITE;
413 auio.uio_segflg = UIO_USERSPACE;
414 auio.uio_td = td;
415#ifdef KTRACE
416 /*
417 * if tracing, save a copy of iovec and uio
418 */
419 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
420 ktriov = aiov;
421 ktruio = auio;
422 didktr = 1;
423 }
424#endif
425 cnt = nbyte;
426 if (fp->f_type == DTYPE_VNODE)
427 bwillwrite();
428 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
429 if (auio.uio_resid != cnt && (error == ERESTART ||
430 error == EINTR || error == EWOULDBLOCK))
431 error = 0;
432 if (error == EPIPE) {
433 PROC_LOCK(td->td_proc);
434 psignal(td->td_proc, SIGPIPE);
435 PROC_UNLOCK(td->td_proc);
436 }
437 }
438 cnt -= auio.uio_resid;
439#ifdef KTRACE
440 if (didktr && error == 0) {
441 ktruio.uio_iov = &ktriov;
442 ktruio.uio_resid = cnt;
443 ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
444 }
445#endif
446 td->td_retval[0] = cnt;
447 return (error);
448}
449
450/*
451 * Gather write system call
452 */
453#ifndef _SYS_SYSPROTO_H_
454struct writev_args {
455 int fd;
456 struct iovec *iovp;
457 u_int iovcnt;
458};
459#endif
460/*
461 * MPSAFE
462 */
463int
464writev(td, uap)
465 struct thread *td;
466 register struct writev_args *uap;
467{
468 struct file *fp;
469 struct uio auio;
470 register struct iovec *iov;
471 struct iovec *needfree;
472 struct iovec aiov[UIO_SMALLIOV];
473 long i, cnt, error = 0;
474 u_int iovlen;
475#ifdef KTRACE
476 struct iovec *ktriov = NULL;
477 struct uio ktruio;
478#endif
479
480 mtx_lock(&Giant);
481 if ((error = fget_write(td, uap->fd, &fp)) != 0) {
482 error = EBADF;
483 goto done2;
484 }
485 /* note: can't use iovlen until iovcnt is validated */
486 iovlen = uap->iovcnt * sizeof (struct iovec);
487 if (uap->iovcnt > UIO_SMALLIOV) {
488 if (uap->iovcnt > UIO_MAXIOV) {
489 needfree = NULL;
490 error = EINVAL;
491 goto done;
492 }
493 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
494 needfree = iov;
495 } else {
496 iov = aiov;
497 needfree = NULL;
498 }
499 auio.uio_iov = iov;
500 auio.uio_iovcnt = uap->iovcnt;
501 auio.uio_rw = UIO_WRITE;
502 auio.uio_segflg = UIO_USERSPACE;
503 auio.uio_td = td;
504 auio.uio_offset = -1;
505 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
506 goto done;
507 auio.uio_resid = 0;
508 for (i = 0; i < uap->iovcnt; i++) {
509 if (iov->iov_len > INT_MAX - auio.uio_resid) {
510 error = EINVAL;
511 goto done;
512 }
513 auio.uio_resid += iov->iov_len;
514 iov++;
515 }
516#ifdef KTRACE
517 /*
518 * if tracing, save a copy of iovec and uio
519 */
520 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
521 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
522 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
523 ktruio = auio;
524 }
525#endif
526 cnt = auio.uio_resid;
527 if (fp->f_type == DTYPE_VNODE)
528 bwillwrite();
529 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
530 if (auio.uio_resid != cnt && (error == ERESTART ||
531 error == EINTR || error == EWOULDBLOCK))
532 error = 0;
533 if (error == EPIPE) {
534 PROC_LOCK(td->td_proc);
535 psignal(td->td_proc, SIGPIPE);
536 PROC_UNLOCK(td->td_proc);
537 }
538 }
539 cnt -= auio.uio_resid;
540#ifdef KTRACE
541 if (ktriov != NULL) {
542 if (error == 0) {
543 ktruio.uio_iov = ktriov;
544 ktruio.uio_resid = cnt;
545 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
546 error);
547 }
548 FREE(ktriov, M_TEMP);
549 }
550#endif
551 td->td_retval[0] = cnt;
552done:
553 fdrop(fp, td);
554 if (needfree)
555 FREE(needfree, M_IOV);
556done2:
557 mtx_unlock(&Giant);
558 return (error);
559}
560
561/*
562 * Ioctl system call
563 */
564#ifndef _SYS_SYSPROTO_H_
565struct ioctl_args {
566 int fd;
567 u_long com;
568 caddr_t data;
569};
570#endif
571/*
572 * MPSAFE
573 */
574/* ARGSUSED */
575int
576ioctl(td, uap)
577 struct thread *td;
578 register struct ioctl_args *uap;
579{
580 struct file *fp;
581 register struct filedesc *fdp;
582 register u_long com;
583 int error = 0;
584 register u_int size;
585 caddr_t data, memp;
586 int tmp;
587#define STK_PARAMS 128
588 union {
589 char stkbuf[STK_PARAMS];
590 long align;
591 } ubuf;
592
593 if ((error = fget(td, uap->fd, &fp)) != 0)
594 return (error);
595 mtx_lock(&Giant);
596 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
597 fdrop(fp, td);
598 mtx_unlock(&Giant);
599 return (EBADF);
600 }
601 fdp = td->td_proc->p_fd;
602 switch (com = uap->com) {
603 case FIONCLEX:
604 FILEDESC_LOCK(fdp);
605 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
606 FILEDESC_UNLOCK(fdp);
607 fdrop(fp, td);
608 mtx_unlock(&Giant);
609 return (0);
610 case FIOCLEX:
611 FILEDESC_LOCK(fdp);
612 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
613 FILEDESC_UNLOCK(fdp);
614 fdrop(fp, td);
615 mtx_unlock(&Giant);
616 return (0);
617 }
618
619 /*
620 * Interpret high order word to find amount of data to be
621 * copied to/from the user's address space.
622 */
623 size = IOCPARM_LEN(com);
624 if (size > IOCPARM_MAX) {
625 fdrop(fp, td);
626 mtx_unlock(&Giant);
627 return (ENOTTY);
628 }
629
630 memp = NULL;
631 if (size > sizeof (ubuf.stkbuf)) {
632 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
633 data = memp;
634 } else {
635 data = ubuf.stkbuf;
636 }
637 if (com&IOC_IN) {
638 if (size) {
639 error = copyin(uap->data, data, (u_int)size);
640 if (error) {
641 if (memp)
642 free(memp, M_IOCTLOPS);
643 fdrop(fp, td);
644 goto done;
645 }
646 } else {
647 *(caddr_t *)data = uap->data;
648 }
649 } else if ((com&IOC_OUT) && size) {
650 /*
651 * Zero the buffer so the user always
652 * gets back something deterministic.
653 */
654 bzero(data, size);
655 } else if (com&IOC_VOID) {
656 *(caddr_t *)data = uap->data;
657 }
658
659 switch (com) {
660
661 case FIONBIO:
662 FILE_LOCK(fp);
663 if ((tmp = *(int *)data))
664 fp->f_flag |= FNONBLOCK;
665 else
666 fp->f_flag &= ~FNONBLOCK;
667 FILE_UNLOCK(fp);
668 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
669 break;
670
671 case FIOASYNC:
672 FILE_LOCK(fp);
673 if ((tmp = *(int *)data))
674 fp->f_flag |= FASYNC;
675 else
676 fp->f_flag &= ~FASYNC;
677 FILE_UNLOCK(fp);
678 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
679 break;
680
681 default:
682 error = fo_ioctl(fp, com, data, td);
683 /*
684 * Copy any data to user, size was
685 * already set and checked above.
686 */
687 if (error == 0 && (com&IOC_OUT) && size)
688 error = copyout(data, uap->data, (u_int)size);
689 break;
690 }
691 if (memp)
692 free(memp, M_IOCTLOPS);
693 fdrop(fp, td);
694done:
695 mtx_unlock(&Giant);
696 return (error);
697}
698
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/sysproto.h>
47#include <sys/filedesc.h>
48#include <sys/filio.h>
49#include <sys/fcntl.h>
50#include <sys/file.h>
51#include <sys/proc.h>
52#include <sys/signalvar.h>
53#include <sys/socketvar.h>
54#include <sys/uio.h>
55#include <sys/kernel.h>
56#include <sys/malloc.h>
57#include <sys/poll.h>
58#include <sys/resourcevar.h>
59#include <sys/selinfo.h>
60#include <sys/sysctl.h>
61#include <sys/sysent.h>
62#include <sys/bio.h>
63#include <sys/buf.h>
64#include <sys/condvar.h>
65#ifdef KTRACE
66#include <sys/ktrace.h>
67#endif
68#include <vm/vm.h>
69#include <vm/vm_page.h>
70
71#include <machine/limits.h>
72
73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77static int pollscan(struct thread *, struct pollfd *, u_int);
78static int selscan(struct thread *, fd_mask **, fd_mask **, int);
79static int dofileread(struct thread *, struct file *, int, void *,
80 size_t, off_t, int);
81static int dofilewrite(struct thread *, struct file *, int,
82 const void *, size_t, off_t, int);
83
84/*
85 * Read system call.
86 */
87#ifndef _SYS_SYSPROTO_H_
88struct read_args {
89 int fd;
90 void *buf;
91 size_t nbyte;
92};
93#endif
94/*
95 * MPSAFE
96 */
97int
98read(td, uap)
99 struct thread *td;
100 struct read_args *uap;
101{
102 struct file *fp;
103 int error;
104
105 mtx_lock(&Giant);
106 if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107 error = dofileread(td, fp, uap->fd, uap->buf,
108 uap->nbyte, (off_t)-1, 0);
109 fdrop(fp, td);
110 }
111 mtx_unlock(&Giant);
112 return(error);
113}
114
115/*
116 * Pread system call
117 */
118#ifndef _SYS_SYSPROTO_H_
119struct pread_args {
120 int fd;
121 void *buf;
122 size_t nbyte;
123 int pad;
124 off_t offset;
125};
126#endif
127/*
128 * MPSAFE
129 */
130int
131pread(td, uap)
132 struct thread *td;
133 struct pread_args *uap;
134{
135 struct file *fp;
136 int error;
137
138 if ((error = fget_read(td, uap->fd, &fp)) != 0)
139 return (error);
140 mtx_lock(&Giant);
141 if (fp->f_type != DTYPE_VNODE) {
142 error = ESPIPE;
143 } else {
144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145 uap->offset, FOF_OFFSET);
146 }
147 fdrop(fp, td);
148 mtx_unlock(&Giant);
149 return(error);
150}
151
152/*
153 * Code common for read and pread
154 */
155int
156dofileread(td, fp, fd, buf, nbyte, offset, flags)
157 struct thread *td;
158 struct file *fp;
159 int fd, flags;
160 void *buf;
161 size_t nbyte;
162 off_t offset;
163{
164 struct uio auio;
165 struct iovec aiov;
166 long cnt, error = 0;
167#ifdef KTRACE
168 struct iovec ktriov;
169 struct uio ktruio;
170 int didktr = 0;
171#endif
172
173 aiov.iov_base = (caddr_t)buf;
174 aiov.iov_len = nbyte;
175 auio.uio_iov = &aiov;
176 auio.uio_iovcnt = 1;
177 auio.uio_offset = offset;
178 if (nbyte > INT_MAX)
179 return (EINVAL);
180 auio.uio_resid = nbyte;
181 auio.uio_rw = UIO_READ;
182 auio.uio_segflg = UIO_USERSPACE;
183 auio.uio_td = td;
184#ifdef KTRACE
185 /*
186 * if tracing, save a copy of iovec
187 */
188 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
189 ktriov = aiov;
190 ktruio = auio;
191 didktr = 1;
192 }
193#endif
194 cnt = nbyte;
195
196 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
197 if (auio.uio_resid != cnt && (error == ERESTART ||
198 error == EINTR || error == EWOULDBLOCK))
199 error = 0;
200 }
201 cnt -= auio.uio_resid;
202#ifdef KTRACE
203 if (didktr && error == 0) {
204 ktruio.uio_iov = &ktriov;
205 ktruio.uio_resid = cnt;
206 ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
207 }
208#endif
209 td->td_retval[0] = cnt;
210 return (error);
211}
212
213/*
214 * Scatter read system call.
215 */
216#ifndef _SYS_SYSPROTO_H_
217struct readv_args {
218 int fd;
219 struct iovec *iovp;
220 u_int iovcnt;
221};
222#endif
223/*
224 * MPSAFE
225 */
226int
227readv(td, uap)
228 struct thread *td;
229 struct readv_args *uap;
230{
231 struct file *fp;
232 struct uio auio;
233 struct iovec *iov;
234 struct iovec *needfree;
235 struct iovec aiov[UIO_SMALLIOV];
236 long i, cnt, error = 0;
237 u_int iovlen;
238#ifdef KTRACE
239 struct iovec *ktriov = NULL;
240 struct uio ktruio;
241#endif
242 mtx_lock(&Giant);
243
244 if ((error = fget_read(td, uap->fd, &fp)) != 0)
245 goto done2;
246 /* note: can't use iovlen until iovcnt is validated */
247 iovlen = uap->iovcnt * sizeof (struct iovec);
248 if (uap->iovcnt > UIO_SMALLIOV) {
249 if (uap->iovcnt > UIO_MAXIOV) {
250 error = EINVAL;
251 goto done2;
252 }
253 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
254 needfree = iov;
255 } else {
256 iov = aiov;
257 needfree = NULL;
258 }
259 auio.uio_iov = iov;
260 auio.uio_iovcnt = uap->iovcnt;
261 auio.uio_rw = UIO_READ;
262 auio.uio_segflg = UIO_USERSPACE;
263 auio.uio_td = td;
264 auio.uio_offset = -1;
265 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
266 goto done;
267 auio.uio_resid = 0;
268 for (i = 0; i < uap->iovcnt; i++) {
269 if (iov->iov_len > INT_MAX - auio.uio_resid) {
270 error = EINVAL;
271 goto done;
272 }
273 auio.uio_resid += iov->iov_len;
274 iov++;
275 }
276#ifdef KTRACE
277 /*
278 * if tracing, save a copy of iovec
279 */
280 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
281 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
282 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
283 ktruio = auio;
284 }
285#endif
286 cnt = auio.uio_resid;
287 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
288 if (auio.uio_resid != cnt && (error == ERESTART ||
289 error == EINTR || error == EWOULDBLOCK))
290 error = 0;
291 }
292 cnt -= auio.uio_resid;
293#ifdef KTRACE
294 if (ktriov != NULL) {
295 if (error == 0) {
296 ktruio.uio_iov = ktriov;
297 ktruio.uio_resid = cnt;
298 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
299 error);
300 }
301 FREE(ktriov, M_TEMP);
302 }
303#endif
304 td->td_retval[0] = cnt;
305done:
306 fdrop(fp, td);
307 if (needfree)
308 FREE(needfree, M_IOV);
309done2:
310 mtx_unlock(&Giant);
311 return (error);
312}
313
314/*
315 * Write system call
316 */
317#ifndef _SYS_SYSPROTO_H_
318struct write_args {
319 int fd;
320 const void *buf;
321 size_t nbyte;
322};
323#endif
324/*
325 * MPSAFE
326 */
327int
328write(td, uap)
329 struct thread *td;
330 struct write_args *uap;
331{
332 struct file *fp;
333 int error;
334
335 mtx_lock(&Giant);
336 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
337 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
338 (off_t)-1, 0);
339 fdrop(fp, td);
340 } else {
341 error = EBADF; /* XXX this can't be right */
342 }
343 mtx_unlock(&Giant);
344 return(error);
345}
346
347/*
348 * Pwrite system call
349 */
350#ifndef _SYS_SYSPROTO_H_
351struct pwrite_args {
352 int fd;
353 const void *buf;
354 size_t nbyte;
355 int pad;
356 off_t offset;
357};
358#endif
359/*
360 * MPSAFE
361 */
362int
363pwrite(td, uap)
364 struct thread *td;
365 struct pwrite_args *uap;
366{
367 struct file *fp;
368 int error;
369
370 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
371 mtx_lock(&Giant);
372 if (fp->f_type == DTYPE_VNODE) {
373 error = dofilewrite(td, fp, uap->fd, uap->buf,
374 uap->nbyte, uap->offset, FOF_OFFSET);
375 } else {
376 error = ESPIPE;
377 }
378 fdrop(fp, td);
379 mtx_unlock(&Giant);
380 } else {
381 error = EBADF; /* this can't be right */
382 }
383 return(error);
384}
385
386static int
387dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
388 struct thread *td;
389 struct file *fp;
390 int fd, flags;
391 const void *buf;
392 size_t nbyte;
393 off_t offset;
394{
395 struct uio auio;
396 struct iovec aiov;
397 long cnt, error = 0;
398#ifdef KTRACE
399 struct iovec ktriov;
400 struct uio ktruio;
401 int didktr = 0;
402#endif
403
404 aiov.iov_base = (void *)(uintptr_t)buf;
405 aiov.iov_len = nbyte;
406 auio.uio_iov = &aiov;
407 auio.uio_iovcnt = 1;
408 auio.uio_offset = offset;
409 if (nbyte > INT_MAX)
410 return (EINVAL);
411 auio.uio_resid = nbyte;
412 auio.uio_rw = UIO_WRITE;
413 auio.uio_segflg = UIO_USERSPACE;
414 auio.uio_td = td;
415#ifdef KTRACE
416 /*
417 * if tracing, save a copy of iovec and uio
418 */
419 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
420 ktriov = aiov;
421 ktruio = auio;
422 didktr = 1;
423 }
424#endif
425 cnt = nbyte;
426 if (fp->f_type == DTYPE_VNODE)
427 bwillwrite();
428 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
429 if (auio.uio_resid != cnt && (error == ERESTART ||
430 error == EINTR || error == EWOULDBLOCK))
431 error = 0;
432 if (error == EPIPE) {
433 PROC_LOCK(td->td_proc);
434 psignal(td->td_proc, SIGPIPE);
435 PROC_UNLOCK(td->td_proc);
436 }
437 }
438 cnt -= auio.uio_resid;
439#ifdef KTRACE
440 if (didktr && error == 0) {
441 ktruio.uio_iov = &ktriov;
442 ktruio.uio_resid = cnt;
443 ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
444 }
445#endif
446 td->td_retval[0] = cnt;
447 return (error);
448}
449
450/*
451 * Gather write system call
452 */
453#ifndef _SYS_SYSPROTO_H_
454struct writev_args {
455 int fd;
456 struct iovec *iovp;
457 u_int iovcnt;
458};
459#endif
460/*
461 * MPSAFE
462 */
463int
464writev(td, uap)
465 struct thread *td;
466 register struct writev_args *uap;
467{
468 struct file *fp;
469 struct uio auio;
470 register struct iovec *iov;
471 struct iovec *needfree;
472 struct iovec aiov[UIO_SMALLIOV];
473 long i, cnt, error = 0;
474 u_int iovlen;
475#ifdef KTRACE
476 struct iovec *ktriov = NULL;
477 struct uio ktruio;
478#endif
479
480 mtx_lock(&Giant);
481 if ((error = fget_write(td, uap->fd, &fp)) != 0) {
482 error = EBADF;
483 goto done2;
484 }
485 /* note: can't use iovlen until iovcnt is validated */
486 iovlen = uap->iovcnt * sizeof (struct iovec);
487 if (uap->iovcnt > UIO_SMALLIOV) {
488 if (uap->iovcnt > UIO_MAXIOV) {
489 needfree = NULL;
490 error = EINVAL;
491 goto done;
492 }
493 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
494 needfree = iov;
495 } else {
496 iov = aiov;
497 needfree = NULL;
498 }
499 auio.uio_iov = iov;
500 auio.uio_iovcnt = uap->iovcnt;
501 auio.uio_rw = UIO_WRITE;
502 auio.uio_segflg = UIO_USERSPACE;
503 auio.uio_td = td;
504 auio.uio_offset = -1;
505 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
506 goto done;
507 auio.uio_resid = 0;
508 for (i = 0; i < uap->iovcnt; i++) {
509 if (iov->iov_len > INT_MAX - auio.uio_resid) {
510 error = EINVAL;
511 goto done;
512 }
513 auio.uio_resid += iov->iov_len;
514 iov++;
515 }
516#ifdef KTRACE
517 /*
518 * if tracing, save a copy of iovec and uio
519 */
520 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
521 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
522 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
523 ktruio = auio;
524 }
525#endif
526 cnt = auio.uio_resid;
527 if (fp->f_type == DTYPE_VNODE)
528 bwillwrite();
529 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
530 if (auio.uio_resid != cnt && (error == ERESTART ||
531 error == EINTR || error == EWOULDBLOCK))
532 error = 0;
533 if (error == EPIPE) {
534 PROC_LOCK(td->td_proc);
535 psignal(td->td_proc, SIGPIPE);
536 PROC_UNLOCK(td->td_proc);
537 }
538 }
539 cnt -= auio.uio_resid;
540#ifdef KTRACE
541 if (ktriov != NULL) {
542 if (error == 0) {
543 ktruio.uio_iov = ktriov;
544 ktruio.uio_resid = cnt;
545 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
546 error);
547 }
548 FREE(ktriov, M_TEMP);
549 }
550#endif
551 td->td_retval[0] = cnt;
552done:
553 fdrop(fp, td);
554 if (needfree)
555 FREE(needfree, M_IOV);
556done2:
557 mtx_unlock(&Giant);
558 return (error);
559}
560
561/*
562 * Ioctl system call
563 */
564#ifndef _SYS_SYSPROTO_H_
565struct ioctl_args {
566 int fd;
567 u_long com;
568 caddr_t data;
569};
570#endif
571/*
572 * MPSAFE
573 */
574/* ARGSUSED */
575int
576ioctl(td, uap)
577 struct thread *td;
578 register struct ioctl_args *uap;
579{
580 struct file *fp;
581 register struct filedesc *fdp;
582 register u_long com;
583 int error = 0;
584 register u_int size;
585 caddr_t data, memp;
586 int tmp;
587#define STK_PARAMS 128
588 union {
589 char stkbuf[STK_PARAMS];
590 long align;
591 } ubuf;
592
593 if ((error = fget(td, uap->fd, &fp)) != 0)
594 return (error);
595 mtx_lock(&Giant);
596 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
597 fdrop(fp, td);
598 mtx_unlock(&Giant);
599 return (EBADF);
600 }
601 fdp = td->td_proc->p_fd;
602 switch (com = uap->com) {
603 case FIONCLEX:
604 FILEDESC_LOCK(fdp);
605 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
606 FILEDESC_UNLOCK(fdp);
607 fdrop(fp, td);
608 mtx_unlock(&Giant);
609 return (0);
610 case FIOCLEX:
611 FILEDESC_LOCK(fdp);
612 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
613 FILEDESC_UNLOCK(fdp);
614 fdrop(fp, td);
615 mtx_unlock(&Giant);
616 return (0);
617 }
618
619 /*
620 * Interpret high order word to find amount of data to be
621 * copied to/from the user's address space.
622 */
623 size = IOCPARM_LEN(com);
624 if (size > IOCPARM_MAX) {
625 fdrop(fp, td);
626 mtx_unlock(&Giant);
627 return (ENOTTY);
628 }
629
630 memp = NULL;
631 if (size > sizeof (ubuf.stkbuf)) {
632 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
633 data = memp;
634 } else {
635 data = ubuf.stkbuf;
636 }
637 if (com&IOC_IN) {
638 if (size) {
639 error = copyin(uap->data, data, (u_int)size);
640 if (error) {
641 if (memp)
642 free(memp, M_IOCTLOPS);
643 fdrop(fp, td);
644 goto done;
645 }
646 } else {
647 *(caddr_t *)data = uap->data;
648 }
649 } else if ((com&IOC_OUT) && size) {
650 /*
651 * Zero the buffer so the user always
652 * gets back something deterministic.
653 */
654 bzero(data, size);
655 } else if (com&IOC_VOID) {
656 *(caddr_t *)data = uap->data;
657 }
658
659 switch (com) {
660
661 case FIONBIO:
662 FILE_LOCK(fp);
663 if ((tmp = *(int *)data))
664 fp->f_flag |= FNONBLOCK;
665 else
666 fp->f_flag &= ~FNONBLOCK;
667 FILE_UNLOCK(fp);
668 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
669 break;
670
671 case FIOASYNC:
672 FILE_LOCK(fp);
673 if ((tmp = *(int *)data))
674 fp->f_flag |= FASYNC;
675 else
676 fp->f_flag &= ~FASYNC;
677 FILE_UNLOCK(fp);
678 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
679 break;
680
681 default:
682 error = fo_ioctl(fp, com, data, td);
683 /*
684 * Copy any data to user, size was
685 * already set and checked above.
686 */
687 if (error == 0 && (com&IOC_OUT) && size)
688 error = copyout(data, uap->data, (u_int)size);
689 break;
690 }
691 if (memp)
692 free(memp, M_IOCTLOPS);
693 fdrop(fp, td);
694done:
695 mtx_unlock(&Giant);
696 return (error);
697}
698
699static int nselcoll; /* Select collisions since boot */
699/*
700 * sellock and selwait are initialized in selectinit() via SYSINIT.
701 */
702struct mtx sellock;
700struct cv selwait;
703struct cv selwait;
704int nselcoll; /* Select collisions since boot */
701SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
702
703/*
704 * Select system call.
705 */
706#ifndef _SYS_SYSPROTO_H_
707struct select_args {
708 int nd;
709 fd_set *in, *ou, *ex;
710 struct timeval *tv;
711};
712#endif
713/*
714 * MPSAFE
715 */
716int
717select(td, uap)
718 register struct thread *td;
719 register struct select_args *uap;
720{
721 struct filedesc *fdp;
722 /*
723 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
724 * infds with the new FD_SETSIZE of 1024, and more than enough for
725 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
726 * of 256.
727 */
728 fd_mask s_selbits[howmany(2048, NFDBITS)];
729 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
730 struct timeval atv, rtv, ttv;
731 int ncoll, error, timo;
732 u_int nbufbytes, ncpbytes, nfdbits;
733
734 if (uap->nd < 0)
735 return (EINVAL);
736 fdp = td->td_proc->p_fd;
737 mtx_lock(&Giant);
738 FILEDESC_LOCK(fdp);
739
740 if (uap->nd > td->td_proc->p_fd->fd_nfiles)
741 uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
742 FILEDESC_UNLOCK(fdp);
743
744 /*
745 * Allocate just enough bits for the non-null fd_sets. Use the
746 * preallocated auto buffer if possible.
747 */
748 nfdbits = roundup(uap->nd, NFDBITS);
749 ncpbytes = nfdbits / NBBY;
750 nbufbytes = 0;
751 if (uap->in != NULL)
752 nbufbytes += 2 * ncpbytes;
753 if (uap->ou != NULL)
754 nbufbytes += 2 * ncpbytes;
755 if (uap->ex != NULL)
756 nbufbytes += 2 * ncpbytes;
757 if (nbufbytes <= sizeof s_selbits)
758 selbits = &s_selbits[0];
759 else
760 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
761
762 /*
763 * Assign pointers into the bit buffers and fetch the input bits.
764 * Put the output buffers together so that they can be bzeroed
765 * together.
766 */
767 sbp = selbits;
768#define getbits(name, x) \
769 do { \
770 if (uap->name == NULL) \
771 ibits[x] = NULL; \
772 else { \
773 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
774 obits[x] = sbp; \
775 sbp += ncpbytes / sizeof *sbp; \
776 error = copyin(uap->name, ibits[x], ncpbytes); \
777 if (error != 0) \
705SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
706
707/*
708 * Select system call.
709 */
710#ifndef _SYS_SYSPROTO_H_
711struct select_args {
712 int nd;
713 fd_set *in, *ou, *ex;
714 struct timeval *tv;
715};
716#endif
717/*
718 * MPSAFE
719 */
720int
721select(td, uap)
722 register struct thread *td;
723 register struct select_args *uap;
724{
725 struct filedesc *fdp;
726 /*
727 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
728 * infds with the new FD_SETSIZE of 1024, and more than enough for
729 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
730 * of 256.
731 */
732 fd_mask s_selbits[howmany(2048, NFDBITS)];
733 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
734 struct timeval atv, rtv, ttv;
735 int ncoll, error, timo;
736 u_int nbufbytes, ncpbytes, nfdbits;
737
738 if (uap->nd < 0)
739 return (EINVAL);
740 fdp = td->td_proc->p_fd;
741 mtx_lock(&Giant);
742 FILEDESC_LOCK(fdp);
743
744 if (uap->nd > td->td_proc->p_fd->fd_nfiles)
745 uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
746 FILEDESC_UNLOCK(fdp);
747
748 /*
749 * Allocate just enough bits for the non-null fd_sets. Use the
750 * preallocated auto buffer if possible.
751 */
752 nfdbits = roundup(uap->nd, NFDBITS);
753 ncpbytes = nfdbits / NBBY;
754 nbufbytes = 0;
755 if (uap->in != NULL)
756 nbufbytes += 2 * ncpbytes;
757 if (uap->ou != NULL)
758 nbufbytes += 2 * ncpbytes;
759 if (uap->ex != NULL)
760 nbufbytes += 2 * ncpbytes;
761 if (nbufbytes <= sizeof s_selbits)
762 selbits = &s_selbits[0];
763 else
764 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
765
766 /*
767 * Assign pointers into the bit buffers and fetch the input bits.
768 * Put the output buffers together so that they can be bzeroed
769 * together.
770 */
771 sbp = selbits;
772#define getbits(name, x) \
773 do { \
774 if (uap->name == NULL) \
775 ibits[x] = NULL; \
776 else { \
777 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
778 obits[x] = sbp; \
779 sbp += ncpbytes / sizeof *sbp; \
780 error = copyin(uap->name, ibits[x], ncpbytes); \
781 if (error != 0) \
778 goto done_noproclock; \
782 goto done_nosellock; \
779 } \
780 } while (0)
781 getbits(in, 0);
782 getbits(ou, 1);
783 getbits(ex, 2);
784#undef getbits
785 if (nbufbytes != 0)
786 bzero(selbits, nbufbytes / 2);
787
788 if (uap->tv) {
789 error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
790 sizeof (atv));
791 if (error)
783 } \
784 } while (0)
785 getbits(in, 0);
786 getbits(ou, 1);
787 getbits(ex, 2);
788#undef getbits
789 if (nbufbytes != 0)
790 bzero(selbits, nbufbytes / 2);
791
792 if (uap->tv) {
793 error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
794 sizeof (atv));
795 if (error)
792 goto done_noproclock;
796 goto done_nosellock;
793 if (itimerfix(&atv)) {
794 error = EINVAL;
797 if (itimerfix(&atv)) {
798 error = EINVAL;
795 goto done_noproclock;
799 goto done_nosellock;
796 }
797 getmicrouptime(&rtv);
798 timevaladd(&atv, &rtv);
799 } else {
800 atv.tv_sec = 0;
801 atv.tv_usec = 0;
802 }
803 timo = 0;
800 }
801 getmicrouptime(&rtv);
802 timevaladd(&atv, &rtv);
803 } else {
804 atv.tv_sec = 0;
805 atv.tv_usec = 0;
806 }
807 timo = 0;
804 PROC_LOCK(td->td_proc);
808 mtx_lock(&sellock);
805retry:
806 ncoll = nselcoll;
807 mtx_lock_spin(&sched_lock);
808 td->td_flags |= TDF_SELECT;
809 mtx_unlock_spin(&sched_lock);
809retry:
810 ncoll = nselcoll;
811 mtx_lock_spin(&sched_lock);
812 td->td_flags |= TDF_SELECT;
813 mtx_unlock_spin(&sched_lock);
810 PROC_UNLOCK(td->td_proc);
814 mtx_unlock(&sellock);
815
816 /* XXX Is there a better place for this? */
817 TAILQ_INIT(&td->td_selq);
811 error = selscan(td, ibits, obits, uap->nd);
818 error = selscan(td, ibits, obits, uap->nd);
812 PROC_LOCK(td->td_proc);
819 mtx_lock(&sellock);
813 if (error || td->td_retval[0])
814 goto done;
815 if (atv.tv_sec || atv.tv_usec) {
816 getmicrouptime(&rtv);
820 if (error || td->td_retval[0])
821 goto done;
822 if (atv.tv_sec || atv.tv_usec) {
823 getmicrouptime(&rtv);
817 if (timevalcmp(&rtv, &atv, >=)) {
818 /*
819 * An event of our interest may occur during locking a process.
820 * In order to avoid missing the event that occured during locking
821 * the process, test TDF_SELECT and rescan file descriptors if
822 * necessary.
823 */
824 mtx_lock_spin(&sched_lock);
825 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
826 ncoll = nselcoll;
827 td->td_flags |= TDF_SELECT;
828 mtx_unlock_spin(&sched_lock);
829 PROC_UNLOCK(td->td_proc);
830 error = selscan(td, ibits, obits, uap->nd);
831 PROC_LOCK(td->td_proc);
832 } else
833 mtx_unlock_spin(&sched_lock);
824 if (timevalcmp(&rtv, &atv, >=))
834 goto done;
825 goto done;
835 }
836 ttv = atv;
837 timevalsub(&ttv, &rtv);
838 timo = ttv.tv_sec > 24 * 60 * 60 ?
839 24 * 60 * 60 * hz : tvtohz(&ttv);
840 }
826 ttv = atv;
827 timevalsub(&ttv, &rtv);
828 timo = ttv.tv_sec > 24 * 60 * 60 ?
829 24 * 60 * 60 * hz : tvtohz(&ttv);
830 }
831
832 /*
833 * An event of interest may occur while we do not hold
834 * sellock, so check TDF_SELECT and the number of
835 * collisions and rescan the file descriptors if
836 * necessary.
837 */
841 mtx_lock_spin(&sched_lock);
838 mtx_lock_spin(&sched_lock);
842 td->td_flags &= ~TDF_SELECT;
839 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
840 mtx_unlock_spin(&sched_lock);
841 goto retry;
842 }
843 mtx_unlock_spin(&sched_lock);
844
845 if (timo > 0)
843 mtx_unlock_spin(&sched_lock);
844
845 if (timo > 0)
846 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
846 error = cv_timedwait_sig(&selwait, &sellock, timo);
847 else
847 else
848 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
848 error = cv_wait_sig(&selwait, &sellock);
849
850 if (error == 0)
851 goto retry;
852
853done:
849
850 if (error == 0)
851 goto retry;
852
853done:
854 clear_selinfo_list(td);
854 mtx_lock_spin(&sched_lock);
855 td->td_flags &= ~TDF_SELECT;
856 mtx_unlock_spin(&sched_lock);
855 mtx_lock_spin(&sched_lock);
856 td->td_flags &= ~TDF_SELECT;
857 mtx_unlock_spin(&sched_lock);
857 PROC_UNLOCK(td->td_proc);
858done_noproclock:
858 mtx_unlock(&sellock);
859
860done_nosellock:
859 /* select is not restarted after signals... */
860 if (error == ERESTART)
861 error = EINTR;
862 if (error == EWOULDBLOCK)
863 error = 0;
864#define putbits(name, x) \
865 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
866 error = error2;
867 if (error == 0) {
868 int error2;
869
870 putbits(in, 0);
871 putbits(ou, 1);
872 putbits(ex, 2);
873#undef putbits
874 }
875 if (selbits != &s_selbits[0])
876 free(selbits, M_SELECT);
877
878 mtx_unlock(&Giant);
879 return (error);
880}
881
882static int
883selscan(td, ibits, obits, nfd)
884 struct thread *td;
885 fd_mask **ibits, **obits;
886 int nfd;
887{
888 int msk, i, fd;
889 fd_mask bits;
890 struct file *fp;
891 int n = 0;
892 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
893 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
894 struct filedesc *fdp = td->td_proc->p_fd;
895
896 FILEDESC_LOCK(fdp);
897 for (msk = 0; msk < 3; msk++) {
898 if (ibits[msk] == NULL)
899 continue;
900 for (i = 0; i < nfd; i += NFDBITS) {
901 bits = ibits[msk][i/NFDBITS];
902 /* ffs(int mask) not portable, fd_mask is long */
903 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
904 if (!(bits & 1))
905 continue;
906 if ((fp = fget_locked(fdp, fd)) == NULL) {
907 FILEDESC_UNLOCK(fdp);
908 return (EBADF);
909 }
910 if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
911 obits[msk][(fd)/NFDBITS] |=
912 ((fd_mask)1 << ((fd) % NFDBITS));
913 n++;
914 }
915 }
916 }
917 }
918 FILEDESC_UNLOCK(fdp);
919 td->td_retval[0] = n;
920 return (0);
921}
922
923/*
924 * Poll system call.
925 */
926#ifndef _SYS_SYSPROTO_H_
927struct poll_args {
928 struct pollfd *fds;
929 u_int nfds;
930 int timeout;
931};
932#endif
933/*
934 * MPSAFE
935 */
936int
937poll(td, uap)
938 struct thread *td;
939 struct poll_args *uap;
940{
941 caddr_t bits;
942 char smallbits[32 * sizeof(struct pollfd)];
943 struct timeval atv, rtv, ttv;
944 int ncoll, error = 0, timo;
945 u_int nfds;
946 size_t ni;
947
948 nfds = SCARG(uap, nfds);
949
950 mtx_lock(&Giant);
951 /*
952 * This is kinda bogus. We have fd limits, but that is not
953 * really related to the size of the pollfd array. Make sure
954 * we let the process use at least FD_SETSIZE entries and at
955 * least enough for the current limits. We want to be reasonably
956 * safe, but not overly restrictive.
957 */
958 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
959 (nfds > FD_SETSIZE)) {
960 error = EINVAL;
961 goto done2;
962 }
963 ni = nfds * sizeof(struct pollfd);
964 if (ni > sizeof(smallbits))
965 bits = malloc(ni, M_TEMP, M_WAITOK);
966 else
967 bits = smallbits;
968 error = copyin(SCARG(uap, fds), bits, ni);
969 if (error)
861 /* select is not restarted after signals... */
862 if (error == ERESTART)
863 error = EINTR;
864 if (error == EWOULDBLOCK)
865 error = 0;
866#define putbits(name, x) \
867 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
868 error = error2;
869 if (error == 0) {
870 int error2;
871
872 putbits(in, 0);
873 putbits(ou, 1);
874 putbits(ex, 2);
875#undef putbits
876 }
877 if (selbits != &s_selbits[0])
878 free(selbits, M_SELECT);
879
880 mtx_unlock(&Giant);
881 return (error);
882}
883
884static int
885selscan(td, ibits, obits, nfd)
886 struct thread *td;
887 fd_mask **ibits, **obits;
888 int nfd;
889{
890 int msk, i, fd;
891 fd_mask bits;
892 struct file *fp;
893 int n = 0;
894 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
895 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
896 struct filedesc *fdp = td->td_proc->p_fd;
897
898 FILEDESC_LOCK(fdp);
899 for (msk = 0; msk < 3; msk++) {
900 if (ibits[msk] == NULL)
901 continue;
902 for (i = 0; i < nfd; i += NFDBITS) {
903 bits = ibits[msk][i/NFDBITS];
904 /* ffs(int mask) not portable, fd_mask is long */
905 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
906 if (!(bits & 1))
907 continue;
908 if ((fp = fget_locked(fdp, fd)) == NULL) {
909 FILEDESC_UNLOCK(fdp);
910 return (EBADF);
911 }
912 if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
913 obits[msk][(fd)/NFDBITS] |=
914 ((fd_mask)1 << ((fd) % NFDBITS));
915 n++;
916 }
917 }
918 }
919 }
920 FILEDESC_UNLOCK(fdp);
921 td->td_retval[0] = n;
922 return (0);
923}
924
925/*
926 * Poll system call.
927 */
928#ifndef _SYS_SYSPROTO_H_
929struct poll_args {
930 struct pollfd *fds;
931 u_int nfds;
932 int timeout;
933};
934#endif
935/*
936 * MPSAFE
937 */
938int
939poll(td, uap)
940 struct thread *td;
941 struct poll_args *uap;
942{
943 caddr_t bits;
944 char smallbits[32 * sizeof(struct pollfd)];
945 struct timeval atv, rtv, ttv;
946 int ncoll, error = 0, timo;
947 u_int nfds;
948 size_t ni;
949
950 nfds = SCARG(uap, nfds);
951
952 mtx_lock(&Giant);
953 /*
954 * This is kinda bogus. We have fd limits, but that is not
955 * really related to the size of the pollfd array. Make sure
956 * we let the process use at least FD_SETSIZE entries and at
957 * least enough for the current limits. We want to be reasonably
958 * safe, but not overly restrictive.
959 */
960 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
961 (nfds > FD_SETSIZE)) {
962 error = EINVAL;
963 goto done2;
964 }
965 ni = nfds * sizeof(struct pollfd);
966 if (ni > sizeof(smallbits))
967 bits = malloc(ni, M_TEMP, M_WAITOK);
968 else
969 bits = smallbits;
970 error = copyin(SCARG(uap, fds), bits, ni);
971 if (error)
970 goto done_noproclock;
972 goto done_nosellock;
971 if (SCARG(uap, timeout) != INFTIM) {
972 atv.tv_sec = SCARG(uap, timeout) / 1000;
973 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
974 if (itimerfix(&atv)) {
975 error = EINVAL;
973 if (SCARG(uap, timeout) != INFTIM) {
974 atv.tv_sec = SCARG(uap, timeout) / 1000;
975 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
976 if (itimerfix(&atv)) {
977 error = EINVAL;
976 goto done_noproclock;
978 goto done_nosellock;
977 }
978 getmicrouptime(&rtv);
979 timevaladd(&atv, &rtv);
980 } else {
981 atv.tv_sec = 0;
982 atv.tv_usec = 0;
983 }
984 timo = 0;
979 }
980 getmicrouptime(&rtv);
981 timevaladd(&atv, &rtv);
982 } else {
983 atv.tv_sec = 0;
984 atv.tv_usec = 0;
985 }
986 timo = 0;
985 PROC_LOCK(td->td_proc);
987 mtx_lock(&sellock);
986retry:
987 ncoll = nselcoll;
988 mtx_lock_spin(&sched_lock);
989 td->td_flags |= TDF_SELECT;
990 mtx_unlock_spin(&sched_lock);
988retry:
989 ncoll = nselcoll;
990 mtx_lock_spin(&sched_lock);
991 td->td_flags |= TDF_SELECT;
992 mtx_unlock_spin(&sched_lock);
991 PROC_UNLOCK(td->td_proc);
993 mtx_unlock(&sellock);
994
995 /* XXX Is there a better place for this? */
996 TAILQ_INIT(&td->td_selq);
992 error = pollscan(td, (struct pollfd *)bits, nfds);
997 error = pollscan(td, (struct pollfd *)bits, nfds);
993 PROC_LOCK(td->td_proc);
998 mtx_lock(&sellock);
994 if (error || td->td_retval[0])
995 goto done;
996 if (atv.tv_sec || atv.tv_usec) {
997 getmicrouptime(&rtv);
999 if (error || td->td_retval[0])
1000 goto done;
1001 if (atv.tv_sec || atv.tv_usec) {
1002 getmicrouptime(&rtv);
998 if (timevalcmp(&rtv, &atv, >=)) {
999 /*
1000 * An event of our interest may occur during locking a process.
1001 * In order to avoid missing the event that occured during locking
1002 * the process, test TDF_SELECT and rescan file descriptors if
1003 * necessary.
1004 */
1005 mtx_lock_spin(&sched_lock);
1006 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1007 ncoll = nselcoll;
1008 td->td_flags |= TDF_SELECT;
1009 mtx_unlock_spin(&sched_lock);
1010 PROC_UNLOCK(td->td_proc);
1011 error = pollscan(td, (struct pollfd *)bits, nfds);
1012 PROC_LOCK(td->td_proc);
1013 } else
1014 mtx_unlock_spin(&sched_lock);
1003 if (timevalcmp(&rtv, &atv, >=))
1015 goto done;
1004 goto done;
1016 }
1017 ttv = atv;
1018 timevalsub(&ttv, &rtv);
1019 timo = ttv.tv_sec > 24 * 60 * 60 ?
1020 24 * 60 * 60 * hz : tvtohz(&ttv);
1021 }
1005 ttv = atv;
1006 timevalsub(&ttv, &rtv);
1007 timo = ttv.tv_sec > 24 * 60 * 60 ?
1008 24 * 60 * 60 * hz : tvtohz(&ttv);
1009 }
1010 /*
1011 * An event of interest may occur while we do not hold
1012 * sellock, so check TDF_SELECT and the number of collisions
1013 * and rescan the file descriptors if necessary.
1014 */
1022 mtx_lock_spin(&sched_lock);
1015 mtx_lock_spin(&sched_lock);
1023 td->td_flags &= ~TDF_SELECT;
1016 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1017 mtx_unlock_spin(&sched_lock);
1018 goto retry;
1019 }
1024 mtx_unlock_spin(&sched_lock);
1020 mtx_unlock_spin(&sched_lock);
1021
1025 if (timo > 0)
1022 if (timo > 0)
1026 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
1023 error = cv_timedwait_sig(&selwait, &sellock, timo);
1027 else
1024 else
1028 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
1025 error = cv_wait_sig(&selwait, &sellock);
1026
1029 if (error == 0)
1030 goto retry;
1031
1032done:
1027 if (error == 0)
1028 goto retry;
1029
1030done:
1031 clear_selinfo_list(td);
1033 mtx_lock_spin(&sched_lock);
1034 td->td_flags &= ~TDF_SELECT;
1035 mtx_unlock_spin(&sched_lock);
1032 mtx_lock_spin(&sched_lock);
1033 td->td_flags &= ~TDF_SELECT;
1034 mtx_unlock_spin(&sched_lock);
1036 PROC_UNLOCK(td->td_proc);
1037done_noproclock:
1035 mtx_unlock(&sellock);
1036
1037done_nosellock:
1038 /* poll is not restarted after signals... */
1039 if (error == ERESTART)
1040 error = EINTR;
1041 if (error == EWOULDBLOCK)
1042 error = 0;
1043 if (error == 0) {
1044 error = copyout(bits, SCARG(uap, fds), ni);
1045 if (error)
1046 goto out;
1047 }
1048out:
1049 if (ni > sizeof(smallbits))
1050 free(bits, M_TEMP);
1051done2:
1052 mtx_unlock(&Giant);
1053 return (error);
1054}
1055
1056static int
1057pollscan(td, fds, nfd)
1058 struct thread *td;
1059 struct pollfd *fds;
1060 u_int nfd;
1061{
1062 register struct filedesc *fdp = td->td_proc->p_fd;
1063 int i;
1064 struct file *fp;
1065 int n = 0;
1066
1067 FILEDESC_LOCK(fdp);
1068 for (i = 0; i < nfd; i++, fds++) {
1069 if (fds->fd >= fdp->fd_nfiles) {
1070 fds->revents = POLLNVAL;
1071 n++;
1072 } else if (fds->fd < 0) {
1073 fds->revents = 0;
1074 } else {
1075 fp = fdp->fd_ofiles[fds->fd];
1076 if (fp == NULL) {
1077 fds->revents = POLLNVAL;
1078 n++;
1079 } else {
1080 /*
1081 * Note: backend also returns POLLHUP and
1082 * POLLERR if appropriate.
1083 */
1084 fds->revents = fo_poll(fp, fds->events,
1085 fp->f_cred, td);
1086 if (fds->revents != 0)
1087 n++;
1088 }
1089 }
1090 }
1091 FILEDESC_UNLOCK(fdp);
1092 td->td_retval[0] = n;
1093 return (0);
1094}
1095
1096/*
1097 * OpenBSD poll system call.
1098 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1099 */
1100#ifndef _SYS_SYSPROTO_H_
1101struct openbsd_poll_args {
1102 struct pollfd *fds;
1103 u_int nfds;
1104 int timeout;
1105};
1106#endif
1107/*
1108 * MPSAFE
1109 */
1110int
1111openbsd_poll(td, uap)
1112 register struct thread *td;
1113 register struct openbsd_poll_args *uap;
1114{
1115 return (poll(td, (struct poll_args *)uap));
1116}
1117
1038 /* poll is not restarted after signals... */
1039 if (error == ERESTART)
1040 error = EINTR;
1041 if (error == EWOULDBLOCK)
1042 error = 0;
1043 if (error == 0) {
1044 error = copyout(bits, SCARG(uap, fds), ni);
1045 if (error)
1046 goto out;
1047 }
1048out:
1049 if (ni > sizeof(smallbits))
1050 free(bits, M_TEMP);
1051done2:
1052 mtx_unlock(&Giant);
1053 return (error);
1054}
1055
1056static int
1057pollscan(td, fds, nfd)
1058 struct thread *td;
1059 struct pollfd *fds;
1060 u_int nfd;
1061{
1062 register struct filedesc *fdp = td->td_proc->p_fd;
1063 int i;
1064 struct file *fp;
1065 int n = 0;
1066
1067 FILEDESC_LOCK(fdp);
1068 for (i = 0; i < nfd; i++, fds++) {
1069 if (fds->fd >= fdp->fd_nfiles) {
1070 fds->revents = POLLNVAL;
1071 n++;
1072 } else if (fds->fd < 0) {
1073 fds->revents = 0;
1074 } else {
1075 fp = fdp->fd_ofiles[fds->fd];
1076 if (fp == NULL) {
1077 fds->revents = POLLNVAL;
1078 n++;
1079 } else {
1080 /*
1081 * Note: backend also returns POLLHUP and
1082 * POLLERR if appropriate.
1083 */
1084 fds->revents = fo_poll(fp, fds->events,
1085 fp->f_cred, td);
1086 if (fds->revents != 0)
1087 n++;
1088 }
1089 }
1090 }
1091 FILEDESC_UNLOCK(fdp);
1092 td->td_retval[0] = n;
1093 return (0);
1094}
1095
1096/*
1097 * OpenBSD poll system call.
1098 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1099 */
1100#ifndef _SYS_SYSPROTO_H_
1101struct openbsd_poll_args {
1102 struct pollfd *fds;
1103 u_int nfds;
1104 int timeout;
1105};
1106#endif
1107/*
1108 * MPSAFE
1109 */
1110int
1111openbsd_poll(td, uap)
1112 register struct thread *td;
1113 register struct openbsd_poll_args *uap;
1114{
1115 return (poll(td, (struct poll_args *)uap));
1116}
1117
1118/*
1119 * Remove the references to the thread from all of the objects
1120 * we were polling.
1121 *
1122 * This code assumes that the underlying owner of the selinfo
1123 * structure will hold sellock before it changes it, and that
1124 * it will unlink itself from our list if it goes away.
1125 */
1126void
1127clear_selinfo_list(td)
1128 struct thread *td;
1129{
1130 struct selinfo *si;
1131
1132 mtx_assert(&sellock, MA_OWNED);
1133 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1134 si->si_thread = NULL;
1135 TAILQ_INIT(&td->td_selq);
1136}
1137
1118/*ARGSUSED*/
1119int
1120seltrue(dev, events, td)
1121 dev_t dev;
1122 int events;
1123 struct thread *td;
1124{
1125
1126 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1127}
1128
1138/*ARGSUSED*/
1139int
1140seltrue(dev, events, td)
1141 dev_t dev;
1142 int events;
1143 struct thread *td;
1144{
1145
1146 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1147}
1148
1129static int
1130find_thread_in_proc(struct proc *p, struct thread *td)
1131{
1132 struct thread *td2;
1133 FOREACH_THREAD_IN_PROC(p, td2) {
1134 if (td2 == td) {
1135 return (1);
1136 }
1137 }
1138 return (0);
1139}
1140
1141/*
1142 * Record a select request.
1143 */
1144void
1145selrecord(selector, sip)
1146 struct thread *selector;
1147 struct selinfo *sip;
1148{
1149/*
1150 * Record a select request.
1151 */
1152void
1153selrecord(selector, sip)
1154 struct thread *selector;
1155 struct selinfo *sip;
1156{
1149 struct proc *p;
1150 pid_t mypid;
1151
1157
1152 mypid = selector->td_proc->p_pid;
1153 if ((sip->si_pid == mypid) &&
1154 (sip->si_thread == selector)) { /* XXXKSE should be an ID? */
1155 return;
1158 mtx_lock(&sellock);
1159 /*
1160 * If the thread is NULL then take ownership of selinfo
1161 * however if the thread is not NULL and the thread points to
1162 * someone else, then we have a collision, otherwise leave it alone
1163 * as we've owned it in a previous selrecord on this selinfo.
1164 */
1165 if (sip->si_thread == NULL) {
1166 sip->si_thread = selector;
1167 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1168 } else if (sip->si_thread != selector) {
1169 sip->si_flags |= SI_COLL;
1156 }
1170 }
1157 if (sip->si_pid &&
1158 (p = pfind(sip->si_pid)) &&
1159 (find_thread_in_proc(p, sip->si_thread))) {
1160 mtx_lock_spin(&sched_lock);
1161 if (sip->si_thread->td_wchan == (caddr_t)&selwait) {
1162 mtx_unlock_spin(&sched_lock);
1163 PROC_UNLOCK(p);
1164 sip->si_flags |= SI_COLL;
1165 return;
1166 }
1167 mtx_unlock_spin(&sched_lock);
1168 PROC_UNLOCK(p);
1169 }
1170 sip->si_pid = mypid;
1171 sip->si_thread = selector;
1171
1172 mtx_unlock(&sellock);
1172}
1173
1174/*
1175 * Do a wakeup when a selectable event occurs.
1176 */
1177void
1178selwakeup(sip)
1173}
1174
1175/*
1176 * Do a wakeup when a selectable event occurs.
1177 */
1178void
1179selwakeup(sip)
1179 register struct selinfo *sip;
1180 struct selinfo *sip;
1180{
1181 struct thread *td;
1181{
1182 struct thread *td;
1182 register struct proc *p;
1183
1183
1184 if (sip->si_pid == 0)
1185 return;
1186 if (sip->si_flags & SI_COLL) {
1184 mtx_lock(&sellock);
1185 td = sip->si_thread;
1186 if ((sip->si_flags & SI_COLL) != 0) {
1187 nselcoll++;
1188 sip->si_flags &= ~SI_COLL;
1189 cv_broadcast(&selwait);
1190 }
1187 nselcoll++;
1188 sip->si_flags &= ~SI_COLL;
1189 cv_broadcast(&selwait);
1190 }
1191 p = pfind(sip->si_pid);
1192 sip->si_pid = 0;
1193 td = sip->si_thread;
1194 if (p != NULL) {
1195 if (!find_thread_in_proc(p, td)) {
1196 PROC_UNLOCK(p); /* lock is in pfind() */;
1197 return;
1198 }
1199 mtx_lock_spin(&sched_lock);
1200 if (td->td_wchan == (caddr_t)&selwait) {
1201 if (td->td_proc->p_stat == SSLEEP)
1202 setrunnable(td);
1203 else
1204 cv_waitq_remove(td);
1205 } else
1206 td->td_flags &= ~TDF_SELECT;
1207 mtx_unlock_spin(&sched_lock);
1208 PROC_UNLOCK(p); /* Lock is in pfind() */
1191 if (td == NULL) {
1192 mtx_unlock(&sellock);
1193 return;
1209 }
1194 }
1195 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1196 sip->si_thread = NULL;
1197 mtx_lock_spin(&sched_lock);
1198 if (td->td_wchan == (caddr_t)&selwait) {
1199 if (td->td_proc->p_stat == SSLEEP)
1200 setrunnable(td);
1201 else
1202 cv_waitq_remove(td);
1203 } else
1204 td->td_flags &= ~TDF_SELECT;
1205 mtx_unlock_spin(&sched_lock);
1206 mtx_unlock(&sellock);
1210}
1211
1212static void selectinit __P((void *));
1213SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1214
1215/* ARGSUSED*/
1216static void
1217selectinit(dummy)
1218 void *dummy;
1219{
1220 cv_init(&selwait, "select");
1207}
1208
1209static void selectinit __P((void *));
1210SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1211
1212/* ARGSUSED*/
1213static void
1214selectinit(dummy)
1215 void *dummy;
1216{
1217 cv_init(&selwait, "select");
1218 mtx_init(&sellock, "sellck", MTX_DEF);
1221}
1219}