Deleted Added
sdiff udiff text old ( 89306 ) new ( 89319 )
full compact
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/sys_generic.c 89306 2002-01-13 11:58:06Z alfred $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/sysproto.h>
47#include <sys/filedesc.h>
48#include <sys/filio.h>
49#include <sys/fcntl.h>
50#include <sys/file.h>
51#include <sys/proc.h>
52#include <sys/signalvar.h>
53#include <sys/socketvar.h>
54#include <sys/uio.h>
55#include <sys/kernel.h>
56#include <sys/malloc.h>
57#include <sys/poll.h>
58#include <sys/resourcevar.h>
59#include <sys/selinfo.h>
60#include <sys/sysctl.h>
61#include <sys/sysent.h>
62#include <sys/bio.h>
63#include <sys/buf.h>
64#include <sys/condvar.h>
65#ifdef KTRACE
66#include <sys/ktrace.h>
67#endif
68#include <vm/vm.h>
69#include <vm/vm_page.h>
70
71#include <machine/limits.h>
72
73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77static int pollscan __P((struct thread *, struct pollfd *, u_int));
78static int pollholddrop __P((struct thread *, struct pollfd *, u_int, int));
79static int selscan __P((struct thread *, fd_mask **, fd_mask **, int));
80static int selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int));
81static int dofileread __P((struct thread *, struct file *, int, void *,
82 size_t, off_t, int));
83static int dofilewrite __P((struct thread *, struct file *, int,
84 const void *, size_t, off_t, int));
85
86struct file*
87holdfp(fdp, fd, flag)
88 struct filedesc* fdp;
89 int fd, flag;
90{
91 struct file* fp;
92
93 FILEDESC_LOCK(fdp);
94 if (((u_int)fd) >= fdp->fd_nfiles ||
95 (fp = fdp->fd_ofiles[fd]) == NULL) {
96 FILEDESC_UNLOCK(fdp);
97 return (NULL);
98 }
99 FILE_LOCK(fp);
100 FILEDESC_UNLOCK(fdp);
101 if ((fp->f_flag & flag) == 0) {
102 FILE_UNLOCK(fp);
103 return (NULL);
104 }
105 fp->f_count++;
106 FILE_UNLOCK(fp);
107 return (fp);
108}
109
110/*
111 * Read system call.
112 */
113#ifndef _SYS_SYSPROTO_H_
114struct read_args {
115 int fd;
116 void *buf;
117 size_t nbyte;
118};
119#endif
120/*
121 * MPSAFE
122 */
123int
124read(td, uap)
125 struct thread *td;
126 struct read_args *uap;
127{
128 struct file *fp;
129 int error;
130
131 mtx_lock(&Giant);
132 if ((error = fget_read(td, uap->fd, &fp)) == 0) {
133 error = dofileread(td, fp, uap->fd, uap->buf,
134 uap->nbyte, (off_t)-1, 0);
135 fdrop(fp, td);
136 }
137 mtx_unlock(&Giant);
138 return(error);
139}
140
141/*
142 * Pread system call
143 */
144#ifndef _SYS_SYSPROTO_H_
145struct pread_args {
146 int fd;
147 void *buf;
148 size_t nbyte;
149 int pad;
150 off_t offset;
151};
152#endif
153/*
154 * MPSAFE
155 */
156int
157pread(td, uap)
158 struct thread *td;
159 struct pread_args *uap;
160{
161 struct file *fp;
162 int error;
163
164 fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD);
165 if (fp == NULL)
166 return (EBADF);
167 if (fp->f_type != DTYPE_VNODE) {
168 error = ESPIPE;
169 } else {
170 mtx_lock(&Giant);
171 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
172 uap->offset, FOF_OFFSET);
173 mtx_unlock(&Giant);
174 }
175 fdrop(fp, td);
176 return(error);
177}
178
179/*
180 * Code common for read and pread
181 */
182int
183dofileread(td, fp, fd, buf, nbyte, offset, flags)
184 struct thread *td;
185 struct file *fp;
186 int fd, flags;
187 void *buf;
188 size_t nbyte;
189 off_t offset;
190{
191 struct uio auio;
192 struct iovec aiov;
193 long cnt, error = 0;
194#ifdef KTRACE
195 struct iovec ktriov;
196 struct uio ktruio;
197 int didktr = 0;
198#endif
199
200 aiov.iov_base = (caddr_t)buf;
201 aiov.iov_len = nbyte;
202 auio.uio_iov = &aiov;
203 auio.uio_iovcnt = 1;
204 auio.uio_offset = offset;
205 if (nbyte > INT_MAX)
206 return (EINVAL);
207 auio.uio_resid = nbyte;
208 auio.uio_rw = UIO_READ;
209 auio.uio_segflg = UIO_USERSPACE;
210 auio.uio_td = td;
211#ifdef KTRACE
212 /*
213 * if tracing, save a copy of iovec
214 */
215 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
216 ktriov = aiov;
217 ktruio = auio;
218 didktr = 1;
219 }
220#endif
221 cnt = nbyte;
222
223 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
224 if (auio.uio_resid != cnt && (error == ERESTART ||
225 error == EINTR || error == EWOULDBLOCK))
226 error = 0;
227 }
228 cnt -= auio.uio_resid;
229#ifdef KTRACE
230 if (didktr && error == 0) {
231 ktruio.uio_iov = &ktriov;
232 ktruio.uio_resid = cnt;
233 ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
234 }
235#endif
236 td->td_retval[0] = cnt;
237 return (error);
238}
239
240/*
241 * Scatter read system call.
242 */
243#ifndef _SYS_SYSPROTO_H_
244struct readv_args {
245 int fd;
246 struct iovec *iovp;
247 u_int iovcnt;
248};
249#endif
250/*
251 * MPSAFE
252 */
253int
254readv(td, uap)
255 struct thread *td;
256 struct readv_args *uap;
257{
258 struct file *fp;
259 struct uio auio;
260 struct iovec *iov;
261 struct iovec *needfree;
262 struct iovec aiov[UIO_SMALLIOV];
263 long i, cnt, error = 0;
264 u_int iovlen;
265#ifdef KTRACE
266 struct iovec *ktriov = NULL;
267 struct uio ktruio;
268#endif
269 mtx_lock(&Giant);
270
271 if ((error = fget_read(td, uap->fd, &fp)) != 0)
272 goto done2;
273 /* note: can't use iovlen until iovcnt is validated */
274 iovlen = uap->iovcnt * sizeof (struct iovec);
275 if (uap->iovcnt > UIO_SMALLIOV) {
276 if (uap->iovcnt > UIO_MAXIOV) {
277 error = EINVAL;
278 goto done2;
279 }
280 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
281 needfree = iov;
282 } else {
283 iov = aiov;
284 needfree = NULL;
285 }
286 auio.uio_iov = iov;
287 auio.uio_iovcnt = uap->iovcnt;
288 auio.uio_rw = UIO_READ;
289 auio.uio_segflg = UIO_USERSPACE;
290 auio.uio_td = td;
291 auio.uio_offset = -1;
292 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
293 goto done;
294 auio.uio_resid = 0;
295 for (i = 0; i < uap->iovcnt; i++) {
296 if (iov->iov_len > INT_MAX - auio.uio_resid) {
297 error = EINVAL;
298 goto done;
299 }
300 auio.uio_resid += iov->iov_len;
301 iov++;
302 }
303#ifdef KTRACE
304 /*
305 * if tracing, save a copy of iovec
306 */
307 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
308 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
309 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
310 ktruio = auio;
311 }
312#endif
313 cnt = auio.uio_resid;
314 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
315 if (auio.uio_resid != cnt && (error == ERESTART ||
316 error == EINTR || error == EWOULDBLOCK))
317 error = 0;
318 }
319 cnt -= auio.uio_resid;
320#ifdef KTRACE
321 if (ktriov != NULL) {
322 if (error == 0) {
323 ktruio.uio_iov = ktriov;
324 ktruio.uio_resid = cnt;
325 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
326 error);
327 }
328 FREE(ktriov, M_TEMP);
329 }
330#endif
331 td->td_retval[0] = cnt;
332done:
333 fdrop(fp, td);
334 if (needfree)
335 FREE(needfree, M_IOV);
336done2:
337 mtx_unlock(&Giant);
338 return (error);
339}
340
341/*
342 * Write system call
343 */
344#ifndef _SYS_SYSPROTO_H_
345struct write_args {
346 int fd;
347 const void *buf;
348 size_t nbyte;
349};
350#endif
351/*
352 * MPSAFE
353 */
354int
355write(td, uap)
356 struct thread *td;
357 struct write_args *uap;
358{
359 struct file *fp;
360 int error;
361
362 mtx_lock(&Giant);
363 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
364 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
365 (off_t)-1, 0);
366 fdrop(fp, td);
367 } else {
368 error = EBADF; /* XXX this can't be right */
369 }
370 mtx_unlock(&Giant);
371 return(error);
372}
373
374/*
375 * Pwrite system call
376 */
377#ifndef _SYS_SYSPROTO_H_
378struct pwrite_args {
379 int fd;
380 const void *buf;
381 size_t nbyte;
382 int pad;
383 off_t offset;
384};
385#endif
386/*
387 * MPSAFE
388 */
389int
390pwrite(td, uap)
391 struct thread *td;
392 struct pwrite_args *uap;
393{
394 struct file *fp;
395 int error;
396
397 mtx_lock(&Giant);
398 if ((error = fget_write(td, uap->fd, &fp)) == 0) {
399 if (fp->f_type == DTYPE_VNODE) {
400 error = dofilewrite(td, fp, uap->fd, uap->buf,
401 uap->nbyte, uap->offset, FOF_OFFSET);
402 } else {
403 error = ESPIPE;
404 }
405 fdrop(fp, td);
406 } else {
407 error = EBADF; /* this can't be right */
408 }
409 return(error);
410}
411
412static int
413dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
414 struct thread *td;
415 struct file *fp;
416 int fd, flags;
417 const void *buf;
418 size_t nbyte;
419 off_t offset;
420{
421 struct uio auio;
422 struct iovec aiov;
423 long cnt, error = 0;
424#ifdef KTRACE
425 struct iovec ktriov;
426 struct uio ktruio;
427 int didktr = 0;
428#endif
429
430 aiov.iov_base = (void *)(uintptr_t)buf;
431 aiov.iov_len = nbyte;
432 auio.uio_iov = &aiov;
433 auio.uio_iovcnt = 1;
434 auio.uio_offset = offset;
435 if (nbyte > INT_MAX)
436 return (EINVAL);
437 auio.uio_resid = nbyte;
438 auio.uio_rw = UIO_WRITE;
439 auio.uio_segflg = UIO_USERSPACE;
440 auio.uio_td = td;
441#ifdef KTRACE
442 /*
443 * if tracing, save a copy of iovec and uio
444 */
445 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
446 ktriov = aiov;
447 ktruio = auio;
448 didktr = 1;
449 }
450#endif
451 cnt = nbyte;
452 if (fp->f_type == DTYPE_VNODE)
453 bwillwrite();
454 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
455 if (auio.uio_resid != cnt && (error == ERESTART ||
456 error == EINTR || error == EWOULDBLOCK))
457 error = 0;
458 if (error == EPIPE) {
459 PROC_LOCK(td->td_proc);
460 psignal(td->td_proc, SIGPIPE);
461 PROC_UNLOCK(td->td_proc);
462 }
463 }
464 cnt -= auio.uio_resid;
465#ifdef KTRACE
466 if (didktr && error == 0) {
467 ktruio.uio_iov = &ktriov;
468 ktruio.uio_resid = cnt;
469 ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
470 }
471#endif
472 td->td_retval[0] = cnt;
473 return (error);
474}
475
476/*
477 * Gather write system call
478 */
479#ifndef _SYS_SYSPROTO_H_
480struct writev_args {
481 int fd;
482 struct iovec *iovp;
483 u_int iovcnt;
484};
485#endif
486/*
487 * MPSAFE
488 */
489int
490writev(td, uap)
491 struct thread *td;
492 register struct writev_args *uap;
493{
494 struct file *fp;
495 struct uio auio;
496 register struct iovec *iov;
497 struct iovec *needfree;
498 struct iovec aiov[UIO_SMALLIOV];
499 long i, cnt, error = 0;
500 u_int iovlen;
501#ifdef KTRACE
502 struct iovec *ktriov = NULL;
503 struct uio ktruio;
504#endif
505
506 mtx_lock(&Giant);
507 if ((error = fget_write(td, uap->fd, &fp)) != 0) {
508 error = EBADF;
509 goto done2;
510 }
511 /* note: can't use iovlen until iovcnt is validated */
512 iovlen = uap->iovcnt * sizeof (struct iovec);
513 if (uap->iovcnt > UIO_SMALLIOV) {
514 if (uap->iovcnt > UIO_MAXIOV) {
515 needfree = NULL;
516 error = EINVAL;
517 goto done;
518 }
519 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
520 needfree = iov;
521 } else {
522 iov = aiov;
523 needfree = NULL;
524 }
525 auio.uio_iov = iov;
526 auio.uio_iovcnt = uap->iovcnt;
527 auio.uio_rw = UIO_WRITE;
528 auio.uio_segflg = UIO_USERSPACE;
529 auio.uio_td = td;
530 auio.uio_offset = -1;
531 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
532 goto done;
533 auio.uio_resid = 0;
534 for (i = 0; i < uap->iovcnt; i++) {
535 if (iov->iov_len > INT_MAX - auio.uio_resid) {
536 error = EINVAL;
537 goto done;
538 }
539 auio.uio_resid += iov->iov_len;
540 iov++;
541 }
542#ifdef KTRACE
543 /*
544 * if tracing, save a copy of iovec and uio
545 */
546 if (KTRPOINT(td->td_proc, KTR_GENIO)) {
547 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
548 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
549 ktruio = auio;
550 }
551#endif
552 cnt = auio.uio_resid;
553 if (fp->f_type == DTYPE_VNODE)
554 bwillwrite();
555 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
556 if (auio.uio_resid != cnt && (error == ERESTART ||
557 error == EINTR || error == EWOULDBLOCK))
558 error = 0;
559 if (error == EPIPE) {
560 PROC_LOCK(td->td_proc);
561 psignal(td->td_proc, SIGPIPE);
562 PROC_UNLOCK(td->td_proc);
563 }
564 }
565 cnt -= auio.uio_resid;
566#ifdef KTRACE
567 if (ktriov != NULL) {
568 if (error == 0) {
569 ktruio.uio_iov = ktriov;
570 ktruio.uio_resid = cnt;
571 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
572 error);
573 }
574 FREE(ktriov, M_TEMP);
575 }
576#endif
577 td->td_retval[0] = cnt;
578done:
579 fdrop(fp, td);
580 if (needfree)
581 FREE(needfree, M_IOV);
582done2:
583 mtx_unlock(&Giant);
584 return (error);
585}
586
587/*
588 * Ioctl system call
589 */
590#ifndef _SYS_SYSPROTO_H_
591struct ioctl_args {
592 int fd;
593 u_long com;
594 caddr_t data;
595};
596#endif
597/*
598 * MPSAFE
599 */
600/* ARGSUSED */
601int
602ioctl(td, uap)
603 struct thread *td;
604 register struct ioctl_args *uap;
605{
606 register struct file *fp;
607 register struct filedesc *fdp;
608 register u_long com;
609 int error = 0;
610 register u_int size;
611 caddr_t data, memp;
612 int tmp;
613#define STK_PARAMS 128
614 union {
615 char stkbuf[STK_PARAMS];
616 long align;
617 } ubuf;
618
619 fp = ffind_hold(td, uap->fd);
620 if (fp == NULL)
621 return (EBADF);
622 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
623 fdrop(fp, td);
624 return (EBADF);
625 }
626 fdp = td->td_proc->p_fd;
627 switch (com = uap->com) {
628 case FIONCLEX:
629 FILEDESC_LOCK(fdp);
630 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
631 FILEDESC_UNLOCK(fdp);
632 fdrop(fp, td);
633 return (0);
634 case FIOCLEX:
635 FILEDESC_LOCK(fdp);
636 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
637 FILEDESC_UNLOCK(fdp);
638 fdrop(fp, td);
639 return (0);
640 }
641
642 /*
643 * Interpret high order word to find amount of data to be
644 * copied to/from the user's address space.
645 */
646 size = IOCPARM_LEN(com);
647 if (size > IOCPARM_MAX) {
648 fdrop(fp, td);
649 return (ENOTTY);
650 }
651
652 mtx_lock(&Giant);
653 memp = NULL;
654 if (size > sizeof (ubuf.stkbuf)) {
655 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
656 data = memp;
657 } else {
658 data = ubuf.stkbuf;
659 }
660 if (com&IOC_IN) {
661 if (size) {
662 error = copyin(uap->data, data, (u_int)size);
663 if (error) {
664 if (memp)
665 free(memp, M_IOCTLOPS);
666 fdrop(fp, td);
667 goto done;
668 }
669 } else {
670 *(caddr_t *)data = uap->data;
671 }
672 } else if ((com&IOC_OUT) && size) {
673 /*
674 * Zero the buffer so the user always
675 * gets back something deterministic.
676 */
677 bzero(data, size);
678 } else if (com&IOC_VOID) {
679 *(caddr_t *)data = uap->data;
680 }
681
682 switch (com) {
683
684 case FIONBIO:
685 FILE_LOCK(fp);
686 if ((tmp = *(int *)data))
687 fp->f_flag |= FNONBLOCK;
688 else
689 fp->f_flag &= ~FNONBLOCK;
690 FILE_UNLOCK(fp);
691 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
692 break;
693
694 case FIOASYNC:
695 FILE_LOCK(fp);
696 if ((tmp = *(int *)data))
697 fp->f_flag |= FASYNC;
698 else
699 fp->f_flag &= ~FASYNC;
700 FILE_UNLOCK(fp);
701 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
702 break;
703
704 default:
705 error = fo_ioctl(fp, com, data, td);
706 /*
707 * Copy any data to user, size was
708 * already set and checked above.
709 */
710 if (error == 0 && (com&IOC_OUT) && size)
711 error = copyout(data, uap->data, (u_int)size);
712 break;
713 }
714 if (memp)
715 free(memp, M_IOCTLOPS);
716 fdrop(fp, td);
717done:
718 mtx_unlock(&Giant);
719 return (error);
720}
721
722static int nselcoll; /* Select collisions since boot */
723struct cv selwait;
724SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
725
726/*
727 * Select system call.
728 */
729#ifndef _SYS_SYSPROTO_H_
730struct select_args {
731 int nd;
732 fd_set *in, *ou, *ex;
733 struct timeval *tv;
734};
735#endif
736/*
737 * MPSAFE
738 */
739int
740select(td, uap)
741 register struct thread *td;
742 register struct select_args *uap;
743{
744 struct filedesc *fdp;
745 /*
746 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
747 * infds with the new FD_SETSIZE of 1024, and more than enough for
748 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
749 * of 256.
750 */
751 fd_mask s_selbits[howmany(2048, NFDBITS)];
752 fd_mask s_heldbits[howmany(2048, NFDBITS)];
753 fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits;
754 struct timeval atv, rtv, ttv;
755 int ncoll, error, timo, i;
756 u_int nbufbytes, ncpbytes, nfdbits;
757
758 if (uap->nd < 0)
759 return (EINVAL);
760 fdp = td->td_proc->p_fd;
761 mtx_lock(&Giant);
762 FILEDESC_LOCK(fdp);
763
764 if (uap->nd > td->td_proc->p_fd->fd_nfiles)
765 uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
766 FILEDESC_UNLOCK(fdp);
767
768 /*
769 * Allocate just enough bits for the non-null fd_sets. Use the
770 * preallocated auto buffer if possible.
771 */
772 nfdbits = roundup(uap->nd, NFDBITS);
773 ncpbytes = nfdbits / NBBY;
774 nbufbytes = 0;
775 if (uap->in != NULL)
776 nbufbytes += 2 * ncpbytes;
777 if (uap->ou != NULL)
778 nbufbytes += 2 * ncpbytes;
779 if (uap->ex != NULL)
780 nbufbytes += 2 * ncpbytes;
781 if (nbufbytes <= sizeof s_selbits)
782 selbits = &s_selbits[0];
783 else
784 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
785 if (2 * ncpbytes <= sizeof s_heldbits) {
786 bzero(s_heldbits, sizeof(s_heldbits));
787 heldbits = &s_heldbits[0];
788 } else
789 heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO);
790
791 /*
792 * Assign pointers into the bit buffers and fetch the input bits.
793 * Put the output buffers together so that they can be bzeroed
794 * together.
795 */
796 sbp = selbits;
797 hibits = heldbits + ncpbytes / sizeof *heldbits;
798 hobits = heldbits;
799#define getbits(name, x) \
800 do { \
801 if (uap->name == NULL) \
802 ibits[x] = NULL; \
803 else { \
804 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
805 obits[x] = sbp; \
806 sbp += ncpbytes / sizeof *sbp; \
807 error = copyin(uap->name, ibits[x], ncpbytes); \
808 if (error != 0) \
809 goto done_noproclock; \
810 for (i = 0; \
811 i < ncpbytes / sizeof ibits[i][0]; \
812 i++) \
813 hibits[i] |= ibits[x][i]; \
814 } \
815 } while (0)
816 getbits(in, 0);
817 getbits(ou, 1);
818 getbits(ex, 2);
819#undef getbits
820 if (nbufbytes != 0)
821 bzero(selbits, nbufbytes / 2);
822
823 if (uap->tv) {
824 error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
825 sizeof (atv));
826 if (error)
827 goto done_noproclock;
828 if (itimerfix(&atv)) {
829 error = EINVAL;
830 goto done_noproclock;
831 }
832 getmicrouptime(&rtv);
833 timevaladd(&atv, &rtv);
834 } else {
835 atv.tv_sec = 0;
836 atv.tv_usec = 0;
837 }
838 selholddrop(td, hibits, hobits, uap->nd, 1);
839 timo = 0;
840 PROC_LOCK(td->td_proc);
841retry:
842 ncoll = nselcoll;
843 mtx_lock_spin(&sched_lock);
844 td->td_flags |= TDF_SELECT;
845 mtx_unlock_spin(&sched_lock);
846 PROC_UNLOCK(td->td_proc);
847 error = selscan(td, ibits, obits, uap->nd);
848 PROC_LOCK(td->td_proc);
849 if (error || td->td_retval[0])
850 goto done;
851 if (atv.tv_sec || atv.tv_usec) {
852 getmicrouptime(&rtv);
853 if (timevalcmp(&rtv, &atv, >=)) {
854 /*
855 * An event of our interest may occur during locking a process.
856 * In order to avoid missing the event that occured during locking
857 * the process, test TDF_SELECT and rescan file descriptors if
858 * necessary.
859 */
860 mtx_lock_spin(&sched_lock);
861 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
862 ncoll = nselcoll;
863 td->td_flags |= TDF_SELECT;
864 mtx_unlock_spin(&sched_lock);
865 PROC_UNLOCK(td->td_proc);
866 error = selscan(td, ibits, obits, uap->nd);
867 PROC_LOCK(td->td_proc);
868 } else
869 mtx_unlock_spin(&sched_lock);
870 goto done;
871 }
872 ttv = atv;
873 timevalsub(&ttv, &rtv);
874 timo = ttv.tv_sec > 24 * 60 * 60 ?
875 24 * 60 * 60 * hz : tvtohz(&ttv);
876 }
877 mtx_lock_spin(&sched_lock);
878 td->td_flags &= ~TDF_SELECT;
879 mtx_unlock_spin(&sched_lock);
880
881 if (timo > 0)
882 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
883 else
884 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
885
886 if (error == 0)
887 goto retry;
888
889done:
890 mtx_lock_spin(&sched_lock);
891 td->td_flags &= ~TDF_SELECT;
892 mtx_unlock_spin(&sched_lock);
893 PROC_UNLOCK(td->td_proc);
894 selholddrop(td, hibits, hobits, uap->nd, 0);
895done_noproclock:
896 /* select is not restarted after signals... */
897 if (error == ERESTART)
898 error = EINTR;
899 if (error == EWOULDBLOCK)
900 error = 0;
901#define putbits(name, x) \
902 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
903 error = error2;
904 if (error == 0) {
905 int error2;
906
907 putbits(in, 0);
908 putbits(ou, 1);
909 putbits(ex, 2);
910#undef putbits
911 }
912 if (selbits != &s_selbits[0])
913 free(selbits, M_SELECT);
914 if (heldbits != &s_heldbits[0])
915 free(heldbits, M_SELECT);
916
917 mtx_unlock(&Giant);
918 return (error);
919}
920
921/*
922 * Used to hold then release a group of fds for select(2).
923 * Hold (hold == 1) or release (hold == 0) a group of filedescriptors.
924 * if holding then use ibits setting the bits in obits, otherwise use obits.
925 */
926static int
927selholddrop(td, ibits, obits, nfd, hold)
928 struct thread *td;
929 fd_mask *ibits, *obits;
930 int nfd, hold;
931{
932 struct filedesc *fdp = td->td_proc->p_fd;
933 int i, fd;
934 fd_mask bits;
935 struct file *fp;
936
937 FILEDESC_LOCK(fdp);
938 for (i = 0; i < nfd; i += NFDBITS) {
939 if (hold)
940 bits = ibits[i/NFDBITS];
941 else
942 bits = obits[i/NFDBITS];
943 /* ffs(int mask) not portable, fd_mask is long */
944 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
945 if (!(bits & 1))
946 continue;
947 fp = fdp->fd_ofiles[fd];
948 if (fp == NULL) {
949 FILEDESC_UNLOCK(fdp);
950 return (EBADF);
951 }
952 if (hold) {
953 fhold(fp);
954 obits[(fd)/NFDBITS] |=
955 ((fd_mask)1 << ((fd) % NFDBITS));
956 } else {
957 /* XXX: optimize by making a special
958 * version of fdrop that only unlocks
959 * the filedesc if needed? This would
960 * redcuce the number of lock/unlock
961 * pairs by quite a bit.
962 */
963 FILEDESC_UNLOCK(fdp);
964 fdrop(fp, td);
965 FILEDESC_LOCK(fdp);
966 }
967 }
968 }
969 FILEDESC_UNLOCK(fdp);
970 return (0);
971}
972
973static int
974selscan(td, ibits, obits, nfd)
975 struct thread *td;
976 fd_mask **ibits, **obits;
977 int nfd;
978{
979 int msk, i, fd;
980 fd_mask bits;
981 struct file *fp;
982 int n = 0;
983 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
984 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
985
986 for (msk = 0; msk < 3; msk++) {
987 if (ibits[msk] == NULL)
988 continue;
989 for (i = 0; i < nfd; i += NFDBITS) {
990 bits = ibits[msk][i/NFDBITS];
991 /* ffs(int mask) not portable, fd_mask is long */
992 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
993 if (!(bits & 1))
994 continue;
995 fp = ffind_hold(td, fd);
996 if (fp == NULL)
997 return (EBADF);
998 if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
999 obits[msk][(fd)/NFDBITS] |=
1000 ((fd_mask)1 << ((fd) % NFDBITS));
1001 n++;
1002 }
1003 fdrop(fp, td);
1004 }
1005 }
1006 }
1007 td->td_retval[0] = n;
1008 return (0);
1009}
1010
1011/*
1012 * Poll system call.
1013 */
1014#ifndef _SYS_SYSPROTO_H_
1015struct poll_args {
1016 struct pollfd *fds;
1017 u_int nfds;
1018 int timeout;
1019};
1020#endif
1021/*
1022 * MPSAFE
1023 */
1024int
1025poll(td, uap)
1026 struct thread *td;
1027 struct poll_args *uap;
1028{
1029 caddr_t bits;
1030 char smallbits[32 * sizeof(struct pollfd)];
1031 struct timeval atv, rtv, ttv;
1032 int ncoll, error = 0, timo;
1033 u_int nfds;
1034 size_t ni;
1035 struct pollfd p_heldbits[32];
1036 struct pollfd *heldbits;
1037
1038 nfds = SCARG(uap, nfds);
1039
1040 mtx_lock(&Giant);
1041 /*
1042 * This is kinda bogus. We have fd limits, but that is not
1043 * really related to the size of the pollfd array. Make sure
1044 * we let the process use at least FD_SETSIZE entries and at
1045 * least enough for the current limits. We want to be reasonably
1046 * safe, but not overly restrictive.
1047 */
1048 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
1049 (nfds > FD_SETSIZE)) {
1050 error = EINVAL;
1051 goto done2;
1052 }
1053 ni = nfds * sizeof(struct pollfd);
1054 if (ni > sizeof(smallbits))
1055 bits = malloc(ni, M_TEMP, M_WAITOK);
1056 else
1057 bits = smallbits;
1058 if (ni > sizeof(p_heldbits))
1059 heldbits = malloc(ni, M_TEMP, M_WAITOK);
1060 else {
1061 bzero(p_heldbits, sizeof(p_heldbits));
1062 heldbits = p_heldbits;
1063 }
1064 error = copyin(SCARG(uap, fds), bits, ni);
1065 if (error)
1066 goto done_noproclock;
1067 bcopy(bits, heldbits, ni);
1068 if (SCARG(uap, timeout) != INFTIM) {
1069 atv.tv_sec = SCARG(uap, timeout) / 1000;
1070 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
1071 if (itimerfix(&atv)) {
1072 error = EINVAL;
1073 goto done_noproclock;
1074 }
1075 getmicrouptime(&rtv);
1076 timevaladd(&atv, &rtv);
1077 } else {
1078 atv.tv_sec = 0;
1079 atv.tv_usec = 0;
1080 }
1081 pollholddrop(td, heldbits, nfds, 1);
1082 timo = 0;
1083 PROC_LOCK(td->td_proc);
1084retry:
1085 ncoll = nselcoll;
1086 mtx_lock_spin(&sched_lock);
1087 td->td_flags |= TDF_SELECT;
1088 mtx_unlock_spin(&sched_lock);
1089 PROC_UNLOCK(td->td_proc);
1090 error = pollscan(td, (struct pollfd *)bits, nfds);
1091 PROC_LOCK(td->td_proc);
1092 if (error || td->td_retval[0])
1093 goto done;
1094 if (atv.tv_sec || atv.tv_usec) {
1095 getmicrouptime(&rtv);
1096 if (timevalcmp(&rtv, &atv, >=)) {
1097 /*
1098 * An event of our interest may occur during locking a process.
1099 * In order to avoid missing the event that occured during locking
1100 * the process, test TDF_SELECT and rescan file descriptors if
1101 * necessary.
1102 */
1103 mtx_lock_spin(&sched_lock);
1104 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1105 ncoll = nselcoll;
1106 td->td_flags |= TDF_SELECT;
1107 mtx_unlock_spin(&sched_lock);
1108 PROC_UNLOCK(td->td_proc);
1109 error = pollscan(td, (struct pollfd *)bits, nfds);
1110 PROC_LOCK(td->td_proc);
1111 } else
1112 mtx_unlock_spin(&sched_lock);
1113 goto done;
1114 }
1115 ttv = atv;
1116 timevalsub(&ttv, &rtv);
1117 timo = ttv.tv_sec > 24 * 60 * 60 ?
1118 24 * 60 * 60 * hz : tvtohz(&ttv);
1119 }
1120 mtx_lock_spin(&sched_lock);
1121 td->td_flags &= ~TDF_SELECT;
1122 mtx_unlock_spin(&sched_lock);
1123 if (timo > 0)
1124 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
1125 else
1126 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
1127 if (error == 0)
1128 goto retry;
1129
1130done:
1131 mtx_lock_spin(&sched_lock);
1132 td->td_flags &= ~TDF_SELECT;
1133 mtx_unlock_spin(&sched_lock);
1134 PROC_UNLOCK(td->td_proc);
1135 pollholddrop(td, heldbits, nfds, 0);
1136done_noproclock:
1137 /* poll is not restarted after signals... */
1138 if (error == ERESTART)
1139 error = EINTR;
1140 if (error == EWOULDBLOCK)
1141 error = 0;
1142 if (error == 0) {
1143 error = copyout(bits, SCARG(uap, fds), ni);
1144 if (error)
1145 goto out;
1146 }
1147out:
1148 if (ni > sizeof(smallbits))
1149 free(bits, M_TEMP);
1150 if (ni > sizeof(p_heldbits))
1151 free(heldbits, M_TEMP);
1152done2:
1153 mtx_unlock(&Giant);
1154 return (error);
1155}
1156
1157static int
1158pollholddrop(td, fds, nfd, hold)
1159 struct thread *td;
1160 struct pollfd *fds;
1161 u_int nfd;
1162 int hold;
1163{
1164 register struct filedesc *fdp = td->td_proc->p_fd;
1165 int i;
1166 struct file *fp;
1167
1168 FILEDESC_LOCK(fdp);
1169 for (i = 0; i < nfd; i++, fds++) {
1170 if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) {
1171 fp = fdp->fd_ofiles[fds->fd];
1172 if (hold) {
1173 if (fp != NULL) {
1174 fhold(fp);
1175 fds->revents = 1;
1176 } else
1177 fds->revents = 0;
1178 } else if(fp != NULL && fds->revents) {
1179 FILE_LOCK(fp);
1180 FILEDESC_UNLOCK(fdp);
1181 fdrop_locked(fp, td);
1182 FILEDESC_LOCK(fdp);
1183 }
1184 }
1185 }
1186 FILEDESC_UNLOCK(fdp);
1187 return (0);
1188}
1189
1190static int
1191pollscan(td, fds, nfd)
1192 struct thread *td;
1193 struct pollfd *fds;
1194 u_int nfd;
1195{
1196 register struct filedesc *fdp = td->td_proc->p_fd;
1197 int i;
1198 struct file *fp;
1199 int n = 0;
1200
1201 for (i = 0; i < nfd; i++, fds++) {
1202 FILEDESC_LOCK(fdp);
1203 if (fds->fd >= fdp->fd_nfiles) {
1204 fds->revents = POLLNVAL;
1205 n++;
1206 FILEDESC_UNLOCK(fdp);
1207 } else if (fds->fd < 0) {
1208 fds->revents = 0;
1209 FILEDESC_UNLOCK(fdp);
1210 } else {
1211 fp = fdp->fd_ofiles[fds->fd];
1212 FILEDESC_UNLOCK(fdp);
1213 if (fp == NULL) {
1214 fds->revents = POLLNVAL;
1215 n++;
1216 } else {
1217 /*
1218 * Note: backend also returns POLLHUP and
1219 * POLLERR if appropriate.
1220 */
1221 fds->revents = fo_poll(fp, fds->events,
1222 fp->f_cred, td);
1223 if (fds->revents != 0)
1224 n++;
1225 }
1226 }
1227 }
1228 td->td_retval[0] = n;
1229 return (0);
1230}
1231
1232/*
1233 * OpenBSD poll system call.
1234 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1235 */
1236#ifndef _SYS_SYSPROTO_H_
1237struct openbsd_poll_args {
1238 struct pollfd *fds;
1239 u_int nfds;
1240 int timeout;
1241};
1242#endif
1243/*
1244 * MPSAFE
1245 */
1246int
1247openbsd_poll(td, uap)
1248 register struct thread *td;
1249 register struct openbsd_poll_args *uap;
1250{
1251 return (poll(td, (struct poll_args *)uap));
1252}
1253
1254/*ARGSUSED*/
1255int
1256seltrue(dev, events, td)
1257 dev_t dev;
1258 int events;
1259 struct thread *td;
1260{
1261
1262 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1263}
1264
1265static int
1266find_thread_in_proc(struct proc *p, struct thread *td)
1267{
1268 struct thread *td2;
1269 FOREACH_THREAD_IN_PROC(p, td2) {
1270 if (td2 == td) {
1271 return (1);
1272 }
1273 }
1274 return (0);
1275}
1276
1277/*
1278 * Record a select request.
1279 */
1280void
1281selrecord(selector, sip)
1282 struct thread *selector;
1283 struct selinfo *sip;
1284{
1285 struct proc *p;
1286 pid_t mypid;
1287
1288 mypid = selector->td_proc->p_pid;
1289 if ((sip->si_pid == mypid) &&
1290 (sip->si_thread == selector)) { /* XXXKSE should be an ID? */
1291 return;
1292 }
1293 if (sip->si_pid &&
1294 (p = pfind(sip->si_pid)) &&
1295 (find_thread_in_proc(p, sip->si_thread))) {
1296 mtx_lock_spin(&sched_lock);
1297 if (sip->si_thread->td_wchan == (caddr_t)&selwait) {
1298 mtx_unlock_spin(&sched_lock);
1299 PROC_UNLOCK(p);
1300 sip->si_flags |= SI_COLL;
1301 return;
1302 }
1303 mtx_unlock_spin(&sched_lock);
1304 PROC_UNLOCK(p);
1305 }
1306 sip->si_pid = mypid;
1307 sip->si_thread = selector;
1308}
1309
1310/*
1311 * Do a wakeup when a selectable event occurs.
1312 */
1313void
1314selwakeup(sip)
1315 register struct selinfo *sip;
1316{
1317 struct thread *td;
1318 register struct proc *p;
1319
1320 if (sip->si_pid == 0)
1321 return;
1322 if (sip->si_flags & SI_COLL) {
1323 nselcoll++;
1324 sip->si_flags &= ~SI_COLL;
1325 cv_broadcast(&selwait);
1326 }
1327 p = pfind(sip->si_pid);
1328 sip->si_pid = 0;
1329 td = sip->si_thread;
1330 if (p != NULL) {
1331 if (!find_thread_in_proc(p, td)) {
1332 PROC_UNLOCK(p); /* lock is in pfind() */;
1333 return;
1334 }
1335 mtx_lock_spin(&sched_lock);
1336 if (td->td_wchan == (caddr_t)&selwait) {
1337 if (td->td_proc->p_stat == SSLEEP)
1338 setrunnable(td);
1339 else
1340 cv_waitq_remove(td);
1341 } else
1342 td->td_flags &= ~TDF_SELECT;
1343 mtx_unlock_spin(&sched_lock);
1344 PROC_UNLOCK(p); /* Lock is in pfind() */
1345 }
1346}
1347
1348static void selectinit __P((void *));
1349SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1350
1351/* ARGSUSED*/
1352static void
1353selectinit(dummy)
1354 void *dummy;
1355{
1356 cv_init(&selwait, "select");
1357}