sys_generic.c revision 103216
1/* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD: head/sys/kern/sys_generic.c 103216 2002-09-11 08:13:56Z julian $ 40 */ 41 42#include "opt_ktrace.h" 43 44#include <sys/param.h> 45#include <sys/systm.h> 46#include <sys/sysproto.h> 47#include <sys/filedesc.h> 48#include <sys/filio.h> 49#include <sys/fcntl.h> 50#include <sys/file.h> 51#include <sys/proc.h> 52#include <sys/signalvar.h> 53#include <sys/socketvar.h> 54#include <sys/uio.h> 55#include <sys/kernel.h> 56#include <sys/malloc.h> 57#include <sys/poll.h> 58#include <sys/resourcevar.h> 59#include <sys/selinfo.h> 60#include <sys/syscallsubr.h> 61#include <sys/sysctl.h> 62#include <sys/sysent.h> 63#include <sys/bio.h> 64#include <sys/buf.h> 65#include <sys/condvar.h> 66#ifdef __alpha__ 67#include <sys/disklabel.h> 68#endif 69#ifdef KTRACE 70#include <sys/ktrace.h> 71#endif 72#include <vm/vm.h> 73#include <vm/vm_page.h> 74 75#include <machine/limits.h> 76 77static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 78static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 79MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 80 81static int pollscan(struct thread *, struct pollfd *, u_int); 82static int selscan(struct thread *, fd_mask **, fd_mask **, int); 83static int dofileread(struct thread *, struct file *, int, void *, 84 size_t, off_t, int); 85static int dofilewrite(struct thread *, struct file *, int, 86 const void *, size_t, off_t, int); 87 88/* 89 * Read system call. 90 */ 91#ifndef _SYS_SYSPROTO_H_ 92struct read_args { 93 int fd; 94 void *buf; 95 size_t nbyte; 96}; 97#endif 98/* 99 * MPSAFE 100 */ 101int 102read(td, uap) 103 struct thread *td; 104 struct read_args *uap; 105{ 106 struct file *fp; 107 int error; 108 109 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 110 error = dofileread(td, fp, uap->fd, uap->buf, 111 uap->nbyte, (off_t)-1, 0); 112 fdrop(fp, td); 113 } 114 return(error); 115} 116 117/* 118 * Pread system call 119 */ 120#ifndef _SYS_SYSPROTO_H_ 121struct pread_args { 122 int fd; 123 void *buf; 124 size_t nbyte; 125 int pad; 126 off_t offset; 127}; 128#endif 129/* 130 * MPSAFE 131 */ 132int 133pread(td, uap) 134 struct thread *td; 135 struct pread_args *uap; 136{ 137 struct file *fp; 138 int error; 139 140 if ((error = fget_read(td, uap->fd, &fp)) != 0) 141 return (error); 142 if (fp->f_type != DTYPE_VNODE) { 143 error = ESPIPE; 144 } else { 145 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 146 uap->offset, FOF_OFFSET); 147 } 148 fdrop(fp, td); 149 return(error); 150} 151 152/* 153 * Code common for read and pread 154 */ 155int 156dofileread(td, fp, fd, buf, nbyte, offset, flags) 157 struct thread *td; 158 struct file *fp; 159 int fd, flags; 160 void *buf; 161 size_t nbyte; 162 off_t offset; 163{ 164 struct uio auio; 165 struct iovec aiov; 166 long cnt, error = 0; 167#ifdef KTRACE 168 struct iovec ktriov; 169 struct uio ktruio; 170 int didktr = 0; 171#endif 172 173 aiov.iov_base = buf; 174 aiov.iov_len = nbyte; 175 auio.uio_iov = &aiov; 176 auio.uio_iovcnt = 1; 177 auio.uio_offset = offset; 178 if (nbyte > INT_MAX) 179 return (EINVAL); 180 auio.uio_resid = nbyte; 181 auio.uio_rw = UIO_READ; 182 auio.uio_segflg = UIO_USERSPACE; 183 auio.uio_td = td; 184#ifdef KTRACE 185 /* 186 * if tracing, save a copy of iovec 187 */ 188 if (KTRPOINT(td, KTR_GENIO)) { 189 ktriov = aiov; 190 ktruio = auio; 191 didktr = 1; 192 } 193#endif 194 cnt = nbyte; 195 196 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 197 if (auio.uio_resid != cnt && (error == ERESTART || 198 error == EINTR || error == EWOULDBLOCK)) 199 error = 0; 200 } 201 cnt -= auio.uio_resid; 202#ifdef KTRACE 203 if (didktr && error == 0) { 204 ktruio.uio_iov = &ktriov; 205 ktruio.uio_resid = cnt; 206 ktrgenio(fd, UIO_READ, &ktruio, error); 207 } 208#endif 209 td->td_retval[0] = cnt; 210 return (error); 211} 212 213/* 214 * Scatter read system call. 215 */ 216#ifndef _SYS_SYSPROTO_H_ 217struct readv_args { 218 int fd; 219 struct iovec *iovp; 220 u_int iovcnt; 221}; 222#endif 223/* 224 * MPSAFE 225 */ 226int 227readv(td, uap) 228 struct thread *td; 229 struct readv_args *uap; 230{ 231 struct file *fp; 232 struct uio auio; 233 struct iovec *iov; 234 struct iovec *needfree; 235 struct iovec aiov[UIO_SMALLIOV]; 236 long i, cnt; 237 int error; 238 u_int iovlen; 239#ifdef KTRACE 240 struct iovec *ktriov = NULL; 241 struct uio ktruio; 242#endif 243 244 if ((error = fget_read(td, uap->fd, &fp)) != 0) 245 return (error); 246 needfree = NULL; 247 /* note: can't use iovlen until iovcnt is validated */ 248 iovlen = uap->iovcnt * sizeof (struct iovec); 249 if (uap->iovcnt > UIO_SMALLIOV) { 250 if (uap->iovcnt > UIO_MAXIOV) { 251 error = EINVAL; 252 goto done; 253 } 254 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 255 needfree = iov; 256 } else 257 iov = aiov; 258 auio.uio_iov = iov; 259 auio.uio_iovcnt = uap->iovcnt; 260 auio.uio_rw = UIO_READ; 261 auio.uio_segflg = UIO_USERSPACE; 262 auio.uio_td = td; 263 auio.uio_offset = -1; 264 if ((error = copyin(uap->iovp, iov, iovlen))) 265 goto done; 266 auio.uio_resid = 0; 267 for (i = 0; i < uap->iovcnt; i++) { 268 if (iov->iov_len > INT_MAX - auio.uio_resid) { 269 error = EINVAL; 270 goto done; 271 } 272 auio.uio_resid += iov->iov_len; 273 iov++; 274 } 275#ifdef KTRACE 276 /* 277 * if tracing, save a copy of iovec 278 */ 279 if (KTRPOINT(td, KTR_GENIO)) { 280 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 281 bcopy(auio.uio_iov, ktriov, iovlen); 282 ktruio = auio; 283 } 284#endif 285 cnt = auio.uio_resid; 286 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) { 287 if (auio.uio_resid != cnt && (error == ERESTART || 288 error == EINTR || error == EWOULDBLOCK)) 289 error = 0; 290 } 291 cnt -= auio.uio_resid; 292#ifdef KTRACE 293 if (ktriov != NULL) { 294 if (error == 0) { 295 ktruio.uio_iov = ktriov; 296 ktruio.uio_resid = cnt; 297 ktrgenio(uap->fd, UIO_READ, &ktruio, error); 298 } 299 FREE(ktriov, M_TEMP); 300 } 301#endif 302 td->td_retval[0] = cnt; 303done: 304 fdrop(fp, td); 305 if (needfree) 306 FREE(needfree, M_IOV); 307 return (error); 308} 309 310/* 311 * Write system call 312 */ 313#ifndef _SYS_SYSPROTO_H_ 314struct write_args { 315 int fd; 316 const void *buf; 317 size_t nbyte; 318}; 319#endif 320/* 321 * MPSAFE 322 */ 323int 324write(td, uap) 325 struct thread *td; 326 struct write_args *uap; 327{ 328 struct file *fp; 329 int error; 330 331 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 332 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 333 (off_t)-1, 0); 334 fdrop(fp, td); 335 } else { 336 error = EBADF; /* XXX this can't be right */ 337 } 338 return(error); 339} 340 341/* 342 * Pwrite system call 343 */ 344#ifndef _SYS_SYSPROTO_H_ 345struct pwrite_args { 346 int fd; 347 const void *buf; 348 size_t nbyte; 349 int pad; 350 off_t offset; 351}; 352#endif 353/* 354 * MPSAFE 355 */ 356int 357pwrite(td, uap) 358 struct thread *td; 359 struct pwrite_args *uap; 360{ 361 struct file *fp; 362 int error; 363 364 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 365 if (fp->f_type == DTYPE_VNODE) { 366 error = dofilewrite(td, fp, uap->fd, uap->buf, 367 uap->nbyte, uap->offset, FOF_OFFSET); 368 } else { 369 error = ESPIPE; 370 } 371 fdrop(fp, td); 372 } else { 373 error = EBADF; /* this can't be right */ 374 } 375 return(error); 376} 377 378static int 379dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 380 struct thread *td; 381 struct file *fp; 382 int fd, flags; 383 const void *buf; 384 size_t nbyte; 385 off_t offset; 386{ 387 struct uio auio; 388 struct iovec aiov; 389 long cnt, error = 0; 390#ifdef KTRACE 391 struct iovec ktriov; 392 struct uio ktruio; 393 int didktr = 0; 394#endif 395 396 aiov.iov_base = (void *)(uintptr_t)buf; 397 aiov.iov_len = nbyte; 398 auio.uio_iov = &aiov; 399 auio.uio_iovcnt = 1; 400 auio.uio_offset = offset; 401 if (nbyte > INT_MAX) 402 return (EINVAL); 403 auio.uio_resid = nbyte; 404 auio.uio_rw = UIO_WRITE; 405 auio.uio_segflg = UIO_USERSPACE; 406 auio.uio_td = td; 407#ifdef KTRACE 408 /* 409 * if tracing, save a copy of iovec and uio 410 */ 411 if (KTRPOINT(td, KTR_GENIO)) { 412 ktriov = aiov; 413 ktruio = auio; 414 didktr = 1; 415 } 416#endif 417 cnt = nbyte; 418 if (fp->f_type == DTYPE_VNODE) 419 bwillwrite(); 420 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 421 if (auio.uio_resid != cnt && (error == ERESTART || 422 error == EINTR || error == EWOULDBLOCK)) 423 error = 0; 424 /* Socket layer is responsible for issuing SIGPIPE. */ 425 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 426 PROC_LOCK(td->td_proc); 427 psignal(td->td_proc, SIGPIPE); 428 PROC_UNLOCK(td->td_proc); 429 } 430 } 431 cnt -= auio.uio_resid; 432#ifdef KTRACE 433 if (didktr && error == 0) { 434 ktruio.uio_iov = &ktriov; 435 ktruio.uio_resid = cnt; 436 ktrgenio(fd, UIO_WRITE, &ktruio, error); 437 } 438#endif 439 td->td_retval[0] = cnt; 440 return (error); 441} 442 443/* 444 * Gather write system call 445 */ 446#ifndef _SYS_SYSPROTO_H_ 447struct writev_args { 448 int fd; 449 struct iovec *iovp; 450 u_int iovcnt; 451}; 452#endif 453/* 454 * MPSAFE 455 */ 456int 457writev(td, uap) 458 struct thread *td; 459 register struct writev_args *uap; 460{ 461 struct file *fp; 462 struct uio auio; 463 register struct iovec *iov; 464 struct iovec *needfree; 465 struct iovec aiov[UIO_SMALLIOV]; 466 long i, cnt, error = 0; 467 u_int iovlen; 468#ifdef KTRACE 469 struct iovec *ktriov = NULL; 470 struct uio ktruio; 471#endif 472 473 mtx_lock(&Giant); 474 if ((error = fget_write(td, uap->fd, &fp)) != 0) { 475 error = EBADF; 476 goto done2; 477 } 478 /* note: can't use iovlen until iovcnt is validated */ 479 iovlen = uap->iovcnt * sizeof (struct iovec); 480 if (uap->iovcnt > UIO_SMALLIOV) { 481 if (uap->iovcnt > UIO_MAXIOV) { 482 needfree = NULL; 483 error = EINVAL; 484 goto done; 485 } 486 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 487 needfree = iov; 488 } else { 489 iov = aiov; 490 needfree = NULL; 491 } 492 auio.uio_iov = iov; 493 auio.uio_iovcnt = uap->iovcnt; 494 auio.uio_rw = UIO_WRITE; 495 auio.uio_segflg = UIO_USERSPACE; 496 auio.uio_td = td; 497 auio.uio_offset = -1; 498 if ((error = copyin(uap->iovp, iov, iovlen))) 499 goto done; 500 auio.uio_resid = 0; 501 for (i = 0; i < uap->iovcnt; i++) { 502 if (iov->iov_len > INT_MAX - auio.uio_resid) { 503 error = EINVAL; 504 goto done; 505 } 506 auio.uio_resid += iov->iov_len; 507 iov++; 508 } 509#ifdef KTRACE 510 /* 511 * if tracing, save a copy of iovec and uio 512 */ 513 if (KTRPOINT(td, KTR_GENIO)) { 514 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 515 bcopy(auio.uio_iov, ktriov, iovlen); 516 ktruio = auio; 517 } 518#endif 519 cnt = auio.uio_resid; 520 if (fp->f_type == DTYPE_VNODE) 521 bwillwrite(); 522 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) { 523 if (auio.uio_resid != cnt && (error == ERESTART || 524 error == EINTR || error == EWOULDBLOCK)) 525 error = 0; 526 if (error == EPIPE) { 527 PROC_LOCK(td->td_proc); 528 psignal(td->td_proc, SIGPIPE); 529 PROC_UNLOCK(td->td_proc); 530 } 531 } 532 cnt -= auio.uio_resid; 533#ifdef KTRACE 534 if (ktriov != NULL) { 535 if (error == 0) { 536 ktruio.uio_iov = ktriov; 537 ktruio.uio_resid = cnt; 538 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); 539 } 540 FREE(ktriov, M_TEMP); 541 } 542#endif 543 td->td_retval[0] = cnt; 544done: 545 fdrop(fp, td); 546 if (needfree) 547 FREE(needfree, M_IOV); 548done2: 549 mtx_unlock(&Giant); 550 return (error); 551} 552 553/* 554 * Ioctl system call 555 */ 556#ifndef _SYS_SYSPROTO_H_ 557struct ioctl_args { 558 int fd; 559 u_long com; 560 caddr_t data; 561}; 562#endif 563/* 564 * MPSAFE 565 */ 566/* ARGSUSED */ 567int 568ioctl(td, uap) 569 struct thread *td; 570 register struct ioctl_args *uap; 571{ 572 struct file *fp; 573 register struct filedesc *fdp; 574 register u_long com; 575 int error = 0; 576 register u_int size; 577 caddr_t data, memp; 578 int tmp; 579#define STK_PARAMS 128 580 union { 581 char stkbuf[STK_PARAMS]; 582 long align; 583 } ubuf; 584 585 if ((error = fget(td, uap->fd, &fp)) != 0) 586 return (error); 587 mtx_lock(&Giant); 588 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 589 fdrop(fp, td); 590 mtx_unlock(&Giant); 591 return (EBADF); 592 } 593 fdp = td->td_proc->p_fd; 594 switch (com = uap->com) { 595 case FIONCLEX: 596 FILEDESC_LOCK(fdp); 597 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 598 FILEDESC_UNLOCK(fdp); 599 fdrop(fp, td); 600 mtx_unlock(&Giant); 601 return (0); 602 case FIOCLEX: 603 FILEDESC_LOCK(fdp); 604 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 605 FILEDESC_UNLOCK(fdp); 606 fdrop(fp, td); 607 mtx_unlock(&Giant); 608 return (0); 609 } 610 611 /* 612 * Interpret high order word to find amount of data to be 613 * copied to/from the user's address space. 614 */ 615 size = IOCPARM_LEN(com); 616 if (size > IOCPARM_MAX) { 617 fdrop(fp, td); 618 mtx_unlock(&Giant); 619 return (ENOTTY); 620 } 621 622 memp = NULL; 623 if (size > sizeof (ubuf.stkbuf)) { 624 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 625 data = memp; 626 } else { 627 data = ubuf.stkbuf; 628 } 629 if (com&IOC_IN) { 630 if (size) { 631 error = copyin(uap->data, data, (u_int)size); 632 if (error) { 633 if (memp) 634 free(memp, M_IOCTLOPS); 635 fdrop(fp, td); 636 goto done; 637 } 638 } else { 639 *(caddr_t *)data = uap->data; 640 } 641 } else if ((com&IOC_OUT) && size) { 642 /* 643 * Zero the buffer so the user always 644 * gets back something deterministic. 645 */ 646 bzero(data, size); 647 } else if (com&IOC_VOID) { 648 *(caddr_t *)data = uap->data; 649 } 650 651 switch (com) { 652 653 case FIONBIO: 654 FILE_LOCK(fp); 655 if ((tmp = *(int *)data)) 656 fp->f_flag |= FNONBLOCK; 657 else 658 fp->f_flag &= ~FNONBLOCK; 659 FILE_UNLOCK(fp); 660 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 661 break; 662 663 case FIOASYNC: 664 FILE_LOCK(fp); 665 if ((tmp = *(int *)data)) 666 fp->f_flag |= FASYNC; 667 else 668 fp->f_flag &= ~FASYNC; 669 FILE_UNLOCK(fp); 670 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 671 break; 672 673 default: 674 error = fo_ioctl(fp, com, data, td->td_ucred, td); 675 /* 676 * Copy any data to user, size was 677 * already set and checked above. 678 */ 679 if (error == 0 && (com&IOC_OUT) && size) 680 error = copyout(data, uap->data, (u_int)size); 681 break; 682 } 683 if (memp) 684 free(memp, M_IOCTLOPS); 685 fdrop(fp, td); 686done: 687 mtx_unlock(&Giant); 688 return (error); 689} 690 691/* 692 * sellock and selwait are initialized in selectinit() via SYSINIT. 693 */ 694struct mtx sellock; 695struct cv selwait; 696u_int nselcoll; /* Select collisions since boot */ 697SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 698 699/* 700 * Select system call. 701 */ 702#ifndef _SYS_SYSPROTO_H_ 703struct select_args { 704 int nd; 705 fd_set *in, *ou, *ex; 706 struct timeval *tv; 707}; 708#endif 709/* 710 * MPSAFE 711 */ 712int 713select(td, uap) 714 register struct thread *td; 715 register struct select_args *uap; 716{ 717 struct timeval tv, *tvp; 718 int error; 719 720 if (uap->tv != NULL) { 721 error = copyin(uap->tv, &tv, sizeof(tv)); 722 if (error) 723 return (error); 724 tvp = &tv; 725 } else 726 tvp = NULL; 727 728 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 729} 730 731int 732kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 733 fd_set *fd_ex, struct timeval *tvp) 734{ 735 struct filedesc *fdp; 736 /* 737 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 738 * infds with the new FD_SETSIZE of 1024, and more than enough for 739 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 740 * of 256. 741 */ 742 fd_mask s_selbits[howmany(2048, NFDBITS)]; 743 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 744 struct timeval atv, rtv, ttv; 745 int error, timo; 746 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 747 748 if (nd < 0) 749 return (EINVAL); 750 fdp = td->td_proc->p_fd; 751 mtx_lock(&Giant); 752 FILEDESC_LOCK(fdp); 753 754 if (nd > td->td_proc->p_fd->fd_nfiles) 755 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 756 FILEDESC_UNLOCK(fdp); 757 758 /* 759 * Allocate just enough bits for the non-null fd_sets. Use the 760 * preallocated auto buffer if possible. 761 */ 762 nfdbits = roundup(nd, NFDBITS); 763 ncpbytes = nfdbits / NBBY; 764 nbufbytes = 0; 765 if (fd_in != NULL) 766 nbufbytes += 2 * ncpbytes; 767 if (fd_ou != NULL) 768 nbufbytes += 2 * ncpbytes; 769 if (fd_ex != NULL) 770 nbufbytes += 2 * ncpbytes; 771 if (nbufbytes <= sizeof s_selbits) 772 selbits = &s_selbits[0]; 773 else 774 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 775 776 /* 777 * Assign pointers into the bit buffers and fetch the input bits. 778 * Put the output buffers together so that they can be bzeroed 779 * together. 780 */ 781 sbp = selbits; 782#define getbits(name, x) \ 783 do { \ 784 if (name == NULL) \ 785 ibits[x] = NULL; \ 786 else { \ 787 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 788 obits[x] = sbp; \ 789 sbp += ncpbytes / sizeof *sbp; \ 790 error = copyin(name, ibits[x], ncpbytes); \ 791 if (error != 0) \ 792 goto done_nosellock; \ 793 } \ 794 } while (0) 795 getbits(fd_in, 0); 796 getbits(fd_ou, 1); 797 getbits(fd_ex, 2); 798#undef getbits 799 if (nbufbytes != 0) 800 bzero(selbits, nbufbytes / 2); 801 802 if (tvp != NULL) { 803 atv = *tvp; 804 if (itimerfix(&atv)) { 805 error = EINVAL; 806 goto done_nosellock; 807 } 808 getmicrouptime(&rtv); 809 timevaladd(&atv, &rtv); 810 } else { 811 atv.tv_sec = 0; 812 atv.tv_usec = 0; 813 } 814 timo = 0; 815 TAILQ_INIT(&td->td_selq); 816 mtx_lock(&sellock); 817retry: 818 ncoll = nselcoll; 819 mtx_lock_spin(&sched_lock); 820 td->td_flags |= TDF_SELECT; 821 mtx_unlock_spin(&sched_lock); 822 mtx_unlock(&sellock); 823 824 error = selscan(td, ibits, obits, nd); 825 mtx_lock(&sellock); 826 if (error || td->td_retval[0]) 827 goto done; 828 if (atv.tv_sec || atv.tv_usec) { 829 getmicrouptime(&rtv); 830 if (timevalcmp(&rtv, &atv, >=)) 831 goto done; 832 ttv = atv; 833 timevalsub(&ttv, &rtv); 834 timo = ttv.tv_sec > 24 * 60 * 60 ? 835 24 * 60 * 60 * hz : tvtohz(&ttv); 836 } 837 838 /* 839 * An event of interest may occur while we do not hold 840 * sellock, so check TDF_SELECT and the number of 841 * collisions and rescan the file descriptors if 842 * necessary. 843 */ 844 mtx_lock_spin(&sched_lock); 845 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 846 mtx_unlock_spin(&sched_lock); 847 goto retry; 848 } 849 mtx_unlock_spin(&sched_lock); 850 851 if (timo > 0) 852 error = cv_timedwait_sig(&selwait, &sellock, timo); 853 else 854 error = cv_wait_sig(&selwait, &sellock); 855 856 if (error == 0) 857 goto retry; 858 859done: 860 clear_selinfo_list(td); 861 mtx_lock_spin(&sched_lock); 862 td->td_flags &= ~TDF_SELECT; 863 mtx_unlock_spin(&sched_lock); 864 mtx_unlock(&sellock); 865 866done_nosellock: 867 /* select is not restarted after signals... */ 868 if (error == ERESTART) 869 error = EINTR; 870 if (error == EWOULDBLOCK) 871 error = 0; 872#define putbits(name, x) \ 873 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 874 error = error2; 875 if (error == 0) { 876 int error2; 877 878 putbits(fd_in, 0); 879 putbits(fd_ou, 1); 880 putbits(fd_ex, 2); 881#undef putbits 882 } 883 if (selbits != &s_selbits[0]) 884 free(selbits, M_SELECT); 885 886 mtx_unlock(&Giant); 887 return (error); 888} 889 890static int 891selscan(td, ibits, obits, nfd) 892 struct thread *td; 893 fd_mask **ibits, **obits; 894 int nfd; 895{ 896 int msk, i, fd; 897 fd_mask bits; 898 struct file *fp; 899 int n = 0; 900 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 901 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 902 struct filedesc *fdp = td->td_proc->p_fd; 903 904 FILEDESC_LOCK(fdp); 905 for (msk = 0; msk < 3; msk++) { 906 if (ibits[msk] == NULL) 907 continue; 908 for (i = 0; i < nfd; i += NFDBITS) { 909 bits = ibits[msk][i/NFDBITS]; 910 /* ffs(int mask) not portable, fd_mask is long */ 911 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 912 if (!(bits & 1)) 913 continue; 914 if ((fp = fget_locked(fdp, fd)) == NULL) { 915 FILEDESC_UNLOCK(fdp); 916 return (EBADF); 917 } 918 if (fo_poll(fp, flag[msk], td->td_ucred, 919 td)) { 920 obits[msk][(fd)/NFDBITS] |= 921 ((fd_mask)1 << ((fd) % NFDBITS)); 922 n++; 923 } 924 } 925 } 926 } 927 FILEDESC_UNLOCK(fdp); 928 td->td_retval[0] = n; 929 return (0); 930} 931 932/* 933 * Poll system call. 934 */ 935#ifndef _SYS_SYSPROTO_H_ 936struct poll_args { 937 struct pollfd *fds; 938 u_int nfds; 939 int timeout; 940}; 941#endif 942/* 943 * MPSAFE 944 */ 945int 946poll(td, uap) 947 struct thread *td; 948 struct poll_args *uap; 949{ 950 caddr_t bits; 951 char smallbits[32 * sizeof(struct pollfd)]; 952 struct timeval atv, rtv, ttv; 953 int error = 0, timo; 954 u_int ncoll, nfds; 955 size_t ni; 956 957 nfds = SCARG(uap, nfds); 958 959 mtx_lock(&Giant); 960 /* 961 * This is kinda bogus. We have fd limits, but that is not 962 * really related to the size of the pollfd array. Make sure 963 * we let the process use at least FD_SETSIZE entries and at 964 * least enough for the current limits. We want to be reasonably 965 * safe, but not overly restrictive. 966 */ 967 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 968 (nfds > FD_SETSIZE)) { 969 error = EINVAL; 970 goto done2; 971 } 972 ni = nfds * sizeof(struct pollfd); 973 if (ni > sizeof(smallbits)) 974 bits = malloc(ni, M_TEMP, M_WAITOK); 975 else 976 bits = smallbits; 977 error = copyin(SCARG(uap, fds), bits, ni); 978 if (error) 979 goto done_nosellock; 980 if (SCARG(uap, timeout) != INFTIM) { 981 atv.tv_sec = SCARG(uap, timeout) / 1000; 982 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 983 if (itimerfix(&atv)) { 984 error = EINVAL; 985 goto done_nosellock; 986 } 987 getmicrouptime(&rtv); 988 timevaladd(&atv, &rtv); 989 } else { 990 atv.tv_sec = 0; 991 atv.tv_usec = 0; 992 } 993 timo = 0; 994 TAILQ_INIT(&td->td_selq); 995 mtx_lock(&sellock); 996retry: 997 ncoll = nselcoll; 998 mtx_lock_spin(&sched_lock); 999 td->td_flags |= TDF_SELECT; 1000 mtx_unlock_spin(&sched_lock); 1001 mtx_unlock(&sellock); 1002 1003 error = pollscan(td, (struct pollfd *)bits, nfds); 1004 mtx_lock(&sellock); 1005 if (error || td->td_retval[0]) 1006 goto done; 1007 if (atv.tv_sec || atv.tv_usec) { 1008 getmicrouptime(&rtv); 1009 if (timevalcmp(&rtv, &atv, >=)) 1010 goto done; 1011 ttv = atv; 1012 timevalsub(&ttv, &rtv); 1013 timo = ttv.tv_sec > 24 * 60 * 60 ? 1014 24 * 60 * 60 * hz : tvtohz(&ttv); 1015 } 1016 /* 1017 * An event of interest may occur while we do not hold 1018 * sellock, so check TDF_SELECT and the number of collisions 1019 * and rescan the file descriptors if necessary. 1020 */ 1021 mtx_lock_spin(&sched_lock); 1022 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1023 mtx_unlock_spin(&sched_lock); 1024 goto retry; 1025 } 1026 mtx_unlock_spin(&sched_lock); 1027 1028 if (timo > 0) 1029 error = cv_timedwait_sig(&selwait, &sellock, timo); 1030 else 1031 error = cv_wait_sig(&selwait, &sellock); 1032 1033 if (error == 0) 1034 goto retry; 1035 1036done: 1037 clear_selinfo_list(td); 1038 mtx_lock_spin(&sched_lock); 1039 td->td_flags &= ~TDF_SELECT; 1040 mtx_unlock_spin(&sched_lock); 1041 mtx_unlock(&sellock); 1042 1043done_nosellock: 1044 /* poll is not restarted after signals... */ 1045 if (error == ERESTART) 1046 error = EINTR; 1047 if (error == EWOULDBLOCK) 1048 error = 0; 1049 if (error == 0) { 1050 error = copyout(bits, SCARG(uap, fds), ni); 1051 if (error) 1052 goto out; 1053 } 1054out: 1055 if (ni > sizeof(smallbits)) 1056 free(bits, M_TEMP); 1057done2: 1058 mtx_unlock(&Giant); 1059 return (error); 1060} 1061 1062static int 1063pollscan(td, fds, nfd) 1064 struct thread *td; 1065 struct pollfd *fds; 1066 u_int nfd; 1067{ 1068 register struct filedesc *fdp = td->td_proc->p_fd; 1069 int i; 1070 struct file *fp; 1071 int n = 0; 1072 1073 FILEDESC_LOCK(fdp); 1074 for (i = 0; i < nfd; i++, fds++) { 1075 if (fds->fd >= fdp->fd_nfiles) { 1076 fds->revents = POLLNVAL; 1077 n++; 1078 } else if (fds->fd < 0) { 1079 fds->revents = 0; 1080 } else { 1081 fp = fdp->fd_ofiles[fds->fd]; 1082 if (fp == NULL) { 1083 fds->revents = POLLNVAL; 1084 n++; 1085 } else { 1086 /* 1087 * Note: backend also returns POLLHUP and 1088 * POLLERR if appropriate. 1089 */ 1090 fds->revents = fo_poll(fp, fds->events, 1091 td->td_ucred, td); 1092 if (fds->revents != 0) 1093 n++; 1094 } 1095 } 1096 } 1097 FILEDESC_UNLOCK(fdp); 1098 td->td_retval[0] = n; 1099 return (0); 1100} 1101 1102/* 1103 * OpenBSD poll system call. 1104 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1105 */ 1106#ifndef _SYS_SYSPROTO_H_ 1107struct openbsd_poll_args { 1108 struct pollfd *fds; 1109 u_int nfds; 1110 int timeout; 1111}; 1112#endif 1113/* 1114 * MPSAFE 1115 */ 1116int 1117openbsd_poll(td, uap) 1118 register struct thread *td; 1119 register struct openbsd_poll_args *uap; 1120{ 1121 return (poll(td, (struct poll_args *)uap)); 1122} 1123 1124/* 1125 * Remove the references to the thread from all of the objects 1126 * we were polling. 1127 * 1128 * This code assumes that the underlying owner of the selinfo 1129 * structure will hold sellock before it changes it, and that 1130 * it will unlink itself from our list if it goes away. 1131 */ 1132void 1133clear_selinfo_list(td) 1134 struct thread *td; 1135{ 1136 struct selinfo *si; 1137 1138 mtx_assert(&sellock, MA_OWNED); 1139 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1140 si->si_thread = NULL; 1141 TAILQ_INIT(&td->td_selq); 1142} 1143 1144/*ARGSUSED*/ 1145int 1146seltrue(dev, events, td) 1147 dev_t dev; 1148 int events; 1149 struct thread *td; 1150{ 1151 1152 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1153} 1154 1155/* 1156 * Record a select request. 1157 */ 1158void 1159selrecord(selector, sip) 1160 struct thread *selector; 1161 struct selinfo *sip; 1162{ 1163 1164 mtx_lock(&sellock); 1165 /* 1166 * If the selinfo's thread pointer is NULL then take ownership of it. 1167 * 1168 * If the thread pointer is not NULL and it points to another 1169 * thread, then we have a collision. 1170 * 1171 * If the thread pointer is not NULL and points back to us then leave 1172 * it alone as we've already added pointed it at us and added it to 1173 * our list. 1174 */ 1175 if (sip->si_thread == NULL) { 1176 sip->si_thread = selector; 1177 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1178 } else if (sip->si_thread != selector) { 1179 sip->si_flags |= SI_COLL; 1180 } 1181 1182 mtx_unlock(&sellock); 1183} 1184 1185/* 1186 * Do a wakeup when a selectable event occurs. 1187 */ 1188void 1189selwakeup(sip) 1190 struct selinfo *sip; 1191{ 1192 struct thread *td; 1193 1194 mtx_lock(&sellock); 1195 td = sip->si_thread; 1196 if ((sip->si_flags & SI_COLL) != 0) { 1197 nselcoll++; 1198 sip->si_flags &= ~SI_COLL; 1199 cv_broadcast(&selwait); 1200 } 1201 if (td == NULL) { 1202 mtx_unlock(&sellock); 1203 return; 1204 } 1205 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1206 sip->si_thread = NULL; 1207 mtx_lock_spin(&sched_lock); 1208 if (td->td_wchan == &selwait) { 1209 cv_waitq_remove(td); 1210 TD_CLR_SLEEPING(td); 1211 setrunnable(td); 1212 } else 1213 td->td_flags &= ~TDF_SELECT; 1214 mtx_unlock_spin(&sched_lock); 1215 mtx_unlock(&sellock); 1216} 1217 1218static void selectinit(void *); 1219SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1220 1221/* ARGSUSED*/ 1222static void 1223selectinit(dummy) 1224 void *dummy; 1225{ 1226 cv_init(&selwait, "select"); 1227 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1228} 1229