sys_generic.c revision 137806
1/* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD: head/sys/kern/sys_generic.c 137806 2004-11-17 09:09:55Z phk $"); 39 40#include "opt_ktrace.h" 41 42#include <sys/param.h> 43#include <sys/systm.h> 44#include <sys/sysproto.h> 45#include <sys/filedesc.h> 46#include <sys/filio.h> 47#include <sys/fcntl.h> 48#include <sys/file.h> 49#include <sys/proc.h> 50#include <sys/signalvar.h> 51#include <sys/socketvar.h> 52#include <sys/uio.h> 53#include <sys/kernel.h> 54#include <sys/limits.h> 55#include <sys/malloc.h> 56#include <sys/poll.h> 57#include <sys/resourcevar.h> 58#include <sys/selinfo.h> 59#include <sys/sleepqueue.h> 60#include <sys/syscallsubr.h> 61#include <sys/sysctl.h> 62#include <sys/sysent.h> 63#include <sys/vnode.h> 64#include <sys/bio.h> 65#include <sys/buf.h> 66#include <sys/condvar.h> 67#ifdef KTRACE 68#include <sys/ktrace.h> 69#endif 70#include <vm/vm.h> 71#include <vm/vm_page.h> 72 73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77static int pollscan(struct thread *, struct pollfd *, u_int); 78static int selscan(struct thread *, fd_mask **, fd_mask **, int); 79static int dofileread(struct thread *, struct file *, int, void *, 80 size_t, off_t, int); 81static int dofilewrite(struct thread *, struct file *, int, 82 const void *, size_t, off_t, int); 83static void doselwakeup(struct selinfo *, int); 84 85/* 86 * Read system call. 87 */ 88#ifndef _SYS_SYSPROTO_H_ 89struct read_args { 90 int fd; 91 void *buf; 92 size_t nbyte; 93}; 94#endif 95/* 96 * MPSAFE 97 */ 98int 99read(td, uap) 100 struct thread *td; 101 struct read_args *uap; 102{ 103 struct file *fp; 104 int error; 105 106 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 107 error = dofileread(td, fp, uap->fd, uap->buf, 108 uap->nbyte, (off_t)-1, 0); 109 fdrop(fp, td); 110 } 111 return(error); 112} 113 114/* 115 * Pread system call 116 */ 117#ifndef _SYS_SYSPROTO_H_ 118struct pread_args { 119 int fd; 120 void *buf; 121 size_t nbyte; 122 int pad; 123 off_t offset; 124}; 125#endif 126/* 127 * MPSAFE 128 */ 129int 130pread(td, uap) 131 struct thread *td; 132 struct pread_args *uap; 133{ 134 struct file *fp; 135 int error; 136 137 if ((error = fget_read(td, uap->fd, &fp)) != 0) 138 return (error); 139 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 140 error = ESPIPE; 141 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 142 error = EINVAL; 143 else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 return(error); 149} 150 151/* 152 * Code common for read and pread 153 */ 154static int 155dofileread(td, fp, fd, buf, nbyte, offset, flags) 156 struct thread *td; 157 struct file *fp; 158 int fd, flags; 159 void *buf; 160 size_t nbyte; 161 off_t offset; 162{ 163 struct uio auio; 164 struct iovec aiov; 165 long cnt, error = 0; 166#ifdef KTRACE 167 struct uio *ktruio = NULL; 168#endif 169 170 aiov.iov_base = buf; 171 aiov.iov_len = nbyte; 172 auio.uio_iov = &aiov; 173 auio.uio_iovcnt = 1; 174 auio.uio_offset = offset; 175 if (nbyte > INT_MAX) 176 return (EINVAL); 177 auio.uio_resid = nbyte; 178 auio.uio_rw = UIO_READ; 179 auio.uio_segflg = UIO_USERSPACE; 180 auio.uio_td = td; 181#ifdef KTRACE 182 if (KTRPOINT(td, KTR_GENIO)) 183 ktruio = cloneuio(&auio); 184#endif 185 cnt = nbyte; 186 187 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 188 if (auio.uio_resid != cnt && (error == ERESTART || 189 error == EINTR || error == EWOULDBLOCK)) 190 error = 0; 191 } 192 cnt -= auio.uio_resid; 193#ifdef KTRACE 194 if (ktruio != NULL) { 195 ktruio->uio_resid = cnt; 196 ktrgenio(fd, UIO_READ, ktruio, error); 197 } 198#endif 199 td->td_retval[0] = cnt; 200 return (error); 201} 202 203/* 204 * Scatter read system call. 205 */ 206#ifndef _SYS_SYSPROTO_H_ 207struct readv_args { 208 int fd; 209 struct iovec *iovp; 210 u_int iovcnt; 211}; 212#endif 213/* 214 * MPSAFE 215 */ 216int 217readv(struct thread *td, struct readv_args *uap) 218{ 219 struct file *fp; 220 struct uio *auio = NULL; 221 long cnt; 222 int error; 223#ifdef KTRACE 224 struct uio *ktruio = NULL; 225#endif 226 227 error = fget_read(td, uap->fd, &fp); 228 if (error) 229 return (error); 230 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 231 if (error) { 232 fdrop(fp, td); 233 return (error); 234 } 235 auio->uio_rw = UIO_READ; 236 auio->uio_td = td; 237#ifdef KTRACE 238 if (KTRPOINT(td, KTR_GENIO)) 239 ktruio = cloneuio(auio); 240#endif 241 cnt = auio->uio_resid; 242 if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) { 243 if (auio->uio_resid != cnt && (error == ERESTART || 244 error == EINTR || error == EWOULDBLOCK)) 245 error = 0; 246 } 247 cnt -= auio->uio_resid; 248#ifdef KTRACE 249 if (ktruio != NULL) { 250 ktruio->uio_resid = cnt; 251 ktrgenio(uap->fd, UIO_READ, ktruio, error); 252 } 253#endif 254 td->td_retval[0] = cnt; 255 free(auio, M_IOV); 256 fdrop(fp, td); 257 return (error); 258} 259 260/* 261 * Write system call 262 */ 263#ifndef _SYS_SYSPROTO_H_ 264struct write_args { 265 int fd; 266 const void *buf; 267 size_t nbyte; 268}; 269#endif 270/* 271 * MPSAFE 272 */ 273int 274write(td, uap) 275 struct thread *td; 276 struct write_args *uap; 277{ 278 struct file *fp; 279 int error; 280 281 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 282 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 283 (off_t)-1, 0); 284 fdrop(fp, td); 285 } else { 286 error = EBADF; /* XXX this can't be right */ 287 } 288 return(error); 289} 290 291/* 292 * Pwrite system call 293 */ 294#ifndef _SYS_SYSPROTO_H_ 295struct pwrite_args { 296 int fd; 297 const void *buf; 298 size_t nbyte; 299 int pad; 300 off_t offset; 301}; 302#endif 303/* 304 * MPSAFE 305 */ 306int 307pwrite(td, uap) 308 struct thread *td; 309 struct pwrite_args *uap; 310{ 311 struct file *fp; 312 int error; 313 314 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 315 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 316 error = ESPIPE; 317 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 318 error = EINVAL; 319 else { 320 error = dofilewrite(td, fp, uap->fd, uap->buf, 321 uap->nbyte, uap->offset, FOF_OFFSET); 322 } 323 fdrop(fp, td); 324 } else { 325 error = EBADF; /* this can't be right */ 326 } 327 return(error); 328} 329 330static int 331dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 332 struct thread *td; 333 struct file *fp; 334 int fd, flags; 335 const void *buf; 336 size_t nbyte; 337 off_t offset; 338{ 339 struct uio auio; 340 struct iovec aiov; 341 long cnt, error = 0; 342#ifdef KTRACE 343 struct uio *ktruio = NULL; 344#endif 345 346 aiov.iov_base = (void *)(uintptr_t)buf; 347 aiov.iov_len = nbyte; 348 auio.uio_iov = &aiov; 349 auio.uio_iovcnt = 1; 350 auio.uio_offset = offset; 351 if (nbyte > INT_MAX) 352 return (EINVAL); 353 auio.uio_resid = nbyte; 354 auio.uio_rw = UIO_WRITE; 355 auio.uio_segflg = UIO_USERSPACE; 356 auio.uio_td = td; 357#ifdef KTRACE 358 if (KTRPOINT(td, KTR_GENIO)) 359 ktruio = cloneuio(&auio); 360#endif 361 cnt = nbyte; 362 if (fp->f_type == DTYPE_VNODE) 363 bwillwrite(); 364 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 365 if (auio.uio_resid != cnt && (error == ERESTART || 366 error == EINTR || error == EWOULDBLOCK)) 367 error = 0; 368 /* Socket layer is responsible for issuing SIGPIPE. */ 369 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 370 PROC_LOCK(td->td_proc); 371 psignal(td->td_proc, SIGPIPE); 372 PROC_UNLOCK(td->td_proc); 373 } 374 } 375 cnt -= auio.uio_resid; 376#ifdef KTRACE 377 if (ktruio != NULL) { 378 ktruio->uio_resid = cnt; 379 ktrgenio(fd, UIO_WRITE, ktruio, error); 380 } 381#endif 382 td->td_retval[0] = cnt; 383 return (error); 384} 385 386/* 387 * Gather write system call 388 */ 389#ifndef _SYS_SYSPROTO_H_ 390struct writev_args { 391 int fd; 392 struct iovec *iovp; 393 u_int iovcnt; 394}; 395#endif 396/* 397 * MPSAFE 398 */ 399int 400writev(struct thread *td, struct writev_args *uap) 401{ 402 struct file *fp; 403 struct uio *auio = NULL; 404 long cnt; 405 int error; 406#ifdef KTRACE 407 struct uio *ktruio = NULL; 408#endif 409 410 error = fget_write(td, uap->fd, &fp); 411 if (error) 412 return (EBADF); 413 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 414 if (error) { 415 fdrop(fp, td); 416 return (error); 417 } 418 auio->uio_rw = UIO_WRITE; 419 auio->uio_td = td; 420#ifdef KTRACE 421 if (KTRPOINT(td, KTR_GENIO)) 422 ktruio = cloneuio(auio); 423#endif 424 cnt = auio->uio_resid; 425 if (fp->f_type == DTYPE_VNODE) 426 bwillwrite(); 427 if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) { 428 if (auio->uio_resid != cnt && (error == ERESTART || 429 error == EINTR || error == EWOULDBLOCK)) 430 error = 0; 431 if (error == EPIPE) { 432 PROC_LOCK(td->td_proc); 433 psignal(td->td_proc, SIGPIPE); 434 PROC_UNLOCK(td->td_proc); 435 } 436 } 437 cnt -= auio->uio_resid; 438#ifdef KTRACE 439 if (ktruio != NULL) { 440 ktruio->uio_resid = cnt; 441 ktrgenio(uap->fd, UIO_WRITE, ktruio, error); 442 } 443#endif 444 td->td_retval[0] = cnt; 445 fdrop(fp, td); 446 free(auio, M_IOV); 447 return (error); 448} 449 450/* 451 * Ioctl system call 452 */ 453#ifndef _SYS_SYSPROTO_H_ 454struct ioctl_args { 455 int fd; 456 u_long com; 457 caddr_t data; 458}; 459#endif 460/* 461 * MPSAFE 462 */ 463/* ARGSUSED */ 464int 465ioctl(struct thread *td, struct ioctl_args *uap) 466{ 467 struct file *fp; 468 struct filedesc *fdp; 469 u_long com; 470 int error = 0; 471 u_int size; 472 caddr_t data, memp; 473 int tmp; 474 475 if ((error = fget(td, uap->fd, &fp)) != 0) 476 return (error); 477 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 478 fdrop(fp, td); 479 return (EBADF); 480 } 481 fdp = td->td_proc->p_fd; 482 switch (com = uap->com) { 483 case FIONCLEX: 484 FILEDESC_LOCK_FAST(fdp); 485 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 486 FILEDESC_UNLOCK_FAST(fdp); 487 fdrop(fp, td); 488 return (0); 489 case FIOCLEX: 490 FILEDESC_LOCK_FAST(fdp); 491 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 492 FILEDESC_UNLOCK_FAST(fdp); 493 fdrop(fp, td); 494 return (0); 495 } 496 497 /* 498 * Interpret high order word to find amount of data to be 499 * copied to/from the user's address space. 500 */ 501 size = IOCPARM_LEN(com); 502 if ((size > IOCPARM_MAX) || 503 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 504 ((com & IOC_VOID) && size > 0) || 505 ((com & (IOC_IN | IOC_OUT)) && size == 0)) { 506 fdrop(fp, td); 507 return (ENOTTY); 508 } 509 510 if (size > 0) { 511 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 512 data = memp; 513 } else { 514 memp = NULL; 515 data = (void *)&uap->data; 516 } 517 if (com & IOC_IN) { 518 error = copyin(uap->data, data, (u_int)size); 519 if (error) { 520 free(memp, M_IOCTLOPS); 521 fdrop(fp, td); 522 return (error); 523 } 524 } else if (com & IOC_OUT) { 525 /* 526 * Zero the buffer so the user always 527 * gets back something deterministic. 528 */ 529 bzero(data, size); 530 } 531 532 if (com == FIONBIO) { 533 FILE_LOCK(fp); 534 if ((tmp = *(int *)data)) 535 fp->f_flag |= FNONBLOCK; 536 else 537 fp->f_flag &= ~FNONBLOCK; 538 FILE_UNLOCK(fp); 539 data = (void *)&tmp; 540 } else if (com == FIOASYNC) { 541 FILE_LOCK(fp); 542 if ((tmp = *(int *)data)) 543 fp->f_flag |= FASYNC; 544 else 545 fp->f_flag &= ~FASYNC; 546 FILE_UNLOCK(fp); 547 data = (void *)&tmp; 548 } 549 550 error = fo_ioctl(fp, com, data, td->td_ucred, td); 551 552 if (error == 0 && (com & IOC_OUT)) 553 error = copyout(data, uap->data, (u_int)size); 554 555 if (memp != NULL) 556 free(memp, M_IOCTLOPS); 557 fdrop(fp, td); 558 return (error); 559} 560 561/* 562 * sellock and selwait are initialized in selectinit() via SYSINIT. 563 */ 564struct mtx sellock; 565struct cv selwait; 566u_int nselcoll; /* Select collisions since boot */ 567SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 568 569/* 570 * Select system call. 571 */ 572#ifndef _SYS_SYSPROTO_H_ 573struct select_args { 574 int nd; 575 fd_set *in, *ou, *ex; 576 struct timeval *tv; 577}; 578#endif 579/* 580 * MPSAFE 581 */ 582int 583select(td, uap) 584 register struct thread *td; 585 register struct select_args *uap; 586{ 587 struct timeval tv, *tvp; 588 int error; 589 590 if (uap->tv != NULL) { 591 error = copyin(uap->tv, &tv, sizeof(tv)); 592 if (error) 593 return (error); 594 tvp = &tv; 595 } else 596 tvp = NULL; 597 598 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 599} 600 601int 602kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 603 fd_set *fd_ex, struct timeval *tvp) 604{ 605 struct filedesc *fdp; 606 /* 607 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 608 * infds with the new FD_SETSIZE of 1024, and more than enough for 609 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 610 * of 256. 611 */ 612 fd_mask s_selbits[howmany(2048, NFDBITS)]; 613 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 614 struct timeval atv, rtv, ttv; 615 int error, timo; 616 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 617 618 if (nd < 0) 619 return (EINVAL); 620 fdp = td->td_proc->p_fd; 621 622 FILEDESC_LOCK_FAST(fdp); 623 624 if (nd > td->td_proc->p_fd->fd_nfiles) 625 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 626 FILEDESC_UNLOCK_FAST(fdp); 627 628 /* 629 * Allocate just enough bits for the non-null fd_sets. Use the 630 * preallocated auto buffer if possible. 631 */ 632 nfdbits = roundup(nd, NFDBITS); 633 ncpbytes = nfdbits / NBBY; 634 nbufbytes = 0; 635 if (fd_in != NULL) 636 nbufbytes += 2 * ncpbytes; 637 if (fd_ou != NULL) 638 nbufbytes += 2 * ncpbytes; 639 if (fd_ex != NULL) 640 nbufbytes += 2 * ncpbytes; 641 if (nbufbytes <= sizeof s_selbits) 642 selbits = &s_selbits[0]; 643 else 644 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 645 646 /* 647 * Assign pointers into the bit buffers and fetch the input bits. 648 * Put the output buffers together so that they can be bzeroed 649 * together. 650 */ 651 sbp = selbits; 652#define getbits(name, x) \ 653 do { \ 654 if (name == NULL) \ 655 ibits[x] = NULL; \ 656 else { \ 657 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 658 obits[x] = sbp; \ 659 sbp += ncpbytes / sizeof *sbp; \ 660 error = copyin(name, ibits[x], ncpbytes); \ 661 if (error != 0) \ 662 goto done_nosellock; \ 663 } \ 664 } while (0) 665 getbits(fd_in, 0); 666 getbits(fd_ou, 1); 667 getbits(fd_ex, 2); 668#undef getbits 669 if (nbufbytes != 0) 670 bzero(selbits, nbufbytes / 2); 671 672 if (tvp != NULL) { 673 atv = *tvp; 674 if (itimerfix(&atv)) { 675 error = EINVAL; 676 goto done_nosellock; 677 } 678 getmicrouptime(&rtv); 679 timevaladd(&atv, &rtv); 680 } else { 681 atv.tv_sec = 0; 682 atv.tv_usec = 0; 683 } 684 timo = 0; 685 TAILQ_INIT(&td->td_selq); 686 mtx_lock(&sellock); 687retry: 688 ncoll = nselcoll; 689 mtx_lock_spin(&sched_lock); 690 td->td_flags |= TDF_SELECT; 691 mtx_unlock_spin(&sched_lock); 692 mtx_unlock(&sellock); 693 694 error = selscan(td, ibits, obits, nd); 695 mtx_lock(&sellock); 696 if (error || td->td_retval[0]) 697 goto done; 698 if (atv.tv_sec || atv.tv_usec) { 699 getmicrouptime(&rtv); 700 if (timevalcmp(&rtv, &atv, >=)) 701 goto done; 702 ttv = atv; 703 timevalsub(&ttv, &rtv); 704 timo = ttv.tv_sec > 24 * 60 * 60 ? 705 24 * 60 * 60 * hz : tvtohz(&ttv); 706 } 707 708 /* 709 * An event of interest may occur while we do not hold 710 * sellock, so check TDF_SELECT and the number of 711 * collisions and rescan the file descriptors if 712 * necessary. 713 */ 714 mtx_lock_spin(&sched_lock); 715 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 716 mtx_unlock_spin(&sched_lock); 717 goto retry; 718 } 719 mtx_unlock_spin(&sched_lock); 720 721 if (timo > 0) 722 error = cv_timedwait_sig(&selwait, &sellock, timo); 723 else 724 error = cv_wait_sig(&selwait, &sellock); 725 726 if (error == 0) 727 goto retry; 728 729done: 730 clear_selinfo_list(td); 731 mtx_lock_spin(&sched_lock); 732 td->td_flags &= ~TDF_SELECT; 733 mtx_unlock_spin(&sched_lock); 734 mtx_unlock(&sellock); 735 736done_nosellock: 737 /* select is not restarted after signals... */ 738 if (error == ERESTART) 739 error = EINTR; 740 if (error == EWOULDBLOCK) 741 error = 0; 742#define putbits(name, x) \ 743 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 744 error = error2; 745 if (error == 0) { 746 int error2; 747 748 putbits(fd_in, 0); 749 putbits(fd_ou, 1); 750 putbits(fd_ex, 2); 751#undef putbits 752 } 753 if (selbits != &s_selbits[0]) 754 free(selbits, M_SELECT); 755 756 return (error); 757} 758 759static int 760selscan(td, ibits, obits, nfd) 761 struct thread *td; 762 fd_mask **ibits, **obits; 763 int nfd; 764{ 765 int msk, i, fd; 766 fd_mask bits; 767 struct file *fp; 768 int n = 0; 769 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 770 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 771 struct filedesc *fdp = td->td_proc->p_fd; 772 773 FILEDESC_LOCK(fdp); 774 for (msk = 0; msk < 3; msk++) { 775 if (ibits[msk] == NULL) 776 continue; 777 for (i = 0; i < nfd; i += NFDBITS) { 778 bits = ibits[msk][i/NFDBITS]; 779 /* ffs(int mask) not portable, fd_mask is long */ 780 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 781 if (!(bits & 1)) 782 continue; 783 if ((fp = fget_locked(fdp, fd)) == NULL) { 784 FILEDESC_UNLOCK(fdp); 785 return (EBADF); 786 } 787 if (fo_poll(fp, flag[msk], td->td_ucred, 788 td)) { 789 obits[msk][(fd)/NFDBITS] |= 790 ((fd_mask)1 << ((fd) % NFDBITS)); 791 n++; 792 } 793 } 794 } 795 } 796 FILEDESC_UNLOCK(fdp); 797 td->td_retval[0] = n; 798 return (0); 799} 800 801/* 802 * Poll system call. 803 */ 804#ifndef _SYS_SYSPROTO_H_ 805struct poll_args { 806 struct pollfd *fds; 807 u_int nfds; 808 int timeout; 809}; 810#endif 811/* 812 * MPSAFE 813 */ 814int 815poll(td, uap) 816 struct thread *td; 817 struct poll_args *uap; 818{ 819 struct pollfd *bits; 820 struct pollfd smallbits[32]; 821 struct timeval atv, rtv, ttv; 822 int error = 0, timo; 823 u_int ncoll, nfds; 824 size_t ni; 825 826 nfds = uap->nfds; 827 828 /* 829 * This is kinda bogus. We have fd limits, but that is not 830 * really related to the size of the pollfd array. Make sure 831 * we let the process use at least FD_SETSIZE entries and at 832 * least enough for the current limits. We want to be reasonably 833 * safe, but not overly restrictive. 834 */ 835 PROC_LOCK(td->td_proc); 836 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 837 (nfds > FD_SETSIZE)) { 838 PROC_UNLOCK(td->td_proc); 839 error = EINVAL; 840 goto done2; 841 } 842 PROC_UNLOCK(td->td_proc); 843 ni = nfds * sizeof(struct pollfd); 844 if (ni > sizeof(smallbits)) 845 bits = malloc(ni, M_TEMP, M_WAITOK); 846 else 847 bits = smallbits; 848 error = copyin(uap->fds, bits, ni); 849 if (error) 850 goto done_nosellock; 851 if (uap->timeout != INFTIM) { 852 atv.tv_sec = uap->timeout / 1000; 853 atv.tv_usec = (uap->timeout % 1000) * 1000; 854 if (itimerfix(&atv)) { 855 error = EINVAL; 856 goto done_nosellock; 857 } 858 getmicrouptime(&rtv); 859 timevaladd(&atv, &rtv); 860 } else { 861 atv.tv_sec = 0; 862 atv.tv_usec = 0; 863 } 864 timo = 0; 865 TAILQ_INIT(&td->td_selq); 866 mtx_lock(&sellock); 867retry: 868 ncoll = nselcoll; 869 mtx_lock_spin(&sched_lock); 870 td->td_flags |= TDF_SELECT; 871 mtx_unlock_spin(&sched_lock); 872 mtx_unlock(&sellock); 873 874 error = pollscan(td, bits, nfds); 875 mtx_lock(&sellock); 876 if (error || td->td_retval[0]) 877 goto done; 878 if (atv.tv_sec || atv.tv_usec) { 879 getmicrouptime(&rtv); 880 if (timevalcmp(&rtv, &atv, >=)) 881 goto done; 882 ttv = atv; 883 timevalsub(&ttv, &rtv); 884 timo = ttv.tv_sec > 24 * 60 * 60 ? 885 24 * 60 * 60 * hz : tvtohz(&ttv); 886 } 887 /* 888 * An event of interest may occur while we do not hold 889 * sellock, so check TDF_SELECT and the number of collisions 890 * and rescan the file descriptors if necessary. 891 */ 892 mtx_lock_spin(&sched_lock); 893 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 894 mtx_unlock_spin(&sched_lock); 895 goto retry; 896 } 897 mtx_unlock_spin(&sched_lock); 898 899 if (timo > 0) 900 error = cv_timedwait_sig(&selwait, &sellock, timo); 901 else 902 error = cv_wait_sig(&selwait, &sellock); 903 904 if (error == 0) 905 goto retry; 906 907done: 908 clear_selinfo_list(td); 909 mtx_lock_spin(&sched_lock); 910 td->td_flags &= ~TDF_SELECT; 911 mtx_unlock_spin(&sched_lock); 912 mtx_unlock(&sellock); 913 914done_nosellock: 915 /* poll is not restarted after signals... */ 916 if (error == ERESTART) 917 error = EINTR; 918 if (error == EWOULDBLOCK) 919 error = 0; 920 if (error == 0) { 921 error = copyout(bits, uap->fds, ni); 922 if (error) 923 goto out; 924 } 925out: 926 if (ni > sizeof(smallbits)) 927 free(bits, M_TEMP); 928done2: 929 return (error); 930} 931 932static int 933pollscan(td, fds, nfd) 934 struct thread *td; 935 struct pollfd *fds; 936 u_int nfd; 937{ 938 register struct filedesc *fdp = td->td_proc->p_fd; 939 int i; 940 struct file *fp; 941 int n = 0; 942 943 FILEDESC_LOCK(fdp); 944 for (i = 0; i < nfd; i++, fds++) { 945 if (fds->fd >= fdp->fd_nfiles) { 946 fds->revents = POLLNVAL; 947 n++; 948 } else if (fds->fd < 0) { 949 fds->revents = 0; 950 } else { 951 fp = fdp->fd_ofiles[fds->fd]; 952 if (fp == NULL) { 953 fds->revents = POLLNVAL; 954 n++; 955 } else { 956 /* 957 * Note: backend also returns POLLHUP and 958 * POLLERR if appropriate. 959 */ 960 fds->revents = fo_poll(fp, fds->events, 961 td->td_ucred, td); 962 if (fds->revents != 0) 963 n++; 964 } 965 } 966 } 967 FILEDESC_UNLOCK(fdp); 968 td->td_retval[0] = n; 969 return (0); 970} 971 972/* 973 * OpenBSD poll system call. 974 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 975 */ 976#ifndef _SYS_SYSPROTO_H_ 977struct openbsd_poll_args { 978 struct pollfd *fds; 979 u_int nfds; 980 int timeout; 981}; 982#endif 983/* 984 * MPSAFE 985 */ 986int 987openbsd_poll(td, uap) 988 register struct thread *td; 989 register struct openbsd_poll_args *uap; 990{ 991 return (poll(td, (struct poll_args *)uap)); 992} 993 994/* 995 * Remove the references to the thread from all of the objects 996 * we were polling. 997 * 998 * This code assumes that the underlying owner of the selinfo 999 * structure will hold sellock before it changes it, and that 1000 * it will unlink itself from our list if it goes away. 1001 */ 1002void 1003clear_selinfo_list(td) 1004 struct thread *td; 1005{ 1006 struct selinfo *si; 1007 1008 mtx_assert(&sellock, MA_OWNED); 1009 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1010 si->si_thread = NULL; 1011 TAILQ_INIT(&td->td_selq); 1012} 1013 1014/* 1015 * Record a select request. 1016 */ 1017void 1018selrecord(selector, sip) 1019 struct thread *selector; 1020 struct selinfo *sip; 1021{ 1022 1023 mtx_lock(&sellock); 1024 /* 1025 * If the selinfo's thread pointer is NULL then take ownership of it. 1026 * 1027 * If the thread pointer is not NULL and it points to another 1028 * thread, then we have a collision. 1029 * 1030 * If the thread pointer is not NULL and points back to us then leave 1031 * it alone as we've already added pointed it at us and added it to 1032 * our list. 1033 */ 1034 if (sip->si_thread == NULL) { 1035 sip->si_thread = selector; 1036 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1037 } else if (sip->si_thread != selector) { 1038 sip->si_flags |= SI_COLL; 1039 } 1040 1041 mtx_unlock(&sellock); 1042} 1043 1044/* Wake up a selecting thread. */ 1045void 1046selwakeup(sip) 1047 struct selinfo *sip; 1048{ 1049 doselwakeup(sip, -1); 1050} 1051 1052/* Wake up a selecting thread, and set its priority. */ 1053void 1054selwakeuppri(sip, pri) 1055 struct selinfo *sip; 1056 int pri; 1057{ 1058 doselwakeup(sip, pri); 1059} 1060 1061/* 1062 * Do a wakeup when a selectable event occurs. 1063 */ 1064static void 1065doselwakeup(sip, pri) 1066 struct selinfo *sip; 1067 int pri; 1068{ 1069 struct thread *td; 1070 1071 mtx_lock(&sellock); 1072 td = sip->si_thread; 1073 if ((sip->si_flags & SI_COLL) != 0) { 1074 nselcoll++; 1075 sip->si_flags &= ~SI_COLL; 1076 cv_broadcastpri(&selwait, pri); 1077 } 1078 if (td == NULL) { 1079 mtx_unlock(&sellock); 1080 return; 1081 } 1082 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1083 sip->si_thread = NULL; 1084 mtx_lock_spin(&sched_lock); 1085 td->td_flags &= ~TDF_SELECT; 1086 mtx_unlock_spin(&sched_lock); 1087 sleepq_remove(td, &selwait); 1088 mtx_unlock(&sellock); 1089} 1090 1091static void selectinit(void *); 1092SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1093 1094/* ARGSUSED*/ 1095static void 1096selectinit(dummy) 1097 void *dummy; 1098{ 1099 cv_init(&selwait, "select"); 1100 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1101} 1102