sys_generic.c revision 144445
1/*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD: head/sys/kern/sys_generic.c 144445 2005-03-31 22:51:18Z jhb $"); 39 40#include "opt_ktrace.h" 41 42#include <sys/param.h> 43#include <sys/systm.h> 44#include <sys/sysproto.h> 45#include <sys/filedesc.h> 46#include <sys/filio.h> 47#include <sys/fcntl.h> 48#include <sys/file.h> 49#include <sys/proc.h> 50#include <sys/signalvar.h> 51#include <sys/socketvar.h> 52#include <sys/uio.h> 53#include <sys/kernel.h> 54#include <sys/limits.h> 55#include <sys/malloc.h> 56#include <sys/poll.h> 57#include <sys/resourcevar.h> 58#include <sys/selinfo.h> 59#include <sys/sleepqueue.h> 60#include <sys/syscallsubr.h> 61#include <sys/sysctl.h> 62#include <sys/sysent.h> 63#include <sys/vnode.h> 64#include <sys/bio.h> 65#include <sys/buf.h> 66#include <sys/condvar.h> 67#ifdef KTRACE 68#include <sys/ktrace.h> 69#endif 70#include <vm/vm.h> 71#include <vm/vm_page.h> 72 73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77static int pollscan(struct thread *, struct pollfd *, u_int); 78static int selscan(struct thread *, fd_mask **, fd_mask **, int); 79static int dofileread(struct thread *, struct file *, int, void *, 80 size_t, off_t, int); 81static int dofilewrite(struct thread *, struct file *, int, 82 const void *, size_t, off_t, int); 83static void doselwakeup(struct selinfo *, int); 84 85/* 86 * Read system call. 87 */ 88#ifndef _SYS_SYSPROTO_H_ 89struct read_args { 90 int fd; 91 void *buf; 92 size_t nbyte; 93}; 94#endif 95/* 96 * MPSAFE 97 */ 98int 99read(td, uap) 100 struct thread *td; 101 struct read_args *uap; 102{ 103 struct file *fp; 104 int error; 105 106 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 107 error = dofileread(td, fp, uap->fd, uap->buf, 108 uap->nbyte, (off_t)-1, 0); 109 fdrop(fp, td); 110 } 111 return(error); 112} 113 114/* 115 * Pread system call 116 */ 117#ifndef _SYS_SYSPROTO_H_ 118struct pread_args { 119 int fd; 120 void *buf; 121 size_t nbyte; 122 int pad; 123 off_t offset; 124}; 125#endif 126/* 127 * MPSAFE 128 */ 129int 130pread(td, uap) 131 struct thread *td; 132 struct pread_args *uap; 133{ 134 struct file *fp; 135 int error; 136 137 if ((error = fget_read(td, uap->fd, &fp)) != 0) 138 return (error); 139 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 140 error = ESPIPE; 141 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 142 error = EINVAL; 143 else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 return(error); 149} 150 151/* 152 * Code common for read and pread 153 */ 154static int 155dofileread(td, fp, fd, buf, nbyte, offset, flags) 156 struct thread *td; 157 struct file *fp; 158 int fd, flags; 159 void *buf; 160 size_t nbyte; 161 off_t offset; 162{ 163 struct uio auio; 164 struct iovec aiov; 165 ssize_t cnt; 166 long error = 0; 167#ifdef KTRACE 168 struct uio *ktruio = NULL; 169#endif 170 171 /* Finish zero length reads right here */ 172 if (nbyte == 0) { 173 td->td_retval[0] = 0; 174 return(0); 175 } 176 aiov.iov_base = buf; 177 aiov.iov_len = nbyte; 178 auio.uio_iov = &aiov; 179 auio.uio_iovcnt = 1; 180 auio.uio_offset = offset; 181 if (nbyte > INT_MAX) 182 return (EINVAL); 183 auio.uio_resid = nbyte; 184 auio.uio_rw = UIO_READ; 185 auio.uio_segflg = UIO_USERSPACE; 186 auio.uio_td = td; 187#ifdef KTRACE 188 if (KTRPOINT(td, KTR_GENIO)) 189 ktruio = cloneuio(&auio); 190#endif 191 cnt = nbyte; 192 193 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 194 if (auio.uio_resid != cnt && (error == ERESTART || 195 error == EINTR || error == EWOULDBLOCK)) 196 error = 0; 197 } 198 cnt -= auio.uio_resid; 199#ifdef KTRACE 200 if (ktruio != NULL) { 201 ktruio->uio_resid = cnt; 202 ktrgenio(fd, UIO_READ, ktruio, error); 203 } 204#endif 205 td->td_retval[0] = cnt; 206 return (error); 207} 208 209/* 210 * Scatter read system call. 211 */ 212#ifndef _SYS_SYSPROTO_H_ 213struct readv_args { 214 int fd; 215 struct iovec *iovp; 216 u_int iovcnt; 217}; 218#endif 219/* 220 * MPSAFE 221 */ 222int 223readv(struct thread *td, struct readv_args *uap) 224{ 225 struct uio *auio; 226 int error; 227 228 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 229 if (error) 230 return (error); 231 error = kern_readv(td, uap->fd, auio); 232 free(auio, M_IOV); 233 return (error); 234} 235 236int 237kern_readv(struct thread *td, int fd, struct uio *auio) 238{ 239 struct file *fp; 240 long cnt; 241 int error; 242#ifdef KTRACE 243 struct uio *ktruio = NULL; 244#endif 245 246 error = fget_read(td, fd, &fp); 247 if (error) 248 return (error); 249 /* Finish zero length reads right here */ 250 if (auio->uio_resid == 0) { 251 td->td_retval[0] = 0; 252 fdrop(fp, td); 253 return(0); 254 } 255 auio->uio_rw = UIO_READ; 256 auio->uio_td = td; 257#ifdef KTRACE 258 if (KTRPOINT(td, KTR_GENIO)) 259 ktruio = cloneuio(auio); 260#endif 261 cnt = auio->uio_resid; 262 if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) { 263 if (auio->uio_resid != cnt && (error == ERESTART || 264 error == EINTR || error == EWOULDBLOCK)) 265 error = 0; 266 } 267 cnt -= auio->uio_resid; 268#ifdef KTRACE 269 if (ktruio != NULL) { 270 ktruio->uio_resid = cnt; 271 ktrgenio(fd, UIO_READ, ktruio, error); 272 } 273#endif 274 td->td_retval[0] = cnt; 275 fdrop(fp, td); 276 return (error); 277} 278 279/* 280 * Write system call 281 */ 282#ifndef _SYS_SYSPROTO_H_ 283struct write_args { 284 int fd; 285 const void *buf; 286 size_t nbyte; 287}; 288#endif 289/* 290 * MPSAFE 291 */ 292int 293write(td, uap) 294 struct thread *td; 295 struct write_args *uap; 296{ 297 struct file *fp; 298 int error; 299 300 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 301 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 302 (off_t)-1, 0); 303 fdrop(fp, td); 304 } else { 305 error = EBADF; /* XXX this can't be right */ 306 } 307 return(error); 308} 309 310/* 311 * Pwrite system call 312 */ 313#ifndef _SYS_SYSPROTO_H_ 314struct pwrite_args { 315 int fd; 316 const void *buf; 317 size_t nbyte; 318 int pad; 319 off_t offset; 320}; 321#endif 322/* 323 * MPSAFE 324 */ 325int 326pwrite(td, uap) 327 struct thread *td; 328 struct pwrite_args *uap; 329{ 330 struct file *fp; 331 int error; 332 333 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 334 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 335 error = ESPIPE; 336 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 337 error = EINVAL; 338 else { 339 error = dofilewrite(td, fp, uap->fd, uap->buf, 340 uap->nbyte, uap->offset, FOF_OFFSET); 341 } 342 fdrop(fp, td); 343 } else { 344 error = EBADF; /* this can't be right */ 345 } 346 return(error); 347} 348 349static int 350dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 351 struct thread *td; 352 struct file *fp; 353 int fd, flags; 354 const void *buf; 355 size_t nbyte; 356 off_t offset; 357{ 358 struct uio auio; 359 struct iovec aiov; 360 ssize_t cnt; 361 long error = 0; 362#ifdef KTRACE 363 struct uio *ktruio = NULL; 364#endif 365 366 aiov.iov_base = (void *)(uintptr_t)buf; 367 aiov.iov_len = nbyte; 368 auio.uio_iov = &aiov; 369 auio.uio_iovcnt = 1; 370 auio.uio_offset = offset; 371 if (nbyte > INT_MAX) 372 return (EINVAL); 373 auio.uio_resid = nbyte; 374 auio.uio_rw = UIO_WRITE; 375 auio.uio_segflg = UIO_USERSPACE; 376 auio.uio_td = td; 377#ifdef KTRACE 378 if (KTRPOINT(td, KTR_GENIO)) 379 ktruio = cloneuio(&auio); 380#endif 381 cnt = nbyte; 382 if (fp->f_type == DTYPE_VNODE) 383 bwillwrite(); 384 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 385 if (auio.uio_resid != cnt && (error == ERESTART || 386 error == EINTR || error == EWOULDBLOCK)) 387 error = 0; 388 /* Socket layer is responsible for issuing SIGPIPE. */ 389 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 390 PROC_LOCK(td->td_proc); 391 psignal(td->td_proc, SIGPIPE); 392 PROC_UNLOCK(td->td_proc); 393 } 394 } 395 cnt -= auio.uio_resid; 396#ifdef KTRACE 397 if (ktruio != NULL) { 398 ktruio->uio_resid = cnt; 399 ktrgenio(fd, UIO_WRITE, ktruio, error); 400 } 401#endif 402 td->td_retval[0] = cnt; 403 return (error); 404} 405 406/* 407 * Gather write system call 408 */ 409#ifndef _SYS_SYSPROTO_H_ 410struct writev_args { 411 int fd; 412 struct iovec *iovp; 413 u_int iovcnt; 414}; 415#endif 416/* 417 * MPSAFE 418 */ 419int 420writev(struct thread *td, struct writev_args *uap) 421{ 422 struct uio *auio; 423 int error; 424 425 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 426 if (error) 427 return (error); 428 error = kern_writev(td, uap->fd, auio); 429 free(auio, M_IOV); 430 return (error); 431} 432 433int 434kern_writev(struct thread *td, int fd, struct uio *auio) 435{ 436 struct file *fp; 437 long cnt; 438 int error; 439#ifdef KTRACE 440 struct uio *ktruio = NULL; 441#endif 442 443 error = fget_write(td, fd, &fp); 444 if (error) 445 return (EBADF); 446 auio->uio_rw = UIO_WRITE; 447 auio->uio_td = td; 448#ifdef KTRACE 449 if (KTRPOINT(td, KTR_GENIO)) 450 ktruio = cloneuio(auio); 451#endif 452 cnt = auio->uio_resid; 453 if (fp->f_type == DTYPE_VNODE) 454 bwillwrite(); 455 if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) { 456 if (auio->uio_resid != cnt && (error == ERESTART || 457 error == EINTR || error == EWOULDBLOCK)) 458 error = 0; 459 if (error == EPIPE) { 460 PROC_LOCK(td->td_proc); 461 psignal(td->td_proc, SIGPIPE); 462 PROC_UNLOCK(td->td_proc); 463 } 464 } 465 cnt -= auio->uio_resid; 466#ifdef KTRACE 467 if (ktruio != NULL) { 468 ktruio->uio_resid = cnt; 469 ktrgenio(fd, UIO_WRITE, ktruio, error); 470 } 471#endif 472 td->td_retval[0] = cnt; 473 fdrop(fp, td); 474 return (error); 475} 476 477/* 478 * Ioctl system call 479 */ 480#ifndef _SYS_SYSPROTO_H_ 481struct ioctl_args { 482 int fd; 483 u_long com; 484 caddr_t data; 485}; 486#endif 487/* 488 * MPSAFE 489 */ 490/* ARGSUSED */ 491int 492ioctl(struct thread *td, struct ioctl_args *uap) 493{ 494 struct file *fp; 495 struct filedesc *fdp; 496 u_long com; 497 int error = 0; 498 u_int size; 499 caddr_t data, memp; 500 int tmp; 501 502 if (uap->com > 0xffffffff) { 503 printf( 504 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 505 td->td_proc->p_pid, td->td_proc->p_comm, uap->com); 506 uap->com &= 0xffffffff; 507 } 508 if ((error = fget(td, uap->fd, &fp)) != 0) 509 return (error); 510 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 511 fdrop(fp, td); 512 return (EBADF); 513 } 514 fdp = td->td_proc->p_fd; 515 switch (com = uap->com) { 516 case FIONCLEX: 517 FILEDESC_LOCK_FAST(fdp); 518 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 519 FILEDESC_UNLOCK_FAST(fdp); 520 fdrop(fp, td); 521 return (0); 522 case FIOCLEX: 523 FILEDESC_LOCK_FAST(fdp); 524 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 525 FILEDESC_UNLOCK_FAST(fdp); 526 fdrop(fp, td); 527 return (0); 528 } 529 530 /* 531 * Interpret high order word to find amount of data to be 532 * copied to/from the user's address space. 533 */ 534 size = IOCPARM_LEN(com); 535 if ((size > IOCPARM_MAX) || 536 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 537 ((com & IOC_VOID) && size > 0) || 538 ((com & (IOC_IN | IOC_OUT)) && size == 0)) { 539 fdrop(fp, td); 540 return (ENOTTY); 541 } 542 543 if (size > 0) { 544 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 545 data = memp; 546 } else { 547 memp = NULL; 548 data = (void *)&uap->data; 549 } 550 if (com & IOC_IN) { 551 error = copyin(uap->data, data, (u_int)size); 552 if (error) { 553 free(memp, M_IOCTLOPS); 554 fdrop(fp, td); 555 return (error); 556 } 557 } else if (com & IOC_OUT) { 558 /* 559 * Zero the buffer so the user always 560 * gets back something deterministic. 561 */ 562 bzero(data, size); 563 } 564 565 if (com == FIONBIO) { 566 FILE_LOCK(fp); 567 if ((tmp = *(int *)data)) 568 fp->f_flag |= FNONBLOCK; 569 else 570 fp->f_flag &= ~FNONBLOCK; 571 FILE_UNLOCK(fp); 572 data = (void *)&tmp; 573 } else if (com == FIOASYNC) { 574 FILE_LOCK(fp); 575 if ((tmp = *(int *)data)) 576 fp->f_flag |= FASYNC; 577 else 578 fp->f_flag &= ~FASYNC; 579 FILE_UNLOCK(fp); 580 data = (void *)&tmp; 581 } 582 583 error = fo_ioctl(fp, com, data, td->td_ucred, td); 584 585 if (error == 0 && (com & IOC_OUT)) 586 error = copyout(data, uap->data, (u_int)size); 587 588 if (memp != NULL) 589 free(memp, M_IOCTLOPS); 590 fdrop(fp, td); 591 return (error); 592} 593 594/* 595 * sellock and selwait are initialized in selectinit() via SYSINIT. 596 */ 597struct mtx sellock; 598struct cv selwait; 599u_int nselcoll; /* Select collisions since boot */ 600SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 601 602/* 603 * Select system call. 604 */ 605#ifndef _SYS_SYSPROTO_H_ 606struct select_args { 607 int nd; 608 fd_set *in, *ou, *ex; 609 struct timeval *tv; 610}; 611#endif 612/* 613 * MPSAFE 614 */ 615int 616select(td, uap) 617 register struct thread *td; 618 register struct select_args *uap; 619{ 620 struct timeval tv, *tvp; 621 int error; 622 623 if (uap->tv != NULL) { 624 error = copyin(uap->tv, &tv, sizeof(tv)); 625 if (error) 626 return (error); 627 tvp = &tv; 628 } else 629 tvp = NULL; 630 631 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 632} 633 634int 635kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 636 fd_set *fd_ex, struct timeval *tvp) 637{ 638 struct filedesc *fdp; 639 /* 640 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 641 * infds with the new FD_SETSIZE of 1024, and more than enough for 642 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 643 * of 256. 644 */ 645 fd_mask s_selbits[howmany(2048, NFDBITS)]; 646 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 647 struct timeval atv, rtv, ttv; 648 int error, timo; 649 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 650 651 if (nd < 0) 652 return (EINVAL); 653 fdp = td->td_proc->p_fd; 654 655 FILEDESC_LOCK_FAST(fdp); 656 657 if (nd > td->td_proc->p_fd->fd_nfiles) 658 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 659 FILEDESC_UNLOCK_FAST(fdp); 660 661 /* 662 * Allocate just enough bits for the non-null fd_sets. Use the 663 * preallocated auto buffer if possible. 664 */ 665 nfdbits = roundup(nd, NFDBITS); 666 ncpbytes = nfdbits / NBBY; 667 nbufbytes = 0; 668 if (fd_in != NULL) 669 nbufbytes += 2 * ncpbytes; 670 if (fd_ou != NULL) 671 nbufbytes += 2 * ncpbytes; 672 if (fd_ex != NULL) 673 nbufbytes += 2 * ncpbytes; 674 if (nbufbytes <= sizeof s_selbits) 675 selbits = &s_selbits[0]; 676 else 677 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 678 679 /* 680 * Assign pointers into the bit buffers and fetch the input bits. 681 * Put the output buffers together so that they can be bzeroed 682 * together. 683 */ 684 sbp = selbits; 685#define getbits(name, x) \ 686 do { \ 687 if (name == NULL) \ 688 ibits[x] = NULL; \ 689 else { \ 690 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 691 obits[x] = sbp; \ 692 sbp += ncpbytes / sizeof *sbp; \ 693 error = copyin(name, ibits[x], ncpbytes); \ 694 if (error != 0) \ 695 goto done_nosellock; \ 696 } \ 697 } while (0) 698 getbits(fd_in, 0); 699 getbits(fd_ou, 1); 700 getbits(fd_ex, 2); 701#undef getbits 702 if (nbufbytes != 0) 703 bzero(selbits, nbufbytes / 2); 704 705 if (tvp != NULL) { 706 atv = *tvp; 707 if (itimerfix(&atv)) { 708 error = EINVAL; 709 goto done_nosellock; 710 } 711 getmicrouptime(&rtv); 712 timevaladd(&atv, &rtv); 713 } else { 714 atv.tv_sec = 0; 715 atv.tv_usec = 0; 716 } 717 timo = 0; 718 TAILQ_INIT(&td->td_selq); 719 mtx_lock(&sellock); 720retry: 721 ncoll = nselcoll; 722 mtx_lock_spin(&sched_lock); 723 td->td_flags |= TDF_SELECT; 724 mtx_unlock_spin(&sched_lock); 725 mtx_unlock(&sellock); 726 727 error = selscan(td, ibits, obits, nd); 728 mtx_lock(&sellock); 729 if (error || td->td_retval[0]) 730 goto done; 731 if (atv.tv_sec || atv.tv_usec) { 732 getmicrouptime(&rtv); 733 if (timevalcmp(&rtv, &atv, >=)) 734 goto done; 735 ttv = atv; 736 timevalsub(&ttv, &rtv); 737 timo = ttv.tv_sec > 24 * 60 * 60 ? 738 24 * 60 * 60 * hz : tvtohz(&ttv); 739 } 740 741 /* 742 * An event of interest may occur while we do not hold 743 * sellock, so check TDF_SELECT and the number of 744 * collisions and rescan the file descriptors if 745 * necessary. 746 */ 747 mtx_lock_spin(&sched_lock); 748 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 749 mtx_unlock_spin(&sched_lock); 750 goto retry; 751 } 752 mtx_unlock_spin(&sched_lock); 753 754 if (timo > 0) 755 error = cv_timedwait_sig(&selwait, &sellock, timo); 756 else 757 error = cv_wait_sig(&selwait, &sellock); 758 759 if (error == 0) 760 goto retry; 761 762done: 763 clear_selinfo_list(td); 764 mtx_lock_spin(&sched_lock); 765 td->td_flags &= ~TDF_SELECT; 766 mtx_unlock_spin(&sched_lock); 767 mtx_unlock(&sellock); 768 769done_nosellock: 770 /* select is not restarted after signals... */ 771 if (error == ERESTART) 772 error = EINTR; 773 if (error == EWOULDBLOCK) 774 error = 0; 775#define putbits(name, x) \ 776 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 777 error = error2; 778 if (error == 0) { 779 int error2; 780 781 putbits(fd_in, 0); 782 putbits(fd_ou, 1); 783 putbits(fd_ex, 2); 784#undef putbits 785 } 786 if (selbits != &s_selbits[0]) 787 free(selbits, M_SELECT); 788 789 return (error); 790} 791 792static int 793selscan(td, ibits, obits, nfd) 794 struct thread *td; 795 fd_mask **ibits, **obits; 796 int nfd; 797{ 798 int msk, i, fd; 799 fd_mask bits; 800 struct file *fp; 801 int n = 0; 802 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 803 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 804 struct filedesc *fdp = td->td_proc->p_fd; 805 806 FILEDESC_LOCK(fdp); 807 for (msk = 0; msk < 3; msk++) { 808 if (ibits[msk] == NULL) 809 continue; 810 for (i = 0; i < nfd; i += NFDBITS) { 811 bits = ibits[msk][i/NFDBITS]; 812 /* ffs(int mask) not portable, fd_mask is long */ 813 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 814 if (!(bits & 1)) 815 continue; 816 if ((fp = fget_locked(fdp, fd)) == NULL) { 817 FILEDESC_UNLOCK(fdp); 818 return (EBADF); 819 } 820 if (fo_poll(fp, flag[msk], td->td_ucred, 821 td)) { 822 obits[msk][(fd)/NFDBITS] |= 823 ((fd_mask)1 << ((fd) % NFDBITS)); 824 n++; 825 } 826 } 827 } 828 } 829 FILEDESC_UNLOCK(fdp); 830 td->td_retval[0] = n; 831 return (0); 832} 833 834/* 835 * Poll system call. 836 */ 837#ifndef _SYS_SYSPROTO_H_ 838struct poll_args { 839 struct pollfd *fds; 840 u_int nfds; 841 int timeout; 842}; 843#endif 844/* 845 * MPSAFE 846 */ 847int 848poll(td, uap) 849 struct thread *td; 850 struct poll_args *uap; 851{ 852 struct pollfd *bits; 853 struct pollfd smallbits[32]; 854 struct timeval atv, rtv, ttv; 855 int error = 0, timo; 856 u_int ncoll, nfds; 857 size_t ni; 858 859 nfds = uap->nfds; 860 861 /* 862 * This is kinda bogus. We have fd limits, but that is not 863 * really related to the size of the pollfd array. Make sure 864 * we let the process use at least FD_SETSIZE entries and at 865 * least enough for the current limits. We want to be reasonably 866 * safe, but not overly restrictive. 867 */ 868 PROC_LOCK(td->td_proc); 869 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 870 (nfds > FD_SETSIZE)) { 871 PROC_UNLOCK(td->td_proc); 872 error = EINVAL; 873 goto done2; 874 } 875 PROC_UNLOCK(td->td_proc); 876 ni = nfds * sizeof(struct pollfd); 877 if (ni > sizeof(smallbits)) 878 bits = malloc(ni, M_TEMP, M_WAITOK); 879 else 880 bits = smallbits; 881 error = copyin(uap->fds, bits, ni); 882 if (error) 883 goto done_nosellock; 884 if (uap->timeout != INFTIM) { 885 atv.tv_sec = uap->timeout / 1000; 886 atv.tv_usec = (uap->timeout % 1000) * 1000; 887 if (itimerfix(&atv)) { 888 error = EINVAL; 889 goto done_nosellock; 890 } 891 getmicrouptime(&rtv); 892 timevaladd(&atv, &rtv); 893 } else { 894 atv.tv_sec = 0; 895 atv.tv_usec = 0; 896 } 897 timo = 0; 898 TAILQ_INIT(&td->td_selq); 899 mtx_lock(&sellock); 900retry: 901 ncoll = nselcoll; 902 mtx_lock_spin(&sched_lock); 903 td->td_flags |= TDF_SELECT; 904 mtx_unlock_spin(&sched_lock); 905 mtx_unlock(&sellock); 906 907 error = pollscan(td, bits, nfds); 908 mtx_lock(&sellock); 909 if (error || td->td_retval[0]) 910 goto done; 911 if (atv.tv_sec || atv.tv_usec) { 912 getmicrouptime(&rtv); 913 if (timevalcmp(&rtv, &atv, >=)) 914 goto done; 915 ttv = atv; 916 timevalsub(&ttv, &rtv); 917 timo = ttv.tv_sec > 24 * 60 * 60 ? 918 24 * 60 * 60 * hz : tvtohz(&ttv); 919 } 920 /* 921 * An event of interest may occur while we do not hold 922 * sellock, so check TDF_SELECT and the number of collisions 923 * and rescan the file descriptors if necessary. 924 */ 925 mtx_lock_spin(&sched_lock); 926 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 927 mtx_unlock_spin(&sched_lock); 928 goto retry; 929 } 930 mtx_unlock_spin(&sched_lock); 931 932 if (timo > 0) 933 error = cv_timedwait_sig(&selwait, &sellock, timo); 934 else 935 error = cv_wait_sig(&selwait, &sellock); 936 937 if (error == 0) 938 goto retry; 939 940done: 941 clear_selinfo_list(td); 942 mtx_lock_spin(&sched_lock); 943 td->td_flags &= ~TDF_SELECT; 944 mtx_unlock_spin(&sched_lock); 945 mtx_unlock(&sellock); 946 947done_nosellock: 948 /* poll is not restarted after signals... */ 949 if (error == ERESTART) 950 error = EINTR; 951 if (error == EWOULDBLOCK) 952 error = 0; 953 if (error == 0) { 954 error = copyout(bits, uap->fds, ni); 955 if (error) 956 goto out; 957 } 958out: 959 if (ni > sizeof(smallbits)) 960 free(bits, M_TEMP); 961done2: 962 return (error); 963} 964 965static int 966pollscan(td, fds, nfd) 967 struct thread *td; 968 struct pollfd *fds; 969 u_int nfd; 970{ 971 register struct filedesc *fdp = td->td_proc->p_fd; 972 int i; 973 struct file *fp; 974 int n = 0; 975 976 FILEDESC_LOCK(fdp); 977 for (i = 0; i < nfd; i++, fds++) { 978 if (fds->fd >= fdp->fd_nfiles) { 979 fds->revents = POLLNVAL; 980 n++; 981 } else if (fds->fd < 0) { 982 fds->revents = 0; 983 } else { 984 fp = fdp->fd_ofiles[fds->fd]; 985 if (fp == NULL) { 986 fds->revents = POLLNVAL; 987 n++; 988 } else { 989 /* 990 * Note: backend also returns POLLHUP and 991 * POLLERR if appropriate. 992 */ 993 fds->revents = fo_poll(fp, fds->events, 994 td->td_ucred, td); 995 if (fds->revents != 0) 996 n++; 997 } 998 } 999 } 1000 FILEDESC_UNLOCK(fdp); 1001 td->td_retval[0] = n; 1002 return (0); 1003} 1004 1005/* 1006 * OpenBSD poll system call. 1007 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1008 */ 1009#ifndef _SYS_SYSPROTO_H_ 1010struct openbsd_poll_args { 1011 struct pollfd *fds; 1012 u_int nfds; 1013 int timeout; 1014}; 1015#endif 1016/* 1017 * MPSAFE 1018 */ 1019int 1020openbsd_poll(td, uap) 1021 register struct thread *td; 1022 register struct openbsd_poll_args *uap; 1023{ 1024 return (poll(td, (struct poll_args *)uap)); 1025} 1026 1027/* 1028 * Remove the references to the thread from all of the objects 1029 * we were polling. 1030 * 1031 * This code assumes that the underlying owner of the selinfo 1032 * structure will hold sellock before it changes it, and that 1033 * it will unlink itself from our list if it goes away. 1034 */ 1035void 1036clear_selinfo_list(td) 1037 struct thread *td; 1038{ 1039 struct selinfo *si; 1040 1041 mtx_assert(&sellock, MA_OWNED); 1042 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1043 si->si_thread = NULL; 1044 TAILQ_INIT(&td->td_selq); 1045} 1046 1047/* 1048 * Record a select request. 1049 */ 1050void 1051selrecord(selector, sip) 1052 struct thread *selector; 1053 struct selinfo *sip; 1054{ 1055 1056 mtx_lock(&sellock); 1057 /* 1058 * If the selinfo's thread pointer is NULL then take ownership of it. 1059 * 1060 * If the thread pointer is not NULL and it points to another 1061 * thread, then we have a collision. 1062 * 1063 * If the thread pointer is not NULL and points back to us then leave 1064 * it alone as we've already added pointed it at us and added it to 1065 * our list. 1066 */ 1067 if (sip->si_thread == NULL) { 1068 sip->si_thread = selector; 1069 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1070 } else if (sip->si_thread != selector) { 1071 sip->si_flags |= SI_COLL; 1072 } 1073 1074 mtx_unlock(&sellock); 1075} 1076 1077/* Wake up a selecting thread. */ 1078void 1079selwakeup(sip) 1080 struct selinfo *sip; 1081{ 1082 doselwakeup(sip, -1); 1083} 1084 1085/* Wake up a selecting thread, and set its priority. */ 1086void 1087selwakeuppri(sip, pri) 1088 struct selinfo *sip; 1089 int pri; 1090{ 1091 doselwakeup(sip, pri); 1092} 1093 1094/* 1095 * Do a wakeup when a selectable event occurs. 1096 */ 1097static void 1098doselwakeup(sip, pri) 1099 struct selinfo *sip; 1100 int pri; 1101{ 1102 struct thread *td; 1103 1104 mtx_lock(&sellock); 1105 td = sip->si_thread; 1106 if ((sip->si_flags & SI_COLL) != 0) { 1107 nselcoll++; 1108 sip->si_flags &= ~SI_COLL; 1109 cv_broadcastpri(&selwait, pri); 1110 } 1111 if (td == NULL) { 1112 mtx_unlock(&sellock); 1113 return; 1114 } 1115 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1116 sip->si_thread = NULL; 1117 mtx_lock_spin(&sched_lock); 1118 td->td_flags &= ~TDF_SELECT; 1119 mtx_unlock_spin(&sched_lock); 1120 sleepq_remove(td, &selwait); 1121 mtx_unlock(&sellock); 1122} 1123 1124static void selectinit(void *); 1125SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1126 1127/* ARGSUSED*/ 1128static void 1129selectinit(dummy) 1130 void *dummy; 1131{ 1132 cv_init(&selwait, "select"); 1133 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1134} 1135