sys_generic.c revision 89306
1/* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD: head/sys/kern/sys_generic.c 89306 2002-01-13 11:58:06Z alfred $ 40 */ 41 42#include "opt_ktrace.h" 43 44#include <sys/param.h> 45#include <sys/systm.h> 46#include <sys/sysproto.h> 47#include <sys/filedesc.h> 48#include <sys/filio.h> 49#include <sys/fcntl.h> 50#include <sys/file.h> 51#include <sys/proc.h> 52#include <sys/signalvar.h> 53#include <sys/socketvar.h> 54#include <sys/uio.h> 55#include <sys/kernel.h> 56#include <sys/malloc.h> 57#include <sys/poll.h> 58#include <sys/resourcevar.h> 59#include <sys/selinfo.h> 60#include <sys/sysctl.h> 61#include <sys/sysent.h> 62#include <sys/bio.h> 63#include <sys/buf.h> 64#include <sys/condvar.h> 65#ifdef KTRACE 66#include <sys/ktrace.h> 67#endif 68#include <vm/vm.h> 69#include <vm/vm_page.h> 70 71#include <machine/limits.h> 72 73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77static int pollscan __P((struct thread *, struct pollfd *, u_int)); 78static int pollholddrop __P((struct thread *, struct pollfd *, u_int, int)); 79static int selscan __P((struct thread *, fd_mask **, fd_mask **, int)); 80static int selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int)); 81static int dofileread __P((struct thread *, struct file *, int, void *, 82 size_t, off_t, int)); 83static int dofilewrite __P((struct thread *, struct file *, int, 84 const void *, size_t, off_t, int)); 85 86struct file* 87holdfp(fdp, fd, flag) 88 struct filedesc* fdp; 89 int fd, flag; 90{ 91 struct file* fp; 92 93 FILEDESC_LOCK(fdp); 94 if (((u_int)fd) >= fdp->fd_nfiles || 95 (fp = fdp->fd_ofiles[fd]) == NULL) { 96 FILEDESC_UNLOCK(fdp); 97 return (NULL); 98 } 99 FILE_LOCK(fp); 100 FILEDESC_UNLOCK(fdp); 101 if ((fp->f_flag & flag) == 0) { 102 FILE_UNLOCK(fp); 103 return (NULL); 104 } 105 fp->f_count++; 106 FILE_UNLOCK(fp); 107 return (fp); 108} 109 110/* 111 * Read system call. 112 */ 113#ifndef _SYS_SYSPROTO_H_ 114struct read_args { 115 int fd; 116 void *buf; 117 size_t nbyte; 118}; 119#endif 120/* 121 * MPSAFE 122 */ 123int 124read(td, uap) 125 struct thread *td; 126 struct read_args *uap; 127{ 128 struct file *fp; 129 int error; 130 131 mtx_lock(&Giant); 132 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 133 error = dofileread(td, fp, uap->fd, uap->buf, 134 uap->nbyte, (off_t)-1, 0); 135 fdrop(fp, td); 136 } 137 mtx_unlock(&Giant); 138 return(error); 139} 140 141/* 142 * Pread system call 143 */ 144#ifndef _SYS_SYSPROTO_H_ 145struct pread_args { 146 int fd; 147 void *buf; 148 size_t nbyte; 149 int pad; 150 off_t offset; 151}; 152#endif 153/* 154 * MPSAFE 155 */ 156int 157pread(td, uap) 158 struct thread *td; 159 struct pread_args *uap; 160{ 161 struct file *fp; 162 int error; 163 164 fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD); 165 if (fp == NULL) 166 return (EBADF); 167 if (fp->f_type != DTYPE_VNODE) { 168 error = ESPIPE; 169 } else { 170 mtx_lock(&Giant); 171 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 172 uap->offset, FOF_OFFSET); 173 mtx_unlock(&Giant); 174 } 175 fdrop(fp, td); 176 return(error); 177} 178 179/* 180 * Code common for read and pread 181 */ 182int 183dofileread(td, fp, fd, buf, nbyte, offset, flags) 184 struct thread *td; 185 struct file *fp; 186 int fd, flags; 187 void *buf; 188 size_t nbyte; 189 off_t offset; 190{ 191 struct uio auio; 192 struct iovec aiov; 193 long cnt, error = 0; 194#ifdef KTRACE 195 struct iovec ktriov; 196 struct uio ktruio; 197 int didktr = 0; 198#endif 199 200 aiov.iov_base = (caddr_t)buf; 201 aiov.iov_len = nbyte; 202 auio.uio_iov = &aiov; 203 auio.uio_iovcnt = 1; 204 auio.uio_offset = offset; 205 if (nbyte > INT_MAX) 206 return (EINVAL); 207 auio.uio_resid = nbyte; 208 auio.uio_rw = UIO_READ; 209 auio.uio_segflg = UIO_USERSPACE; 210 auio.uio_td = td; 211#ifdef KTRACE 212 /* 213 * if tracing, save a copy of iovec 214 */ 215 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 216 ktriov = aiov; 217 ktruio = auio; 218 didktr = 1; 219 } 220#endif 221 cnt = nbyte; 222 223 if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) { 224 if (auio.uio_resid != cnt && (error == ERESTART || 225 error == EINTR || error == EWOULDBLOCK)) 226 error = 0; 227 } 228 cnt -= auio.uio_resid; 229#ifdef KTRACE 230 if (didktr && error == 0) { 231 ktruio.uio_iov = &ktriov; 232 ktruio.uio_resid = cnt; 233 ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error); 234 } 235#endif 236 td->td_retval[0] = cnt; 237 return (error); 238} 239 240/* 241 * Scatter read system call. 242 */ 243#ifndef _SYS_SYSPROTO_H_ 244struct readv_args { 245 int fd; 246 struct iovec *iovp; 247 u_int iovcnt; 248}; 249#endif 250/* 251 * MPSAFE 252 */ 253int 254readv(td, uap) 255 struct thread *td; 256 struct readv_args *uap; 257{ 258 struct file *fp; 259 struct uio auio; 260 struct iovec *iov; 261 struct iovec *needfree; 262 struct iovec aiov[UIO_SMALLIOV]; 263 long i, cnt, error = 0; 264 u_int iovlen; 265#ifdef KTRACE 266 struct iovec *ktriov = NULL; 267 struct uio ktruio; 268#endif 269 mtx_lock(&Giant); 270 271 if ((error = fget_read(td, uap->fd, &fp)) != 0) 272 goto done2; 273 /* note: can't use iovlen until iovcnt is validated */ 274 iovlen = uap->iovcnt * sizeof (struct iovec); 275 if (uap->iovcnt > UIO_SMALLIOV) { 276 if (uap->iovcnt > UIO_MAXIOV) { 277 error = EINVAL; 278 goto done2; 279 } 280 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 281 needfree = iov; 282 } else { 283 iov = aiov; 284 needfree = NULL; 285 } 286 auio.uio_iov = iov; 287 auio.uio_iovcnt = uap->iovcnt; 288 auio.uio_rw = UIO_READ; 289 auio.uio_segflg = UIO_USERSPACE; 290 auio.uio_td = td; 291 auio.uio_offset = -1; 292 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 293 goto done; 294 auio.uio_resid = 0; 295 for (i = 0; i < uap->iovcnt; i++) { 296 if (iov->iov_len > INT_MAX - auio.uio_resid) { 297 error = EINVAL; 298 goto done; 299 } 300 auio.uio_resid += iov->iov_len; 301 iov++; 302 } 303#ifdef KTRACE 304 /* 305 * if tracing, save a copy of iovec 306 */ 307 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 308 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 309 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 310 ktruio = auio; 311 } 312#endif 313 cnt = auio.uio_resid; 314 if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) { 315 if (auio.uio_resid != cnt && (error == ERESTART || 316 error == EINTR || error == EWOULDBLOCK)) 317 error = 0; 318 } 319 cnt -= auio.uio_resid; 320#ifdef KTRACE 321 if (ktriov != NULL) { 322 if (error == 0) { 323 ktruio.uio_iov = ktriov; 324 ktruio.uio_resid = cnt; 325 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio, 326 error); 327 } 328 FREE(ktriov, M_TEMP); 329 } 330#endif 331 td->td_retval[0] = cnt; 332done: 333 fdrop(fp, td); 334 if (needfree) 335 FREE(needfree, M_IOV); 336done2: 337 mtx_unlock(&Giant); 338 return (error); 339} 340 341/* 342 * Write system call 343 */ 344#ifndef _SYS_SYSPROTO_H_ 345struct write_args { 346 int fd; 347 const void *buf; 348 size_t nbyte; 349}; 350#endif 351/* 352 * MPSAFE 353 */ 354int 355write(td, uap) 356 struct thread *td; 357 struct write_args *uap; 358{ 359 struct file *fp; 360 int error; 361 362 mtx_lock(&Giant); 363 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 364 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 365 (off_t)-1, 0); 366 fdrop(fp, td); 367 } else { 368 error = EBADF; /* XXX this can't be right */ 369 } 370 mtx_unlock(&Giant); 371 return(error); 372} 373 374/* 375 * Pwrite system call 376 */ 377#ifndef _SYS_SYSPROTO_H_ 378struct pwrite_args { 379 int fd; 380 const void *buf; 381 size_t nbyte; 382 int pad; 383 off_t offset; 384}; 385#endif 386/* 387 * MPSAFE 388 */ 389int 390pwrite(td, uap) 391 struct thread *td; 392 struct pwrite_args *uap; 393{ 394 struct file *fp; 395 int error; 396 397 mtx_lock(&Giant); 398 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 399 if (fp->f_type == DTYPE_VNODE) { 400 error = dofilewrite(td, fp, uap->fd, uap->buf, 401 uap->nbyte, uap->offset, FOF_OFFSET); 402 } else { 403 error = ESPIPE; 404 } 405 fdrop(fp, td); 406 } else { 407 error = EBADF; /* this can't be right */ 408 } 409 return(error); 410} 411 412static int 413dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 414 struct thread *td; 415 struct file *fp; 416 int fd, flags; 417 const void *buf; 418 size_t nbyte; 419 off_t offset; 420{ 421 struct uio auio; 422 struct iovec aiov; 423 long cnt, error = 0; 424#ifdef KTRACE 425 struct iovec ktriov; 426 struct uio ktruio; 427 int didktr = 0; 428#endif 429 430 aiov.iov_base = (void *)(uintptr_t)buf; 431 aiov.iov_len = nbyte; 432 auio.uio_iov = &aiov; 433 auio.uio_iovcnt = 1; 434 auio.uio_offset = offset; 435 if (nbyte > INT_MAX) 436 return (EINVAL); 437 auio.uio_resid = nbyte; 438 auio.uio_rw = UIO_WRITE; 439 auio.uio_segflg = UIO_USERSPACE; 440 auio.uio_td = td; 441#ifdef KTRACE 442 /* 443 * if tracing, save a copy of iovec and uio 444 */ 445 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 446 ktriov = aiov; 447 ktruio = auio; 448 didktr = 1; 449 } 450#endif 451 cnt = nbyte; 452 if (fp->f_type == DTYPE_VNODE) 453 bwillwrite(); 454 if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) { 455 if (auio.uio_resid != cnt && (error == ERESTART || 456 error == EINTR || error == EWOULDBLOCK)) 457 error = 0; 458 if (error == EPIPE) { 459 PROC_LOCK(td->td_proc); 460 psignal(td->td_proc, SIGPIPE); 461 PROC_UNLOCK(td->td_proc); 462 } 463 } 464 cnt -= auio.uio_resid; 465#ifdef KTRACE 466 if (didktr && error == 0) { 467 ktruio.uio_iov = &ktriov; 468 ktruio.uio_resid = cnt; 469 ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error); 470 } 471#endif 472 td->td_retval[0] = cnt; 473 return (error); 474} 475 476/* 477 * Gather write system call 478 */ 479#ifndef _SYS_SYSPROTO_H_ 480struct writev_args { 481 int fd; 482 struct iovec *iovp; 483 u_int iovcnt; 484}; 485#endif 486/* 487 * MPSAFE 488 */ 489int 490writev(td, uap) 491 struct thread *td; 492 register struct writev_args *uap; 493{ 494 struct file *fp; 495 struct uio auio; 496 register struct iovec *iov; 497 struct iovec *needfree; 498 struct iovec aiov[UIO_SMALLIOV]; 499 long i, cnt, error = 0; 500 u_int iovlen; 501#ifdef KTRACE 502 struct iovec *ktriov = NULL; 503 struct uio ktruio; 504#endif 505 506 mtx_lock(&Giant); 507 if ((error = fget_write(td, uap->fd, &fp)) != 0) { 508 error = EBADF; 509 goto done2; 510 } 511 /* note: can't use iovlen until iovcnt is validated */ 512 iovlen = uap->iovcnt * sizeof (struct iovec); 513 if (uap->iovcnt > UIO_SMALLIOV) { 514 if (uap->iovcnt > UIO_MAXIOV) { 515 needfree = NULL; 516 error = EINVAL; 517 goto done; 518 } 519 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 520 needfree = iov; 521 } else { 522 iov = aiov; 523 needfree = NULL; 524 } 525 auio.uio_iov = iov; 526 auio.uio_iovcnt = uap->iovcnt; 527 auio.uio_rw = UIO_WRITE; 528 auio.uio_segflg = UIO_USERSPACE; 529 auio.uio_td = td; 530 auio.uio_offset = -1; 531 if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) 532 goto done; 533 auio.uio_resid = 0; 534 for (i = 0; i < uap->iovcnt; i++) { 535 if (iov->iov_len > INT_MAX - auio.uio_resid) { 536 error = EINVAL; 537 goto done; 538 } 539 auio.uio_resid += iov->iov_len; 540 iov++; 541 } 542#ifdef KTRACE 543 /* 544 * if tracing, save a copy of iovec and uio 545 */ 546 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 547 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 548 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 549 ktruio = auio; 550 } 551#endif 552 cnt = auio.uio_resid; 553 if (fp->f_type == DTYPE_VNODE) 554 bwillwrite(); 555 if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) { 556 if (auio.uio_resid != cnt && (error == ERESTART || 557 error == EINTR || error == EWOULDBLOCK)) 558 error = 0; 559 if (error == EPIPE) { 560 PROC_LOCK(td->td_proc); 561 psignal(td->td_proc, SIGPIPE); 562 PROC_UNLOCK(td->td_proc); 563 } 564 } 565 cnt -= auio.uio_resid; 566#ifdef KTRACE 567 if (ktriov != NULL) { 568 if (error == 0) { 569 ktruio.uio_iov = ktriov; 570 ktruio.uio_resid = cnt; 571 ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio, 572 error); 573 } 574 FREE(ktriov, M_TEMP); 575 } 576#endif 577 td->td_retval[0] = cnt; 578done: 579 fdrop(fp, td); 580 if (needfree) 581 FREE(needfree, M_IOV); 582done2: 583 mtx_unlock(&Giant); 584 return (error); 585} 586 587/* 588 * Ioctl system call 589 */ 590#ifndef _SYS_SYSPROTO_H_ 591struct ioctl_args { 592 int fd; 593 u_long com; 594 caddr_t data; 595}; 596#endif 597/* 598 * MPSAFE 599 */ 600/* ARGSUSED */ 601int 602ioctl(td, uap) 603 struct thread *td; 604 register struct ioctl_args *uap; 605{ 606 register struct file *fp; 607 register struct filedesc *fdp; 608 register u_long com; 609 int error = 0; 610 register u_int size; 611 caddr_t data, memp; 612 int tmp; 613#define STK_PARAMS 128 614 union { 615 char stkbuf[STK_PARAMS]; 616 long align; 617 } ubuf; 618 619 fp = ffind_hold(td, uap->fd); 620 if (fp == NULL) 621 return (EBADF); 622 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 623 fdrop(fp, td); 624 return (EBADF); 625 } 626 fdp = td->td_proc->p_fd; 627 switch (com = uap->com) { 628 case FIONCLEX: 629 FILEDESC_LOCK(fdp); 630 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 631 FILEDESC_UNLOCK(fdp); 632 fdrop(fp, td); 633 return (0); 634 case FIOCLEX: 635 FILEDESC_LOCK(fdp); 636 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 637 FILEDESC_UNLOCK(fdp); 638 fdrop(fp, td); 639 return (0); 640 } 641 642 /* 643 * Interpret high order word to find amount of data to be 644 * copied to/from the user's address space. 645 */ 646 size = IOCPARM_LEN(com); 647 if (size > IOCPARM_MAX) { 648 fdrop(fp, td); 649 return (ENOTTY); 650 } 651 652 mtx_lock(&Giant); 653 memp = NULL; 654 if (size > sizeof (ubuf.stkbuf)) { 655 memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 656 data = memp; 657 } else { 658 data = ubuf.stkbuf; 659 } 660 if (com&IOC_IN) { 661 if (size) { 662 error = copyin(uap->data, data, (u_int)size); 663 if (error) { 664 if (memp) 665 free(memp, M_IOCTLOPS); 666 fdrop(fp, td); 667 goto done; 668 } 669 } else { 670 *(caddr_t *)data = uap->data; 671 } 672 } else if ((com&IOC_OUT) && size) { 673 /* 674 * Zero the buffer so the user always 675 * gets back something deterministic. 676 */ 677 bzero(data, size); 678 } else if (com&IOC_VOID) { 679 *(caddr_t *)data = uap->data; 680 } 681 682 switch (com) { 683 684 case FIONBIO: 685 FILE_LOCK(fp); 686 if ((tmp = *(int *)data)) 687 fp->f_flag |= FNONBLOCK; 688 else 689 fp->f_flag &= ~FNONBLOCK; 690 FILE_UNLOCK(fp); 691 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td); 692 break; 693 694 case FIOASYNC: 695 FILE_LOCK(fp); 696 if ((tmp = *(int *)data)) 697 fp->f_flag |= FASYNC; 698 else 699 fp->f_flag &= ~FASYNC; 700 FILE_UNLOCK(fp); 701 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td); 702 break; 703 704 default: 705 error = fo_ioctl(fp, com, data, td); 706 /* 707 * Copy any data to user, size was 708 * already set and checked above. 709 */ 710 if (error == 0 && (com&IOC_OUT) && size) 711 error = copyout(data, uap->data, (u_int)size); 712 break; 713 } 714 if (memp) 715 free(memp, M_IOCTLOPS); 716 fdrop(fp, td); 717done: 718 mtx_unlock(&Giant); 719 return (error); 720} 721 722static int nselcoll; /* Select collisions since boot */ 723struct cv selwait; 724SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 725 726/* 727 * Select system call. 728 */ 729#ifndef _SYS_SYSPROTO_H_ 730struct select_args { 731 int nd; 732 fd_set *in, *ou, *ex; 733 struct timeval *tv; 734}; 735#endif 736/* 737 * MPSAFE 738 */ 739int 740select(td, uap) 741 register struct thread *td; 742 register struct select_args *uap; 743{ 744 struct filedesc *fdp; 745 /* 746 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 747 * infds with the new FD_SETSIZE of 1024, and more than enough for 748 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 749 * of 256. 750 */ 751 fd_mask s_selbits[howmany(2048, NFDBITS)]; 752 fd_mask s_heldbits[howmany(2048, NFDBITS)]; 753 fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits; 754 struct timeval atv, rtv, ttv; 755 int ncoll, error, timo, i; 756 u_int nbufbytes, ncpbytes, nfdbits; 757 758 if (uap->nd < 0) 759 return (EINVAL); 760 fdp = td->td_proc->p_fd; 761 mtx_lock(&Giant); 762 FILEDESC_LOCK(fdp); 763 764 if (uap->nd > td->td_proc->p_fd->fd_nfiles) 765 uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 766 FILEDESC_UNLOCK(fdp); 767 768 /* 769 * Allocate just enough bits for the non-null fd_sets. Use the 770 * preallocated auto buffer if possible. 771 */ 772 nfdbits = roundup(uap->nd, NFDBITS); 773 ncpbytes = nfdbits / NBBY; 774 nbufbytes = 0; 775 if (uap->in != NULL) 776 nbufbytes += 2 * ncpbytes; 777 if (uap->ou != NULL) 778 nbufbytes += 2 * ncpbytes; 779 if (uap->ex != NULL) 780 nbufbytes += 2 * ncpbytes; 781 if (nbufbytes <= sizeof s_selbits) 782 selbits = &s_selbits[0]; 783 else 784 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 785 if (2 * ncpbytes <= sizeof s_heldbits) { 786 bzero(s_heldbits, sizeof(s_heldbits)); 787 heldbits = &s_heldbits[0]; 788 } else 789 heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO); 790 791 /* 792 * Assign pointers into the bit buffers and fetch the input bits. 793 * Put the output buffers together so that they can be bzeroed 794 * together. 795 */ 796 sbp = selbits; 797 hibits = heldbits + ncpbytes / sizeof *heldbits; 798 hobits = heldbits; 799#define getbits(name, x) \ 800 do { \ 801 if (uap->name == NULL) \ 802 ibits[x] = NULL; \ 803 else { \ 804 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 805 obits[x] = sbp; \ 806 sbp += ncpbytes / sizeof *sbp; \ 807 error = copyin(uap->name, ibits[x], ncpbytes); \ 808 if (error != 0) \ 809 goto done_noproclock; \ 810 for (i = 0; \ 811 i < ncpbytes / sizeof ibits[i][0]; \ 812 i++) \ 813 hibits[i] |= ibits[x][i]; \ 814 } \ 815 } while (0) 816 getbits(in, 0); 817 getbits(ou, 1); 818 getbits(ex, 2); 819#undef getbits 820 if (nbufbytes != 0) 821 bzero(selbits, nbufbytes / 2); 822 823 if (uap->tv) { 824 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, 825 sizeof (atv)); 826 if (error) 827 goto done_noproclock; 828 if (itimerfix(&atv)) { 829 error = EINVAL; 830 goto done_noproclock; 831 } 832 getmicrouptime(&rtv); 833 timevaladd(&atv, &rtv); 834 } else { 835 atv.tv_sec = 0; 836 atv.tv_usec = 0; 837 } 838 selholddrop(td, hibits, hobits, uap->nd, 1); 839 timo = 0; 840 PROC_LOCK(td->td_proc); 841retry: 842 ncoll = nselcoll; 843 mtx_lock_spin(&sched_lock); 844 td->td_flags |= TDF_SELECT; 845 mtx_unlock_spin(&sched_lock); 846 PROC_UNLOCK(td->td_proc); 847 error = selscan(td, ibits, obits, uap->nd); 848 PROC_LOCK(td->td_proc); 849 if (error || td->td_retval[0]) 850 goto done; 851 if (atv.tv_sec || atv.tv_usec) { 852 getmicrouptime(&rtv); 853 if (timevalcmp(&rtv, &atv, >=)) { 854 /* 855 * An event of our interest may occur during locking a process. 856 * In order to avoid missing the event that occured during locking 857 * the process, test TDF_SELECT and rescan file descriptors if 858 * necessary. 859 */ 860 mtx_lock_spin(&sched_lock); 861 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 862 ncoll = nselcoll; 863 td->td_flags |= TDF_SELECT; 864 mtx_unlock_spin(&sched_lock); 865 PROC_UNLOCK(td->td_proc); 866 error = selscan(td, ibits, obits, uap->nd); 867 PROC_LOCK(td->td_proc); 868 } else 869 mtx_unlock_spin(&sched_lock); 870 goto done; 871 } 872 ttv = atv; 873 timevalsub(&ttv, &rtv); 874 timo = ttv.tv_sec > 24 * 60 * 60 ? 875 24 * 60 * 60 * hz : tvtohz(&ttv); 876 } 877 mtx_lock_spin(&sched_lock); 878 td->td_flags &= ~TDF_SELECT; 879 mtx_unlock_spin(&sched_lock); 880 881 if (timo > 0) 882 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo); 883 else 884 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx); 885 886 if (error == 0) 887 goto retry; 888 889done: 890 mtx_lock_spin(&sched_lock); 891 td->td_flags &= ~TDF_SELECT; 892 mtx_unlock_spin(&sched_lock); 893 PROC_UNLOCK(td->td_proc); 894 selholddrop(td, hibits, hobits, uap->nd, 0); 895done_noproclock: 896 /* select is not restarted after signals... */ 897 if (error == ERESTART) 898 error = EINTR; 899 if (error == EWOULDBLOCK) 900 error = 0; 901#define putbits(name, x) \ 902 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ 903 error = error2; 904 if (error == 0) { 905 int error2; 906 907 putbits(in, 0); 908 putbits(ou, 1); 909 putbits(ex, 2); 910#undef putbits 911 } 912 if (selbits != &s_selbits[0]) 913 free(selbits, M_SELECT); 914 if (heldbits != &s_heldbits[0]) 915 free(heldbits, M_SELECT); 916 917 mtx_unlock(&Giant); 918 return (error); 919} 920 921/* 922 * Used to hold then release a group of fds for select(2). 923 * Hold (hold == 1) or release (hold == 0) a group of filedescriptors. 924 * if holding then use ibits setting the bits in obits, otherwise use obits. 925 */ 926static int 927selholddrop(td, ibits, obits, nfd, hold) 928 struct thread *td; 929 fd_mask *ibits, *obits; 930 int nfd, hold; 931{ 932 struct filedesc *fdp = td->td_proc->p_fd; 933 int i, fd; 934 fd_mask bits; 935 struct file *fp; 936 937 FILEDESC_LOCK(fdp); 938 for (i = 0; i < nfd; i += NFDBITS) { 939 if (hold) 940 bits = ibits[i/NFDBITS]; 941 else 942 bits = obits[i/NFDBITS]; 943 /* ffs(int mask) not portable, fd_mask is long */ 944 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 945 if (!(bits & 1)) 946 continue; 947 fp = fdp->fd_ofiles[fd]; 948 if (fp == NULL) { 949 FILEDESC_UNLOCK(fdp); 950 return (EBADF); 951 } 952 if (hold) { 953 fhold(fp); 954 obits[(fd)/NFDBITS] |= 955 ((fd_mask)1 << ((fd) % NFDBITS)); 956 } else { 957 /* XXX: optimize by making a special 958 * version of fdrop that only unlocks 959 * the filedesc if needed? This would 960 * redcuce the number of lock/unlock 961 * pairs by quite a bit. 962 */ 963 FILEDESC_UNLOCK(fdp); 964 fdrop(fp, td); 965 FILEDESC_LOCK(fdp); 966 } 967 } 968 } 969 FILEDESC_UNLOCK(fdp); 970 return (0); 971} 972 973static int 974selscan(td, ibits, obits, nfd) 975 struct thread *td; 976 fd_mask **ibits, **obits; 977 int nfd; 978{ 979 int msk, i, fd; 980 fd_mask bits; 981 struct file *fp; 982 int n = 0; 983 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 984 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 985 986 for (msk = 0; msk < 3; msk++) { 987 if (ibits[msk] == NULL) 988 continue; 989 for (i = 0; i < nfd; i += NFDBITS) { 990 bits = ibits[msk][i/NFDBITS]; 991 /* ffs(int mask) not portable, fd_mask is long */ 992 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 993 if (!(bits & 1)) 994 continue; 995 fp = ffind_hold(td, fd); 996 if (fp == NULL) 997 return (EBADF); 998 if (fo_poll(fp, flag[msk], fp->f_cred, td)) { 999 obits[msk][(fd)/NFDBITS] |= 1000 ((fd_mask)1 << ((fd) % NFDBITS)); 1001 n++; 1002 } 1003 fdrop(fp, td); 1004 } 1005 } 1006 } 1007 td->td_retval[0] = n; 1008 return (0); 1009} 1010 1011/* 1012 * Poll system call. 1013 */ 1014#ifndef _SYS_SYSPROTO_H_ 1015struct poll_args { 1016 struct pollfd *fds; 1017 u_int nfds; 1018 int timeout; 1019}; 1020#endif 1021/* 1022 * MPSAFE 1023 */ 1024int 1025poll(td, uap) 1026 struct thread *td; 1027 struct poll_args *uap; 1028{ 1029 caddr_t bits; 1030 char smallbits[32 * sizeof(struct pollfd)]; 1031 struct timeval atv, rtv, ttv; 1032 int ncoll, error = 0, timo; 1033 u_int nfds; 1034 size_t ni; 1035 struct pollfd p_heldbits[32]; 1036 struct pollfd *heldbits; 1037 1038 nfds = SCARG(uap, nfds); 1039 1040 mtx_lock(&Giant); 1041 /* 1042 * This is kinda bogus. We have fd limits, but that is not 1043 * really related to the size of the pollfd array. Make sure 1044 * we let the process use at least FD_SETSIZE entries and at 1045 * least enough for the current limits. We want to be reasonably 1046 * safe, but not overly restrictive. 1047 */ 1048 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 1049 (nfds > FD_SETSIZE)) { 1050 error = EINVAL; 1051 goto done2; 1052 } 1053 ni = nfds * sizeof(struct pollfd); 1054 if (ni > sizeof(smallbits)) 1055 bits = malloc(ni, M_TEMP, M_WAITOK); 1056 else 1057 bits = smallbits; 1058 if (ni > sizeof(p_heldbits)) 1059 heldbits = malloc(ni, M_TEMP, M_WAITOK); 1060 else { 1061 bzero(p_heldbits, sizeof(p_heldbits)); 1062 heldbits = p_heldbits; 1063 } 1064 error = copyin(SCARG(uap, fds), bits, ni); 1065 if (error) 1066 goto done_noproclock; 1067 bcopy(bits, heldbits, ni); 1068 if (SCARG(uap, timeout) != INFTIM) { 1069 atv.tv_sec = SCARG(uap, timeout) / 1000; 1070 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; 1071 if (itimerfix(&atv)) { 1072 error = EINVAL; 1073 goto done_noproclock; 1074 } 1075 getmicrouptime(&rtv); 1076 timevaladd(&atv, &rtv); 1077 } else { 1078 atv.tv_sec = 0; 1079 atv.tv_usec = 0; 1080 } 1081 pollholddrop(td, heldbits, nfds, 1); 1082 timo = 0; 1083 PROC_LOCK(td->td_proc); 1084retry: 1085 ncoll = nselcoll; 1086 mtx_lock_spin(&sched_lock); 1087 td->td_flags |= TDF_SELECT; 1088 mtx_unlock_spin(&sched_lock); 1089 PROC_UNLOCK(td->td_proc); 1090 error = pollscan(td, (struct pollfd *)bits, nfds); 1091 PROC_LOCK(td->td_proc); 1092 if (error || td->td_retval[0]) 1093 goto done; 1094 if (atv.tv_sec || atv.tv_usec) { 1095 getmicrouptime(&rtv); 1096 if (timevalcmp(&rtv, &atv, >=)) { 1097 /* 1098 * An event of our interest may occur during locking a process. 1099 * In order to avoid missing the event that occured during locking 1100 * the process, test TDF_SELECT and rescan file descriptors if 1101 * necessary. 1102 */ 1103 mtx_lock_spin(&sched_lock); 1104 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1105 ncoll = nselcoll; 1106 td->td_flags |= TDF_SELECT; 1107 mtx_unlock_spin(&sched_lock); 1108 PROC_UNLOCK(td->td_proc); 1109 error = pollscan(td, (struct pollfd *)bits, nfds); 1110 PROC_LOCK(td->td_proc); 1111 } else 1112 mtx_unlock_spin(&sched_lock); 1113 goto done; 1114 } 1115 ttv = atv; 1116 timevalsub(&ttv, &rtv); 1117 timo = ttv.tv_sec > 24 * 60 * 60 ? 1118 24 * 60 * 60 * hz : tvtohz(&ttv); 1119 } 1120 mtx_lock_spin(&sched_lock); 1121 td->td_flags &= ~TDF_SELECT; 1122 mtx_unlock_spin(&sched_lock); 1123 if (timo > 0) 1124 error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo); 1125 else 1126 error = cv_wait_sig(&selwait, &td->td_proc->p_mtx); 1127 if (error == 0) 1128 goto retry; 1129 1130done: 1131 mtx_lock_spin(&sched_lock); 1132 td->td_flags &= ~TDF_SELECT; 1133 mtx_unlock_spin(&sched_lock); 1134 PROC_UNLOCK(td->td_proc); 1135 pollholddrop(td, heldbits, nfds, 0); 1136done_noproclock: 1137 /* poll is not restarted after signals... */ 1138 if (error == ERESTART) 1139 error = EINTR; 1140 if (error == EWOULDBLOCK) 1141 error = 0; 1142 if (error == 0) { 1143 error = copyout(bits, SCARG(uap, fds), ni); 1144 if (error) 1145 goto out; 1146 } 1147out: 1148 if (ni > sizeof(smallbits)) 1149 free(bits, M_TEMP); 1150 if (ni > sizeof(p_heldbits)) 1151 free(heldbits, M_TEMP); 1152done2: 1153 mtx_unlock(&Giant); 1154 return (error); 1155} 1156 1157static int 1158pollholddrop(td, fds, nfd, hold) 1159 struct thread *td; 1160 struct pollfd *fds; 1161 u_int nfd; 1162 int hold; 1163{ 1164 register struct filedesc *fdp = td->td_proc->p_fd; 1165 int i; 1166 struct file *fp; 1167 1168 FILEDESC_LOCK(fdp); 1169 for (i = 0; i < nfd; i++, fds++) { 1170 if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) { 1171 fp = fdp->fd_ofiles[fds->fd]; 1172 if (hold) { 1173 if (fp != NULL) { 1174 fhold(fp); 1175 fds->revents = 1; 1176 } else 1177 fds->revents = 0; 1178 } else if(fp != NULL && fds->revents) { 1179 FILE_LOCK(fp); 1180 FILEDESC_UNLOCK(fdp); 1181 fdrop_locked(fp, td); 1182 FILEDESC_LOCK(fdp); 1183 } 1184 } 1185 } 1186 FILEDESC_UNLOCK(fdp); 1187 return (0); 1188} 1189 1190static int 1191pollscan(td, fds, nfd) 1192 struct thread *td; 1193 struct pollfd *fds; 1194 u_int nfd; 1195{ 1196 register struct filedesc *fdp = td->td_proc->p_fd; 1197 int i; 1198 struct file *fp; 1199 int n = 0; 1200 1201 for (i = 0; i < nfd; i++, fds++) { 1202 FILEDESC_LOCK(fdp); 1203 if (fds->fd >= fdp->fd_nfiles) { 1204 fds->revents = POLLNVAL; 1205 n++; 1206 FILEDESC_UNLOCK(fdp); 1207 } else if (fds->fd < 0) { 1208 fds->revents = 0; 1209 FILEDESC_UNLOCK(fdp); 1210 } else { 1211 fp = fdp->fd_ofiles[fds->fd]; 1212 FILEDESC_UNLOCK(fdp); 1213 if (fp == NULL) { 1214 fds->revents = POLLNVAL; 1215 n++; 1216 } else { 1217 /* 1218 * Note: backend also returns POLLHUP and 1219 * POLLERR if appropriate. 1220 */ 1221 fds->revents = fo_poll(fp, fds->events, 1222 fp->f_cred, td); 1223 if (fds->revents != 0) 1224 n++; 1225 } 1226 } 1227 } 1228 td->td_retval[0] = n; 1229 return (0); 1230} 1231 1232/* 1233 * OpenBSD poll system call. 1234 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1235 */ 1236#ifndef _SYS_SYSPROTO_H_ 1237struct openbsd_poll_args { 1238 struct pollfd *fds; 1239 u_int nfds; 1240 int timeout; 1241}; 1242#endif 1243/* 1244 * MPSAFE 1245 */ 1246int 1247openbsd_poll(td, uap) 1248 register struct thread *td; 1249 register struct openbsd_poll_args *uap; 1250{ 1251 return (poll(td, (struct poll_args *)uap)); 1252} 1253 1254/*ARGSUSED*/ 1255int 1256seltrue(dev, events, td) 1257 dev_t dev; 1258 int events; 1259 struct thread *td; 1260{ 1261 1262 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1263} 1264 1265static int 1266find_thread_in_proc(struct proc *p, struct thread *td) 1267{ 1268 struct thread *td2; 1269 FOREACH_THREAD_IN_PROC(p, td2) { 1270 if (td2 == td) { 1271 return (1); 1272 } 1273 } 1274 return (0); 1275} 1276 1277/* 1278 * Record a select request. 1279 */ 1280void 1281selrecord(selector, sip) 1282 struct thread *selector; 1283 struct selinfo *sip; 1284{ 1285 struct proc *p; 1286 pid_t mypid; 1287 1288 mypid = selector->td_proc->p_pid; 1289 if ((sip->si_pid == mypid) && 1290 (sip->si_thread == selector)) { /* XXXKSE should be an ID? */ 1291 return; 1292 } 1293 if (sip->si_pid && 1294 (p = pfind(sip->si_pid)) && 1295 (find_thread_in_proc(p, sip->si_thread))) { 1296 mtx_lock_spin(&sched_lock); 1297 if (sip->si_thread->td_wchan == (caddr_t)&selwait) { 1298 mtx_unlock_spin(&sched_lock); 1299 PROC_UNLOCK(p); 1300 sip->si_flags |= SI_COLL; 1301 return; 1302 } 1303 mtx_unlock_spin(&sched_lock); 1304 PROC_UNLOCK(p); 1305 } 1306 sip->si_pid = mypid; 1307 sip->si_thread = selector; 1308} 1309 1310/* 1311 * Do a wakeup when a selectable event occurs. 1312 */ 1313void 1314selwakeup(sip) 1315 register struct selinfo *sip; 1316{ 1317 struct thread *td; 1318 register struct proc *p; 1319 1320 if (sip->si_pid == 0) 1321 return; 1322 if (sip->si_flags & SI_COLL) { 1323 nselcoll++; 1324 sip->si_flags &= ~SI_COLL; 1325 cv_broadcast(&selwait); 1326 } 1327 p = pfind(sip->si_pid); 1328 sip->si_pid = 0; 1329 td = sip->si_thread; 1330 if (p != NULL) { 1331 if (!find_thread_in_proc(p, td)) { 1332 PROC_UNLOCK(p); /* lock is in pfind() */; 1333 return; 1334 } 1335 mtx_lock_spin(&sched_lock); 1336 if (td->td_wchan == (caddr_t)&selwait) { 1337 if (td->td_proc->p_stat == SSLEEP) 1338 setrunnable(td); 1339 else 1340 cv_waitq_remove(td); 1341 } else 1342 td->td_flags &= ~TDF_SELECT; 1343 mtx_unlock_spin(&sched_lock); 1344 PROC_UNLOCK(p); /* Lock is in pfind() */ 1345 } 1346} 1347 1348static void selectinit __P((void *)); 1349SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1350 1351/* ARGSUSED*/ 1352static void 1353selectinit(dummy) 1354 void *dummy; 1355{ 1356 cv_init(&selwait, "select"); 1357} 1358