sys_pipe.c revision 133741
1/* 2 * Copyright (c) 1996 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 */ 19 20/* 21 * This file contains a high-performance replacement for the socket-based 22 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 23 * all features of sockets, but does do everything that pipes normally 24 * do. 25 */ 26 27/* 28 * This code has two modes of operation, a small write mode and a large 29 * write mode. The small write mode acts like conventional pipes with 30 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 31 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 32 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and 33 * the receiving process can copy it directly from the pages in the sending 34 * process. 35 * 36 * If the sending process receives a signal, it is possible that it will 37 * go away, and certainly its address space can change, because control 38 * is returned back to the user-mode side. In that case, the pipe code 39 * arranges to copy the buffer supplied by the user process, to a pageable 40 * kernel buffer, and the receiving process will grab the data from the 41 * pageable kernel buffer. Since signals don't happen all that often, 42 * the copy operation is normally eliminated. 43 * 44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 45 * happen for small transfers so that the system will not spend all of 46 * its time context switching. 47 * 48 * In order to limit the resource use of pipes, two sysctls exist: 49 * 50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable 51 * address space available to us in pipe_map. Whenever the amount in use 52 * exceeds half of this value, all new pipes will be created with size 53 * SMALL_PIPE_SIZE, rather than PIPE_SIZE. Big pipe creation will be limited 54 * as well. This value is loader tunable only. 55 * 56 * These values are autotuned in subr_param.c. 57 * 58 * Memory usage may be monitored through the sysctls 59 * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired. 60 * 61 * 62 * Locking rules: There are two locks present here: A mutex, used via 63 * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via 64 * the flag, as mutexes can not persist over uiomove. The mutex 65 * exists only to guard access to the flag, and is not in itself a 66 * locking mechanism. 67 * 68 * As pipelock() may have to sleep before it can acquire the flag, it 69 * is important to reread all data after a call to pipelock(); everything 70 * in the structure may have changed. 71 */ 72 73#include <sys/cdefs.h> 74__FBSDID("$FreeBSD: head/sys/kern/sys_pipe.c 133741 2004-08-15 06:24:42Z jmg $"); 75 76#include "opt_mac.h" 77 78#include <sys/param.h> 79#include <sys/systm.h> 80#include <sys/fcntl.h> 81#include <sys/file.h> 82#include <sys/filedesc.h> 83#include <sys/filio.h> 84#include <sys/kernel.h> 85#include <sys/lock.h> 86#include <sys/mac.h> 87#include <sys/mutex.h> 88#include <sys/ttycom.h> 89#include <sys/stat.h> 90#include <sys/malloc.h> 91#include <sys/poll.h> 92#include <sys/selinfo.h> 93#include <sys/signalvar.h> 94#include <sys/sysctl.h> 95#include <sys/sysproto.h> 96#include <sys/pipe.h> 97#include <sys/proc.h> 98#include <sys/vnode.h> 99#include <sys/uio.h> 100#include <sys/event.h> 101 102#include <vm/vm.h> 103#include <vm/vm_param.h> 104#include <vm/vm_object.h> 105#include <vm/vm_kern.h> 106#include <vm/vm_extern.h> 107#include <vm/pmap.h> 108#include <vm/vm_map.h> 109#include <vm/vm_page.h> 110#include <vm/uma.h> 111 112/* 113 * Use this define if you want to disable *fancy* VM things. Expect an 114 * approx 30% decrease in transfer rate. This could be useful for 115 * NetBSD or OpenBSD. 116 */ 117/* #define PIPE_NODIRECT */ 118 119/* 120 * interfaces to the outside world 121 */ 122static fo_rdwr_t pipe_read; 123static fo_rdwr_t pipe_write; 124static fo_ioctl_t pipe_ioctl; 125static fo_poll_t pipe_poll; 126static fo_kqfilter_t pipe_kqfilter; 127static fo_stat_t pipe_stat; 128static fo_close_t pipe_close; 129 130static struct fileops pipeops = { 131 .fo_read = pipe_read, 132 .fo_write = pipe_write, 133 .fo_ioctl = pipe_ioctl, 134 .fo_poll = pipe_poll, 135 .fo_kqfilter = pipe_kqfilter, 136 .fo_stat = pipe_stat, 137 .fo_close = pipe_close, 138 .fo_flags = DFLAG_PASSABLE 139}; 140 141static void filt_pipedetach(struct knote *kn); 142static int filt_piperead(struct knote *kn, long hint); 143static int filt_pipewrite(struct knote *kn, long hint); 144 145static struct filterops pipe_rfiltops = 146 { 1, NULL, filt_pipedetach, filt_piperead }; 147static struct filterops pipe_wfiltops = 148 { 1, NULL, filt_pipedetach, filt_pipewrite }; 149 150/* 151 * Default pipe buffer size(s), this can be kind-of large now because pipe 152 * space is pageable. The pipe code will try to maintain locality of 153 * reference for performance reasons, so small amounts of outstanding I/O 154 * will not wipe the cache. 155 */ 156#define MINPIPESIZE (PIPE_SIZE/3) 157#define MAXPIPESIZE (2*PIPE_SIZE/3) 158 159/* 160 * Limit the number of "big" pipes 161 */ 162#define LIMITBIGPIPES 32 163static int nbigpipe; 164 165static int amountpipes; 166static int amountpipekva; 167 168SYSCTL_DECL(_kern_ipc); 169 170SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, 171 &maxpipekva, 0, "Pipe KVA limit"); 172SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD, 173 &amountpipes, 0, "Current # of pipes"); 174SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD, 175 &nbigpipe, 0, "Current # of big pipes"); 176SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, 177 &amountpipekva, 0, "Pipe KVA usage"); 178 179static void pipeinit(void *dummy __unused); 180static void pipeclose(struct pipe *cpipe); 181static void pipe_free_kmem(struct pipe *cpipe); 182static int pipe_create(struct pipe *pipe); 183static __inline int pipelock(struct pipe *cpipe, int catch); 184static __inline void pipeunlock(struct pipe *cpipe); 185static __inline void pipeselwakeup(struct pipe *cpipe); 186#ifndef PIPE_NODIRECT 187static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); 188static void pipe_destroy_write_buffer(struct pipe *wpipe); 189static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); 190static void pipe_clone_write_buffer(struct pipe *wpipe); 191#endif 192static int pipespace(struct pipe *cpipe, int size); 193static int pipespace_new(struct pipe *cpipe, int size); 194 195static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); 196static void pipe_zone_dtor(void *mem, int size, void *arg); 197static int pipe_zone_init(void *mem, int size, int flags); 198static void pipe_zone_fini(void *mem, int size); 199 200static uma_zone_t pipe_zone; 201 202SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); 203 204static void 205pipeinit(void *dummy __unused) 206{ 207 208 pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair), 209 pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini, 210 UMA_ALIGN_PTR, 0); 211 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); 212} 213 214static int 215pipe_zone_ctor(void *mem, int size, void *arg, int flags) 216{ 217 struct pipepair *pp; 218 struct pipe *rpipe, *wpipe; 219 220 KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); 221 222 pp = (struct pipepair *)mem; 223 224 /* 225 * We zero both pipe endpoints to make sure all the kmem pointers 226 * are NULL, flag fields are zero'd, etc. We timestamp both 227 * endpoints with the same time. 228 */ 229 rpipe = &pp->pp_rpipe; 230 bzero(rpipe, sizeof(*rpipe)); 231 vfs_timestamp(&rpipe->pipe_ctime); 232 rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; 233 234 wpipe = &pp->pp_wpipe; 235 bzero(wpipe, sizeof(*wpipe)); 236 wpipe->pipe_ctime = rpipe->pipe_ctime; 237 wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; 238 239 rpipe->pipe_peer = wpipe; 240 rpipe->pipe_pair = pp; 241 wpipe->pipe_peer = rpipe; 242 wpipe->pipe_pair = pp; 243 244 /* 245 * Mark both endpoints as present; they will later get free'd 246 * one at a time. When both are free'd, then the whole pair 247 * is released. 248 */ 249 rpipe->pipe_present = 1; 250 wpipe->pipe_present = 1; 251 252 /* 253 * Eventually, the MAC Framework may initialize the label 254 * in ctor or init, but for now we do it elswhere to avoid 255 * blocking in ctor or init. 256 */ 257 pp->pp_label = NULL; 258 259 atomic_add_int(&amountpipes, 2); 260 return (0); 261} 262 263static void 264pipe_zone_dtor(void *mem, int size, void *arg) 265{ 266 struct pipepair *pp; 267 268 KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size")); 269 270 pp = (struct pipepair *)mem; 271 272 atomic_subtract_int(&amountpipes, 2); 273} 274 275static int 276pipe_zone_init(void *mem, int size, int flags) 277{ 278 struct pipepair *pp; 279 280 KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); 281 282 pp = (struct pipepair *)mem; 283 284 mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); 285 return (0); 286} 287 288static void 289pipe_zone_fini(void *mem, int size) 290{ 291 struct pipepair *pp; 292 293 KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); 294 295 pp = (struct pipepair *)mem; 296 297 mtx_destroy(&pp->pp_mtx); 298} 299 300/* 301 * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, 302 * let the zone pick up the pieces via pipeclose(). 303 */ 304 305/* ARGSUSED */ 306int 307pipe(td, uap) 308 struct thread *td; 309 struct pipe_args /* { 310 int dummy; 311 } */ *uap; 312{ 313 struct filedesc *fdp = td->td_proc->p_fd; 314 struct file *rf, *wf; 315 struct pipepair *pp; 316 struct pipe *rpipe, *wpipe; 317 int fd, error; 318 319 pp = uma_zalloc(pipe_zone, M_WAITOK); 320#ifdef MAC 321 /* 322 * The MAC label is shared between the connected endpoints. As a 323 * result mac_init_pipe() and mac_create_pipe() are called once 324 * for the pair, and not on the endpoints. 325 */ 326 mac_init_pipe(pp); 327 mac_create_pipe(td->td_ucred, pp); 328#endif 329 rpipe = &pp->pp_rpipe; 330 wpipe = &pp->pp_wpipe; 331 332 if (pipe_create(rpipe) || pipe_create(wpipe)) { 333 pipeclose(rpipe); 334 pipeclose(wpipe); 335 return (ENFILE); 336 } 337 338 rpipe->pipe_state |= PIPE_DIRECTOK; 339 wpipe->pipe_state |= PIPE_DIRECTOK; 340 341 error = falloc(td, &rf, &fd); 342 if (error) { 343 pipeclose(rpipe); 344 pipeclose(wpipe); 345 return (error); 346 } 347 /* An extra reference on `rf' has been held for us by falloc(). */ 348 td->td_retval[0] = fd; 349 350 /* 351 * Warning: once we've gotten past allocation of the fd for the 352 * read-side, we can only drop the read side via fdrop() in order 353 * to avoid races against processes which manage to dup() the read 354 * side while we are blocked trying to allocate the write side. 355 */ 356 FILE_LOCK(rf); 357 rf->f_flag = FREAD | FWRITE; 358 rf->f_type = DTYPE_PIPE; 359 rf->f_data = rpipe; 360 rf->f_ops = &pipeops; 361 FILE_UNLOCK(rf); 362 error = falloc(td, &wf, &fd); 363 if (error) { 364 FILEDESC_LOCK(fdp); 365 if (fdp->fd_ofiles[td->td_retval[0]] == rf) { 366 fdp->fd_ofiles[td->td_retval[0]] = NULL; 367 fdunused(fdp, td->td_retval[0]); 368 FILEDESC_UNLOCK(fdp); 369 fdrop(rf, td); 370 } else { 371 FILEDESC_UNLOCK(fdp); 372 } 373 fdrop(rf, td); 374 /* rpipe has been closed by fdrop(). */ 375 pipeclose(wpipe); 376 return (error); 377 } 378 /* An extra reference on `wf' has been held for us by falloc(). */ 379 FILE_LOCK(wf); 380 wf->f_flag = FREAD | FWRITE; 381 wf->f_type = DTYPE_PIPE; 382 wf->f_data = wpipe; 383 wf->f_ops = &pipeops; 384 FILE_UNLOCK(wf); 385 fdrop(wf, td); 386 td->td_retval[1] = fd; 387 fdrop(rf, td); 388 389 return (0); 390} 391 392/* 393 * Allocate kva for pipe circular buffer, the space is pageable 394 * This routine will 'realloc' the size of a pipe safely, if it fails 395 * it will retain the old buffer. 396 * If it fails it will return ENOMEM. 397 */ 398static int 399pipespace_new(cpipe, size) 400 struct pipe *cpipe; 401 int size; 402{ 403 caddr_t buffer; 404 int error; 405 static int curfail = 0; 406 static struct timeval lastfail; 407 408 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); 409 410 size = round_page(size); 411 /* 412 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. 413 */ 414 buffer = (caddr_t) vm_map_min(pipe_map); 415 416 /* 417 * The map entry is, by default, pageable. 418 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. 419 */ 420 error = vm_map_find(pipe_map, NULL, 0, 421 (vm_offset_t *) &buffer, size, 1, 422 VM_PROT_ALL, VM_PROT_ALL, 0); 423 if (error != KERN_SUCCESS) { 424 if (ppsratecheck(&lastfail, &curfail, 1)) 425 printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); 426 return (ENOMEM); 427 } 428 429 /* free old resources if we're resizing */ 430 pipe_free_kmem(cpipe); 431 cpipe->pipe_buffer.buffer = buffer; 432 cpipe->pipe_buffer.size = size; 433 cpipe->pipe_buffer.in = 0; 434 cpipe->pipe_buffer.out = 0; 435 cpipe->pipe_buffer.cnt = 0; 436 atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size); 437 return (0); 438} 439 440/* 441 * Wrapper for pipespace_new() that performs locking assertions. 442 */ 443static int 444pipespace(cpipe, size) 445 struct pipe *cpipe; 446 int size; 447{ 448 449 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 450 ("Unlocked pipe passed to pipespace")); 451 return (pipespace_new(cpipe, size)); 452} 453 454/* 455 * lock a pipe for I/O, blocking other access 456 */ 457static __inline int 458pipelock(cpipe, catch) 459 struct pipe *cpipe; 460 int catch; 461{ 462 int error; 463 464 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 465 while (cpipe->pipe_state & PIPE_LOCKFL) { 466 cpipe->pipe_state |= PIPE_LWANT; 467 error = msleep(cpipe, PIPE_MTX(cpipe), 468 catch ? (PRIBIO | PCATCH) : PRIBIO, 469 "pipelk", 0); 470 if (error != 0) 471 return (error); 472 } 473 cpipe->pipe_state |= PIPE_LOCKFL; 474 return (0); 475} 476 477/* 478 * unlock a pipe I/O lock 479 */ 480static __inline void 481pipeunlock(cpipe) 482 struct pipe *cpipe; 483{ 484 485 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 486 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 487 ("Unlocked pipe passed to pipeunlock")); 488 cpipe->pipe_state &= ~PIPE_LOCKFL; 489 if (cpipe->pipe_state & PIPE_LWANT) { 490 cpipe->pipe_state &= ~PIPE_LWANT; 491 wakeup(cpipe); 492 } 493} 494 495static __inline void 496pipeselwakeup(cpipe) 497 struct pipe *cpipe; 498{ 499 500 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 501 if (cpipe->pipe_state & PIPE_SEL) { 502 cpipe->pipe_state &= ~PIPE_SEL; 503 selwakeuppri(&cpipe->pipe_sel, PSOCK); 504 } 505 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 506 pgsigio(&cpipe->pipe_sigio, SIGIO, 0); 507 KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); 508} 509 510/* 511 * Initialize and allocate VM and memory for pipe. The structure 512 * will start out zero'd from the ctor, so we just manage the kmem. 513 */ 514static int 515pipe_create(pipe) 516 struct pipe *pipe; 517{ 518 int error; 519 520 /* 521 * Reduce to 1/4th pipe size if we're over our global max. 522 */ 523 if (amountpipekva > maxpipekva / 2) 524 error = pipespace_new(pipe, SMALL_PIPE_SIZE); 525 else 526 error = pipespace_new(pipe, PIPE_SIZE); 527 knlist_init(&pipe->pipe_sel.si_note, PIPE_MTX(pipe)); 528 return (error); 529} 530 531/* ARGSUSED */ 532static int 533pipe_read(fp, uio, active_cred, flags, td) 534 struct file *fp; 535 struct uio *uio; 536 struct ucred *active_cred; 537 struct thread *td; 538 int flags; 539{ 540 struct pipe *rpipe = fp->f_data; 541 int error; 542 int nread = 0; 543 u_int size; 544 545 PIPE_LOCK(rpipe); 546 ++rpipe->pipe_busy; 547 error = pipelock(rpipe, 1); 548 if (error) 549 goto unlocked_error; 550 551#ifdef MAC 552 error = mac_check_pipe_read(active_cred, rpipe->pipe_pair); 553 if (error) 554 goto locked_error; 555#endif 556 557 while (uio->uio_resid) { 558 /* 559 * normal pipe buffer receive 560 */ 561 if (rpipe->pipe_buffer.cnt > 0) { 562 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 563 if (size > rpipe->pipe_buffer.cnt) 564 size = rpipe->pipe_buffer.cnt; 565 if (size > (u_int) uio->uio_resid) 566 size = (u_int) uio->uio_resid; 567 568 PIPE_UNLOCK(rpipe); 569 error = uiomove( 570 &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 571 size, uio); 572 PIPE_LOCK(rpipe); 573 if (error) 574 break; 575 576 rpipe->pipe_buffer.out += size; 577 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 578 rpipe->pipe_buffer.out = 0; 579 580 rpipe->pipe_buffer.cnt -= size; 581 582 /* 583 * If there is no more to read in the pipe, reset 584 * its pointers to the beginning. This improves 585 * cache hit stats. 586 */ 587 if (rpipe->pipe_buffer.cnt == 0) { 588 rpipe->pipe_buffer.in = 0; 589 rpipe->pipe_buffer.out = 0; 590 } 591 nread += size; 592#ifndef PIPE_NODIRECT 593 /* 594 * Direct copy, bypassing a kernel buffer. 595 */ 596 } else if ((size = rpipe->pipe_map.cnt) && 597 (rpipe->pipe_state & PIPE_DIRECTW)) { 598 if (size > (u_int) uio->uio_resid) 599 size = (u_int) uio->uio_resid; 600 601 PIPE_UNLOCK(rpipe); 602 error = uiomove_fromphys(rpipe->pipe_map.ms, 603 rpipe->pipe_map.pos, size, uio); 604 PIPE_LOCK(rpipe); 605 if (error) 606 break; 607 nread += size; 608 rpipe->pipe_map.pos += size; 609 rpipe->pipe_map.cnt -= size; 610 if (rpipe->pipe_map.cnt == 0) { 611 rpipe->pipe_state &= ~PIPE_DIRECTW; 612 wakeup(rpipe); 613 } 614#endif 615 } else { 616 /* 617 * detect EOF condition 618 * read returns 0 on EOF, no need to set error 619 */ 620 if (rpipe->pipe_state & PIPE_EOF) 621 break; 622 623 /* 624 * If the "write-side" has been blocked, wake it up now. 625 */ 626 if (rpipe->pipe_state & PIPE_WANTW) { 627 rpipe->pipe_state &= ~PIPE_WANTW; 628 wakeup(rpipe); 629 } 630 631 /* 632 * Break if some data was read. 633 */ 634 if (nread > 0) 635 break; 636 637 /* 638 * Unlock the pipe buffer for our remaining processing. 639 * We will either break out with an error or we will 640 * sleep and relock to loop. 641 */ 642 pipeunlock(rpipe); 643 644 /* 645 * Handle non-blocking mode operation or 646 * wait for more data. 647 */ 648 if (fp->f_flag & FNONBLOCK) { 649 error = EAGAIN; 650 } else { 651 rpipe->pipe_state |= PIPE_WANTR; 652 if ((error = msleep(rpipe, PIPE_MTX(rpipe), 653 PRIBIO | PCATCH, 654 "piperd", 0)) == 0) 655 error = pipelock(rpipe, 1); 656 } 657 if (error) 658 goto unlocked_error; 659 } 660 } 661#ifdef MAC 662locked_error: 663#endif 664 pipeunlock(rpipe); 665 666 /* XXX: should probably do this before getting any locks. */ 667 if (error == 0) 668 vfs_timestamp(&rpipe->pipe_atime); 669unlocked_error: 670 --rpipe->pipe_busy; 671 672 /* 673 * PIPE_WANT processing only makes sense if pipe_busy is 0. 674 */ 675 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 676 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 677 wakeup(rpipe); 678 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 679 /* 680 * Handle write blocking hysteresis. 681 */ 682 if (rpipe->pipe_state & PIPE_WANTW) { 683 rpipe->pipe_state &= ~PIPE_WANTW; 684 wakeup(rpipe); 685 } 686 } 687 688 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 689 pipeselwakeup(rpipe); 690 691 PIPE_UNLOCK(rpipe); 692 return (error); 693} 694 695#ifndef PIPE_NODIRECT 696/* 697 * Map the sending processes' buffer into kernel space and wire it. 698 * This is similar to a physical write operation. 699 */ 700static int 701pipe_build_write_buffer(wpipe, uio) 702 struct pipe *wpipe; 703 struct uio *uio; 704{ 705 pmap_t pmap; 706 u_int size; 707 int i, j; 708 vm_offset_t addr, endaddr; 709 710 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 711 712 size = (u_int) uio->uio_iov->iov_len; 713 if (size > wpipe->pipe_buffer.size) 714 size = wpipe->pipe_buffer.size; 715 716 pmap = vmspace_pmap(curproc->p_vmspace); 717 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); 718 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); 719 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { 720 /* 721 * vm_fault_quick() can sleep. Consequently, 722 * vm_page_lock_queue() and vm_page_unlock_queue() 723 * should not be performed outside of this loop. 724 */ 725 race: 726 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { 727 vm_page_lock_queues(); 728 for (j = 0; j < i; j++) 729 vm_page_unhold(wpipe->pipe_map.ms[j]); 730 vm_page_unlock_queues(); 731 return (EFAULT); 732 } 733 wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr, 734 VM_PROT_READ); 735 if (wpipe->pipe_map.ms[i] == NULL) 736 goto race; 737 } 738 739/* 740 * set up the control block 741 */ 742 wpipe->pipe_map.npages = i; 743 wpipe->pipe_map.pos = 744 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 745 wpipe->pipe_map.cnt = size; 746 747/* 748 * and update the uio data 749 */ 750 751 uio->uio_iov->iov_len -= size; 752 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; 753 if (uio->uio_iov->iov_len == 0) 754 uio->uio_iov++; 755 uio->uio_resid -= size; 756 uio->uio_offset += size; 757 return (0); 758} 759 760/* 761 * unmap and unwire the process buffer 762 */ 763static void 764pipe_destroy_write_buffer(wpipe) 765 struct pipe *wpipe; 766{ 767 int i; 768 769 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 770 vm_page_lock_queues(); 771 for (i = 0; i < wpipe->pipe_map.npages; i++) { 772 vm_page_unhold(wpipe->pipe_map.ms[i]); 773 } 774 vm_page_unlock_queues(); 775 wpipe->pipe_map.npages = 0; 776} 777 778/* 779 * In the case of a signal, the writing process might go away. This 780 * code copies the data into the circular buffer so that the source 781 * pages can be freed without loss of data. 782 */ 783static void 784pipe_clone_write_buffer(wpipe) 785 struct pipe *wpipe; 786{ 787 struct uio uio; 788 struct iovec iov; 789 int size; 790 int pos; 791 792 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 793 size = wpipe->pipe_map.cnt; 794 pos = wpipe->pipe_map.pos; 795 796 wpipe->pipe_buffer.in = size; 797 wpipe->pipe_buffer.out = 0; 798 wpipe->pipe_buffer.cnt = size; 799 wpipe->pipe_state &= ~PIPE_DIRECTW; 800 801 PIPE_UNLOCK(wpipe); 802 iov.iov_base = wpipe->pipe_buffer.buffer; 803 iov.iov_len = size; 804 uio.uio_iov = &iov; 805 uio.uio_iovcnt = 1; 806 uio.uio_offset = 0; 807 uio.uio_resid = size; 808 uio.uio_segflg = UIO_SYSSPACE; 809 uio.uio_rw = UIO_READ; 810 uio.uio_td = curthread; 811 uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); 812 PIPE_LOCK(wpipe); 813 pipe_destroy_write_buffer(wpipe); 814} 815 816/* 817 * This implements the pipe buffer write mechanism. Note that only 818 * a direct write OR a normal pipe write can be pending at any given time. 819 * If there are any characters in the pipe buffer, the direct write will 820 * be deferred until the receiving process grabs all of the bytes from 821 * the pipe buffer. Then the direct mapping write is set-up. 822 */ 823static int 824pipe_direct_write(wpipe, uio) 825 struct pipe *wpipe; 826 struct uio *uio; 827{ 828 int error; 829 830retry: 831 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 832 error = pipelock(wpipe, 1); 833 if (wpipe->pipe_state & PIPE_EOF) 834 error = EPIPE; 835 if (error) { 836 pipeunlock(wpipe); 837 goto error1; 838 } 839 while (wpipe->pipe_state & PIPE_DIRECTW) { 840 if (wpipe->pipe_state & PIPE_WANTR) { 841 wpipe->pipe_state &= ~PIPE_WANTR; 842 wakeup(wpipe); 843 } 844 wpipe->pipe_state |= PIPE_WANTW; 845 pipeunlock(wpipe); 846 error = msleep(wpipe, PIPE_MTX(wpipe), 847 PRIBIO | PCATCH, "pipdww", 0); 848 if (error) 849 goto error1; 850 else 851 goto retry; 852 } 853 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 854 if (wpipe->pipe_buffer.cnt > 0) { 855 if (wpipe->pipe_state & PIPE_WANTR) { 856 wpipe->pipe_state &= ~PIPE_WANTR; 857 wakeup(wpipe); 858 } 859 wpipe->pipe_state |= PIPE_WANTW; 860 pipeunlock(wpipe); 861 error = msleep(wpipe, PIPE_MTX(wpipe), 862 PRIBIO | PCATCH, "pipdwc", 0); 863 if (error) 864 goto error1; 865 else 866 goto retry; 867 } 868 869 wpipe->pipe_state |= PIPE_DIRECTW; 870 871 PIPE_UNLOCK(wpipe); 872 error = pipe_build_write_buffer(wpipe, uio); 873 PIPE_LOCK(wpipe); 874 if (error) { 875 wpipe->pipe_state &= ~PIPE_DIRECTW; 876 pipeunlock(wpipe); 877 goto error1; 878 } 879 880 error = 0; 881 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 882 if (wpipe->pipe_state & PIPE_EOF) { 883 pipe_destroy_write_buffer(wpipe); 884 pipeselwakeup(wpipe); 885 pipeunlock(wpipe); 886 error = EPIPE; 887 goto error1; 888 } 889 if (wpipe->pipe_state & PIPE_WANTR) { 890 wpipe->pipe_state &= ~PIPE_WANTR; 891 wakeup(wpipe); 892 } 893 pipeselwakeup(wpipe); 894 pipeunlock(wpipe); 895 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, 896 "pipdwt", 0); 897 pipelock(wpipe, 0); 898 } 899 900 if (wpipe->pipe_state & PIPE_EOF) 901 error = EPIPE; 902 if (wpipe->pipe_state & PIPE_DIRECTW) { 903 /* 904 * this bit of trickery substitutes a kernel buffer for 905 * the process that might be going away. 906 */ 907 pipe_clone_write_buffer(wpipe); 908 } else { 909 pipe_destroy_write_buffer(wpipe); 910 } 911 pipeunlock(wpipe); 912 return (error); 913 914error1: 915 wakeup(wpipe); 916 return (error); 917} 918#endif 919 920static int 921pipe_write(fp, uio, active_cred, flags, td) 922 struct file *fp; 923 struct uio *uio; 924 struct ucred *active_cred; 925 struct thread *td; 926 int flags; 927{ 928 int error = 0; 929 int orig_resid; 930 struct pipe *wpipe, *rpipe; 931 932 rpipe = fp->f_data; 933 wpipe = rpipe->pipe_peer; 934 935 PIPE_LOCK(rpipe); 936 error = pipelock(wpipe, 1); 937 if (error) { 938 PIPE_UNLOCK(rpipe); 939 return (error); 940 } 941 /* 942 * detect loss of pipe read side, issue SIGPIPE if lost. 943 */ 944 if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 945 pipeunlock(wpipe); 946 PIPE_UNLOCK(rpipe); 947 return (EPIPE); 948 } 949#ifdef MAC 950 error = mac_check_pipe_write(active_cred, wpipe->pipe_pair); 951 if (error) { 952 pipeunlock(wpipe); 953 PIPE_UNLOCK(rpipe); 954 return (error); 955 } 956#endif 957 ++wpipe->pipe_busy; 958 959 /* 960 * If it is advantageous to resize the pipe buffer, do 961 * so. 962 */ 963 if ((uio->uio_resid > PIPE_SIZE) && 964 (amountpipekva < maxpipekva / 2) && 965 (nbigpipe < LIMITBIGPIPES) && 966 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 967 (wpipe->pipe_buffer.size <= PIPE_SIZE) && 968 (wpipe->pipe_buffer.cnt == 0)) { 969 970 PIPE_UNLOCK(wpipe); 971 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) 972 atomic_add_int(&nbigpipe, 1); 973 PIPE_LOCK(wpipe); 974 } 975 976 pipeunlock(wpipe); 977 978 orig_resid = uio->uio_resid; 979 980 while (uio->uio_resid) { 981 int space; 982 983 pipelock(wpipe, 0); 984 if (wpipe->pipe_state & PIPE_EOF) { 985 pipeunlock(wpipe); 986 error = EPIPE; 987 break; 988 } 989#ifndef PIPE_NODIRECT 990 /* 991 * If the transfer is large, we can gain performance if 992 * we do process-to-process copies directly. 993 * If the write is non-blocking, we don't use the 994 * direct write mechanism. 995 * 996 * The direct write mechanism will detect the reader going 997 * away on us. 998 */ 999 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && 1000 (fp->f_flag & FNONBLOCK) == 0) { 1001 pipeunlock(wpipe); 1002 error = pipe_direct_write(wpipe, uio); 1003 if (error) 1004 break; 1005 continue; 1006 } 1007#endif 1008 1009 /* 1010 * Pipe buffered writes cannot be coincidental with 1011 * direct writes. We wait until the currently executing 1012 * direct write is completed before we start filling the 1013 * pipe buffer. We break out if a signal occurs or the 1014 * reader goes away. 1015 */ 1016 if (wpipe->pipe_state & PIPE_DIRECTW) { 1017 if (wpipe->pipe_state & PIPE_WANTR) { 1018 wpipe->pipe_state &= ~PIPE_WANTR; 1019 wakeup(wpipe); 1020 } 1021 pipeunlock(wpipe); 1022 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 1023 "pipbww", 0); 1024 if (error) 1025 break; 1026 else 1027 continue; 1028 } 1029 1030 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1031 1032 /* Writes of size <= PIPE_BUF must be atomic. */ 1033 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 1034 space = 0; 1035 1036 if (space > 0) { 1037 int size; /* Transfer size */ 1038 int segsize; /* first segment to transfer */ 1039 1040 /* 1041 * Transfer size is minimum of uio transfer 1042 * and free space in pipe buffer. 1043 */ 1044 if (space > uio->uio_resid) 1045 size = uio->uio_resid; 1046 else 1047 size = space; 1048 /* 1049 * First segment to transfer is minimum of 1050 * transfer size and contiguous space in 1051 * pipe buffer. If first segment to transfer 1052 * is less than the transfer size, we've got 1053 * a wraparound in the buffer. 1054 */ 1055 segsize = wpipe->pipe_buffer.size - 1056 wpipe->pipe_buffer.in; 1057 if (segsize > size) 1058 segsize = size; 1059 1060 /* Transfer first segment */ 1061 1062 PIPE_UNLOCK(rpipe); 1063 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1064 segsize, uio); 1065 PIPE_LOCK(rpipe); 1066 1067 if (error == 0 && segsize < size) { 1068 KASSERT(wpipe->pipe_buffer.in + segsize == 1069 wpipe->pipe_buffer.size, 1070 ("Pipe buffer wraparound disappeared")); 1071 /* 1072 * Transfer remaining part now, to 1073 * support atomic writes. Wraparound 1074 * happened. 1075 */ 1076 1077 PIPE_UNLOCK(rpipe); 1078 error = uiomove( 1079 &wpipe->pipe_buffer.buffer[0], 1080 size - segsize, uio); 1081 PIPE_LOCK(rpipe); 1082 } 1083 if (error == 0) { 1084 wpipe->pipe_buffer.in += size; 1085 if (wpipe->pipe_buffer.in >= 1086 wpipe->pipe_buffer.size) { 1087 KASSERT(wpipe->pipe_buffer.in == 1088 size - segsize + 1089 wpipe->pipe_buffer.size, 1090 ("Expected wraparound bad")); 1091 wpipe->pipe_buffer.in = size - segsize; 1092 } 1093 1094 wpipe->pipe_buffer.cnt += size; 1095 KASSERT(wpipe->pipe_buffer.cnt <= 1096 wpipe->pipe_buffer.size, 1097 ("Pipe buffer overflow")); 1098 } 1099 pipeunlock(wpipe); 1100 } else { 1101 /* 1102 * If the "read-side" has been blocked, wake it up now. 1103 */ 1104 if (wpipe->pipe_state & PIPE_WANTR) { 1105 wpipe->pipe_state &= ~PIPE_WANTR; 1106 wakeup(wpipe); 1107 } 1108 1109 /* 1110 * don't block on non-blocking I/O 1111 */ 1112 if (fp->f_flag & FNONBLOCK) { 1113 error = EAGAIN; 1114 pipeunlock(wpipe); 1115 break; 1116 } 1117 1118 /* 1119 * We have no more space and have something to offer, 1120 * wake up select/poll. 1121 */ 1122 pipeselwakeup(wpipe); 1123 1124 wpipe->pipe_state |= PIPE_WANTW; 1125 pipeunlock(wpipe); 1126 error = msleep(wpipe, PIPE_MTX(rpipe), 1127 PRIBIO | PCATCH, "pipewr", 0); 1128 if (error != 0) 1129 break; 1130 } 1131 } 1132 1133 pipelock(wpipe, 0); 1134 --wpipe->pipe_busy; 1135 1136 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 1137 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1138 wakeup(wpipe); 1139 } else if (wpipe->pipe_buffer.cnt > 0) { 1140 /* 1141 * If we have put any characters in the buffer, we wake up 1142 * the reader. 1143 */ 1144 if (wpipe->pipe_state & PIPE_WANTR) { 1145 wpipe->pipe_state &= ~PIPE_WANTR; 1146 wakeup(wpipe); 1147 } 1148 } 1149 1150 /* 1151 * Don't return EPIPE if I/O was successful 1152 */ 1153 if ((wpipe->pipe_buffer.cnt == 0) && 1154 (uio->uio_resid == 0) && 1155 (error == EPIPE)) { 1156 error = 0; 1157 } 1158 1159 if (error == 0) 1160 vfs_timestamp(&wpipe->pipe_mtime); 1161 1162 /* 1163 * We have something to offer, 1164 * wake up select/poll. 1165 */ 1166 if (wpipe->pipe_buffer.cnt) 1167 pipeselwakeup(wpipe); 1168 1169 pipeunlock(wpipe); 1170 PIPE_UNLOCK(rpipe); 1171 return (error); 1172} 1173 1174/* 1175 * we implement a very minimal set of ioctls for compatibility with sockets. 1176 */ 1177static int 1178pipe_ioctl(fp, cmd, data, active_cred, td) 1179 struct file *fp; 1180 u_long cmd; 1181 void *data; 1182 struct ucred *active_cred; 1183 struct thread *td; 1184{ 1185 struct pipe *mpipe = fp->f_data; 1186#ifdef MAC 1187 int error; 1188#endif 1189 1190 PIPE_LOCK(mpipe); 1191 1192#ifdef MAC 1193 error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data); 1194 if (error) { 1195 PIPE_UNLOCK(mpipe); 1196 return (error); 1197 } 1198#endif 1199 1200 switch (cmd) { 1201 1202 case FIONBIO: 1203 PIPE_UNLOCK(mpipe); 1204 return (0); 1205 1206 case FIOASYNC: 1207 if (*(int *)data) { 1208 mpipe->pipe_state |= PIPE_ASYNC; 1209 } else { 1210 mpipe->pipe_state &= ~PIPE_ASYNC; 1211 } 1212 PIPE_UNLOCK(mpipe); 1213 return (0); 1214 1215 case FIONREAD: 1216 if (mpipe->pipe_state & PIPE_DIRECTW) 1217 *(int *)data = mpipe->pipe_map.cnt; 1218 else 1219 *(int *)data = mpipe->pipe_buffer.cnt; 1220 PIPE_UNLOCK(mpipe); 1221 return (0); 1222 1223 case FIOSETOWN: 1224 PIPE_UNLOCK(mpipe); 1225 return (fsetown(*(int *)data, &mpipe->pipe_sigio)); 1226 1227 case FIOGETOWN: 1228 PIPE_UNLOCK(mpipe); 1229 *(int *)data = fgetown(&mpipe->pipe_sigio); 1230 return (0); 1231 1232 /* This is deprecated, FIOSETOWN should be used instead. */ 1233 case TIOCSPGRP: 1234 PIPE_UNLOCK(mpipe); 1235 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); 1236 1237 /* This is deprecated, FIOGETOWN should be used instead. */ 1238 case TIOCGPGRP: 1239 PIPE_UNLOCK(mpipe); 1240 *(int *)data = -fgetown(&mpipe->pipe_sigio); 1241 return (0); 1242 1243 } 1244 PIPE_UNLOCK(mpipe); 1245 return (ENOTTY); 1246} 1247 1248static int 1249pipe_poll(fp, events, active_cred, td) 1250 struct file *fp; 1251 int events; 1252 struct ucred *active_cred; 1253 struct thread *td; 1254{ 1255 struct pipe *rpipe = fp->f_data; 1256 struct pipe *wpipe; 1257 int revents = 0; 1258#ifdef MAC 1259 int error; 1260#endif 1261 1262 wpipe = rpipe->pipe_peer; 1263 PIPE_LOCK(rpipe); 1264#ifdef MAC 1265 error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair); 1266 if (error) 1267 goto locked_error; 1268#endif 1269 if (events & (POLLIN | POLLRDNORM)) 1270 if ((rpipe->pipe_state & PIPE_DIRECTW) || 1271 (rpipe->pipe_buffer.cnt > 0) || 1272 (rpipe->pipe_state & PIPE_EOF)) 1273 revents |= events & (POLLIN | POLLRDNORM); 1274 1275 if (events & (POLLOUT | POLLWRNORM)) 1276 if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) || 1277 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1278 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1279 revents |= events & (POLLOUT | POLLWRNORM); 1280 1281 if ((rpipe->pipe_state & PIPE_EOF) || 1282 (!wpipe->pipe_present) || 1283 (wpipe->pipe_state & PIPE_EOF)) 1284 revents |= POLLHUP; 1285 1286 if (revents == 0) { 1287 if (events & (POLLIN | POLLRDNORM)) { 1288 selrecord(td, &rpipe->pipe_sel); 1289 rpipe->pipe_state |= PIPE_SEL; 1290 } 1291 1292 if (events & (POLLOUT | POLLWRNORM)) { 1293 selrecord(td, &wpipe->pipe_sel); 1294 wpipe->pipe_state |= PIPE_SEL; 1295 } 1296 } 1297#ifdef MAC 1298locked_error: 1299#endif 1300 PIPE_UNLOCK(rpipe); 1301 1302 return (revents); 1303} 1304 1305/* 1306 * We shouldn't need locks here as we're doing a read and this should 1307 * be a natural race. 1308 */ 1309static int 1310pipe_stat(fp, ub, active_cred, td) 1311 struct file *fp; 1312 struct stat *ub; 1313 struct ucred *active_cred; 1314 struct thread *td; 1315{ 1316 struct pipe *pipe = fp->f_data; 1317#ifdef MAC 1318 int error; 1319 1320 PIPE_LOCK(pipe); 1321 error = mac_check_pipe_stat(active_cred, pipe->pipe_pair); 1322 PIPE_UNLOCK(pipe); 1323 if (error) 1324 return (error); 1325#endif 1326 bzero(ub, sizeof(*ub)); 1327 ub->st_mode = S_IFIFO; 1328 ub->st_blksize = pipe->pipe_buffer.size; 1329 if (pipe->pipe_state & PIPE_DIRECTW) 1330 ub->st_size = pipe->pipe_map.cnt; 1331 else 1332 ub->st_size = pipe->pipe_buffer.cnt; 1333 ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 1334 ub->st_atimespec = pipe->pipe_atime; 1335 ub->st_mtimespec = pipe->pipe_mtime; 1336 ub->st_ctimespec = pipe->pipe_ctime; 1337 ub->st_uid = fp->f_cred->cr_uid; 1338 ub->st_gid = fp->f_cred->cr_gid; 1339 /* 1340 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. 1341 * XXX (st_dev, st_ino) should be unique. 1342 */ 1343 return (0); 1344} 1345 1346/* ARGSUSED */ 1347static int 1348pipe_close(fp, td) 1349 struct file *fp; 1350 struct thread *td; 1351{ 1352 struct pipe *cpipe = fp->f_data; 1353 1354 fp->f_ops = &badfileops; 1355 fp->f_data = NULL; 1356 funsetown(&cpipe->pipe_sigio); 1357 pipeclose(cpipe); 1358 return (0); 1359} 1360 1361static void 1362pipe_free_kmem(cpipe) 1363 struct pipe *cpipe; 1364{ 1365 1366 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), 1367 ("pipe_free_kmem: pipe mutex locked")); 1368 1369 if (cpipe->pipe_buffer.buffer != NULL) { 1370 if (cpipe->pipe_buffer.size > PIPE_SIZE) 1371 atomic_subtract_int(&nbigpipe, 1); 1372 atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size); 1373 vm_map_remove(pipe_map, 1374 (vm_offset_t)cpipe->pipe_buffer.buffer, 1375 (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); 1376 cpipe->pipe_buffer.buffer = NULL; 1377 } 1378#ifndef PIPE_NODIRECT 1379 { 1380 cpipe->pipe_map.cnt = 0; 1381 cpipe->pipe_map.pos = 0; 1382 cpipe->pipe_map.npages = 0; 1383 } 1384#endif 1385} 1386 1387/* 1388 * shutdown the pipe 1389 */ 1390static void 1391pipeclose(cpipe) 1392 struct pipe *cpipe; 1393{ 1394 struct pipepair *pp; 1395 struct pipe *ppipe; 1396 1397 KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); 1398 1399 PIPE_LOCK(cpipe); 1400 pipelock(cpipe, 0); 1401 pp = cpipe->pipe_pair; 1402 1403 pipeselwakeup(cpipe); 1404 1405 /* 1406 * If the other side is blocked, wake it up saying that 1407 * we want to close it down. 1408 */ 1409 cpipe->pipe_state |= PIPE_EOF; 1410 while (cpipe->pipe_busy) { 1411 wakeup(cpipe); 1412 cpipe->pipe_state |= PIPE_WANT; 1413 pipeunlock(cpipe); 1414 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); 1415 pipelock(cpipe, 0); 1416 } 1417 1418 1419 /* 1420 * Disconnect from peer, if any. 1421 */ 1422 ppipe = cpipe->pipe_peer; 1423 if (ppipe->pipe_present != 0) { 1424 pipeselwakeup(ppipe); 1425 1426 ppipe->pipe_state |= PIPE_EOF; 1427 wakeup(ppipe); 1428 KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); 1429 } 1430 1431 /* 1432 * Mark this endpoint as free. Release kmem resources. We 1433 * don't mark this endpoint as unused until we've finished 1434 * doing that, or the pipe might disappear out from under 1435 * us. 1436 */ 1437 PIPE_UNLOCK(cpipe); 1438 pipe_free_kmem(cpipe); 1439 PIPE_LOCK(cpipe); 1440 cpipe->pipe_present = 0; 1441 pipeunlock(cpipe); 1442 knlist_clear(&cpipe->pipe_sel.si_note, 1); 1443 knlist_destroy(&cpipe->pipe_sel.si_note); 1444 1445 /* 1446 * If both endpoints are now closed, release the memory for the 1447 * pipe pair. If not, unlock. 1448 */ 1449 if (ppipe->pipe_present == 0) { 1450 PIPE_UNLOCK(cpipe); 1451#ifdef MAC 1452 mac_destroy_pipe(pp); 1453#endif 1454 uma_zfree(pipe_zone, cpipe->pipe_pair); 1455 } else 1456 PIPE_UNLOCK(cpipe); 1457} 1458 1459/*ARGSUSED*/ 1460static int 1461pipe_kqfilter(struct file *fp, struct knote *kn) 1462{ 1463 struct pipe *cpipe; 1464 1465 cpipe = kn->kn_fp->f_data; 1466 PIPE_LOCK(cpipe); 1467 switch (kn->kn_filter) { 1468 case EVFILT_READ: 1469 kn->kn_fop = &pipe_rfiltops; 1470 break; 1471 case EVFILT_WRITE: 1472 kn->kn_fop = &pipe_wfiltops; 1473 if (!cpipe->pipe_peer->pipe_present) { 1474 /* other end of pipe has been closed */ 1475 PIPE_UNLOCK(cpipe); 1476 return (EPIPE); 1477 } 1478 cpipe = cpipe->pipe_peer; 1479 break; 1480 default: 1481 PIPE_UNLOCK(cpipe); 1482 return (EINVAL); 1483 } 1484 1485 knlist_add(&cpipe->pipe_sel.si_note, kn, 1); 1486 PIPE_UNLOCK(cpipe); 1487 return (0); 1488} 1489 1490static void 1491filt_pipedetach(struct knote *kn) 1492{ 1493 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; 1494 1495 PIPE_LOCK(cpipe); 1496 if (kn->kn_filter == EVFILT_WRITE) { 1497 if (!cpipe->pipe_peer->pipe_present) { 1498 PIPE_UNLOCK(cpipe); 1499 return; 1500 } 1501 cpipe = cpipe->pipe_peer; 1502 } 1503 knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); 1504 PIPE_UNLOCK(cpipe); 1505} 1506 1507/*ARGSUSED*/ 1508static int 1509filt_piperead(struct knote *kn, long hint) 1510{ 1511 struct pipe *rpipe = kn->kn_fp->f_data; 1512 struct pipe *wpipe = rpipe->pipe_peer; 1513 int ret; 1514 1515 PIPE_LOCK(rpipe); 1516 kn->kn_data = rpipe->pipe_buffer.cnt; 1517 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1518 kn->kn_data = rpipe->pipe_map.cnt; 1519 1520 if ((rpipe->pipe_state & PIPE_EOF) || 1521 (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1522 kn->kn_flags |= EV_EOF; 1523 PIPE_UNLOCK(rpipe); 1524 return (1); 1525 } 1526 ret = kn->kn_data > 0; 1527 PIPE_UNLOCK(rpipe); 1528 return ret; 1529} 1530 1531/*ARGSUSED*/ 1532static int 1533filt_pipewrite(struct knote *kn, long hint) 1534{ 1535 struct pipe *rpipe = kn->kn_fp->f_data; 1536 struct pipe *wpipe = rpipe->pipe_peer; 1537 1538 PIPE_LOCK(rpipe); 1539 if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1540 kn->kn_data = 0; 1541 kn->kn_flags |= EV_EOF; 1542 PIPE_UNLOCK(rpipe); 1543 return (1); 1544 } 1545 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1546 if (wpipe->pipe_state & PIPE_DIRECTW) 1547 kn->kn_data = 0; 1548 1549 PIPE_UNLOCK(rpipe); 1550 return (kn->kn_data >= PIPE_BUF); 1551} 1552