1/* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7#include <linux/mm.h> 8#include <linux/file.h> 9#include <linux/poll.h> 10#include <linux/slab.h> 11#include <linux/module.h> 12#include <linux/init.h> 13#include <linux/fs.h> 14#include <linux/log2.h> 15#include <linux/mount.h> 16#include <linux/pipe_fs_i.h> 17#include <linux/uio.h> 18#include <linux/highmem.h> 19#include <linux/pagemap.h> 20#include <linux/audit.h> 21#include <linux/syscalls.h> 22#include <linux/fcntl.h> 23 24#include <asm/uaccess.h> 25#include <asm/ioctls.h> 26 27/* 28 * The max size that a non-root user is allowed to grow the pipe. Can 29 * be set by root in /proc/sys/fs/pipe-max-size 30 */ 31unsigned int pipe_max_size = 1048576; 32 33/* 34 * Minimum pipe size, as required by POSIX 35 */ 36unsigned int pipe_min_size = PAGE_SIZE; 37 38/* 39 * We use a start+len construction, which provides full use of the 40 * allocated memory. 41 * -- Florian Coosmann (FGC) 42 * 43 * Reads with count = 0 should always return 0. 44 * -- Julian Bradfield 1999-06-07. 45 * 46 * FIFOs and Pipes now generate SIGIO for both readers and writers. 47 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 48 * 49 * pipe_read & write cleanup 50 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 51 */ 52 53static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 54{ 55 if (pipe->inode) 56 mutex_lock_nested(&pipe->inode->i_mutex, subclass); 57} 58 59void pipe_lock(struct pipe_inode_info *pipe) 60{ 61 /* 62 * pipe_lock() nests non-pipe inode locks (for writing to a file) 63 */ 64 pipe_lock_nested(pipe, I_MUTEX_PARENT); 65} 66EXPORT_SYMBOL(pipe_lock); 67 68void pipe_unlock(struct pipe_inode_info *pipe) 69{ 70 if (pipe->inode) 71 mutex_unlock(&pipe->inode->i_mutex); 72} 73EXPORT_SYMBOL(pipe_unlock); 74 75void pipe_double_lock(struct pipe_inode_info *pipe1, 76 struct pipe_inode_info *pipe2) 77{ 78 BUG_ON(pipe1 == pipe2); 79 80 if (pipe1 < pipe2) { 81 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 82 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 83 } else { 84 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 85 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 86 } 87} 88 89/* Drop the inode semaphore and wait for a pipe event, atomically */ 90void pipe_wait(struct pipe_inode_info *pipe) 91{ 92 DEFINE_WAIT(wait); 93 94 /* 95 * Pipes are system-local resources, so sleeping on them 96 * is considered a noninteractive wait: 97 */ 98 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 99 pipe_unlock(pipe); 100 schedule(); 101 finish_wait(&pipe->wait, &wait); 102 pipe_lock(pipe); 103} 104 105static int 106pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, 107 int atomic) 108{ 109 unsigned long copy; 110 111 while (len > 0) { 112 while (!iov->iov_len) 113 iov++; 114 copy = min_t(unsigned long, len, iov->iov_len); 115 116 if (atomic) { 117 if (__copy_from_user_inatomic(to, iov->iov_base, copy)) 118 return -EFAULT; 119 } else { 120 if (copy_from_user(to, iov->iov_base, copy)) 121 return -EFAULT; 122 } 123 to += copy; 124 len -= copy; 125 iov->iov_base += copy; 126 iov->iov_len -= copy; 127 } 128 return 0; 129} 130 131static int 132pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, 133 int atomic) 134{ 135 unsigned long copy; 136 137 while (len > 0) { 138 while (!iov->iov_len) 139 iov++; 140 copy = min_t(unsigned long, len, iov->iov_len); 141 142 if (atomic) { 143 if (__copy_to_user_inatomic(iov->iov_base, from, copy)) 144 return -EFAULT; 145 } else { 146 if (copy_to_user(iov->iov_base, from, copy)) 147 return -EFAULT; 148 } 149 from += copy; 150 len -= copy; 151 iov->iov_base += copy; 152 iov->iov_len -= copy; 153 } 154 return 0; 155} 156 157/* 158 * Attempt to pre-fault in the user memory, so we can use atomic copies. 159 * Returns the number of bytes not faulted in. 160 */ 161static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) 162{ 163 while (!iov->iov_len) 164 iov++; 165 166 while (len > 0) { 167 unsigned long this_len; 168 169 this_len = min_t(unsigned long, len, iov->iov_len); 170 if (fault_in_pages_writeable(iov->iov_base, this_len)) 171 break; 172 173 len -= this_len; 174 iov++; 175 } 176 177 return len; 178} 179 180/* 181 * Pre-fault in the user memory, so we can use atomic copies. 182 */ 183static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) 184{ 185 while (!iov->iov_len) 186 iov++; 187 188 while (len > 0) { 189 unsigned long this_len; 190 191 this_len = min_t(unsigned long, len, iov->iov_len); 192 fault_in_pages_readable(iov->iov_base, this_len); 193 len -= this_len; 194 iov++; 195 } 196} 197 198static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 199 struct pipe_buffer *buf) 200{ 201 struct page *page = buf->page; 202 203 /* 204 * If nobody else uses this page, and we don't already have a 205 * temporary page, let's keep track of it as a one-deep 206 * allocation cache. (Otherwise just release our reference to it) 207 */ 208 if (page_count(page) == 1 && !pipe->tmp_page) 209 pipe->tmp_page = page; 210 else 211 page_cache_release(page); 212} 213 214/** 215 * generic_pipe_buf_map - virtually map a pipe buffer 216 * @pipe: the pipe that the buffer belongs to 217 * @buf: the buffer that should be mapped 218 * @atomic: whether to use an atomic map 219 * 220 * Description: 221 * This function returns a kernel virtual address mapping for the 222 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided 223 * and the caller has to be careful not to fault before calling 224 * the unmap function. 225 * 226 * Note that this function occupies KM_USER0 if @atomic != 0. 227 */ 228void *generic_pipe_buf_map(struct pipe_inode_info *pipe, 229 struct pipe_buffer *buf, int atomic) 230{ 231 if (atomic) { 232 buf->flags |= PIPE_BUF_FLAG_ATOMIC; 233 return kmap_atomic(buf->page, KM_USER0); 234 } 235 236 return kmap(buf->page); 237} 238EXPORT_SYMBOL(generic_pipe_buf_map); 239 240/** 241 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 242 * @pipe: the pipe that the buffer belongs to 243 * @buf: the buffer that should be unmapped 244 * @map_data: the data that the mapping function returned 245 * 246 * Description: 247 * This function undoes the mapping that ->map() provided. 248 */ 249void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, 250 struct pipe_buffer *buf, void *map_data) 251{ 252 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { 253 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; 254 kunmap_atomic(map_data, KM_USER0); 255 } else 256 kunmap(buf->page); 257} 258EXPORT_SYMBOL(generic_pipe_buf_unmap); 259 260/** 261 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 262 * @pipe: the pipe that the buffer belongs to 263 * @buf: the buffer to attempt to steal 264 * 265 * Description: 266 * This function attempts to steal the &struct page attached to 267 * @buf. If successful, this function returns 0 and returns with 268 * the page locked. The caller may then reuse the page for whatever 269 * he wishes; the typical use is insertion into a different file 270 * page cache. 271 */ 272int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 273 struct pipe_buffer *buf) 274{ 275 struct page *page = buf->page; 276 277 /* 278 * A reference of one is golden, that means that the owner of this 279 * page is the only one holding a reference to it. lock the page 280 * and return OK. 281 */ 282 if (page_count(page) == 1) { 283 lock_page(page); 284 return 0; 285 } 286 287 return 1; 288} 289EXPORT_SYMBOL(generic_pipe_buf_steal); 290 291/** 292 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 293 * @pipe: the pipe that the buffer belongs to 294 * @buf: the buffer to get a reference to 295 * 296 * Description: 297 * This function grabs an extra reference to @buf. It's used in 298 * in the tee() system call, when we duplicate the buffers in one 299 * pipe into another. 300 */ 301void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 302{ 303 page_cache_get(buf->page); 304} 305EXPORT_SYMBOL(generic_pipe_buf_get); 306 307/** 308 * generic_pipe_buf_confirm - verify contents of the pipe buffer 309 * @info: the pipe that the buffer belongs to 310 * @buf: the buffer to confirm 311 * 312 * Description: 313 * This function does nothing, because the generic pipe code uses 314 * pages that are always good when inserted into the pipe. 315 */ 316int generic_pipe_buf_confirm(struct pipe_inode_info *info, 317 struct pipe_buffer *buf) 318{ 319 return 0; 320} 321EXPORT_SYMBOL(generic_pipe_buf_confirm); 322 323/** 324 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 325 * @pipe: the pipe that the buffer belongs to 326 * @buf: the buffer to put a reference to 327 * 328 * Description: 329 * This function releases a reference to @buf. 330 */ 331void generic_pipe_buf_release(struct pipe_inode_info *pipe, 332 struct pipe_buffer *buf) 333{ 334 page_cache_release(buf->page); 335} 336EXPORT_SYMBOL(generic_pipe_buf_release); 337 338static const struct pipe_buf_operations anon_pipe_buf_ops = { 339 .can_merge = 1, 340 .map = generic_pipe_buf_map, 341 .unmap = generic_pipe_buf_unmap, 342 .confirm = generic_pipe_buf_confirm, 343 .release = anon_pipe_buf_release, 344 .steal = generic_pipe_buf_steal, 345 .get = generic_pipe_buf_get, 346}; 347 348static ssize_t 349pipe_read(struct kiocb *iocb, const struct iovec *_iov, 350 unsigned long nr_segs, loff_t pos) 351{ 352 struct file *filp = iocb->ki_filp; 353 struct inode *inode = filp->f_path.dentry->d_inode; 354 struct pipe_inode_info *pipe; 355 int do_wakeup; 356 ssize_t ret; 357 struct iovec *iov = (struct iovec *)_iov; 358 size_t total_len; 359 360 total_len = iov_length(iov, nr_segs); 361 /* Null read succeeds. */ 362 if (unlikely(total_len == 0)) 363 return 0; 364 365 do_wakeup = 0; 366 ret = 0; 367 mutex_lock(&inode->i_mutex); 368 pipe = inode->i_pipe; 369 for (;;) { 370 int bufs = pipe->nrbufs; 371 if (bufs) { 372 int curbuf = pipe->curbuf; 373 struct pipe_buffer *buf = pipe->bufs + curbuf; 374 const struct pipe_buf_operations *ops = buf->ops; 375 void *addr; 376 size_t chars = buf->len; 377 int error, atomic; 378 379 if (chars > total_len) 380 chars = total_len; 381 382 error = ops->confirm(pipe, buf); 383 if (error) { 384 if (!ret) 385 ret = error; 386 break; 387 } 388 389 atomic = !iov_fault_in_pages_write(iov, chars); 390redo: 391 addr = ops->map(pipe, buf, atomic); 392 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); 393 ops->unmap(pipe, buf, addr); 394 if (unlikely(error)) { 395 /* 396 * Just retry with the slow path if we failed. 397 */ 398 if (atomic) { 399 atomic = 0; 400 goto redo; 401 } 402 if (!ret) 403 ret = error; 404 break; 405 } 406 ret += chars; 407 buf->offset += chars; 408 buf->len -= chars; 409 if (!buf->len) { 410 buf->ops = NULL; 411 ops->release(pipe, buf); 412 curbuf = (curbuf + 1) & (pipe->buffers - 1); 413 pipe->curbuf = curbuf; 414 pipe->nrbufs = --bufs; 415 do_wakeup = 1; 416 } 417 total_len -= chars; 418 if (!total_len) 419 break; /* common path: read succeeded */ 420 } 421 if (bufs) /* More to do? */ 422 continue; 423 if (!pipe->writers) 424 break; 425 if (!pipe->waiting_writers) { 426 /* syscall merging: Usually we must not sleep 427 * if O_NONBLOCK is set, or if we got some data. 428 * But if a writer sleeps in kernel space, then 429 * we can wait for that data without violating POSIX. 430 */ 431 if (ret) 432 break; 433 if (filp->f_flags & O_NONBLOCK) { 434 ret = -EAGAIN; 435 break; 436 } 437 } 438 if (signal_pending(current)) { 439 if (!ret) 440 ret = -ERESTARTSYS; 441 break; 442 } 443 if (do_wakeup) { 444 wake_up_interruptible_sync(&pipe->wait); 445 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 446 } 447 pipe_wait(pipe); 448 } 449 mutex_unlock(&inode->i_mutex); 450 451 /* Signal writers asynchronously that there is more room. */ 452 if (do_wakeup) { 453 wake_up_interruptible_sync(&pipe->wait); 454 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 455 } 456 if (ret > 0) 457 file_accessed(filp); 458 return ret; 459} 460 461static ssize_t 462pipe_write(struct kiocb *iocb, const struct iovec *_iov, 463 unsigned long nr_segs, loff_t ppos) 464{ 465 struct file *filp = iocb->ki_filp; 466 struct inode *inode = filp->f_path.dentry->d_inode; 467 struct pipe_inode_info *pipe; 468 ssize_t ret; 469 int do_wakeup; 470 struct iovec *iov = (struct iovec *)_iov; 471 size_t total_len; 472 ssize_t chars; 473 474 total_len = iov_length(iov, nr_segs); 475 /* Null write succeeds. */ 476 if (unlikely(total_len == 0)) 477 return 0; 478 479 do_wakeup = 0; 480 ret = 0; 481 mutex_lock(&inode->i_mutex); 482 pipe = inode->i_pipe; 483 484 if (!pipe->readers) { 485 send_sig(SIGPIPE, current, 0); 486 ret = -EPIPE; 487 goto out; 488 } 489 490 /* We try to merge small writes */ 491 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 492 if (pipe->nrbufs && chars != 0) { 493 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 494 (pipe->buffers - 1); 495 struct pipe_buffer *buf = pipe->bufs + lastbuf; 496 const struct pipe_buf_operations *ops = buf->ops; 497 int offset = buf->offset + buf->len; 498 499 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 500 int error, atomic = 1; 501 void *addr; 502 503 error = ops->confirm(pipe, buf); 504 if (error) 505 goto out; 506 507 iov_fault_in_pages_read(iov, chars); 508redo1: 509 addr = ops->map(pipe, buf, atomic); 510 error = pipe_iov_copy_from_user(offset + addr, iov, 511 chars, atomic); 512 ops->unmap(pipe, buf, addr); 513 ret = error; 514 do_wakeup = 1; 515 if (error) { 516 if (atomic) { 517 atomic = 0; 518 goto redo1; 519 } 520 goto out; 521 } 522 buf->len += chars; 523 total_len -= chars; 524 ret = chars; 525 if (!total_len) 526 goto out; 527 } 528 } 529 530 for (;;) { 531 int bufs; 532 533 if (!pipe->readers) { 534 send_sig(SIGPIPE, current, 0); 535 if (!ret) 536 ret = -EPIPE; 537 break; 538 } 539 bufs = pipe->nrbufs; 540 if (bufs < pipe->buffers) { 541 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 542 struct pipe_buffer *buf = pipe->bufs + newbuf; 543 struct page *page = pipe->tmp_page; 544 char *src; 545 int error, atomic = 1; 546 547 if (!page) { 548 page = alloc_page(GFP_HIGHUSER); 549 if (unlikely(!page)) { 550 ret = ret ? : -ENOMEM; 551 break; 552 } 553 pipe->tmp_page = page; 554 } 555 do_wakeup = 1; 556 chars = PAGE_SIZE; 557 if (chars > total_len) 558 chars = total_len; 559 560 iov_fault_in_pages_read(iov, chars); 561redo2: 562 if (atomic) 563 src = kmap_atomic(page, KM_USER0); 564 else 565 src = kmap(page); 566 567 error = pipe_iov_copy_from_user(src, iov, chars, 568 atomic); 569 if (atomic) 570 kunmap_atomic(src, KM_USER0); 571 else 572 kunmap(page); 573 574 if (unlikely(error)) { 575 if (atomic) { 576 atomic = 0; 577 goto redo2; 578 } 579 if (!ret) 580 ret = error; 581 break; 582 } 583 ret += chars; 584 585 /* Insert it into the buffer array */ 586 buf->page = page; 587 buf->ops = &anon_pipe_buf_ops; 588 buf->offset = 0; 589 buf->len = chars; 590 pipe->nrbufs = ++bufs; 591 pipe->tmp_page = NULL; 592 593 total_len -= chars; 594 if (!total_len) 595 break; 596 } 597 if (bufs < pipe->buffers) 598 continue; 599 if (filp->f_flags & O_NONBLOCK) { 600 if (!ret) 601 ret = -EAGAIN; 602 break; 603 } 604 if (signal_pending(current)) { 605 if (!ret) 606 ret = -ERESTARTSYS; 607 break; 608 } 609 if (do_wakeup) { 610 wake_up_interruptible_sync(&pipe->wait); 611 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 612 do_wakeup = 0; 613 } 614 pipe->waiting_writers++; 615 pipe_wait(pipe); 616 pipe->waiting_writers--; 617 } 618out: 619 mutex_unlock(&inode->i_mutex); 620 if (do_wakeup) { 621 wake_up_interruptible_sync(&pipe->wait); 622 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 623 } 624 if (ret > 0) 625 file_update_time(filp); 626 return ret; 627} 628 629static ssize_t 630bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 631{ 632 return -EBADF; 633} 634 635static ssize_t 636bad_pipe_w(struct file *filp, const char __user *buf, size_t count, 637 loff_t *ppos) 638{ 639 return -EBADF; 640} 641 642static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 643{ 644 struct inode *inode = filp->f_path.dentry->d_inode; 645 struct pipe_inode_info *pipe; 646 int count, buf, nrbufs; 647 648 switch (cmd) { 649 case FIONREAD: 650 mutex_lock(&inode->i_mutex); 651 pipe = inode->i_pipe; 652 count = 0; 653 buf = pipe->curbuf; 654 nrbufs = pipe->nrbufs; 655 while (--nrbufs >= 0) { 656 count += pipe->bufs[buf].len; 657 buf = (buf+1) & (pipe->buffers - 1); 658 } 659 mutex_unlock(&inode->i_mutex); 660 661 return put_user(count, (int __user *)arg); 662 default: 663 return -EINVAL; 664 } 665} 666 667/* No kernel lock held - fine */ 668static unsigned int 669pipe_poll(struct file *filp, poll_table *wait) 670{ 671 unsigned int mask; 672 struct inode *inode = filp->f_path.dentry->d_inode; 673 struct pipe_inode_info *pipe = inode->i_pipe; 674 int nrbufs; 675 676 poll_wait(filp, &pipe->wait, wait); 677 678 /* Reading only -- no need for acquiring the semaphore. */ 679 nrbufs = pipe->nrbufs; 680 mask = 0; 681 if (filp->f_mode & FMODE_READ) { 682 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 683 if (!pipe->writers && filp->f_version != pipe->w_counter) 684 mask |= POLLHUP; 685 } 686 687 if (filp->f_mode & FMODE_WRITE) { 688 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 689 /* 690 * Most Unices do not set POLLERR for FIFOs but on Linux they 691 * behave exactly like pipes for poll(). 692 */ 693 if (!pipe->readers) 694 mask |= POLLERR; 695 } 696 697 return mask; 698} 699 700static int 701pipe_release(struct inode *inode, int decr, int decw) 702{ 703 struct pipe_inode_info *pipe; 704 705 mutex_lock(&inode->i_mutex); 706 pipe = inode->i_pipe; 707 pipe->readers -= decr; 708 pipe->writers -= decw; 709 710 if (!pipe->readers && !pipe->writers) { 711 free_pipe_info(inode); 712 } else { 713 wake_up_interruptible_sync(&pipe->wait); 714 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 715 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 716 } 717 mutex_unlock(&inode->i_mutex); 718 719 return 0; 720} 721 722static int 723pipe_read_fasync(int fd, struct file *filp, int on) 724{ 725 struct inode *inode = filp->f_path.dentry->d_inode; 726 int retval; 727 728 mutex_lock(&inode->i_mutex); 729 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 730 mutex_unlock(&inode->i_mutex); 731 732 return retval; 733} 734 735 736static int 737pipe_write_fasync(int fd, struct file *filp, int on) 738{ 739 struct inode *inode = filp->f_path.dentry->d_inode; 740 int retval; 741 742 mutex_lock(&inode->i_mutex); 743 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 744 mutex_unlock(&inode->i_mutex); 745 746 return retval; 747} 748 749 750static int 751pipe_rdwr_fasync(int fd, struct file *filp, int on) 752{ 753 struct inode *inode = filp->f_path.dentry->d_inode; 754 struct pipe_inode_info *pipe = inode->i_pipe; 755 int retval; 756 757 mutex_lock(&inode->i_mutex); 758 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 759 if (retval >= 0) { 760 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 761 if (retval < 0) /* this can happen only if on == T */ 762 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 763 } 764 mutex_unlock(&inode->i_mutex); 765 return retval; 766} 767 768 769static int 770pipe_read_release(struct inode *inode, struct file *filp) 771{ 772 return pipe_release(inode, 1, 0); 773} 774 775static int 776pipe_write_release(struct inode *inode, struct file *filp) 777{ 778 return pipe_release(inode, 0, 1); 779} 780 781static int 782pipe_rdwr_release(struct inode *inode, struct file *filp) 783{ 784 int decr, decw; 785 786 decr = (filp->f_mode & FMODE_READ) != 0; 787 decw = (filp->f_mode & FMODE_WRITE) != 0; 788 return pipe_release(inode, decr, decw); 789} 790 791static int 792pipe_read_open(struct inode *inode, struct file *filp) 793{ 794 int ret = -ENOENT; 795 796 mutex_lock(&inode->i_mutex); 797 798 if (inode->i_pipe) { 799 ret = 0; 800 inode->i_pipe->readers++; 801 } 802 803 mutex_unlock(&inode->i_mutex); 804 805 return ret; 806} 807 808static int 809pipe_write_open(struct inode *inode, struct file *filp) 810{ 811 int ret = -ENOENT; 812 813 mutex_lock(&inode->i_mutex); 814 815 if (inode->i_pipe) { 816 ret = 0; 817 inode->i_pipe->writers++; 818 } 819 820 mutex_unlock(&inode->i_mutex); 821 822 return ret; 823} 824 825static int 826pipe_rdwr_open(struct inode *inode, struct file *filp) 827{ 828 int ret = -ENOENT; 829 830 mutex_lock(&inode->i_mutex); 831 832 if (inode->i_pipe) { 833 ret = 0; 834 if (filp->f_mode & FMODE_READ) 835 inode->i_pipe->readers++; 836 if (filp->f_mode & FMODE_WRITE) 837 inode->i_pipe->writers++; 838 } 839 840 mutex_unlock(&inode->i_mutex); 841 842 return ret; 843} 844 845/* 846 * The file_operations structs are not static because they 847 * are also used in linux/fs/fifo.c to do operations on FIFOs. 848 * 849 * Pipes reuse fifos' file_operations structs. 850 */ 851const struct file_operations read_pipefifo_fops = { 852 .llseek = no_llseek, 853 .read = do_sync_read, 854 .aio_read = pipe_read, 855 .write = bad_pipe_w, 856 .poll = pipe_poll, 857 .unlocked_ioctl = pipe_ioctl, 858 .open = pipe_read_open, 859 .release = pipe_read_release, 860 .fasync = pipe_read_fasync, 861}; 862 863const struct file_operations write_pipefifo_fops = { 864 .llseek = no_llseek, 865 .read = bad_pipe_r, 866 .write = do_sync_write, 867 .aio_write = pipe_write, 868 .poll = pipe_poll, 869 .unlocked_ioctl = pipe_ioctl, 870 .open = pipe_write_open, 871 .release = pipe_write_release, 872 .fasync = pipe_write_fasync, 873}; 874 875const struct file_operations rdwr_pipefifo_fops = { 876 .llseek = no_llseek, 877 .read = do_sync_read, 878 .aio_read = pipe_read, 879 .write = do_sync_write, 880 .aio_write = pipe_write, 881 .poll = pipe_poll, 882 .unlocked_ioctl = pipe_ioctl, 883 .open = pipe_rdwr_open, 884 .release = pipe_rdwr_release, 885 .fasync = pipe_rdwr_fasync, 886}; 887 888struct pipe_inode_info * alloc_pipe_info(struct inode *inode) 889{ 890 struct pipe_inode_info *pipe; 891 892 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 893 if (pipe) { 894 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); 895 if (pipe->bufs) { 896 init_waitqueue_head(&pipe->wait); 897 pipe->r_counter = pipe->w_counter = 1; 898 pipe->inode = inode; 899 pipe->buffers = PIPE_DEF_BUFFERS; 900 return pipe; 901 } 902 kfree(pipe); 903 } 904 905 return NULL; 906} 907 908void __free_pipe_info(struct pipe_inode_info *pipe) 909{ 910 int i; 911 912 for (i = 0; i < pipe->buffers; i++) { 913 struct pipe_buffer *buf = pipe->bufs + i; 914 if (buf->ops) 915 buf->ops->release(pipe, buf); 916 } 917 if (pipe->tmp_page) 918 __free_page(pipe->tmp_page); 919 kfree(pipe->bufs); 920 kfree(pipe); 921} 922 923void free_pipe_info(struct inode *inode) 924{ 925 __free_pipe_info(inode->i_pipe); 926 inode->i_pipe = NULL; 927} 928 929static struct vfsmount *pipe_mnt __read_mostly; 930 931/* 932 * pipefs_dname() is called from d_path(). 933 */ 934static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 935{ 936 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 937 dentry->d_inode->i_ino); 938} 939 940static const struct dentry_operations pipefs_dentry_operations = { 941 .d_dname = pipefs_dname, 942}; 943 944static struct inode * get_pipe_inode(void) 945{ 946 struct inode *inode = new_inode(pipe_mnt->mnt_sb); 947 struct pipe_inode_info *pipe; 948 949 if (!inode) 950 goto fail_inode; 951 952 pipe = alloc_pipe_info(inode); 953 if (!pipe) 954 goto fail_iput; 955 inode->i_pipe = pipe; 956 957 pipe->readers = pipe->writers = 1; 958 inode->i_fop = &rdwr_pipefifo_fops; 959 960 /* 961 * Mark the inode dirty from the very beginning, 962 * that way it will never be moved to the dirty 963 * list because "mark_inode_dirty()" will think 964 * that it already _is_ on the dirty list. 965 */ 966 inode->i_state = I_DIRTY; 967 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 968 inode->i_uid = current_fsuid(); 969 inode->i_gid = current_fsgid(); 970 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 971 972 return inode; 973 974fail_iput: 975 iput(inode); 976 977fail_inode: 978 return NULL; 979} 980 981struct file *create_write_pipe(int flags) 982{ 983 int err; 984 struct inode *inode; 985 struct file *f; 986 struct path path; 987 struct qstr name = { .name = "" }; 988 989 err = -ENFILE; 990 inode = get_pipe_inode(); 991 if (!inode) 992 goto err; 993 994 err = -ENOMEM; 995 path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); 996 if (!path.dentry) 997 goto err_inode; 998 path.mnt = mntget(pipe_mnt); 999 1000 path.dentry->d_op = &pipefs_dentry_operations; 1001 d_instantiate(path.dentry, inode); 1002 1003 err = -ENFILE; 1004 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); 1005 if (!f) 1006 goto err_dentry; 1007 f->f_mapping = inode->i_mapping; 1008 1009 f->f_flags = O_WRONLY | (flags & O_NONBLOCK); 1010 f->f_version = 0; 1011 1012 return f; 1013 1014 err_dentry: 1015 free_pipe_info(inode); 1016 path_put(&path); 1017 return ERR_PTR(err); 1018 1019 err_inode: 1020 free_pipe_info(inode); 1021 iput(inode); 1022 err: 1023 return ERR_PTR(err); 1024} 1025 1026void free_write_pipe(struct file *f) 1027{ 1028 free_pipe_info(f->f_dentry->d_inode); 1029 path_put(&f->f_path); 1030 put_filp(f); 1031} 1032 1033struct file *create_read_pipe(struct file *wrf, int flags) 1034{ 1035 /* Grab pipe from the writer */ 1036 struct file *f = alloc_file(&wrf->f_path, FMODE_READ, 1037 &read_pipefifo_fops); 1038 if (!f) 1039 return ERR_PTR(-ENFILE); 1040 1041 path_get(&wrf->f_path); 1042 f->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1043 1044 return f; 1045} 1046 1047int do_pipe_flags(int *fd, int flags) 1048{ 1049 struct file *fw, *fr; 1050 int error; 1051 int fdw, fdr; 1052 1053 if (flags & ~(O_CLOEXEC | O_NONBLOCK)) 1054 return -EINVAL; 1055 1056 fw = create_write_pipe(flags); 1057 if (IS_ERR(fw)) 1058 return PTR_ERR(fw); 1059 fr = create_read_pipe(fw, flags); 1060 error = PTR_ERR(fr); 1061 if (IS_ERR(fr)) 1062 goto err_write_pipe; 1063 1064 error = get_unused_fd_flags(flags); 1065 if (error < 0) 1066 goto err_read_pipe; 1067 fdr = error; 1068 1069 error = get_unused_fd_flags(flags); 1070 if (error < 0) 1071 goto err_fdr; 1072 fdw = error; 1073 1074 audit_fd_pair(fdr, fdw); 1075 fd_install(fdr, fr); 1076 fd_install(fdw, fw); 1077 fd[0] = fdr; 1078 fd[1] = fdw; 1079 1080 return 0; 1081 1082 err_fdr: 1083 put_unused_fd(fdr); 1084 err_read_pipe: 1085 path_put(&fr->f_path); 1086 put_filp(fr); 1087 err_write_pipe: 1088 free_write_pipe(fw); 1089 return error; 1090} 1091 1092/* 1093 * sys_pipe() is the normal C calling standard for creating 1094 * a pipe. It's not the way Unix traditionally does this, though. 1095 */ 1096SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1097{ 1098 int fd[2]; 1099 int error; 1100 1101 error = do_pipe_flags(fd, flags); 1102 if (!error) { 1103 if (copy_to_user(fildes, fd, sizeof(fd))) { 1104 sys_close(fd[0]); 1105 sys_close(fd[1]); 1106 error = -EFAULT; 1107 } 1108 } 1109 return error; 1110} 1111 1112SYSCALL_DEFINE1(pipe, int __user *, fildes) 1113{ 1114 return sys_pipe2(fildes, 0); 1115} 1116 1117/* 1118 * Allocate a new array of pipe buffers and copy the info over. Returns the 1119 * pipe size if successful, or return -ERROR on error. 1120 */ 1121static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 1122{ 1123 struct pipe_buffer *bufs; 1124 1125 /* 1126 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1127 * expect a lot of shrink+grow operations, just free and allocate 1128 * again like we would do for growing. If the pipe currently 1129 * contains more buffers than arg, then return busy. 1130 */ 1131 if (nr_pages < pipe->nrbufs) 1132 return -EBUSY; 1133 1134 bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); 1135 if (unlikely(!bufs)) 1136 return -ENOMEM; 1137 1138 /* 1139 * The pipe array wraps around, so just start the new one at zero 1140 * and adjust the indexes. 1141 */ 1142 if (pipe->nrbufs) { 1143 unsigned int tail; 1144 unsigned int head; 1145 1146 tail = pipe->curbuf + pipe->nrbufs; 1147 if (tail < pipe->buffers) 1148 tail = 0; 1149 else 1150 tail &= (pipe->buffers - 1); 1151 1152 head = pipe->nrbufs - tail; 1153 if (head) 1154 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1155 if (tail) 1156 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1157 } 1158 1159 pipe->curbuf = 0; 1160 kfree(pipe->bufs); 1161 pipe->bufs = bufs; 1162 pipe->buffers = nr_pages; 1163 return nr_pages * PAGE_SIZE; 1164} 1165 1166/* 1167 * Currently we rely on the pipe array holding a power-of-2 number 1168 * of pages. 1169 */ 1170static inline unsigned int round_pipe_size(unsigned int size) 1171{ 1172 unsigned long nr_pages; 1173 1174 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1175 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1176} 1177 1178/* 1179 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1180 * will return an error. 1181 */ 1182int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1183 size_t *lenp, loff_t *ppos) 1184{ 1185 int ret; 1186 1187 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1188 if (ret < 0 || !write) 1189 return ret; 1190 1191 pipe_max_size = round_pipe_size(pipe_max_size); 1192 return ret; 1193} 1194 1195/* 1196 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1197 * location, so checking ->i_pipe is not enough to verify that this is a 1198 * pipe. 1199 */ 1200struct pipe_inode_info *get_pipe_info(struct file *file) 1201{ 1202 struct inode *i = file->f_path.dentry->d_inode; 1203 1204 return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; 1205} 1206 1207long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1208{ 1209 struct pipe_inode_info *pipe; 1210 long ret; 1211 1212 pipe = get_pipe_info(file); 1213 if (!pipe) 1214 return -EBADF; 1215 1216 mutex_lock(&pipe->inode->i_mutex); 1217 1218 switch (cmd) { 1219 case F_SETPIPE_SZ: { 1220 unsigned int size, nr_pages; 1221 1222 size = round_pipe_size(arg); 1223 nr_pages = size >> PAGE_SHIFT; 1224 1225 ret = -EINVAL; 1226 if (!nr_pages) 1227 goto out; 1228 1229 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1230 ret = -EPERM; 1231 goto out; 1232 } 1233 ret = pipe_set_size(pipe, nr_pages); 1234 break; 1235 } 1236 case F_GETPIPE_SZ: 1237 ret = pipe->buffers * PAGE_SIZE; 1238 break; 1239 default: 1240 ret = -EINVAL; 1241 break; 1242 } 1243 1244out: 1245 mutex_unlock(&pipe->inode->i_mutex); 1246 return ret; 1247} 1248 1249/* 1250 * pipefs should _never_ be mounted by userland - too much of security hassle, 1251 * no real gain from having the whole whorehouse mounted. So we don't need 1252 * any operations on the root directory. However, we need a non-trivial 1253 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1254 */ 1255static int pipefs_get_sb(struct file_system_type *fs_type, 1256 int flags, const char *dev_name, void *data, 1257 struct vfsmount *mnt) 1258{ 1259 return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); 1260} 1261 1262static struct file_system_type pipe_fs_type = { 1263 .name = "pipefs", 1264 .get_sb = pipefs_get_sb, 1265 .kill_sb = kill_anon_super, 1266}; 1267 1268static int __init init_pipe_fs(void) 1269{ 1270 int err = register_filesystem(&pipe_fs_type); 1271 1272 if (!err) { 1273 pipe_mnt = kern_mount(&pipe_fs_type); 1274 if (IS_ERR(pipe_mnt)) { 1275 err = PTR_ERR(pipe_mnt); 1276 unregister_filesystem(&pipe_fs_type); 1277 } 1278 } 1279 return err; 1280} 1281 1282static void __exit exit_pipe_fs(void) 1283{ 1284 unregister_filesystem(&pipe_fs_type); 1285 mntput(pipe_mnt); 1286} 1287 1288fs_initcall(init_pipe_fs); 1289module_exit(exit_pipe_fs); 1290