1/* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20#include <linux/fs.h> 21#include <linux/file.h> 22#include <linux/pagemap.h> 23#include <linux/pipe_fs_i.h> 24#include <linux/mm_inline.h> 25#include <linux/swap.h> 26#include <linux/writeback.h> 27#include <linux/buffer_head.h> 28#include <linux/module.h> 29#include <linux/syscalls.h> 30#include <linux/uio.h> 31 32struct partial_page { 33 unsigned int offset; 34 unsigned int len; 35}; 36 37/* 38 * Passed to splice_to_pipe 39 */ 40struct splice_pipe_desc { 41 struct page **pages; /* page map */ 42 struct partial_page *partial; /* pages[] may not be contig */ 43 int nr_pages; /* number of pages in map */ 44 unsigned int flags; /* splice flags */ 45 const struct pipe_buf_operations *ops;/* ops associated with output pipe */ 46}; 47 48/* 49 * Attempt to steal a page from a pipe buffer. This should perhaps go into 50 * a vm helper function, it's already simplified quite a bit by the 51 * addition of remove_mapping(). If success is returned, the caller may 52 * attempt to reuse this page for another destination. 53 */ 54static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 55 struct pipe_buffer *buf) 56{ 57 struct page *page = buf->page; 58 struct address_space *mapping; 59 60 lock_page(page); 61 62 mapping = page_mapping(page); 63 if (mapping) { 64 WARN_ON(!PageUptodate(page)); 65 66 /* 67 * At least for ext2 with nobh option, we need to wait on 68 * writeback completing on this page, since we'll remove it 69 * from the pagecache. Otherwise truncate wont wait on the 70 * page, allowing the disk blocks to be reused by someone else 71 * before we actually wrote our data to them. fs corruption 72 * ensues. 73 */ 74 wait_on_page_writeback(page); 75 76 if (PagePrivate(page)) 77 try_to_release_page(page, GFP_KERNEL); 78 79 /* 80 * If we succeeded in removing the mapping, set LRU flag 81 * and return good. 82 */ 83 if (remove_mapping(mapping, page)) { 84 buf->flags |= PIPE_BUF_FLAG_LRU; 85 return 0; 86 } 87 } 88 89 /* 90 * Raced with truncate or failed to remove page from current 91 * address space, unlock and return failure. 92 */ 93 unlock_page(page); 94 return 1; 95} 96 97static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 98 struct pipe_buffer *buf) 99{ 100 page_cache_release(buf->page); 101 buf->flags &= ~PIPE_BUF_FLAG_LRU; 102} 103 104static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, 105 struct pipe_buffer *buf) 106{ 107 struct page *page = buf->page; 108 int err; 109 110 if (!PageUptodate(page)) { 111 lock_page(page); 112 113 /* 114 * Page got truncated/unhashed. This will cause a 0-byte 115 * splice, if this is the first page. 116 */ 117 if (!page->mapping) { 118 err = -ENODATA; 119 goto error; 120 } 121 122 /* 123 * Uh oh, read-error from disk. 124 */ 125 if (!PageUptodate(page)) { 126 err = -EIO; 127 goto error; 128 } 129 130 /* 131 * Page is ok afterall, we are done. 132 */ 133 unlock_page(page); 134 } 135 136 return 0; 137error: 138 unlock_page(page); 139 return err; 140} 141 142static const struct pipe_buf_operations page_cache_pipe_buf_ops = { 143 .can_merge = 0, 144 .map = generic_pipe_buf_map, 145 .unmap = generic_pipe_buf_unmap, 146 .pin = page_cache_pipe_buf_pin, 147 .release = page_cache_pipe_buf_release, 148 .steal = page_cache_pipe_buf_steal, 149 .get = generic_pipe_buf_get, 150}; 151 152static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 153 struct pipe_buffer *buf) 154{ 155 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 156 return 1; 157 158 buf->flags |= PIPE_BUF_FLAG_LRU; 159 return generic_pipe_buf_steal(pipe, buf); 160} 161 162static const struct pipe_buf_operations user_page_pipe_buf_ops = { 163 .can_merge = 0, 164 .map = generic_pipe_buf_map, 165 .unmap = generic_pipe_buf_unmap, 166 .pin = generic_pipe_buf_pin, 167 .release = page_cache_pipe_buf_release, 168 .steal = user_page_pipe_buf_steal, 169 .get = generic_pipe_buf_get, 170}; 171 172/* 173 * Pipe output worker. This sets up our pipe format with the page cache 174 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 175 */ 176static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 177 struct splice_pipe_desc *spd) 178{ 179 unsigned int spd_pages = spd->nr_pages; 180 int ret, do_wakeup, page_nr; 181 182 ret = 0; 183 do_wakeup = 0; 184 page_nr = 0; 185 186 if (pipe->inode) 187 mutex_lock(&pipe->inode->i_mutex); 188 189 for (;;) { 190 if (!pipe->readers) { 191 send_sig(SIGPIPE, current, 0); 192 if (!ret) 193 ret = -EPIPE; 194 break; 195 } 196 197 if (pipe->nrbufs < PIPE_BUFFERS) { 198 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 199 struct pipe_buffer *buf = pipe->bufs + newbuf; 200 201 buf->page = spd->pages[page_nr]; 202 buf->offset = spd->partial[page_nr].offset; 203 buf->len = spd->partial[page_nr].len; 204 buf->ops = spd->ops; 205 if (spd->flags & SPLICE_F_GIFT) 206 buf->flags |= PIPE_BUF_FLAG_GIFT; 207 208 pipe->nrbufs++; 209 page_nr++; 210 ret += buf->len; 211 212 if (pipe->inode) 213 do_wakeup = 1; 214 215 if (!--spd->nr_pages) 216 break; 217 if (pipe->nrbufs < PIPE_BUFFERS) 218 continue; 219 220 break; 221 } 222 223 if (spd->flags & SPLICE_F_NONBLOCK) { 224 if (!ret) 225 ret = -EAGAIN; 226 break; 227 } 228 229 if (signal_pending(current)) { 230 if (!ret) 231 ret = -ERESTARTSYS; 232 break; 233 } 234 235 if (do_wakeup) { 236 smp_mb(); 237 if (waitqueue_active(&pipe->wait)) 238 wake_up_interruptible_sync(&pipe->wait); 239 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 240 do_wakeup = 0; 241 } 242 243 pipe->waiting_writers++; 244 pipe_wait(pipe); 245 pipe->waiting_writers--; 246 } 247 248 if (pipe->inode) { 249 mutex_unlock(&pipe->inode->i_mutex); 250 251 if (do_wakeup) { 252 smp_mb(); 253 if (waitqueue_active(&pipe->wait)) 254 wake_up_interruptible(&pipe->wait); 255 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 256 } 257 } 258 259 while (page_nr < spd_pages) 260 page_cache_release(spd->pages[page_nr++]); 261 262 return ret; 263} 264 265static int 266__generic_file_splice_read(struct file *in, loff_t *ppos, 267 struct pipe_inode_info *pipe, size_t len, 268 unsigned int flags) 269{ 270 struct address_space *mapping = in->f_mapping; 271 unsigned int loff, nr_pages; 272 struct page *pages[PIPE_BUFFERS]; 273 struct partial_page partial[PIPE_BUFFERS]; 274 struct page *page; 275 pgoff_t index, end_index; 276 loff_t isize; 277 int error, page_nr; 278 struct splice_pipe_desc spd = { 279 .pages = pages, 280 .partial = partial, 281 .flags = flags, 282 .ops = &page_cache_pipe_buf_ops, 283 }; 284 285 index = *ppos >> PAGE_CACHE_SHIFT; 286 loff = *ppos & ~PAGE_CACHE_MASK; 287 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 288 289 if (nr_pages > PIPE_BUFFERS) 290 nr_pages = PIPE_BUFFERS; 291 292 /* 293 * Don't try to 2nd guess the read-ahead logic, call into 294 * page_cache_readahead() like the page cache reads would do. 295 */ 296 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); 297 298 /* 299 * Now fill in the holes: 300 */ 301 error = 0; 302 303 /* 304 * Lookup the (hopefully) full range of pages we need. 305 */ 306 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 307 308 /* 309 * If find_get_pages_contig() returned fewer pages than we needed, 310 * allocate the rest. 311 */ 312 index += spd.nr_pages; 313 while (spd.nr_pages < nr_pages) { 314 /* 315 * Page could be there, find_get_pages_contig() breaks on 316 * the first hole. 317 */ 318 page = find_get_page(mapping, index); 319 if (!page) { 320 /* 321 * Make sure the read-ahead engine is notified 322 * about this failure. 323 */ 324 handle_ra_miss(mapping, &in->f_ra, index); 325 326 /* 327 * page didn't exist, allocate one. 328 */ 329 page = page_cache_alloc_cold(mapping); 330 if (!page) 331 break; 332 333 error = add_to_page_cache_lru(page, mapping, index, 334 GFP_KERNEL); 335 if (unlikely(error)) { 336 page_cache_release(page); 337 if (error == -EEXIST) 338 continue; 339 break; 340 } 341 /* 342 * add_to_page_cache() locks the page, unlock it 343 * to avoid convoluting the logic below even more. 344 */ 345 unlock_page(page); 346 } 347 348 pages[spd.nr_pages++] = page; 349 index++; 350 } 351 352 /* 353 * Now loop over the map and see if we need to start IO on any 354 * pages, fill in the partial map, etc. 355 */ 356 index = *ppos >> PAGE_CACHE_SHIFT; 357 nr_pages = spd.nr_pages; 358 spd.nr_pages = 0; 359 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 360 unsigned int this_len; 361 362 if (!len) 363 break; 364 365 /* 366 * this_len is the max we'll use from this page 367 */ 368 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 369 page = pages[page_nr]; 370 371 /* 372 * If the page isn't uptodate, we may need to start io on it 373 */ 374 if (!PageUptodate(page)) { 375 /* 376 * If in nonblock mode then dont block on waiting 377 * for an in-flight io page 378 */ 379 if (flags & SPLICE_F_NONBLOCK) { 380 if (TestSetPageLocked(page)) 381 break; 382 } else 383 lock_page(page); 384 385 /* 386 * page was truncated, stop here. if this isn't the 387 * first page, we'll just complete what we already 388 * added 389 */ 390 if (!page->mapping) { 391 unlock_page(page); 392 break; 393 } 394 /* 395 * page was already under io and is now done, great 396 */ 397 if (PageUptodate(page)) { 398 unlock_page(page); 399 goto fill_it; 400 } 401 402 /* 403 * need to read in the page 404 */ 405 error = mapping->a_ops->readpage(in, page); 406 if (unlikely(error)) { 407 /* 408 * We really should re-lookup the page here, 409 * but it complicates things a lot. Instead 410 * lets just do what we already stored, and 411 * we'll get it the next time we are called. 412 */ 413 if (error == AOP_TRUNCATED_PAGE) 414 error = 0; 415 416 break; 417 } 418 } 419fill_it: 420 /* 421 * i_size must be checked after PageUptodate. 422 */ 423 isize = i_size_read(mapping->host); 424 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 425 if (unlikely(!isize || index > end_index)) 426 break; 427 428 /* 429 * if this is the last page, see if we need to shrink 430 * the length and stop 431 */ 432 if (end_index == index) { 433 unsigned int plen; 434 435 /* 436 * max good bytes in this page 437 */ 438 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 439 if (plen <= loff) 440 break; 441 442 /* 443 * force quit after adding this page 444 */ 445 this_len = min(this_len, plen - loff); 446 len = this_len; 447 } 448 449 partial[page_nr].offset = loff; 450 partial[page_nr].len = this_len; 451 len -= this_len; 452 loff = 0; 453 spd.nr_pages++; 454 index++; 455 } 456 457 /* 458 * Release any pages at the end, if we quit early. 'page_nr' is how far 459 * we got, 'nr_pages' is how many pages are in the map. 460 */ 461 while (page_nr < nr_pages) 462 page_cache_release(pages[page_nr++]); 463 464 if (spd.nr_pages) 465 return splice_to_pipe(pipe, &spd); 466 467 return error; 468} 469 470/** 471 * generic_file_splice_read - splice data from file to a pipe 472 * @in: file to splice from 473 * @pipe: pipe to splice to 474 * @len: number of bytes to splice 475 * @flags: splice modifier flags 476 * 477 * Will read pages from given file and fill them into a pipe. 478 */ 479ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 480 struct pipe_inode_info *pipe, size_t len, 481 unsigned int flags) 482{ 483 ssize_t spliced; 484 int ret; 485 loff_t isize, left; 486 487 isize = i_size_read(in->f_mapping->host); 488 if (unlikely(*ppos >= isize)) 489 return 0; 490 491 left = isize - *ppos; 492 if (unlikely(left < len)) 493 len = left; 494 495 ret = 0; 496 spliced = 0; 497 while (len) { 498 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 499 500 if (ret < 0) 501 break; 502 else if (!ret) { 503 if (spliced) 504 break; 505 if (flags & SPLICE_F_NONBLOCK) { 506 ret = -EAGAIN; 507 break; 508 } 509 } 510 511 *ppos += ret; 512 len -= ret; 513 spliced += ret; 514 } 515 516 if (spliced) 517 return spliced; 518 519 return ret; 520} 521 522EXPORT_SYMBOL(generic_file_splice_read); 523 524/* 525 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 526 * using sendpage(). Return the number of bytes sent. 527 */ 528static int pipe_to_sendpage(struct pipe_inode_info *pipe, 529 struct pipe_buffer *buf, struct splice_desc *sd) 530{ 531 struct file *file = sd->file; 532 loff_t pos = sd->pos; 533 int ret, more; 534 535 ret = buf->ops->pin(pipe, buf); 536 if (!ret) { 537 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 538 539 ret = file->f_op->sendpage(file, buf->page, buf->offset, 540 sd->len, &pos, more); 541 } 542 543 return ret; 544} 545 546/* 547 * This is a little more tricky than the file -> pipe splicing. There are 548 * basically three cases: 549 * 550 * - Destination page already exists in the address space and there 551 * are users of it. For that case we have no other option that 552 * copying the data. Tough luck. 553 * - Destination page already exists in the address space, but there 554 * are no users of it. Make sure it's uptodate, then drop it. Fall 555 * through to last case. 556 * - Destination page does not exist, we can add the pipe page to 557 * the page cache and avoid the copy. 558 * 559 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 560 * sd->flags), we attempt to migrate pages from the pipe to the output 561 * file address space page cache. This is possible if no one else has 562 * the pipe page referenced outside of the pipe and page cache. If 563 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 564 * a new page in the output file page cache and fill/dirty that. 565 */ 566static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 567 struct splice_desc *sd) 568{ 569 struct file *file = sd->file; 570 struct address_space *mapping = file->f_mapping; 571 unsigned int offset, this_len; 572 struct page *page; 573 pgoff_t index; 574 int ret; 575 576 /* 577 * make sure the data in this buffer is uptodate 578 */ 579 ret = buf->ops->pin(pipe, buf); 580 if (unlikely(ret)) 581 return ret; 582 583 index = sd->pos >> PAGE_CACHE_SHIFT; 584 offset = sd->pos & ~PAGE_CACHE_MASK; 585 586 this_len = sd->len; 587 if (this_len + offset > PAGE_CACHE_SIZE) 588 this_len = PAGE_CACHE_SIZE - offset; 589 590find_page: 591 page = find_lock_page(mapping, index); 592 if (!page) { 593 ret = -ENOMEM; 594 page = page_cache_alloc_cold(mapping); 595 if (unlikely(!page)) 596 goto out_ret; 597 598 /* 599 * This will also lock the page 600 */ 601 ret = add_to_page_cache_lru(page, mapping, index, 602 GFP_KERNEL); 603 if (unlikely(ret)) 604 goto out; 605 } 606 607 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 608 if (unlikely(ret)) { 609 loff_t isize = i_size_read(mapping->host); 610 611 if (ret != AOP_TRUNCATED_PAGE) 612 unlock_page(page); 613 page_cache_release(page); 614 if (ret == AOP_TRUNCATED_PAGE) 615 goto find_page; 616 617 /* 618 * prepare_write() may have instantiated a few blocks 619 * outside i_size. Trim these off again. 620 */ 621 if (sd->pos + this_len > isize) 622 vmtruncate(mapping->host, isize); 623 624 goto out_ret; 625 } 626 627 if (buf->page != page) { 628 /* 629 * Careful, ->map() uses KM_USER0! 630 */ 631 char *src = buf->ops->map(pipe, buf, 1); 632 char *dst = kmap_atomic(page, KM_USER1); 633 634 memcpy(dst + offset, src + buf->offset, this_len); 635 flush_dcache_page(page); 636 kunmap_atomic(dst, KM_USER1); 637 buf->ops->unmap(pipe, buf, src); 638 } 639 640 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 641 if (ret) { 642 if (ret == AOP_TRUNCATED_PAGE) { 643 page_cache_release(page); 644 goto find_page; 645 } 646 if (ret < 0) 647 goto out; 648 /* 649 * Partial write has happened, so 'ret' already initialized by 650 * number of bytes written, Where is nothing we have to do here. 651 */ 652 } else 653 ret = this_len; 654 /* 655 * Return the number of bytes written and mark page as 656 * accessed, we are now done! 657 */ 658 mark_page_accessed(page); 659out: 660 page_cache_release(page); 661 unlock_page(page); 662out_ret: 663 return ret; 664} 665 666/* 667 * Pipe input worker. Most of this logic works like a regular pipe, the 668 * key here is the 'actor' worker passed in that actually moves the data 669 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 670 */ 671ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, 672 struct file *out, loff_t *ppos, size_t len, 673 unsigned int flags, splice_actor *actor) 674{ 675 int ret, do_wakeup, err; 676 struct splice_desc sd; 677 678 ret = 0; 679 do_wakeup = 0; 680 681 sd.total_len = len; 682 sd.flags = flags; 683 sd.file = out; 684 sd.pos = *ppos; 685 686 for (;;) { 687 if (pipe->nrbufs) { 688 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 689 const struct pipe_buf_operations *ops = buf->ops; 690 691 sd.len = buf->len; 692 if (sd.len > sd.total_len) 693 sd.len = sd.total_len; 694 695 err = actor(pipe, buf, &sd); 696 if (err <= 0) { 697 if (!ret && err != -ENODATA) 698 ret = err; 699 700 break; 701 } 702 703 ret += err; 704 buf->offset += err; 705 buf->len -= err; 706 707 sd.len -= err; 708 sd.pos += err; 709 sd.total_len -= err; 710 if (sd.len) 711 continue; 712 713 if (!buf->len) { 714 buf->ops = NULL; 715 ops->release(pipe, buf); 716 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 717 pipe->nrbufs--; 718 if (pipe->inode) 719 do_wakeup = 1; 720 } 721 722 if (!sd.total_len) 723 break; 724 } 725 726 if (pipe->nrbufs) 727 continue; 728 if (!pipe->writers) 729 break; 730 if (!pipe->waiting_writers) { 731 if (ret) 732 break; 733 } 734 735 if (flags & SPLICE_F_NONBLOCK) { 736 if (!ret) 737 ret = -EAGAIN; 738 break; 739 } 740 741 if (signal_pending(current)) { 742 if (!ret) 743 ret = -ERESTARTSYS; 744 break; 745 } 746 747 if (do_wakeup) { 748 smp_mb(); 749 if (waitqueue_active(&pipe->wait)) 750 wake_up_interruptible_sync(&pipe->wait); 751 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 752 do_wakeup = 0; 753 } 754 755 pipe_wait(pipe); 756 } 757 758 if (do_wakeup) { 759 smp_mb(); 760 if (waitqueue_active(&pipe->wait)) 761 wake_up_interruptible(&pipe->wait); 762 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 763 } 764 765 return ret; 766} 767EXPORT_SYMBOL(__splice_from_pipe); 768 769ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 770 loff_t *ppos, size_t len, unsigned int flags, 771 splice_actor *actor) 772{ 773 ssize_t ret; 774 struct inode *inode = out->f_mapping->host; 775 776 /* 777 * The actor worker might be calling ->prepare_write and 778 * ->commit_write. Most of the time, these expect i_mutex to 779 * be held. Since this may result in an ABBA deadlock with 780 * pipe->inode, we have to order lock acquiry here. 781 */ 782 inode_double_lock(inode, pipe->inode); 783 ret = __splice_from_pipe(pipe, out, ppos, len, flags, actor); 784 inode_double_unlock(inode, pipe->inode); 785 786 return ret; 787} 788 789/** 790 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 791 * @pipe: pipe info 792 * @out: file to write to 793 * @len: number of bytes to splice 794 * @flags: splice modifier flags 795 * 796 * Will either move or copy pages (determined by @flags options) from 797 * the given pipe inode to the given file. The caller is responsible 798 * for acquiring i_mutex on both inodes. 799 * 800 */ 801ssize_t 802generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 803 loff_t *ppos, size_t len, unsigned int flags) 804{ 805 struct address_space *mapping = out->f_mapping; 806 struct inode *inode = mapping->host; 807 ssize_t ret; 808 int err; 809 810 err = remove_suid(out->f_path.dentry); 811 if (unlikely(err)) 812 return err; 813 814 ret = __splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 815 if (ret > 0) { 816 unsigned long nr_pages; 817 818 *ppos += ret; 819 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 820 821 /* 822 * If file or inode is SYNC and we actually wrote some data, 823 * sync it. 824 */ 825 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 826 err = generic_osync_inode(inode, mapping, 827 OSYNC_METADATA|OSYNC_DATA); 828 829 if (err) 830 ret = err; 831 } 832 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 833 } 834 835 return ret; 836} 837 838EXPORT_SYMBOL(generic_file_splice_write_nolock); 839 840/** 841 * generic_file_splice_write - splice data from a pipe to a file 842 * @pipe: pipe info 843 * @out: file to write to 844 * @len: number of bytes to splice 845 * @flags: splice modifier flags 846 * 847 * Will either move or copy pages (determined by @flags options) from 848 * the given pipe inode to the given file. 849 * 850 */ 851ssize_t 852generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 853 loff_t *ppos, size_t len, unsigned int flags) 854{ 855 struct address_space *mapping = out->f_mapping; 856 struct inode *inode = mapping->host; 857 ssize_t ret; 858 int err; 859 860 err = should_remove_suid(out->f_path.dentry); 861 if (unlikely(err)) { 862 mutex_lock(&inode->i_mutex); 863 err = __remove_suid(out->f_path.dentry, err); 864 mutex_unlock(&inode->i_mutex); 865 if (err) 866 return err; 867 } 868 869 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 870 if (ret > 0) { 871 unsigned long nr_pages; 872 873 *ppos += ret; 874 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 875 876 /* 877 * If file or inode is SYNC and we actually wrote some data, 878 * sync it. 879 */ 880 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 881 mutex_lock(&inode->i_mutex); 882 err = generic_osync_inode(inode, mapping, 883 OSYNC_METADATA|OSYNC_DATA); 884 mutex_unlock(&inode->i_mutex); 885 886 if (err) 887 ret = err; 888 } 889 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 890 } 891 892 return ret; 893} 894 895EXPORT_SYMBOL(generic_file_splice_write); 896 897/** 898 * generic_splice_sendpage - splice data from a pipe to a socket 899 * @inode: pipe inode 900 * @out: socket to write to 901 * @len: number of bytes to splice 902 * @flags: splice modifier flags 903 * 904 * Will send @len bytes from the pipe to a network socket. No data copying 905 * is involved. 906 * 907 */ 908ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 909 loff_t *ppos, size_t len, unsigned int flags) 910{ 911 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 912} 913 914EXPORT_SYMBOL(generic_splice_sendpage); 915 916/* 917 * Attempt to initiate a splice from pipe to file. 918 */ 919static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 920 loff_t *ppos, size_t len, unsigned int flags) 921{ 922 int ret; 923 924 if (unlikely(!out->f_op || !out->f_op->splice_write)) 925 return -EINVAL; 926 927 if (unlikely(!(out->f_mode & FMODE_WRITE))) 928 return -EBADF; 929 930 ret = rw_verify_area(WRITE, out, ppos, len); 931 if (unlikely(ret < 0)) 932 return ret; 933 934 return out->f_op->splice_write(pipe, out, ppos, len, flags); 935} 936 937/* 938 * Attempt to initiate a splice from a file to a pipe. 939 */ 940static long do_splice_to(struct file *in, loff_t *ppos, 941 struct pipe_inode_info *pipe, size_t len, 942 unsigned int flags) 943{ 944 int ret; 945 946 if (unlikely(!in->f_op || !in->f_op->splice_read)) 947 return -EINVAL; 948 949 if (unlikely(!(in->f_mode & FMODE_READ))) 950 return -EBADF; 951 952 ret = rw_verify_area(READ, in, ppos, len); 953 if (unlikely(ret < 0)) 954 return ret; 955 956 return in->f_op->splice_read(in, ppos, pipe, len, flags); 957} 958 959long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 960 size_t len, unsigned int flags) 961{ 962 struct pipe_inode_info *pipe; 963 long ret, bytes; 964 loff_t out_off; 965 umode_t i_mode; 966 int i; 967 968 /* 969 * We require the input being a regular file, as we don't want to 970 * randomly drop data for eg socket -> socket splicing. Use the 971 * piped splicing for that! 972 */ 973 i_mode = in->f_path.dentry->d_inode->i_mode; 974 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 975 return -EINVAL; 976 977 /* 978 * neither in nor out is a pipe, setup an internal pipe attached to 979 * 'out' and transfer the wanted data from 'in' to 'out' through that 980 */ 981 pipe = current->splice_pipe; 982 if (unlikely(!pipe)) { 983 pipe = alloc_pipe_info(NULL); 984 if (!pipe) 985 return -ENOMEM; 986 987 /* 988 * We don't have an immediate reader, but we'll read the stuff 989 * out of the pipe right after the splice_to_pipe(). So set 990 * PIPE_READERS appropriately. 991 */ 992 pipe->readers = 1; 993 994 current->splice_pipe = pipe; 995 } 996 997 /* 998 * Do the splice. 999 */ 1000 ret = 0; 1001 bytes = 0; 1002 out_off = 0; 1003 1004 while (len) { 1005 size_t read_len, max_read_len; 1006 1007 /* 1008 * Do at most PIPE_BUFFERS pages worth of transfer: 1009 */ 1010 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); 1011 1012 ret = do_splice_to(in, ppos, pipe, max_read_len, flags); 1013 if (unlikely(ret < 0)) 1014 goto out_release; 1015 1016 read_len = ret; 1017 1018 /* 1019 * NOTE: nonblocking mode only applies to the input. We 1020 * must not do the output in nonblocking mode as then we 1021 * could get stuck data in the internal pipe: 1022 */ 1023 ret = do_splice_from(pipe, out, &out_off, read_len, 1024 flags & ~SPLICE_F_NONBLOCK); 1025 if (unlikely(ret < 0)) 1026 goto out_release; 1027 1028 bytes += ret; 1029 len -= ret; 1030 1031 /* 1032 * In nonblocking mode, if we got back a short read then 1033 * that was due to either an IO error or due to the 1034 * pagecache entry not being there. In the IO error case 1035 * the _next_ splice attempt will produce a clean IO error 1036 * return value (not a short read), so in both cases it's 1037 * correct to break out of the loop here: 1038 */ 1039 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) 1040 break; 1041 } 1042 1043 pipe->nrbufs = pipe->curbuf = 0; 1044 1045 return bytes; 1046 1047out_release: 1048 /* 1049 * If we did an incomplete transfer we must release 1050 * the pipe buffers in question: 1051 */ 1052 for (i = 0; i < PIPE_BUFFERS; i++) { 1053 struct pipe_buffer *buf = pipe->bufs + i; 1054 1055 if (buf->ops) { 1056 buf->ops->release(pipe, buf); 1057 buf->ops = NULL; 1058 } 1059 } 1060 pipe->nrbufs = pipe->curbuf = 0; 1061 1062 /* 1063 * If we transferred some data, return the number of bytes: 1064 */ 1065 if (bytes > 0) 1066 return bytes; 1067 1068 return ret; 1069} 1070 1071/* 1072 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1073 * location, so checking ->i_pipe is not enough to verify that this is a 1074 * pipe. 1075 */ 1076static inline struct pipe_inode_info *pipe_info(struct inode *inode) 1077{ 1078 if (S_ISFIFO(inode->i_mode)) 1079 return inode->i_pipe; 1080 1081 return NULL; 1082} 1083 1084/* 1085 * Determine where to splice to/from. 1086 */ 1087static long do_splice(struct file *in, loff_t __user *off_in, 1088 struct file *out, loff_t __user *off_out, 1089 size_t len, unsigned int flags) 1090{ 1091 struct pipe_inode_info *pipe; 1092 loff_t offset, *off; 1093 long ret; 1094 1095 pipe = pipe_info(in->f_path.dentry->d_inode); 1096 if (pipe) { 1097 if (off_in) 1098 return -ESPIPE; 1099 if (off_out) { 1100 if (out->f_op->llseek == no_llseek) 1101 return -EINVAL; 1102 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1103 return -EFAULT; 1104 off = &offset; 1105 } else 1106 off = &out->f_pos; 1107 1108 ret = do_splice_from(pipe, out, off, len, flags); 1109 1110 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1111 ret = -EFAULT; 1112 1113 return ret; 1114 } 1115 1116 pipe = pipe_info(out->f_path.dentry->d_inode); 1117 if (pipe) { 1118 if (off_out) 1119 return -ESPIPE; 1120 if (off_in) { 1121 if (in->f_op->llseek == no_llseek) 1122 return -EINVAL; 1123 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1124 return -EFAULT; 1125 off = &offset; 1126 } else 1127 off = &in->f_pos; 1128 1129 ret = do_splice_to(in, off, pipe, len, flags); 1130 1131 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1132 ret = -EFAULT; 1133 1134 return ret; 1135 } 1136 1137 return -EINVAL; 1138} 1139 1140/* 1141 * Map an iov into an array of pages and offset/length tupples. With the 1142 * partial_page structure, we can map several non-contiguous ranges into 1143 * our ones pages[] map instead of splitting that operation into pieces. 1144 * Could easily be exported as a generic helper for other users, in which 1145 * case one would probably want to add a 'max_nr_pages' parameter as well. 1146 */ 1147static int get_iovec_page_array(const struct iovec __user *iov, 1148 unsigned int nr_vecs, struct page **pages, 1149 struct partial_page *partial, int aligned) 1150{ 1151 int buffers = 0, error = 0; 1152 1153 /* 1154 * It's ok to take the mmap_sem for reading, even 1155 * across a "get_user()". 1156 */ 1157 down_read(¤t->mm->mmap_sem); 1158 1159 while (nr_vecs) { 1160 unsigned long off, npages; 1161 void __user *base; 1162 size_t len; 1163 int i; 1164 1165 /* 1166 * Get user address base and length for this iovec. 1167 */ 1168 error = get_user(base, &iov->iov_base); 1169 if (unlikely(error)) 1170 break; 1171 error = get_user(len, &iov->iov_len); 1172 if (unlikely(error)) 1173 break; 1174 1175 /* 1176 * Sanity check this iovec. 0 read succeeds. 1177 */ 1178 if (unlikely(!len)) 1179 break; 1180 error = -EFAULT; 1181 if (unlikely(!base)) 1182 break; 1183 1184 /* 1185 * Get this base offset and number of pages, then map 1186 * in the user pages. 1187 */ 1188 off = (unsigned long) base & ~PAGE_MASK; 1189 1190 /* 1191 * If asked for alignment, the offset must be zero and the 1192 * length a multiple of the PAGE_SIZE. 1193 */ 1194 error = -EINVAL; 1195 if (aligned && (off || len & ~PAGE_MASK)) 1196 break; 1197 1198 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1199 if (npages > PIPE_BUFFERS - buffers) 1200 npages = PIPE_BUFFERS - buffers; 1201 1202 error = get_user_pages(current, current->mm, 1203 (unsigned long) base, npages, 0, 0, 1204 &pages[buffers], NULL); 1205 1206 if (unlikely(error <= 0)) 1207 break; 1208 1209 /* 1210 * Fill this contiguous range into the partial page map. 1211 */ 1212 for (i = 0; i < error; i++) { 1213 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1214 1215 partial[buffers].offset = off; 1216 partial[buffers].len = plen; 1217 1218 off = 0; 1219 len -= plen; 1220 buffers++; 1221 } 1222 1223 /* 1224 * We didn't complete this iov, stop here since it probably 1225 * means we have to move some of this into a pipe to 1226 * be able to continue. 1227 */ 1228 if (len) 1229 break; 1230 1231 /* 1232 * Don't continue if we mapped fewer pages than we asked for, 1233 * or if we mapped the max number of pages that we have 1234 * room for. 1235 */ 1236 if (error < npages || buffers == PIPE_BUFFERS) 1237 break; 1238 1239 nr_vecs--; 1240 iov++; 1241 } 1242 1243 up_read(¤t->mm->mmap_sem); 1244 1245 if (buffers) 1246 return buffers; 1247 1248 return error; 1249} 1250 1251/* 1252 * vmsplice splices a user address range into a pipe. It can be thought of 1253 * as splice-from-memory, where the regular splice is splice-from-file (or 1254 * to file). In both cases the output is a pipe, naturally. 1255 * 1256 * Note that vmsplice only supports splicing _from_ user memory to a pipe, 1257 * not the other way around. Splicing from user memory is a simple operation 1258 * that can be supported without any funky alignment restrictions or nasty 1259 * vm tricks. We simply map in the user memory and fill them into a pipe. 1260 * The reverse isn't quite as easy, though. There are two possible solutions 1261 * for that: 1262 * 1263 * - memcpy() the data internally, at which point we might as well just 1264 * do a regular read() on the buffer anyway. 1265 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1266 * has restriction limitations on both ends of the pipe). 1267 * 1268 * Alas, it isn't here. 1269 * 1270 */ 1271static long do_vmsplice(struct file *file, const struct iovec __user *iov, 1272 unsigned long nr_segs, unsigned int flags) 1273{ 1274 struct pipe_inode_info *pipe; 1275 struct page *pages[PIPE_BUFFERS]; 1276 struct partial_page partial[PIPE_BUFFERS]; 1277 struct splice_pipe_desc spd = { 1278 .pages = pages, 1279 .partial = partial, 1280 .flags = flags, 1281 .ops = &user_page_pipe_buf_ops, 1282 }; 1283 1284 pipe = pipe_info(file->f_path.dentry->d_inode); 1285 if (!pipe) 1286 return -EBADF; 1287 if (unlikely(nr_segs > UIO_MAXIOV)) 1288 return -EINVAL; 1289 else if (unlikely(!nr_segs)) 1290 return 0; 1291 1292 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1293 flags & SPLICE_F_GIFT); 1294 if (spd.nr_pages <= 0) 1295 return spd.nr_pages; 1296 1297 return splice_to_pipe(pipe, &spd); 1298} 1299 1300asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1301 unsigned long nr_segs, unsigned int flags) 1302{ 1303 struct file *file; 1304 long error; 1305 int fput; 1306 1307 error = -EBADF; 1308 file = fget_light(fd, &fput); 1309 if (file) { 1310 if (file->f_mode & FMODE_WRITE) 1311 error = do_vmsplice(file, iov, nr_segs, flags); 1312 1313 fput_light(file, fput); 1314 } 1315 1316 return error; 1317} 1318 1319asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1320 int fd_out, loff_t __user *off_out, 1321 size_t len, unsigned int flags) 1322{ 1323 long error; 1324 struct file *in, *out; 1325 int fput_in, fput_out; 1326 1327 if (unlikely(!len)) 1328 return 0; 1329 1330 error = -EBADF; 1331 in = fget_light(fd_in, &fput_in); 1332 if (in) { 1333 if (in->f_mode & FMODE_READ) { 1334 out = fget_light(fd_out, &fput_out); 1335 if (out) { 1336 if (out->f_mode & FMODE_WRITE) 1337 error = do_splice(in, off_in, 1338 out, off_out, 1339 len, flags); 1340 fput_light(out, fput_out); 1341 } 1342 } 1343 1344 fput_light(in, fput_in); 1345 } 1346 1347 return error; 1348} 1349 1350/* 1351 * Make sure there's data to read. Wait for input if we can, otherwise 1352 * return an appropriate error. 1353 */ 1354static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1355{ 1356 int ret; 1357 1358 /* 1359 * Check ->nrbufs without the inode lock first. This function 1360 * is speculative anyways, so missing one is ok. 1361 */ 1362 if (pipe->nrbufs) 1363 return 0; 1364 1365 ret = 0; 1366 mutex_lock(&pipe->inode->i_mutex); 1367 1368 while (!pipe->nrbufs) { 1369 if (signal_pending(current)) { 1370 ret = -ERESTARTSYS; 1371 break; 1372 } 1373 if (!pipe->writers) 1374 break; 1375 if (!pipe->waiting_writers) { 1376 if (flags & SPLICE_F_NONBLOCK) { 1377 ret = -EAGAIN; 1378 break; 1379 } 1380 } 1381 pipe_wait(pipe); 1382 } 1383 1384 mutex_unlock(&pipe->inode->i_mutex); 1385 return ret; 1386} 1387 1388/* 1389 * Make sure there's writeable room. Wait for room if we can, otherwise 1390 * return an appropriate error. 1391 */ 1392static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1393{ 1394 int ret; 1395 1396 /* 1397 * Check ->nrbufs without the inode lock first. This function 1398 * is speculative anyways, so missing one is ok. 1399 */ 1400 if (pipe->nrbufs < PIPE_BUFFERS) 1401 return 0; 1402 1403 ret = 0; 1404 mutex_lock(&pipe->inode->i_mutex); 1405 1406 while (pipe->nrbufs >= PIPE_BUFFERS) { 1407 if (!pipe->readers) { 1408 send_sig(SIGPIPE, current, 0); 1409 ret = -EPIPE; 1410 break; 1411 } 1412 if (flags & SPLICE_F_NONBLOCK) { 1413 ret = -EAGAIN; 1414 break; 1415 } 1416 if (signal_pending(current)) { 1417 ret = -ERESTARTSYS; 1418 break; 1419 } 1420 pipe->waiting_writers++; 1421 pipe_wait(pipe); 1422 pipe->waiting_writers--; 1423 } 1424 1425 mutex_unlock(&pipe->inode->i_mutex); 1426 return ret; 1427} 1428 1429/* 1430 * Link contents of ipipe to opipe. 1431 */ 1432static int link_pipe(struct pipe_inode_info *ipipe, 1433 struct pipe_inode_info *opipe, 1434 size_t len, unsigned int flags) 1435{ 1436 struct pipe_buffer *ibuf, *obuf; 1437 int ret = 0, i = 0, nbuf; 1438 1439 inode_double_lock(ipipe->inode, opipe->inode); 1440 1441 do { 1442 if (!opipe->readers) { 1443 send_sig(SIGPIPE, current, 0); 1444 if (!ret) 1445 ret = -EPIPE; 1446 break; 1447 } 1448 1449 /* 1450 * If we have iterated all input buffers or ran out of 1451 * output room, break. 1452 */ 1453 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1454 break; 1455 1456 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1457 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1458 1459 /* 1460 * Get a reference to this pipe buffer, 1461 * so we can copy the contents over. 1462 */ 1463 ibuf->ops->get(ipipe, ibuf); 1464 1465 obuf = opipe->bufs + nbuf; 1466 *obuf = *ibuf; 1467 1468 /* 1469 * Don't inherit the gift flag, we need to 1470 * prevent multiple steals of this page. 1471 */ 1472 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1473 1474 if (obuf->len > len) 1475 obuf->len = len; 1476 1477 opipe->nrbufs++; 1478 ret += obuf->len; 1479 len -= obuf->len; 1480 i++; 1481 } while (len); 1482 1483 inode_double_unlock(ipipe->inode, opipe->inode); 1484 1485 /* 1486 * If we put data in the output pipe, wakeup any potential readers. 1487 */ 1488 if (ret > 0) { 1489 smp_mb(); 1490 if (waitqueue_active(&opipe->wait)) 1491 wake_up_interruptible(&opipe->wait); 1492 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1493 } 1494 1495 return ret; 1496} 1497 1498/* 1499 * This is a tee(1) implementation that works on pipes. It doesn't copy 1500 * any data, it simply references the 'in' pages on the 'out' pipe. 1501 * The 'flags' used are the SPLICE_F_* variants, currently the only 1502 * applicable one is SPLICE_F_NONBLOCK. 1503 */ 1504static long do_tee(struct file *in, struct file *out, size_t len, 1505 unsigned int flags) 1506{ 1507 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1508 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1509 int ret = -EINVAL; 1510 1511 /* 1512 * Duplicate the contents of ipipe to opipe without actually 1513 * copying the data. 1514 */ 1515 if (ipipe && opipe && ipipe != opipe) { 1516 /* 1517 * Keep going, unless we encounter an error. The ipipe/opipe 1518 * ordering doesn't really matter. 1519 */ 1520 ret = link_ipipe_prep(ipipe, flags); 1521 if (!ret) { 1522 ret = link_opipe_prep(opipe, flags); 1523 if (!ret) { 1524 ret = link_pipe(ipipe, opipe, len, flags); 1525 if (!ret && (flags & SPLICE_F_NONBLOCK)) 1526 ret = -EAGAIN; 1527 } 1528 } 1529 } 1530 1531 return ret; 1532} 1533 1534asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1535{ 1536 struct file *in; 1537 int error, fput_in; 1538 1539 if (unlikely(!len)) 1540 return 0; 1541 1542 error = -EBADF; 1543 in = fget_light(fdin, &fput_in); 1544 if (in) { 1545 if (in->f_mode & FMODE_READ) { 1546 int fput_out; 1547 struct file *out = fget_light(fdout, &fput_out); 1548 1549 if (out) { 1550 if (out->f_mode & FMODE_WRITE) 1551 error = do_tee(in, out, len, flags); 1552 fput_light(out, fput_out); 1553 } 1554 } 1555 fput_light(in, fput_in); 1556 } 1557 1558 return error; 1559} 1560