1/*
2 * "splice": joining two ropes together by interweaving their strands.
3 *
4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other.
7 *
8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer.
10 *
11 * Named by Larry McVoy, original implementation from Linus, extended by
12 * Jens to support splicing to files, network, direct splicing, etc and
13 * fixing lots of bugs.
14 *
15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18 *
19 */
20#include <linux/fs.h>
21#include <linux/file.h>
22#include <linux/pagemap.h>
23#include <linux/pipe_fs_i.h>
24#include <linux/mm_inline.h>
25#include <linux/swap.h>
26#include <linux/writeback.h>
27#include <linux/buffer_head.h>
28#include <linux/module.h>
29#include <linux/syscalls.h>
30#include <linux/uio.h>
31
32struct partial_page {
33	unsigned int offset;
34	unsigned int len;
35};
36
37/*
38 * Passed to splice_to_pipe
39 */
40struct splice_pipe_desc {
41	struct page **pages;		/* page map */
42	struct partial_page *partial;	/* pages[] may not be contig */
43	int nr_pages;			/* number of pages in map */
44	unsigned int flags;		/* splice flags */
45	const struct pipe_buf_operations *ops;/* ops associated with output pipe */
46};
47
48/*
49 * Attempt to steal a page from a pipe buffer. This should perhaps go into
50 * a vm helper function, it's already simplified quite a bit by the
51 * addition of remove_mapping(). If success is returned, the caller may
52 * attempt to reuse this page for another destination.
53 */
54static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
55				     struct pipe_buffer *buf)
56{
57	struct page *page = buf->page;
58	struct address_space *mapping;
59
60	lock_page(page);
61
62	mapping = page_mapping(page);
63	if (mapping) {
64		WARN_ON(!PageUptodate(page));
65
66		/*
67		 * At least for ext2 with nobh option, we need to wait on
68		 * writeback completing on this page, since we'll remove it
69		 * from the pagecache.  Otherwise truncate wont wait on the
70		 * page, allowing the disk blocks to be reused by someone else
71		 * before we actually wrote our data to them. fs corruption
72		 * ensues.
73		 */
74		wait_on_page_writeback(page);
75
76		if (PagePrivate(page))
77			try_to_release_page(page, GFP_KERNEL);
78
79		/*
80		 * If we succeeded in removing the mapping, set LRU flag
81		 * and return good.
82		 */
83		if (remove_mapping(mapping, page)) {
84			buf->flags |= PIPE_BUF_FLAG_LRU;
85			return 0;
86		}
87	}
88
89	/*
90	 * Raced with truncate or failed to remove page from current
91	 * address space, unlock and return failure.
92	 */
93	unlock_page(page);
94	return 1;
95}
96
97static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
98					struct pipe_buffer *buf)
99{
100	page_cache_release(buf->page);
101	buf->flags &= ~PIPE_BUF_FLAG_LRU;
102}
103
104static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe,
105				   struct pipe_buffer *buf)
106{
107	struct page *page = buf->page;
108	int err;
109
110	if (!PageUptodate(page)) {
111		lock_page(page);
112
113		/*
114		 * Page got truncated/unhashed. This will cause a 0-byte
115		 * splice, if this is the first page.
116		 */
117		if (!page->mapping) {
118			err = -ENODATA;
119			goto error;
120		}
121
122		/*
123		 * Uh oh, read-error from disk.
124		 */
125		if (!PageUptodate(page)) {
126			err = -EIO;
127			goto error;
128		}
129
130		/*
131		 * Page is ok afterall, we are done.
132		 */
133		unlock_page(page);
134	}
135
136	return 0;
137error:
138	unlock_page(page);
139	return err;
140}
141
142static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
143	.can_merge = 0,
144	.map = generic_pipe_buf_map,
145	.unmap = generic_pipe_buf_unmap,
146	.pin = page_cache_pipe_buf_pin,
147	.release = page_cache_pipe_buf_release,
148	.steal = page_cache_pipe_buf_steal,
149	.get = generic_pipe_buf_get,
150};
151
152static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
153				    struct pipe_buffer *buf)
154{
155	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
156		return 1;
157
158	buf->flags |= PIPE_BUF_FLAG_LRU;
159	return generic_pipe_buf_steal(pipe, buf);
160}
161
162static const struct pipe_buf_operations user_page_pipe_buf_ops = {
163	.can_merge = 0,
164	.map = generic_pipe_buf_map,
165	.unmap = generic_pipe_buf_unmap,
166	.pin = generic_pipe_buf_pin,
167	.release = page_cache_pipe_buf_release,
168	.steal = user_page_pipe_buf_steal,
169	.get = generic_pipe_buf_get,
170};
171
172/*
173 * Pipe output worker. This sets up our pipe format with the page cache
174 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
175 */
176static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
177			      struct splice_pipe_desc *spd)
178{
179	unsigned int spd_pages = spd->nr_pages;
180	int ret, do_wakeup, page_nr;
181
182	ret = 0;
183	do_wakeup = 0;
184	page_nr = 0;
185
186	if (pipe->inode)
187		mutex_lock(&pipe->inode->i_mutex);
188
189	for (;;) {
190		if (!pipe->readers) {
191			send_sig(SIGPIPE, current, 0);
192			if (!ret)
193				ret = -EPIPE;
194			break;
195		}
196
197		if (pipe->nrbufs < PIPE_BUFFERS) {
198			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
199			struct pipe_buffer *buf = pipe->bufs + newbuf;
200
201			buf->page = spd->pages[page_nr];
202			buf->offset = spd->partial[page_nr].offset;
203			buf->len = spd->partial[page_nr].len;
204			buf->ops = spd->ops;
205			if (spd->flags & SPLICE_F_GIFT)
206				buf->flags |= PIPE_BUF_FLAG_GIFT;
207
208			pipe->nrbufs++;
209			page_nr++;
210			ret += buf->len;
211
212			if (pipe->inode)
213				do_wakeup = 1;
214
215			if (!--spd->nr_pages)
216				break;
217			if (pipe->nrbufs < PIPE_BUFFERS)
218				continue;
219
220			break;
221		}
222
223		if (spd->flags & SPLICE_F_NONBLOCK) {
224			if (!ret)
225				ret = -EAGAIN;
226			break;
227		}
228
229		if (signal_pending(current)) {
230			if (!ret)
231				ret = -ERESTARTSYS;
232			break;
233		}
234
235		if (do_wakeup) {
236			smp_mb();
237			if (waitqueue_active(&pipe->wait))
238				wake_up_interruptible_sync(&pipe->wait);
239			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
240			do_wakeup = 0;
241		}
242
243		pipe->waiting_writers++;
244		pipe_wait(pipe);
245		pipe->waiting_writers--;
246	}
247
248	if (pipe->inode) {
249		mutex_unlock(&pipe->inode->i_mutex);
250
251		if (do_wakeup) {
252			smp_mb();
253			if (waitqueue_active(&pipe->wait))
254				wake_up_interruptible(&pipe->wait);
255			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
256		}
257	}
258
259	while (page_nr < spd_pages)
260		page_cache_release(spd->pages[page_nr++]);
261
262	return ret;
263}
264
265static int
266__generic_file_splice_read(struct file *in, loff_t *ppos,
267			   struct pipe_inode_info *pipe, size_t len,
268			   unsigned int flags)
269{
270	struct address_space *mapping = in->f_mapping;
271	unsigned int loff, nr_pages;
272	struct page *pages[PIPE_BUFFERS];
273	struct partial_page partial[PIPE_BUFFERS];
274	struct page *page;
275	pgoff_t index, end_index;
276	loff_t isize;
277	int error, page_nr;
278	struct splice_pipe_desc spd = {
279		.pages = pages,
280		.partial = partial,
281		.flags = flags,
282		.ops = &page_cache_pipe_buf_ops,
283	};
284
285	index = *ppos >> PAGE_CACHE_SHIFT;
286	loff = *ppos & ~PAGE_CACHE_MASK;
287	nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
288
289	if (nr_pages > PIPE_BUFFERS)
290		nr_pages = PIPE_BUFFERS;
291
292	/*
293	 * Don't try to 2nd guess the read-ahead logic, call into
294	 * page_cache_readahead() like the page cache reads would do.
295	 */
296	page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
297
298	/*
299	 * Now fill in the holes:
300	 */
301	error = 0;
302
303	/*
304	 * Lookup the (hopefully) full range of pages we need.
305	 */
306	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
307
308	/*
309	 * If find_get_pages_contig() returned fewer pages than we needed,
310	 * allocate the rest.
311	 */
312	index += spd.nr_pages;
313	while (spd.nr_pages < nr_pages) {
314		/*
315		 * Page could be there, find_get_pages_contig() breaks on
316		 * the first hole.
317		 */
318		page = find_get_page(mapping, index);
319		if (!page) {
320			/*
321			 * Make sure the read-ahead engine is notified
322			 * about this failure.
323			 */
324			handle_ra_miss(mapping, &in->f_ra, index);
325
326			/*
327			 * page didn't exist, allocate one.
328			 */
329			page = page_cache_alloc_cold(mapping);
330			if (!page)
331				break;
332
333			error = add_to_page_cache_lru(page, mapping, index,
334					      GFP_KERNEL);
335			if (unlikely(error)) {
336				page_cache_release(page);
337				if (error == -EEXIST)
338					continue;
339				break;
340			}
341			/*
342			 * add_to_page_cache() locks the page, unlock it
343			 * to avoid convoluting the logic below even more.
344			 */
345			unlock_page(page);
346		}
347
348		pages[spd.nr_pages++] = page;
349		index++;
350	}
351
352	/*
353	 * Now loop over the map and see if we need to start IO on any
354	 * pages, fill in the partial map, etc.
355	 */
356	index = *ppos >> PAGE_CACHE_SHIFT;
357	nr_pages = spd.nr_pages;
358	spd.nr_pages = 0;
359	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
360		unsigned int this_len;
361
362		if (!len)
363			break;
364
365		/*
366		 * this_len is the max we'll use from this page
367		 */
368		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
369		page = pages[page_nr];
370
371		/*
372		 * If the page isn't uptodate, we may need to start io on it
373		 */
374		if (!PageUptodate(page)) {
375			/*
376			 * If in nonblock mode then dont block on waiting
377			 * for an in-flight io page
378			 */
379			if (flags & SPLICE_F_NONBLOCK) {
380				if (TestSetPageLocked(page))
381					break;
382			} else
383				lock_page(page);
384
385			/*
386			 * page was truncated, stop here. if this isn't the
387			 * first page, we'll just complete what we already
388			 * added
389			 */
390			if (!page->mapping) {
391				unlock_page(page);
392				break;
393			}
394			/*
395			 * page was already under io and is now done, great
396			 */
397			if (PageUptodate(page)) {
398				unlock_page(page);
399				goto fill_it;
400			}
401
402			/*
403			 * need to read in the page
404			 */
405			error = mapping->a_ops->readpage(in, page);
406			if (unlikely(error)) {
407				/*
408				 * We really should re-lookup the page here,
409				 * but it complicates things a lot. Instead
410				 * lets just do what we already stored, and
411				 * we'll get it the next time we are called.
412				 */
413				if (error == AOP_TRUNCATED_PAGE)
414					error = 0;
415
416				break;
417			}
418		}
419fill_it:
420		/*
421		 * i_size must be checked after PageUptodate.
422		 */
423		isize = i_size_read(mapping->host);
424		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
425		if (unlikely(!isize || index > end_index))
426			break;
427
428		/*
429		 * if this is the last page, see if we need to shrink
430		 * the length and stop
431		 */
432		if (end_index == index) {
433			unsigned int plen;
434
435			/*
436			 * max good bytes in this page
437			 */
438			plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
439			if (plen <= loff)
440				break;
441
442			/*
443			 * force quit after adding this page
444			 */
445			this_len = min(this_len, plen - loff);
446			len = this_len;
447		}
448
449		partial[page_nr].offset = loff;
450		partial[page_nr].len = this_len;
451		len -= this_len;
452		loff = 0;
453		spd.nr_pages++;
454		index++;
455	}
456
457	/*
458	 * Release any pages at the end, if we quit early. 'page_nr' is how far
459	 * we got, 'nr_pages' is how many pages are in the map.
460	 */
461	while (page_nr < nr_pages)
462		page_cache_release(pages[page_nr++]);
463
464	if (spd.nr_pages)
465		return splice_to_pipe(pipe, &spd);
466
467	return error;
468}
469
470/**
471 * generic_file_splice_read - splice data from file to a pipe
472 * @in:		file to splice from
473 * @pipe:	pipe to splice to
474 * @len:	number of bytes to splice
475 * @flags:	splice modifier flags
476 *
477 * Will read pages from given file and fill them into a pipe.
478 */
479ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
480				 struct pipe_inode_info *pipe, size_t len,
481				 unsigned int flags)
482{
483	ssize_t spliced;
484	int ret;
485	loff_t isize, left;
486
487	isize = i_size_read(in->f_mapping->host);
488	if (unlikely(*ppos >= isize))
489		return 0;
490
491	left = isize - *ppos;
492	if (unlikely(left < len))
493		len = left;
494
495	ret = 0;
496	spliced = 0;
497	while (len) {
498		ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
499
500		if (ret < 0)
501			break;
502		else if (!ret) {
503			if (spliced)
504				break;
505			if (flags & SPLICE_F_NONBLOCK) {
506				ret = -EAGAIN;
507				break;
508			}
509		}
510
511		*ppos += ret;
512		len -= ret;
513		spliced += ret;
514	}
515
516	if (spliced)
517		return spliced;
518
519	return ret;
520}
521
522EXPORT_SYMBOL(generic_file_splice_read);
523
524/*
525 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
526 * using sendpage(). Return the number of bytes sent.
527 */
528static int pipe_to_sendpage(struct pipe_inode_info *pipe,
529			    struct pipe_buffer *buf, struct splice_desc *sd)
530{
531	struct file *file = sd->file;
532	loff_t pos = sd->pos;
533	int ret, more;
534
535	ret = buf->ops->pin(pipe, buf);
536	if (!ret) {
537		more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
538
539		ret = file->f_op->sendpage(file, buf->page, buf->offset,
540					   sd->len, &pos, more);
541	}
542
543	return ret;
544}
545
546/*
547 * This is a little more tricky than the file -> pipe splicing. There are
548 * basically three cases:
549 *
550 *	- Destination page already exists in the address space and there
551 *	  are users of it. For that case we have no other option that
552 *	  copying the data. Tough luck.
553 *	- Destination page already exists in the address space, but there
554 *	  are no users of it. Make sure it's uptodate, then drop it. Fall
555 *	  through to last case.
556 *	- Destination page does not exist, we can add the pipe page to
557 *	  the page cache and avoid the copy.
558 *
559 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
560 * sd->flags), we attempt to migrate pages from the pipe to the output
561 * file address space page cache. This is possible if no one else has
562 * the pipe page referenced outside of the pipe and page cache. If
563 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
564 * a new page in the output file page cache and fill/dirty that.
565 */
566static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
567			struct splice_desc *sd)
568{
569	struct file *file = sd->file;
570	struct address_space *mapping = file->f_mapping;
571	unsigned int offset, this_len;
572	struct page *page;
573	pgoff_t index;
574	int ret;
575
576	/*
577	 * make sure the data in this buffer is uptodate
578	 */
579	ret = buf->ops->pin(pipe, buf);
580	if (unlikely(ret))
581		return ret;
582
583	index = sd->pos >> PAGE_CACHE_SHIFT;
584	offset = sd->pos & ~PAGE_CACHE_MASK;
585
586	this_len = sd->len;
587	if (this_len + offset > PAGE_CACHE_SIZE)
588		this_len = PAGE_CACHE_SIZE - offset;
589
590find_page:
591	page = find_lock_page(mapping, index);
592	if (!page) {
593		ret = -ENOMEM;
594		page = page_cache_alloc_cold(mapping);
595		if (unlikely(!page))
596			goto out_ret;
597
598		/*
599		 * This will also lock the page
600		 */
601		ret = add_to_page_cache_lru(page, mapping, index,
602					    GFP_KERNEL);
603		if (unlikely(ret))
604			goto out;
605	}
606
607	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
608	if (unlikely(ret)) {
609		loff_t isize = i_size_read(mapping->host);
610
611		if (ret != AOP_TRUNCATED_PAGE)
612			unlock_page(page);
613		page_cache_release(page);
614		if (ret == AOP_TRUNCATED_PAGE)
615			goto find_page;
616
617		/*
618		 * prepare_write() may have instantiated a few blocks
619		 * outside i_size.  Trim these off again.
620		 */
621		if (sd->pos + this_len > isize)
622			vmtruncate(mapping->host, isize);
623
624		goto out_ret;
625	}
626
627	if (buf->page != page) {
628		/*
629		 * Careful, ->map() uses KM_USER0!
630		 */
631		char *src = buf->ops->map(pipe, buf, 1);
632		char *dst = kmap_atomic(page, KM_USER1);
633
634		memcpy(dst + offset, src + buf->offset, this_len);
635		flush_dcache_page(page);
636		kunmap_atomic(dst, KM_USER1);
637		buf->ops->unmap(pipe, buf, src);
638	}
639
640	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
641	if (ret) {
642		if (ret == AOP_TRUNCATED_PAGE) {
643			page_cache_release(page);
644			goto find_page;
645		}
646		if (ret < 0)
647			goto out;
648		/*
649		 * Partial write has happened, so 'ret' already initialized by
650		 * number of bytes written, Where is nothing we have to do here.
651		 */
652	} else
653		ret = this_len;
654	/*
655	 * Return the number of bytes written and mark page as
656	 * accessed, we are now done!
657	 */
658	mark_page_accessed(page);
659out:
660	page_cache_release(page);
661	unlock_page(page);
662out_ret:
663	return ret;
664}
665
666/*
667 * Pipe input worker. Most of this logic works like a regular pipe, the
668 * key here is the 'actor' worker passed in that actually moves the data
669 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
670 */
671ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
672			   struct file *out, loff_t *ppos, size_t len,
673			   unsigned int flags, splice_actor *actor)
674{
675	int ret, do_wakeup, err;
676	struct splice_desc sd;
677
678	ret = 0;
679	do_wakeup = 0;
680
681	sd.total_len = len;
682	sd.flags = flags;
683	sd.file = out;
684	sd.pos = *ppos;
685
686	for (;;) {
687		if (pipe->nrbufs) {
688			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
689			const struct pipe_buf_operations *ops = buf->ops;
690
691			sd.len = buf->len;
692			if (sd.len > sd.total_len)
693				sd.len = sd.total_len;
694
695			err = actor(pipe, buf, &sd);
696			if (err <= 0) {
697				if (!ret && err != -ENODATA)
698					ret = err;
699
700				break;
701			}
702
703			ret += err;
704			buf->offset += err;
705			buf->len -= err;
706
707			sd.len -= err;
708			sd.pos += err;
709			sd.total_len -= err;
710			if (sd.len)
711				continue;
712
713			if (!buf->len) {
714				buf->ops = NULL;
715				ops->release(pipe, buf);
716				pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
717				pipe->nrbufs--;
718				if (pipe->inode)
719					do_wakeup = 1;
720			}
721
722			if (!sd.total_len)
723				break;
724		}
725
726		if (pipe->nrbufs)
727			continue;
728		if (!pipe->writers)
729			break;
730		if (!pipe->waiting_writers) {
731			if (ret)
732				break;
733		}
734
735		if (flags & SPLICE_F_NONBLOCK) {
736			if (!ret)
737				ret = -EAGAIN;
738			break;
739		}
740
741		if (signal_pending(current)) {
742			if (!ret)
743				ret = -ERESTARTSYS;
744			break;
745		}
746
747		if (do_wakeup) {
748			smp_mb();
749			if (waitqueue_active(&pipe->wait))
750				wake_up_interruptible_sync(&pipe->wait);
751			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
752			do_wakeup = 0;
753		}
754
755		pipe_wait(pipe);
756	}
757
758	if (do_wakeup) {
759		smp_mb();
760		if (waitqueue_active(&pipe->wait))
761			wake_up_interruptible(&pipe->wait);
762		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
763	}
764
765	return ret;
766}
767EXPORT_SYMBOL(__splice_from_pipe);
768
769ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
770			 loff_t *ppos, size_t len, unsigned int flags,
771			 splice_actor *actor)
772{
773	ssize_t ret;
774	struct inode *inode = out->f_mapping->host;
775
776	/*
777	 * The actor worker might be calling ->prepare_write and
778	 * ->commit_write. Most of the time, these expect i_mutex to
779	 * be held. Since this may result in an ABBA deadlock with
780	 * pipe->inode, we have to order lock acquiry here.
781	 */
782	inode_double_lock(inode, pipe->inode);
783	ret = __splice_from_pipe(pipe, out, ppos, len, flags, actor);
784	inode_double_unlock(inode, pipe->inode);
785
786	return ret;
787}
788
789/**
790 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
791 * @pipe:	pipe info
792 * @out:	file to write to
793 * @len:	number of bytes to splice
794 * @flags:	splice modifier flags
795 *
796 * Will either move or copy pages (determined by @flags options) from
797 * the given pipe inode to the given file. The caller is responsible
798 * for acquiring i_mutex on both inodes.
799 *
800 */
801ssize_t
802generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
803				 loff_t *ppos, size_t len, unsigned int flags)
804{
805	struct address_space *mapping = out->f_mapping;
806	struct inode *inode = mapping->host;
807	ssize_t ret;
808	int err;
809
810	err = remove_suid(out->f_path.dentry);
811	if (unlikely(err))
812		return err;
813
814	ret = __splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
815	if (ret > 0) {
816		unsigned long nr_pages;
817
818		*ppos += ret;
819		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
820
821		/*
822		 * If file or inode is SYNC and we actually wrote some data,
823		 * sync it.
824		 */
825		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
826			err = generic_osync_inode(inode, mapping,
827						  OSYNC_METADATA|OSYNC_DATA);
828
829			if (err)
830				ret = err;
831		}
832		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
833	}
834
835	return ret;
836}
837
838EXPORT_SYMBOL(generic_file_splice_write_nolock);
839
840/**
841 * generic_file_splice_write - splice data from a pipe to a file
842 * @pipe:	pipe info
843 * @out:	file to write to
844 * @len:	number of bytes to splice
845 * @flags:	splice modifier flags
846 *
847 * Will either move or copy pages (determined by @flags options) from
848 * the given pipe inode to the given file.
849 *
850 */
851ssize_t
852generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
853			  loff_t *ppos, size_t len, unsigned int flags)
854{
855	struct address_space *mapping = out->f_mapping;
856	struct inode *inode = mapping->host;
857	ssize_t ret;
858	int err;
859
860	err = should_remove_suid(out->f_path.dentry);
861	if (unlikely(err)) {
862		mutex_lock(&inode->i_mutex);
863		err = __remove_suid(out->f_path.dentry, err);
864		mutex_unlock(&inode->i_mutex);
865		if (err)
866			return err;
867	}
868
869	ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
870	if (ret > 0) {
871		unsigned long nr_pages;
872
873		*ppos += ret;
874		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
875
876		/*
877		 * If file or inode is SYNC and we actually wrote some data,
878		 * sync it.
879		 */
880		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
881			mutex_lock(&inode->i_mutex);
882			err = generic_osync_inode(inode, mapping,
883						  OSYNC_METADATA|OSYNC_DATA);
884			mutex_unlock(&inode->i_mutex);
885
886			if (err)
887				ret = err;
888		}
889		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
890	}
891
892	return ret;
893}
894
895EXPORT_SYMBOL(generic_file_splice_write);
896
897/**
898 * generic_splice_sendpage - splice data from a pipe to a socket
899 * @inode:	pipe inode
900 * @out:	socket to write to
901 * @len:	number of bytes to splice
902 * @flags:	splice modifier flags
903 *
904 * Will send @len bytes from the pipe to a network socket. No data copying
905 * is involved.
906 *
907 */
908ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
909				loff_t *ppos, size_t len, unsigned int flags)
910{
911	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
912}
913
914EXPORT_SYMBOL(generic_splice_sendpage);
915
916/*
917 * Attempt to initiate a splice from pipe to file.
918 */
919static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
920			   loff_t *ppos, size_t len, unsigned int flags)
921{
922	int ret;
923
924	if (unlikely(!out->f_op || !out->f_op->splice_write))
925		return -EINVAL;
926
927	if (unlikely(!(out->f_mode & FMODE_WRITE)))
928		return -EBADF;
929
930	ret = rw_verify_area(WRITE, out, ppos, len);
931	if (unlikely(ret < 0))
932		return ret;
933
934	return out->f_op->splice_write(pipe, out, ppos, len, flags);
935}
936
937/*
938 * Attempt to initiate a splice from a file to a pipe.
939 */
940static long do_splice_to(struct file *in, loff_t *ppos,
941			 struct pipe_inode_info *pipe, size_t len,
942			 unsigned int flags)
943{
944	int ret;
945
946	if (unlikely(!in->f_op || !in->f_op->splice_read))
947		return -EINVAL;
948
949	if (unlikely(!(in->f_mode & FMODE_READ)))
950		return -EBADF;
951
952	ret = rw_verify_area(READ, in, ppos, len);
953	if (unlikely(ret < 0))
954		return ret;
955
956	return in->f_op->splice_read(in, ppos, pipe, len, flags);
957}
958
959long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
960		      size_t len, unsigned int flags)
961{
962	struct pipe_inode_info *pipe;
963	long ret, bytes;
964	loff_t out_off;
965	umode_t i_mode;
966	int i;
967
968	/*
969	 * We require the input being a regular file, as we don't want to
970	 * randomly drop data for eg socket -> socket splicing. Use the
971	 * piped splicing for that!
972	 */
973	i_mode = in->f_path.dentry->d_inode->i_mode;
974	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
975		return -EINVAL;
976
977	/*
978	 * neither in nor out is a pipe, setup an internal pipe attached to
979	 * 'out' and transfer the wanted data from 'in' to 'out' through that
980	 */
981	pipe = current->splice_pipe;
982	if (unlikely(!pipe)) {
983		pipe = alloc_pipe_info(NULL);
984		if (!pipe)
985			return -ENOMEM;
986
987		/*
988		 * We don't have an immediate reader, but we'll read the stuff
989		 * out of the pipe right after the splice_to_pipe(). So set
990		 * PIPE_READERS appropriately.
991		 */
992		pipe->readers = 1;
993
994		current->splice_pipe = pipe;
995	}
996
997	/*
998	 * Do the splice.
999	 */
1000	ret = 0;
1001	bytes = 0;
1002	out_off = 0;
1003
1004	while (len) {
1005		size_t read_len, max_read_len;
1006
1007		/*
1008		 * Do at most PIPE_BUFFERS pages worth of transfer:
1009		 */
1010		max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
1011
1012		ret = do_splice_to(in, ppos, pipe, max_read_len, flags);
1013		if (unlikely(ret < 0))
1014			goto out_release;
1015
1016		read_len = ret;
1017
1018		/*
1019		 * NOTE: nonblocking mode only applies to the input. We
1020		 * must not do the output in nonblocking mode as then we
1021		 * could get stuck data in the internal pipe:
1022		 */
1023		ret = do_splice_from(pipe, out, &out_off, read_len,
1024				     flags & ~SPLICE_F_NONBLOCK);
1025		if (unlikely(ret < 0))
1026			goto out_release;
1027
1028		bytes += ret;
1029		len -= ret;
1030
1031		/*
1032		 * In nonblocking mode, if we got back a short read then
1033		 * that was due to either an IO error or due to the
1034		 * pagecache entry not being there. In the IO error case
1035		 * the _next_ splice attempt will produce a clean IO error
1036		 * return value (not a short read), so in both cases it's
1037		 * correct to break out of the loop here:
1038		 */
1039		if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
1040			break;
1041	}
1042
1043	pipe->nrbufs = pipe->curbuf = 0;
1044
1045	return bytes;
1046
1047out_release:
1048	/*
1049	 * If we did an incomplete transfer we must release
1050	 * the pipe buffers in question:
1051	 */
1052	for (i = 0; i < PIPE_BUFFERS; i++) {
1053		struct pipe_buffer *buf = pipe->bufs + i;
1054
1055		if (buf->ops) {
1056			buf->ops->release(pipe, buf);
1057			buf->ops = NULL;
1058		}
1059	}
1060	pipe->nrbufs = pipe->curbuf = 0;
1061
1062	/*
1063	 * If we transferred some data, return the number of bytes:
1064	 */
1065	if (bytes > 0)
1066		return bytes;
1067
1068	return ret;
1069}
1070
1071/*
1072 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1073 * location, so checking ->i_pipe is not enough to verify that this is a
1074 * pipe.
1075 */
1076static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1077{
1078	if (S_ISFIFO(inode->i_mode))
1079		return inode->i_pipe;
1080
1081	return NULL;
1082}
1083
1084/*
1085 * Determine where to splice to/from.
1086 */
1087static long do_splice(struct file *in, loff_t __user *off_in,
1088		      struct file *out, loff_t __user *off_out,
1089		      size_t len, unsigned int flags)
1090{
1091	struct pipe_inode_info *pipe;
1092	loff_t offset, *off;
1093	long ret;
1094
1095	pipe = pipe_info(in->f_path.dentry->d_inode);
1096	if (pipe) {
1097		if (off_in)
1098			return -ESPIPE;
1099		if (off_out) {
1100			if (out->f_op->llseek == no_llseek)
1101				return -EINVAL;
1102			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1103				return -EFAULT;
1104			off = &offset;
1105		} else
1106			off = &out->f_pos;
1107
1108		ret = do_splice_from(pipe, out, off, len, flags);
1109
1110		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1111			ret = -EFAULT;
1112
1113		return ret;
1114	}
1115
1116	pipe = pipe_info(out->f_path.dentry->d_inode);
1117	if (pipe) {
1118		if (off_out)
1119			return -ESPIPE;
1120		if (off_in) {
1121			if (in->f_op->llseek == no_llseek)
1122				return -EINVAL;
1123			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1124				return -EFAULT;
1125			off = &offset;
1126		} else
1127			off = &in->f_pos;
1128
1129		ret = do_splice_to(in, off, pipe, len, flags);
1130
1131		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1132			ret = -EFAULT;
1133
1134		return ret;
1135	}
1136
1137	return -EINVAL;
1138}
1139
1140/*
1141 * Map an iov into an array of pages and offset/length tupples. With the
1142 * partial_page structure, we can map several non-contiguous ranges into
1143 * our ones pages[] map instead of splitting that operation into pieces.
1144 * Could easily be exported as a generic helper for other users, in which
1145 * case one would probably want to add a 'max_nr_pages' parameter as well.
1146 */
1147static int get_iovec_page_array(const struct iovec __user *iov,
1148				unsigned int nr_vecs, struct page **pages,
1149				struct partial_page *partial, int aligned)
1150{
1151	int buffers = 0, error = 0;
1152
1153	/*
1154	 * It's ok to take the mmap_sem for reading, even
1155	 * across a "get_user()".
1156	 */
1157	down_read(&current->mm->mmap_sem);
1158
1159	while (nr_vecs) {
1160		unsigned long off, npages;
1161		void __user *base;
1162		size_t len;
1163		int i;
1164
1165		/*
1166		 * Get user address base and length for this iovec.
1167		 */
1168		error = get_user(base, &iov->iov_base);
1169		if (unlikely(error))
1170			break;
1171		error = get_user(len, &iov->iov_len);
1172		if (unlikely(error))
1173			break;
1174
1175		/*
1176		 * Sanity check this iovec. 0 read succeeds.
1177		 */
1178		if (unlikely(!len))
1179			break;
1180		error = -EFAULT;
1181		if (unlikely(!base))
1182			break;
1183
1184		/*
1185		 * Get this base offset and number of pages, then map
1186		 * in the user pages.
1187		 */
1188		off = (unsigned long) base & ~PAGE_MASK;
1189
1190		/*
1191		 * If asked for alignment, the offset must be zero and the
1192		 * length a multiple of the PAGE_SIZE.
1193		 */
1194		error = -EINVAL;
1195		if (aligned && (off || len & ~PAGE_MASK))
1196			break;
1197
1198		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1199		if (npages > PIPE_BUFFERS - buffers)
1200			npages = PIPE_BUFFERS - buffers;
1201
1202		error = get_user_pages(current, current->mm,
1203				       (unsigned long) base, npages, 0, 0,
1204				       &pages[buffers], NULL);
1205
1206		if (unlikely(error <= 0))
1207			break;
1208
1209		/*
1210		 * Fill this contiguous range into the partial page map.
1211		 */
1212		for (i = 0; i < error; i++) {
1213			const int plen = min_t(size_t, len, PAGE_SIZE - off);
1214
1215			partial[buffers].offset = off;
1216			partial[buffers].len = plen;
1217
1218			off = 0;
1219			len -= plen;
1220			buffers++;
1221		}
1222
1223		/*
1224		 * We didn't complete this iov, stop here since it probably
1225		 * means we have to move some of this into a pipe to
1226		 * be able to continue.
1227		 */
1228		if (len)
1229			break;
1230
1231		/*
1232		 * Don't continue if we mapped fewer pages than we asked for,
1233		 * or if we mapped the max number of pages that we have
1234		 * room for.
1235		 */
1236		if (error < npages || buffers == PIPE_BUFFERS)
1237			break;
1238
1239		nr_vecs--;
1240		iov++;
1241	}
1242
1243	up_read(&current->mm->mmap_sem);
1244
1245	if (buffers)
1246		return buffers;
1247
1248	return error;
1249}
1250
1251/*
1252 * vmsplice splices a user address range into a pipe. It can be thought of
1253 * as splice-from-memory, where the regular splice is splice-from-file (or
1254 * to file). In both cases the output is a pipe, naturally.
1255 *
1256 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1257 * not the other way around. Splicing from user memory is a simple operation
1258 * that can be supported without any funky alignment restrictions or nasty
1259 * vm tricks. We simply map in the user memory and fill them into a pipe.
1260 * The reverse isn't quite as easy, though. There are two possible solutions
1261 * for that:
1262 *
1263 *	- memcpy() the data internally, at which point we might as well just
1264 *	  do a regular read() on the buffer anyway.
1265 *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
1266 *	  has restriction limitations on both ends of the pipe).
1267 *
1268 * Alas, it isn't here.
1269 *
1270 */
1271static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1272			unsigned long nr_segs, unsigned int flags)
1273{
1274	struct pipe_inode_info *pipe;
1275	struct page *pages[PIPE_BUFFERS];
1276	struct partial_page partial[PIPE_BUFFERS];
1277	struct splice_pipe_desc spd = {
1278		.pages = pages,
1279		.partial = partial,
1280		.flags = flags,
1281		.ops = &user_page_pipe_buf_ops,
1282	};
1283
1284	pipe = pipe_info(file->f_path.dentry->d_inode);
1285	if (!pipe)
1286		return -EBADF;
1287	if (unlikely(nr_segs > UIO_MAXIOV))
1288		return -EINVAL;
1289	else if (unlikely(!nr_segs))
1290		return 0;
1291
1292	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1293					    flags & SPLICE_F_GIFT);
1294	if (spd.nr_pages <= 0)
1295		return spd.nr_pages;
1296
1297	return splice_to_pipe(pipe, &spd);
1298}
1299
1300asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1301			     unsigned long nr_segs, unsigned int flags)
1302{
1303	struct file *file;
1304	long error;
1305	int fput;
1306
1307	error = -EBADF;
1308	file = fget_light(fd, &fput);
1309	if (file) {
1310		if (file->f_mode & FMODE_WRITE)
1311			error = do_vmsplice(file, iov, nr_segs, flags);
1312
1313		fput_light(file, fput);
1314	}
1315
1316	return error;
1317}
1318
1319asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1320			   int fd_out, loff_t __user *off_out,
1321			   size_t len, unsigned int flags)
1322{
1323	long error;
1324	struct file *in, *out;
1325	int fput_in, fput_out;
1326
1327	if (unlikely(!len))
1328		return 0;
1329
1330	error = -EBADF;
1331	in = fget_light(fd_in, &fput_in);
1332	if (in) {
1333		if (in->f_mode & FMODE_READ) {
1334			out = fget_light(fd_out, &fput_out);
1335			if (out) {
1336				if (out->f_mode & FMODE_WRITE)
1337					error = do_splice(in, off_in,
1338							  out, off_out,
1339							  len, flags);
1340				fput_light(out, fput_out);
1341			}
1342		}
1343
1344		fput_light(in, fput_in);
1345	}
1346
1347	return error;
1348}
1349
1350/*
1351 * Make sure there's data to read. Wait for input if we can, otherwise
1352 * return an appropriate error.
1353 */
1354static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1355{
1356	int ret;
1357
1358	/*
1359	 * Check ->nrbufs without the inode lock first. This function
1360	 * is speculative anyways, so missing one is ok.
1361	 */
1362	if (pipe->nrbufs)
1363		return 0;
1364
1365	ret = 0;
1366	mutex_lock(&pipe->inode->i_mutex);
1367
1368	while (!pipe->nrbufs) {
1369		if (signal_pending(current)) {
1370			ret = -ERESTARTSYS;
1371			break;
1372		}
1373		if (!pipe->writers)
1374			break;
1375		if (!pipe->waiting_writers) {
1376			if (flags & SPLICE_F_NONBLOCK) {
1377				ret = -EAGAIN;
1378				break;
1379			}
1380		}
1381		pipe_wait(pipe);
1382	}
1383
1384	mutex_unlock(&pipe->inode->i_mutex);
1385	return ret;
1386}
1387
1388/*
1389 * Make sure there's writeable room. Wait for room if we can, otherwise
1390 * return an appropriate error.
1391 */
1392static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1393{
1394	int ret;
1395
1396	/*
1397	 * Check ->nrbufs without the inode lock first. This function
1398	 * is speculative anyways, so missing one is ok.
1399	 */
1400	if (pipe->nrbufs < PIPE_BUFFERS)
1401		return 0;
1402
1403	ret = 0;
1404	mutex_lock(&pipe->inode->i_mutex);
1405
1406	while (pipe->nrbufs >= PIPE_BUFFERS) {
1407		if (!pipe->readers) {
1408			send_sig(SIGPIPE, current, 0);
1409			ret = -EPIPE;
1410			break;
1411		}
1412		if (flags & SPLICE_F_NONBLOCK) {
1413			ret = -EAGAIN;
1414			break;
1415		}
1416		if (signal_pending(current)) {
1417			ret = -ERESTARTSYS;
1418			break;
1419		}
1420		pipe->waiting_writers++;
1421		pipe_wait(pipe);
1422		pipe->waiting_writers--;
1423	}
1424
1425	mutex_unlock(&pipe->inode->i_mutex);
1426	return ret;
1427}
1428
1429/*
1430 * Link contents of ipipe to opipe.
1431 */
1432static int link_pipe(struct pipe_inode_info *ipipe,
1433		     struct pipe_inode_info *opipe,
1434		     size_t len, unsigned int flags)
1435{
1436	struct pipe_buffer *ibuf, *obuf;
1437	int ret = 0, i = 0, nbuf;
1438
1439	inode_double_lock(ipipe->inode, opipe->inode);
1440
1441	do {
1442		if (!opipe->readers) {
1443			send_sig(SIGPIPE, current, 0);
1444			if (!ret)
1445				ret = -EPIPE;
1446			break;
1447		}
1448
1449		/*
1450		 * If we have iterated all input buffers or ran out of
1451		 * output room, break.
1452		 */
1453		if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1454			break;
1455
1456		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1457		nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1458
1459		/*
1460		 * Get a reference to this pipe buffer,
1461		 * so we can copy the contents over.
1462		 */
1463		ibuf->ops->get(ipipe, ibuf);
1464
1465		obuf = opipe->bufs + nbuf;
1466		*obuf = *ibuf;
1467
1468		/*
1469		 * Don't inherit the gift flag, we need to
1470		 * prevent multiple steals of this page.
1471		 */
1472		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1473
1474		if (obuf->len > len)
1475			obuf->len = len;
1476
1477		opipe->nrbufs++;
1478		ret += obuf->len;
1479		len -= obuf->len;
1480		i++;
1481	} while (len);
1482
1483	inode_double_unlock(ipipe->inode, opipe->inode);
1484
1485	/*
1486	 * If we put data in the output pipe, wakeup any potential readers.
1487	 */
1488	if (ret > 0) {
1489		smp_mb();
1490		if (waitqueue_active(&opipe->wait))
1491			wake_up_interruptible(&opipe->wait);
1492		kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1493	}
1494
1495	return ret;
1496}
1497
1498/*
1499 * This is a tee(1) implementation that works on pipes. It doesn't copy
1500 * any data, it simply references the 'in' pages on the 'out' pipe.
1501 * The 'flags' used are the SPLICE_F_* variants, currently the only
1502 * applicable one is SPLICE_F_NONBLOCK.
1503 */
1504static long do_tee(struct file *in, struct file *out, size_t len,
1505		   unsigned int flags)
1506{
1507	struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1508	struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1509	int ret = -EINVAL;
1510
1511	/*
1512	 * Duplicate the contents of ipipe to opipe without actually
1513	 * copying the data.
1514	 */
1515	if (ipipe && opipe && ipipe != opipe) {
1516		/*
1517		 * Keep going, unless we encounter an error. The ipipe/opipe
1518		 * ordering doesn't really matter.
1519		 */
1520		ret = link_ipipe_prep(ipipe, flags);
1521		if (!ret) {
1522			ret = link_opipe_prep(opipe, flags);
1523			if (!ret) {
1524				ret = link_pipe(ipipe, opipe, len, flags);
1525				if (!ret && (flags & SPLICE_F_NONBLOCK))
1526					ret = -EAGAIN;
1527			}
1528		}
1529	}
1530
1531	return ret;
1532}
1533
1534asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
1535{
1536	struct file *in;
1537	int error, fput_in;
1538
1539	if (unlikely(!len))
1540		return 0;
1541
1542	error = -EBADF;
1543	in = fget_light(fdin, &fput_in);
1544	if (in) {
1545		if (in->f_mode & FMODE_READ) {
1546			int fput_out;
1547			struct file *out = fget_light(fdout, &fput_out);
1548
1549			if (out) {
1550				if (out->f_mode & FMODE_WRITE)
1551					error = do_tee(in, out, len, flags);
1552				fput_light(out, fput_out);
1553			}
1554		}
1555 		fput_light(in, fput_in);
1556 	}
1557
1558	return error;
1559}
1560