1// SPDX-License-Identifier: GPL-2.0
2/*
3 *  linux/fs/read_write.c
4 *
5 *  Copyright (C) 1991, 1992  Linus Torvalds
6 */
7
8#include <linux/slab.h>
9#include <linux/stat.h>
10#include <linux/sched/xacct.h>
11#include <linux/fcntl.h>
12#include <linux/file.h>
13#include <linux/uio.h>
14#include <linux/fsnotify.h>
15#include <linux/security.h>
16#include <linux/export.h>
17#include <linux/syscalls.h>
18#include <linux/pagemap.h>
19#include <linux/splice.h>
20#include <linux/compat.h>
21#include <linux/mount.h>
22#include <linux/fs.h>
23#include "internal.h"
24
25#include <linux/uaccess.h>
26#include <asm/unistd.h>
27
28const struct file_operations generic_ro_fops = {
29	.llseek		= generic_file_llseek,
30	.read_iter	= generic_file_read_iter,
31	.mmap		= generic_file_readonly_mmap,
32	.splice_read	= filemap_splice_read,
33};
34
35EXPORT_SYMBOL(generic_ro_fops);
36
37static inline bool unsigned_offsets(struct file *file)
38{
39	return file->f_mode & FMODE_UNSIGNED_OFFSET;
40}
41
42/**
43 * vfs_setpos - update the file offset for lseek
44 * @file:	file structure in question
45 * @offset:	file offset to seek to
46 * @maxsize:	maximum file size
47 *
48 * This is a low-level filesystem helper for updating the file offset to
49 * the value specified by @offset if the given offset is valid and it is
50 * not equal to the current file offset.
51 *
52 * Return the specified offset on success and -EINVAL on invalid offset.
53 */
54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
55{
56	if (offset < 0 && !unsigned_offsets(file))
57		return -EINVAL;
58	if (offset > maxsize)
59		return -EINVAL;
60
61	if (offset != file->f_pos) {
62		file->f_pos = offset;
63		file->f_version = 0;
64	}
65	return offset;
66}
67EXPORT_SYMBOL(vfs_setpos);
68
69/**
70 * generic_file_llseek_size - generic llseek implementation for regular files
71 * @file:	file structure to seek on
72 * @offset:	file offset to seek to
73 * @whence:	type of seek
74 * @maxsize:	max size of this file in file system
75 * @eof:	offset used for SEEK_END position
76 *
77 * This is a variant of generic_file_llseek that allows passing in a custom
78 * maximum file size and a custom EOF position, for e.g. hashed directories
79 *
80 * Synchronization:
81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
83 * read/writes behave like SEEK_SET against seeks.
84 */
85loff_t
86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
87		loff_t maxsize, loff_t eof)
88{
89	switch (whence) {
90	case SEEK_END:
91		offset += eof;
92		break;
93	case SEEK_CUR:
94		/*
95		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
96		 * position-querying operation.  Avoid rewriting the "same"
97		 * f_pos value back to the file because a concurrent read(),
98		 * write() or lseek() might have altered it
99		 */
100		if (offset == 0)
101			return file->f_pos;
102		/*
103		 * f_lock protects against read/modify/write race with other
104		 * SEEK_CURs. Note that parallel writes and reads behave
105		 * like SEEK_SET.
106		 */
107		spin_lock(&file->f_lock);
108		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
109		spin_unlock(&file->f_lock);
110		return offset;
111	case SEEK_DATA:
112		/*
113		 * In the generic case the entire file is data, so as long as
114		 * offset isn't at the end of the file then the offset is data.
115		 */
116		if ((unsigned long long)offset >= eof)
117			return -ENXIO;
118		break;
119	case SEEK_HOLE:
120		/*
121		 * There is a virtual hole at the end of the file, so as long as
122		 * offset isn't i_size or larger, return i_size.
123		 */
124		if ((unsigned long long)offset >= eof)
125			return -ENXIO;
126		offset = eof;
127		break;
128	}
129
130	return vfs_setpos(file, offset, maxsize);
131}
132EXPORT_SYMBOL(generic_file_llseek_size);
133
134/**
135 * generic_file_llseek - generic llseek implementation for regular files
136 * @file:	file structure to seek on
137 * @offset:	file offset to seek to
138 * @whence:	type of seek
139 *
140 * This is a generic implemenation of ->llseek useable for all normal local
141 * filesystems.  It just updates the file offset to the value specified by
142 * @offset and @whence.
143 */
144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
145{
146	struct inode *inode = file->f_mapping->host;
147
148	return generic_file_llseek_size(file, offset, whence,
149					inode->i_sb->s_maxbytes,
150					i_size_read(inode));
151}
152EXPORT_SYMBOL(generic_file_llseek);
153
154/**
155 * fixed_size_llseek - llseek implementation for fixed-sized devices
156 * @file:	file structure to seek on
157 * @offset:	file offset to seek to
158 * @whence:	type of seek
159 * @size:	size of the file
160 *
161 */
162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
163{
164	switch (whence) {
165	case SEEK_SET: case SEEK_CUR: case SEEK_END:
166		return generic_file_llseek_size(file, offset, whence,
167						size, size);
168	default:
169		return -EINVAL;
170	}
171}
172EXPORT_SYMBOL(fixed_size_llseek);
173
174/**
175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
176 * @file:	file structure to seek on
177 * @offset:	file offset to seek to
178 * @whence:	type of seek
179 *
180 */
181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
182{
183	switch (whence) {
184	case SEEK_SET: case SEEK_CUR:
185		return generic_file_llseek_size(file, offset, whence,
186						OFFSET_MAX, 0);
187	default:
188		return -EINVAL;
189	}
190}
191EXPORT_SYMBOL(no_seek_end_llseek);
192
193/**
194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
195 * @file:	file structure to seek on
196 * @offset:	file offset to seek to
197 * @whence:	type of seek
198 * @size:	maximal offset allowed
199 *
200 */
201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
202{
203	switch (whence) {
204	case SEEK_SET: case SEEK_CUR:
205		return generic_file_llseek_size(file, offset, whence,
206						size, 0);
207	default:
208		return -EINVAL;
209	}
210}
211EXPORT_SYMBOL(no_seek_end_llseek_size);
212
213/**
214 * noop_llseek - No Operation Performed llseek implementation
215 * @file:	file structure to seek on
216 * @offset:	file offset to seek to
217 * @whence:	type of seek
218 *
219 * This is an implementation of ->llseek useable for the rare special case when
220 * userspace expects the seek to succeed but the (device) file is actually not
221 * able to perform the seek. In this case you use noop_llseek() instead of
222 * falling back to the default implementation of ->llseek.
223 */
224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
225{
226	return file->f_pos;
227}
228EXPORT_SYMBOL(noop_llseek);
229
230loff_t default_llseek(struct file *file, loff_t offset, int whence)
231{
232	struct inode *inode = file_inode(file);
233	loff_t retval;
234
235	inode_lock(inode);
236	switch (whence) {
237		case SEEK_END:
238			offset += i_size_read(inode);
239			break;
240		case SEEK_CUR:
241			if (offset == 0) {
242				retval = file->f_pos;
243				goto out;
244			}
245			offset += file->f_pos;
246			break;
247		case SEEK_DATA:
248			/*
249			 * In the generic case the entire file is data, so as
250			 * long as offset isn't at the end of the file then the
251			 * offset is data.
252			 */
253			if (offset >= inode->i_size) {
254				retval = -ENXIO;
255				goto out;
256			}
257			break;
258		case SEEK_HOLE:
259			/*
260			 * There is a virtual hole at the end of the file, so
261			 * as long as offset isn't i_size or larger, return
262			 * i_size.
263			 */
264			if (offset >= inode->i_size) {
265				retval = -ENXIO;
266				goto out;
267			}
268			offset = inode->i_size;
269			break;
270	}
271	retval = -EINVAL;
272	if (offset >= 0 || unsigned_offsets(file)) {
273		if (offset != file->f_pos) {
274			file->f_pos = offset;
275			file->f_version = 0;
276		}
277		retval = offset;
278	}
279out:
280	inode_unlock(inode);
281	return retval;
282}
283EXPORT_SYMBOL(default_llseek);
284
285loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
286{
287	if (!(file->f_mode & FMODE_LSEEK))
288		return -ESPIPE;
289	return file->f_op->llseek(file, offset, whence);
290}
291EXPORT_SYMBOL(vfs_llseek);
292
293static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
294{
295	off_t retval;
296	struct fd f = fdget_pos(fd);
297	if (!f.file)
298		return -EBADF;
299
300	retval = -EINVAL;
301	if (whence <= SEEK_MAX) {
302		loff_t res = vfs_llseek(f.file, offset, whence);
303		retval = res;
304		if (res != (loff_t)retval)
305			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
306	}
307	fdput_pos(f);
308	return retval;
309}
310
311SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
312{
313	return ksys_lseek(fd, offset, whence);
314}
315
316#ifdef CONFIG_COMPAT
317COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
318{
319	return ksys_lseek(fd, offset, whence);
320}
321#endif
322
323#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
324	defined(__ARCH_WANT_SYS_LLSEEK)
325SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
326		unsigned long, offset_low, loff_t __user *, result,
327		unsigned int, whence)
328{
329	int retval;
330	struct fd f = fdget_pos(fd);
331	loff_t offset;
332
333	if (!f.file)
334		return -EBADF;
335
336	retval = -EINVAL;
337	if (whence > SEEK_MAX)
338		goto out_putf;
339
340	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
341			whence);
342
343	retval = (int)offset;
344	if (offset >= 0) {
345		retval = -EFAULT;
346		if (!copy_to_user(result, &offset, sizeof(offset)))
347			retval = 0;
348	}
349out_putf:
350	fdput_pos(f);
351	return retval;
352}
353#endif
354
355int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
356{
357	int mask = read_write == READ ? MAY_READ : MAY_WRITE;
358	int ret;
359
360	if (unlikely((ssize_t) count < 0))
361		return -EINVAL;
362
363	if (ppos) {
364		loff_t pos = *ppos;
365
366		if (unlikely(pos < 0)) {
367			if (!unsigned_offsets(file))
368				return -EINVAL;
369			if (count >= -pos) /* both values are in 0..LLONG_MAX */
370				return -EOVERFLOW;
371		} else if (unlikely((loff_t) (pos + count) < 0)) {
372			if (!unsigned_offsets(file))
373				return -EINVAL;
374		}
375	}
376
377	ret = security_file_permission(file, mask);
378	if (ret)
379		return ret;
380
381	return fsnotify_file_area_perm(file, mask, ppos, count);
382}
383EXPORT_SYMBOL(rw_verify_area);
384
385static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
386{
387	struct kiocb kiocb;
388	struct iov_iter iter;
389	ssize_t ret;
390
391	init_sync_kiocb(&kiocb, filp);
392	kiocb.ki_pos = (ppos ? *ppos : 0);
393	iov_iter_ubuf(&iter, ITER_DEST, buf, len);
394
395	ret = call_read_iter(filp, &kiocb, &iter);
396	BUG_ON(ret == -EIOCBQUEUED);
397	if (ppos)
398		*ppos = kiocb.ki_pos;
399	return ret;
400}
401
402static int warn_unsupported(struct file *file, const char *op)
403{
404	pr_warn_ratelimited(
405		"kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
406		op, file, current->pid, current->comm);
407	return -EINVAL;
408}
409
410ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
411{
412	struct kvec iov = {
413		.iov_base	= buf,
414		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
415	};
416	struct kiocb kiocb;
417	struct iov_iter iter;
418	ssize_t ret;
419
420	if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
421		return -EINVAL;
422	if (!(file->f_mode & FMODE_CAN_READ))
423		return -EINVAL;
424	/*
425	 * Also fail if ->read_iter and ->read are both wired up as that
426	 * implies very convoluted semantics.
427	 */
428	if (unlikely(!file->f_op->read_iter || file->f_op->read))
429		return warn_unsupported(file, "read");
430
431	init_sync_kiocb(&kiocb, file);
432	kiocb.ki_pos = pos ? *pos : 0;
433	iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
434	ret = file->f_op->read_iter(&kiocb, &iter);
435	if (ret > 0) {
436		if (pos)
437			*pos = kiocb.ki_pos;
438		fsnotify_access(file);
439		add_rchar(current, ret);
440	}
441	inc_syscr(current);
442	return ret;
443}
444
445ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
446{
447	ssize_t ret;
448
449	ret = rw_verify_area(READ, file, pos, count);
450	if (ret)
451		return ret;
452	return __kernel_read(file, buf, count, pos);
453}
454EXPORT_SYMBOL(kernel_read);
455
456ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
457{
458	ssize_t ret;
459
460	if (!(file->f_mode & FMODE_READ))
461		return -EBADF;
462	if (!(file->f_mode & FMODE_CAN_READ))
463		return -EINVAL;
464	if (unlikely(!access_ok(buf, count)))
465		return -EFAULT;
466
467	ret = rw_verify_area(READ, file, pos, count);
468	if (ret)
469		return ret;
470	if (count > MAX_RW_COUNT)
471		count =  MAX_RW_COUNT;
472
473	if (file->f_op->read)
474		ret = file->f_op->read(file, buf, count, pos);
475	else if (file->f_op->read_iter)
476		ret = new_sync_read(file, buf, count, pos);
477	else
478		ret = -EINVAL;
479	if (ret > 0) {
480		fsnotify_access(file);
481		add_rchar(current, ret);
482	}
483	inc_syscr(current);
484	return ret;
485}
486
487static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
488{
489	struct kiocb kiocb;
490	struct iov_iter iter;
491	ssize_t ret;
492
493	init_sync_kiocb(&kiocb, filp);
494	kiocb.ki_pos = (ppos ? *ppos : 0);
495	iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
496
497	ret = call_write_iter(filp, &kiocb, &iter);
498	BUG_ON(ret == -EIOCBQUEUED);
499	if (ret > 0 && ppos)
500		*ppos = kiocb.ki_pos;
501	return ret;
502}
503
504/* caller is responsible for file_start_write/file_end_write */
505ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
506{
507	struct kiocb kiocb;
508	ssize_t ret;
509
510	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
511		return -EBADF;
512	if (!(file->f_mode & FMODE_CAN_WRITE))
513		return -EINVAL;
514	/*
515	 * Also fail if ->write_iter and ->write are both wired up as that
516	 * implies very convoluted semantics.
517	 */
518	if (unlikely(!file->f_op->write_iter || file->f_op->write))
519		return warn_unsupported(file, "write");
520
521	init_sync_kiocb(&kiocb, file);
522	kiocb.ki_pos = pos ? *pos : 0;
523	ret = file->f_op->write_iter(&kiocb, from);
524	if (ret > 0) {
525		if (pos)
526			*pos = kiocb.ki_pos;
527		fsnotify_modify(file);
528		add_wchar(current, ret);
529	}
530	inc_syscw(current);
531	return ret;
532}
533
534/* caller is responsible for file_start_write/file_end_write */
535ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
536{
537	struct kvec iov = {
538		.iov_base	= (void *)buf,
539		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
540	};
541	struct iov_iter iter;
542	iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
543	return __kernel_write_iter(file, &iter, pos);
544}
545/*
546 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
547 * but autofs is one of the few internal kernel users that actually
548 * wants this _and_ can be built as a module. So we need to export
549 * this symbol for autofs, even though it really isn't appropriate
550 * for any other kernel modules.
551 */
552EXPORT_SYMBOL_GPL(__kernel_write);
553
554ssize_t kernel_write(struct file *file, const void *buf, size_t count,
555			    loff_t *pos)
556{
557	ssize_t ret;
558
559	ret = rw_verify_area(WRITE, file, pos, count);
560	if (ret)
561		return ret;
562
563	file_start_write(file);
564	ret =  __kernel_write(file, buf, count, pos);
565	file_end_write(file);
566	return ret;
567}
568EXPORT_SYMBOL(kernel_write);
569
570ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
571{
572	ssize_t ret;
573
574	if (!(file->f_mode & FMODE_WRITE))
575		return -EBADF;
576	if (!(file->f_mode & FMODE_CAN_WRITE))
577		return -EINVAL;
578	if (unlikely(!access_ok(buf, count)))
579		return -EFAULT;
580
581	ret = rw_verify_area(WRITE, file, pos, count);
582	if (ret)
583		return ret;
584	if (count > MAX_RW_COUNT)
585		count =  MAX_RW_COUNT;
586	file_start_write(file);
587	if (file->f_op->write)
588		ret = file->f_op->write(file, buf, count, pos);
589	else if (file->f_op->write_iter)
590		ret = new_sync_write(file, buf, count, pos);
591	else
592		ret = -EINVAL;
593	if (ret > 0) {
594		fsnotify_modify(file);
595		add_wchar(current, ret);
596	}
597	inc_syscw(current);
598	file_end_write(file);
599	return ret;
600}
601
602/* file_ppos returns &file->f_pos or NULL if file is stream */
603static inline loff_t *file_ppos(struct file *file)
604{
605	return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
606}
607
608ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
609{
610	struct fd f = fdget_pos(fd);
611	ssize_t ret = -EBADF;
612
613	if (f.file) {
614		loff_t pos, *ppos = file_ppos(f.file);
615		if (ppos) {
616			pos = *ppos;
617			ppos = &pos;
618		}
619		ret = vfs_read(f.file, buf, count, ppos);
620		if (ret >= 0 && ppos)
621			f.file->f_pos = pos;
622		fdput_pos(f);
623	}
624	return ret;
625}
626
627SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
628{
629	return ksys_read(fd, buf, count);
630}
631
632ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
633{
634	struct fd f = fdget_pos(fd);
635	ssize_t ret = -EBADF;
636
637	if (f.file) {
638		loff_t pos, *ppos = file_ppos(f.file);
639		if (ppos) {
640			pos = *ppos;
641			ppos = &pos;
642		}
643		ret = vfs_write(f.file, buf, count, ppos);
644		if (ret >= 0 && ppos)
645			f.file->f_pos = pos;
646		fdput_pos(f);
647	}
648
649	return ret;
650}
651
652SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
653		size_t, count)
654{
655	return ksys_write(fd, buf, count);
656}
657
658ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
659		     loff_t pos)
660{
661	struct fd f;
662	ssize_t ret = -EBADF;
663
664	if (pos < 0)
665		return -EINVAL;
666
667	f = fdget(fd);
668	if (f.file) {
669		ret = -ESPIPE;
670		if (f.file->f_mode & FMODE_PREAD)
671			ret = vfs_read(f.file, buf, count, &pos);
672		fdput(f);
673	}
674
675	return ret;
676}
677
678SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
679			size_t, count, loff_t, pos)
680{
681	return ksys_pread64(fd, buf, count, pos);
682}
683
684#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
685COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
686		       size_t, count, compat_arg_u64_dual(pos))
687{
688	return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
689}
690#endif
691
692ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
693		      size_t count, loff_t pos)
694{
695	struct fd f;
696	ssize_t ret = -EBADF;
697
698	if (pos < 0)
699		return -EINVAL;
700
701	f = fdget(fd);
702	if (f.file) {
703		ret = -ESPIPE;
704		if (f.file->f_mode & FMODE_PWRITE)
705			ret = vfs_write(f.file, buf, count, &pos);
706		fdput(f);
707	}
708
709	return ret;
710}
711
712SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
713			 size_t, count, loff_t, pos)
714{
715	return ksys_pwrite64(fd, buf, count, pos);
716}
717
718#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
719COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
720		       size_t, count, compat_arg_u64_dual(pos))
721{
722	return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
723}
724#endif
725
726static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
727		loff_t *ppos, int type, rwf_t flags)
728{
729	struct kiocb kiocb;
730	ssize_t ret;
731
732	init_sync_kiocb(&kiocb, filp);
733	ret = kiocb_set_rw_flags(&kiocb, flags);
734	if (ret)
735		return ret;
736	kiocb.ki_pos = (ppos ? *ppos : 0);
737
738	if (type == READ)
739		ret = call_read_iter(filp, &kiocb, iter);
740	else
741		ret = call_write_iter(filp, &kiocb, iter);
742	BUG_ON(ret == -EIOCBQUEUED);
743	if (ppos)
744		*ppos = kiocb.ki_pos;
745	return ret;
746}
747
748/* Do it by hand, with file-ops */
749static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
750		loff_t *ppos, int type, rwf_t flags)
751{
752	ssize_t ret = 0;
753
754	if (flags & ~RWF_HIPRI)
755		return -EOPNOTSUPP;
756
757	while (iov_iter_count(iter)) {
758		ssize_t nr;
759
760		if (type == READ) {
761			nr = filp->f_op->read(filp, iter_iov_addr(iter),
762						iter_iov_len(iter), ppos);
763		} else {
764			nr = filp->f_op->write(filp, iter_iov_addr(iter),
765						iter_iov_len(iter), ppos);
766		}
767
768		if (nr < 0) {
769			if (!ret)
770				ret = nr;
771			break;
772		}
773		ret += nr;
774		if (nr != iter_iov_len(iter))
775			break;
776		iov_iter_advance(iter, nr);
777	}
778
779	return ret;
780}
781
782ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
783			   struct iov_iter *iter)
784{
785	size_t tot_len;
786	ssize_t ret = 0;
787
788	if (!file->f_op->read_iter)
789		return -EINVAL;
790	if (!(file->f_mode & FMODE_READ))
791		return -EBADF;
792	if (!(file->f_mode & FMODE_CAN_READ))
793		return -EINVAL;
794
795	tot_len = iov_iter_count(iter);
796	if (!tot_len)
797		goto out;
798	ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
799	if (ret < 0)
800		return ret;
801
802	ret = call_read_iter(file, iocb, iter);
803out:
804	if (ret >= 0)
805		fsnotify_access(file);
806	return ret;
807}
808EXPORT_SYMBOL(vfs_iocb_iter_read);
809
810ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
811		      rwf_t flags)
812{
813	size_t tot_len;
814	ssize_t ret = 0;
815
816	if (!file->f_op->read_iter)
817		return -EINVAL;
818	if (!(file->f_mode & FMODE_READ))
819		return -EBADF;
820	if (!(file->f_mode & FMODE_CAN_READ))
821		return -EINVAL;
822
823	tot_len = iov_iter_count(iter);
824	if (!tot_len)
825		goto out;
826	ret = rw_verify_area(READ, file, ppos, tot_len);
827	if (ret < 0)
828		return ret;
829
830	ret = do_iter_readv_writev(file, iter, ppos, READ, flags);
831out:
832	if (ret >= 0)
833		fsnotify_access(file);
834	return ret;
835}
836EXPORT_SYMBOL(vfs_iter_read);
837
838/*
839 * Caller is responsible for calling kiocb_end_write() on completion
840 * if async iocb was queued.
841 */
842ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
843			    struct iov_iter *iter)
844{
845	size_t tot_len;
846	ssize_t ret = 0;
847
848	if (!file->f_op->write_iter)
849		return -EINVAL;
850	if (!(file->f_mode & FMODE_WRITE))
851		return -EBADF;
852	if (!(file->f_mode & FMODE_CAN_WRITE))
853		return -EINVAL;
854
855	tot_len = iov_iter_count(iter);
856	if (!tot_len)
857		return 0;
858	ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
859	if (ret < 0)
860		return ret;
861
862	kiocb_start_write(iocb);
863	ret = call_write_iter(file, iocb, iter);
864	if (ret != -EIOCBQUEUED)
865		kiocb_end_write(iocb);
866	if (ret > 0)
867		fsnotify_modify(file);
868
869	return ret;
870}
871EXPORT_SYMBOL(vfs_iocb_iter_write);
872
873ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
874		       rwf_t flags)
875{
876	size_t tot_len;
877	ssize_t ret;
878
879	if (!(file->f_mode & FMODE_WRITE))
880		return -EBADF;
881	if (!(file->f_mode & FMODE_CAN_WRITE))
882		return -EINVAL;
883	if (!file->f_op->write_iter)
884		return -EINVAL;
885
886	tot_len = iov_iter_count(iter);
887	if (!tot_len)
888		return 0;
889
890	ret = rw_verify_area(WRITE, file, ppos, tot_len);
891	if (ret < 0)
892		return ret;
893
894	file_start_write(file);
895	ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags);
896	if (ret > 0)
897		fsnotify_modify(file);
898	file_end_write(file);
899
900	return ret;
901}
902EXPORT_SYMBOL(vfs_iter_write);
903
904static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
905			 unsigned long vlen, loff_t *pos, rwf_t flags)
906{
907	struct iovec iovstack[UIO_FASTIOV];
908	struct iovec *iov = iovstack;
909	struct iov_iter iter;
910	size_t tot_len;
911	ssize_t ret = 0;
912
913	if (!(file->f_mode & FMODE_READ))
914		return -EBADF;
915	if (!(file->f_mode & FMODE_CAN_READ))
916		return -EINVAL;
917
918	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
919			   &iter);
920	if (ret < 0)
921		return ret;
922
923	tot_len = iov_iter_count(&iter);
924	if (!tot_len)
925		goto out;
926
927	ret = rw_verify_area(READ, file, pos, tot_len);
928	if (ret < 0)
929		goto out;
930
931	if (file->f_op->read_iter)
932		ret = do_iter_readv_writev(file, &iter, pos, READ, flags);
933	else
934		ret = do_loop_readv_writev(file, &iter, pos, READ, flags);
935out:
936	if (ret >= 0)
937		fsnotify_access(file);
938	kfree(iov);
939	return ret;
940}
941
942static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
943			  unsigned long vlen, loff_t *pos, rwf_t flags)
944{
945	struct iovec iovstack[UIO_FASTIOV];
946	struct iovec *iov = iovstack;
947	struct iov_iter iter;
948	size_t tot_len;
949	ssize_t ret = 0;
950
951	if (!(file->f_mode & FMODE_WRITE))
952		return -EBADF;
953	if (!(file->f_mode & FMODE_CAN_WRITE))
954		return -EINVAL;
955
956	ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov,
957			   &iter);
958	if (ret < 0)
959		return ret;
960
961	tot_len = iov_iter_count(&iter);
962	if (!tot_len)
963		goto out;
964
965	ret = rw_verify_area(WRITE, file, pos, tot_len);
966	if (ret < 0)
967		goto out;
968
969	file_start_write(file);
970	if (file->f_op->write_iter)
971		ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags);
972	else
973		ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags);
974	if (ret > 0)
975		fsnotify_modify(file);
976	file_end_write(file);
977out:
978	kfree(iov);
979	return ret;
980}
981
982static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
983			unsigned long vlen, rwf_t flags)
984{
985	struct fd f = fdget_pos(fd);
986	ssize_t ret = -EBADF;
987
988	if (f.file) {
989		loff_t pos, *ppos = file_ppos(f.file);
990		if (ppos) {
991			pos = *ppos;
992			ppos = &pos;
993		}
994		ret = vfs_readv(f.file, vec, vlen, ppos, flags);
995		if (ret >= 0 && ppos)
996			f.file->f_pos = pos;
997		fdput_pos(f);
998	}
999
1000	if (ret > 0)
1001		add_rchar(current, ret);
1002	inc_syscr(current);
1003	return ret;
1004}
1005
1006static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1007			 unsigned long vlen, rwf_t flags)
1008{
1009	struct fd f = fdget_pos(fd);
1010	ssize_t ret = -EBADF;
1011
1012	if (f.file) {
1013		loff_t pos, *ppos = file_ppos(f.file);
1014		if (ppos) {
1015			pos = *ppos;
1016			ppos = &pos;
1017		}
1018		ret = vfs_writev(f.file, vec, vlen, ppos, flags);
1019		if (ret >= 0 && ppos)
1020			f.file->f_pos = pos;
1021		fdput_pos(f);
1022	}
1023
1024	if (ret > 0)
1025		add_wchar(current, ret);
1026	inc_syscw(current);
1027	return ret;
1028}
1029
1030static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1031{
1032#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1033	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1034}
1035
1036static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1037			 unsigned long vlen, loff_t pos, rwf_t flags)
1038{
1039	struct fd f;
1040	ssize_t ret = -EBADF;
1041
1042	if (pos < 0)
1043		return -EINVAL;
1044
1045	f = fdget(fd);
1046	if (f.file) {
1047		ret = -ESPIPE;
1048		if (f.file->f_mode & FMODE_PREAD)
1049			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1050		fdput(f);
1051	}
1052
1053	if (ret > 0)
1054		add_rchar(current, ret);
1055	inc_syscr(current);
1056	return ret;
1057}
1058
1059static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1060			  unsigned long vlen, loff_t pos, rwf_t flags)
1061{
1062	struct fd f;
1063	ssize_t ret = -EBADF;
1064
1065	if (pos < 0)
1066		return -EINVAL;
1067
1068	f = fdget(fd);
1069	if (f.file) {
1070		ret = -ESPIPE;
1071		if (f.file->f_mode & FMODE_PWRITE)
1072			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1073		fdput(f);
1074	}
1075
1076	if (ret > 0)
1077		add_wchar(current, ret);
1078	inc_syscw(current);
1079	return ret;
1080}
1081
1082SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1083		unsigned long, vlen)
1084{
1085	return do_readv(fd, vec, vlen, 0);
1086}
1087
1088SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1089		unsigned long, vlen)
1090{
1091	return do_writev(fd, vec, vlen, 0);
1092}
1093
1094SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1095		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1096{
1097	loff_t pos = pos_from_hilo(pos_h, pos_l);
1098
1099	return do_preadv(fd, vec, vlen, pos, 0);
1100}
1101
1102SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1103		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1104		rwf_t, flags)
1105{
1106	loff_t pos = pos_from_hilo(pos_h, pos_l);
1107
1108	if (pos == -1)
1109		return do_readv(fd, vec, vlen, flags);
1110
1111	return do_preadv(fd, vec, vlen, pos, flags);
1112}
1113
1114SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1115		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1116{
1117	loff_t pos = pos_from_hilo(pos_h, pos_l);
1118
1119	return do_pwritev(fd, vec, vlen, pos, 0);
1120}
1121
1122SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1123		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1124		rwf_t, flags)
1125{
1126	loff_t pos = pos_from_hilo(pos_h, pos_l);
1127
1128	if (pos == -1)
1129		return do_writev(fd, vec, vlen, flags);
1130
1131	return do_pwritev(fd, vec, vlen, pos, flags);
1132}
1133
1134/*
1135 * Various compat syscalls.  Note that they all pretend to take a native
1136 * iovec - import_iovec will properly treat those as compat_iovecs based on
1137 * in_compat_syscall().
1138 */
1139#ifdef CONFIG_COMPAT
1140#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1141COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1142		const struct iovec __user *, vec,
1143		unsigned long, vlen, loff_t, pos)
1144{
1145	return do_preadv(fd, vec, vlen, pos, 0);
1146}
1147#endif
1148
1149COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1150		const struct iovec __user *, vec,
1151		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1152{
1153	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1154
1155	return do_preadv(fd, vec, vlen, pos, 0);
1156}
1157
1158#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1159COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1160		const struct iovec __user *, vec,
1161		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1162{
1163	if (pos == -1)
1164		return do_readv(fd, vec, vlen, flags);
1165	return do_preadv(fd, vec, vlen, pos, flags);
1166}
1167#endif
1168
1169COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1170		const struct iovec __user *, vec,
1171		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1172		rwf_t, flags)
1173{
1174	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1175
1176	if (pos == -1)
1177		return do_readv(fd, vec, vlen, flags);
1178	return do_preadv(fd, vec, vlen, pos, flags);
1179}
1180
1181#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1182COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1183		const struct iovec __user *, vec,
1184		unsigned long, vlen, loff_t, pos)
1185{
1186	return do_pwritev(fd, vec, vlen, pos, 0);
1187}
1188#endif
1189
1190COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1191		const struct iovec __user *,vec,
1192		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1193{
1194	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1195
1196	return do_pwritev(fd, vec, vlen, pos, 0);
1197}
1198
1199#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1200COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1201		const struct iovec __user *, vec,
1202		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1203{
1204	if (pos == -1)
1205		return do_writev(fd, vec, vlen, flags);
1206	return do_pwritev(fd, vec, vlen, pos, flags);
1207}
1208#endif
1209
1210COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1211		const struct iovec __user *,vec,
1212		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1213{
1214	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1215
1216	if (pos == -1)
1217		return do_writev(fd, vec, vlen, flags);
1218	return do_pwritev(fd, vec, vlen, pos, flags);
1219}
1220#endif /* CONFIG_COMPAT */
1221
1222static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1223			   size_t count, loff_t max)
1224{
1225	struct fd in, out;
1226	struct inode *in_inode, *out_inode;
1227	struct pipe_inode_info *opipe;
1228	loff_t pos;
1229	loff_t out_pos;
1230	ssize_t retval;
1231	int fl;
1232
1233	/*
1234	 * Get input file, and verify that it is ok..
1235	 */
1236	retval = -EBADF;
1237	in = fdget(in_fd);
1238	if (!in.file)
1239		goto out;
1240	if (!(in.file->f_mode & FMODE_READ))
1241		goto fput_in;
1242	retval = -ESPIPE;
1243	if (!ppos) {
1244		pos = in.file->f_pos;
1245	} else {
1246		pos = *ppos;
1247		if (!(in.file->f_mode & FMODE_PREAD))
1248			goto fput_in;
1249	}
1250	retval = rw_verify_area(READ, in.file, &pos, count);
1251	if (retval < 0)
1252		goto fput_in;
1253	if (count > MAX_RW_COUNT)
1254		count =  MAX_RW_COUNT;
1255
1256	/*
1257	 * Get output file, and verify that it is ok..
1258	 */
1259	retval = -EBADF;
1260	out = fdget(out_fd);
1261	if (!out.file)
1262		goto fput_in;
1263	if (!(out.file->f_mode & FMODE_WRITE))
1264		goto fput_out;
1265	in_inode = file_inode(in.file);
1266	out_inode = file_inode(out.file);
1267	out_pos = out.file->f_pos;
1268
1269	if (!max)
1270		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1271
1272	if (unlikely(pos + count > max)) {
1273		retval = -EOVERFLOW;
1274		if (pos >= max)
1275			goto fput_out;
1276		count = max - pos;
1277	}
1278
1279	fl = 0;
1280#if 0
1281	/*
1282	 * We need to debate whether we can enable this or not. The
1283	 * man page documents EAGAIN return for the output at least,
1284	 * and the application is arguably buggy if it doesn't expect
1285	 * EAGAIN on a non-blocking file descriptor.
1286	 */
1287	if (in.file->f_flags & O_NONBLOCK)
1288		fl = SPLICE_F_NONBLOCK;
1289#endif
1290	opipe = get_pipe_info(out.file, true);
1291	if (!opipe) {
1292		retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1293		if (retval < 0)
1294			goto fput_out;
1295		retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
1296					  count, fl);
1297	} else {
1298		if (out.file->f_flags & O_NONBLOCK)
1299			fl |= SPLICE_F_NONBLOCK;
1300
1301		retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
1302	}
1303
1304	if (retval > 0) {
1305		add_rchar(current, retval);
1306		add_wchar(current, retval);
1307		fsnotify_access(in.file);
1308		fsnotify_modify(out.file);
1309		out.file->f_pos = out_pos;
1310		if (ppos)
1311			*ppos = pos;
1312		else
1313			in.file->f_pos = pos;
1314	}
1315
1316	inc_syscr(current);
1317	inc_syscw(current);
1318	if (pos > max)
1319		retval = -EOVERFLOW;
1320
1321fput_out:
1322	fdput(out);
1323fput_in:
1324	fdput(in);
1325out:
1326	return retval;
1327}
1328
1329SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1330{
1331	loff_t pos;
1332	off_t off;
1333	ssize_t ret;
1334
1335	if (offset) {
1336		if (unlikely(get_user(off, offset)))
1337			return -EFAULT;
1338		pos = off;
1339		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1340		if (unlikely(put_user(pos, offset)))
1341			return -EFAULT;
1342		return ret;
1343	}
1344
1345	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1346}
1347
1348SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1349{
1350	loff_t pos;
1351	ssize_t ret;
1352
1353	if (offset) {
1354		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1355			return -EFAULT;
1356		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1357		if (unlikely(put_user(pos, offset)))
1358			return -EFAULT;
1359		return ret;
1360	}
1361
1362	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1363}
1364
1365#ifdef CONFIG_COMPAT
1366COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1367		compat_off_t __user *, offset, compat_size_t, count)
1368{
1369	loff_t pos;
1370	off_t off;
1371	ssize_t ret;
1372
1373	if (offset) {
1374		if (unlikely(get_user(off, offset)))
1375			return -EFAULT;
1376		pos = off;
1377		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1378		if (unlikely(put_user(pos, offset)))
1379			return -EFAULT;
1380		return ret;
1381	}
1382
1383	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1384}
1385
1386COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1387		compat_loff_t __user *, offset, compat_size_t, count)
1388{
1389	loff_t pos;
1390	ssize_t ret;
1391
1392	if (offset) {
1393		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1394			return -EFAULT;
1395		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1396		if (unlikely(put_user(pos, offset)))
1397			return -EFAULT;
1398		return ret;
1399	}
1400
1401	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1402}
1403#endif
1404
1405/*
1406 * Performs necessary checks before doing a file copy
1407 *
1408 * Can adjust amount of bytes to copy via @req_count argument.
1409 * Returns appropriate error code that caller should return or
1410 * zero in case the copy should be allowed.
1411 */
1412static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
1413				    struct file *file_out, loff_t pos_out,
1414				    size_t *req_count, unsigned int flags)
1415{
1416	struct inode *inode_in = file_inode(file_in);
1417	struct inode *inode_out = file_inode(file_out);
1418	uint64_t count = *req_count;
1419	loff_t size_in;
1420	int ret;
1421
1422	ret = generic_file_rw_checks(file_in, file_out);
1423	if (ret)
1424		return ret;
1425
1426	/*
1427	 * We allow some filesystems to handle cross sb copy, but passing
1428	 * a file of the wrong filesystem type to filesystem driver can result
1429	 * in an attempt to dereference the wrong type of ->private_data, so
1430	 * avoid doing that until we really have a good reason.
1431	 *
1432	 * nfs and cifs define several different file_system_type structures
1433	 * and several different sets of file_operations, but they all end up
1434	 * using the same ->copy_file_range() function pointer.
1435	 */
1436	if (flags & COPY_FILE_SPLICE) {
1437		/* cross sb splice is allowed */
1438	} else if (file_out->f_op->copy_file_range) {
1439		if (file_in->f_op->copy_file_range !=
1440		    file_out->f_op->copy_file_range)
1441			return -EXDEV;
1442	} else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
1443		return -EXDEV;
1444	}
1445
1446	/* Don't touch certain kinds of inodes */
1447	if (IS_IMMUTABLE(inode_out))
1448		return -EPERM;
1449
1450	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1451		return -ETXTBSY;
1452
1453	/* Ensure offsets don't wrap. */
1454	if (pos_in + count < pos_in || pos_out + count < pos_out)
1455		return -EOVERFLOW;
1456
1457	/* Shorten the copy to EOF */
1458	size_in = i_size_read(inode_in);
1459	if (pos_in >= size_in)
1460		count = 0;
1461	else
1462		count = min(count, size_in - (uint64_t)pos_in);
1463
1464	ret = generic_write_check_limits(file_out, pos_out, &count);
1465	if (ret)
1466		return ret;
1467
1468	/* Don't allow overlapped copying within the same file. */
1469	if (inode_in == inode_out &&
1470	    pos_out + count > pos_in &&
1471	    pos_out < pos_in + count)
1472		return -EINVAL;
1473
1474	*req_count = count;
1475	return 0;
1476}
1477
1478/*
1479 * copy_file_range() differs from regular file read and write in that it
1480 * specifically allows return partial success.  When it does so is up to
1481 * the copy_file_range method.
1482 */
1483ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1484			    struct file *file_out, loff_t pos_out,
1485			    size_t len, unsigned int flags)
1486{
1487	ssize_t ret;
1488	bool splice = flags & COPY_FILE_SPLICE;
1489	bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb;
1490
1491	if (flags & ~COPY_FILE_SPLICE)
1492		return -EINVAL;
1493
1494	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1495				       flags);
1496	if (unlikely(ret))
1497		return ret;
1498
1499	ret = rw_verify_area(READ, file_in, &pos_in, len);
1500	if (unlikely(ret))
1501		return ret;
1502
1503	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1504	if (unlikely(ret))
1505		return ret;
1506
1507	if (len == 0)
1508		return 0;
1509
1510	file_start_write(file_out);
1511
1512	/*
1513	 * Cloning is supported by more file systems, so we implement copy on
1514	 * same sb using clone, but for filesystems where both clone and copy
1515	 * are supported (e.g. nfs,cifs), we only call the copy method.
1516	 */
1517	if (!splice && file_out->f_op->copy_file_range) {
1518		ret = file_out->f_op->copy_file_range(file_in, pos_in,
1519						      file_out, pos_out,
1520						      len, flags);
1521	} else if (!splice && file_in->f_op->remap_file_range && samesb) {
1522		ret = file_in->f_op->remap_file_range(file_in, pos_in,
1523				file_out, pos_out,
1524				min_t(loff_t, MAX_RW_COUNT, len),
1525				REMAP_FILE_CAN_SHORTEN);
1526		/* fallback to splice */
1527		if (ret <= 0)
1528			splice = true;
1529	} else if (samesb) {
1530		/* Fallback to splice for same sb copy for backward compat */
1531		splice = true;
1532	}
1533
1534	file_end_write(file_out);
1535
1536	if (!splice)
1537		goto done;
1538
1539	/*
1540	 * We can get here for same sb copy of filesystems that do not implement
1541	 * ->copy_file_range() in case filesystem does not support clone or in
1542	 * case filesystem supports clone but rejected the clone request (e.g.
1543	 * because it was not block aligned).
1544	 *
1545	 * In both cases, fall back to kernel copy so we are able to maintain a
1546	 * consistent story about which filesystems support copy_file_range()
1547	 * and which filesystems do not, that will allow userspace tools to
1548	 * make consistent desicions w.r.t using copy_file_range().
1549	 *
1550	 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
1551	 * for server-side-copy between any two sb.
1552	 *
1553	 * In any case, we call do_splice_direct() and not splice_file_range(),
1554	 * without file_start_write() held, to avoid possible deadlocks related
1555	 * to splicing from input file, while file_start_write() is held on
1556	 * the output file on a different sb.
1557	 */
1558	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1559			       min_t(size_t, len, MAX_RW_COUNT), 0);
1560done:
1561	if (ret > 0) {
1562		fsnotify_access(file_in);
1563		add_rchar(current, ret);
1564		fsnotify_modify(file_out);
1565		add_wchar(current, ret);
1566	}
1567
1568	inc_syscr(current);
1569	inc_syscw(current);
1570
1571	return ret;
1572}
1573EXPORT_SYMBOL(vfs_copy_file_range);
1574
1575SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1576		int, fd_out, loff_t __user *, off_out,
1577		size_t, len, unsigned int, flags)
1578{
1579	loff_t pos_in;
1580	loff_t pos_out;
1581	struct fd f_in;
1582	struct fd f_out;
1583	ssize_t ret = -EBADF;
1584
1585	f_in = fdget(fd_in);
1586	if (!f_in.file)
1587		goto out2;
1588
1589	f_out = fdget(fd_out);
1590	if (!f_out.file)
1591		goto out1;
1592
1593	ret = -EFAULT;
1594	if (off_in) {
1595		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1596			goto out;
1597	} else {
1598		pos_in = f_in.file->f_pos;
1599	}
1600
1601	if (off_out) {
1602		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1603			goto out;
1604	} else {
1605		pos_out = f_out.file->f_pos;
1606	}
1607
1608	ret = -EINVAL;
1609	if (flags != 0)
1610		goto out;
1611
1612	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1613				  flags);
1614	if (ret > 0) {
1615		pos_in += ret;
1616		pos_out += ret;
1617
1618		if (off_in) {
1619			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1620				ret = -EFAULT;
1621		} else {
1622			f_in.file->f_pos = pos_in;
1623		}
1624
1625		if (off_out) {
1626			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1627				ret = -EFAULT;
1628		} else {
1629			f_out.file->f_pos = pos_out;
1630		}
1631	}
1632
1633out:
1634	fdput(f_out);
1635out1:
1636	fdput(f_in);
1637out2:
1638	return ret;
1639}
1640
1641/*
1642 * Don't operate on ranges the page cache doesn't support, and don't exceed the
1643 * LFS limits.  If pos is under the limit it becomes a short access.  If it
1644 * exceeds the limit we return -EFBIG.
1645 */
1646int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
1647{
1648	struct inode *inode = file->f_mapping->host;
1649	loff_t max_size = inode->i_sb->s_maxbytes;
1650	loff_t limit = rlimit(RLIMIT_FSIZE);
1651
1652	if (limit != RLIM_INFINITY) {
1653		if (pos >= limit) {
1654			send_sig(SIGXFSZ, current, 0);
1655			return -EFBIG;
1656		}
1657		*count = min(*count, limit - pos);
1658	}
1659
1660	if (!(file->f_flags & O_LARGEFILE))
1661		max_size = MAX_NON_LFS;
1662
1663	if (unlikely(pos >= max_size))
1664		return -EFBIG;
1665
1666	*count = min(*count, max_size - pos);
1667
1668	return 0;
1669}
1670
1671/* Like generic_write_checks(), but takes size of write instead of iter. */
1672int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
1673{
1674	struct file *file = iocb->ki_filp;
1675	struct inode *inode = file->f_mapping->host;
1676
1677	if (IS_SWAPFILE(inode))
1678		return -ETXTBSY;
1679
1680	if (!*count)
1681		return 0;
1682
1683	if (iocb->ki_flags & IOCB_APPEND)
1684		iocb->ki_pos = i_size_read(inode);
1685
1686	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1687	    !((iocb->ki_flags & IOCB_DIRECT) ||
1688	      (file->f_mode & FMODE_BUF_WASYNC)))
1689		return -EINVAL;
1690
1691	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
1692}
1693EXPORT_SYMBOL(generic_write_checks_count);
1694
1695/*
1696 * Performs necessary checks before doing a write
1697 *
1698 * Can adjust writing position or amount of bytes to write.
1699 * Returns appropriate error code that caller should return or
1700 * zero in case that write should be allowed.
1701 */
1702ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
1703{
1704	loff_t count = iov_iter_count(from);
1705	int ret;
1706
1707	ret = generic_write_checks_count(iocb, &count);
1708	if (ret)
1709		return ret;
1710
1711	iov_iter_truncate(from, count);
1712	return iov_iter_count(from);
1713}
1714EXPORT_SYMBOL(generic_write_checks);
1715
1716/*
1717 * Performs common checks before doing a file copy/clone
1718 * from @file_in to @file_out.
1719 */
1720int generic_file_rw_checks(struct file *file_in, struct file *file_out)
1721{
1722	struct inode *inode_in = file_inode(file_in);
1723	struct inode *inode_out = file_inode(file_out);
1724
1725	/* Don't copy dirs, pipes, sockets... */
1726	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1727		return -EISDIR;
1728	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1729		return -EINVAL;
1730
1731	if (!(file_in->f_mode & FMODE_READ) ||
1732	    !(file_out->f_mode & FMODE_WRITE) ||
1733	    (file_out->f_flags & O_APPEND))
1734		return -EBADF;
1735
1736	return 0;
1737}
1738