vfs_vnops.c revision 330897
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
13 * Copyright (c) 2013, 2014 The FreeBSD Foundation
14 *
15 * Portions of this software were developed by Konstantin Belousov
16 * under sponsorship from the FreeBSD Foundation.
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 *    notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 *    notice, this list of conditions and the following disclaimer in the
25 *    documentation and/or other materials provided with the distribution.
26 * 4. Neither the name of the University nor the names of its contributors
27 *    may be used to endorse or promote products derived from this software
28 *    without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD: stable/11/sys/kern/vfs_vnops.c 330897 2018-03-14 03:19:51Z eadler $");
47
48#include "opt_hwpmc_hooks.h"
49
50#include <sys/param.h>
51#include <sys/systm.h>
52#include <sys/disk.h>
53#include <sys/fail.h>
54#include <sys/fcntl.h>
55#include <sys/file.h>
56#include <sys/kdb.h>
57#include <sys/stat.h>
58#include <sys/priv.h>
59#include <sys/proc.h>
60#include <sys/limits.h>
61#include <sys/lock.h>
62#include <sys/mman.h>
63#include <sys/mount.h>
64#include <sys/mutex.h>
65#include <sys/namei.h>
66#include <sys/vnode.h>
67#include <sys/bio.h>
68#include <sys/buf.h>
69#include <sys/filio.h>
70#include <sys/resourcevar.h>
71#include <sys/rwlock.h>
72#include <sys/sx.h>
73#include <sys/sysctl.h>
74#include <sys/ttycom.h>
75#include <sys/conf.h>
76#include <sys/syslog.h>
77#include <sys/unistd.h>
78#include <sys/user.h>
79
80#include <security/audit/audit.h>
81#include <security/mac/mac_framework.h>
82
83#include <vm/vm.h>
84#include <vm/vm_extern.h>
85#include <vm/pmap.h>
86#include <vm/vm_map.h>
87#include <vm/vm_object.h>
88#include <vm/vm_page.h>
89#include <vm/vnode_pager.h>
90
91#ifdef HWPMC_HOOKS
92#include <sys/pmckern.h>
93#endif
94
95static fo_rdwr_t	vn_read;
96static fo_rdwr_t	vn_write;
97static fo_rdwr_t	vn_io_fault;
98static fo_truncate_t	vn_truncate;
99static fo_ioctl_t	vn_ioctl;
100static fo_poll_t	vn_poll;
101static fo_kqfilter_t	vn_kqfilter;
102static fo_stat_t	vn_statfile;
103static fo_close_t	vn_closefile;
104static fo_mmap_t	vn_mmap;
105
106struct 	fileops vnops = {
107	.fo_read = vn_io_fault,
108	.fo_write = vn_io_fault,
109	.fo_truncate = vn_truncate,
110	.fo_ioctl = vn_ioctl,
111	.fo_poll = vn_poll,
112	.fo_kqfilter = vn_kqfilter,
113	.fo_stat = vn_statfile,
114	.fo_close = vn_closefile,
115	.fo_chmod = vn_chmod,
116	.fo_chown = vn_chown,
117	.fo_sendfile = vn_sendfile,
118	.fo_seek = vn_seek,
119	.fo_fill_kinfo = vn_fill_kinfo,
120	.fo_mmap = vn_mmap,
121	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
122};
123
124static const int io_hold_cnt = 16;
125static int vn_io_fault_enable = 1;
126SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
127    &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
128static int vn_io_fault_prefault = 0;
129SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW,
130    &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
131static u_long vn_io_faults_cnt;
132SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
133    &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
134
135/*
136 * Returns true if vn_io_fault mode of handling the i/o request should
137 * be used.
138 */
139static bool
140do_vn_io_fault(struct vnode *vp, struct uio *uio)
141{
142	struct mount *mp;
143
144	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
145	    (mp = vp->v_mount) != NULL &&
146	    (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
147}
148
149/*
150 * Structure used to pass arguments to vn_io_fault1(), to do either
151 * file- or vnode-based I/O calls.
152 */
153struct vn_io_fault_args {
154	enum {
155		VN_IO_FAULT_FOP,
156		VN_IO_FAULT_VOP
157	} kind;
158	struct ucred *cred;
159	int flags;
160	union {
161		struct fop_args_tag {
162			struct file *fp;
163			fo_rdwr_t *doio;
164		} fop_args;
165		struct vop_args_tag {
166			struct vnode *vp;
167		} vop_args;
168	} args;
169};
170
171static int vn_io_fault1(struct vnode *vp, struct uio *uio,
172    struct vn_io_fault_args *args, struct thread *td);
173
174int
175vn_open(ndp, flagp, cmode, fp)
176	struct nameidata *ndp;
177	int *flagp, cmode;
178	struct file *fp;
179{
180	struct thread *td = ndp->ni_cnd.cn_thread;
181
182	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
183}
184
185/*
186 * Common code for vnode open operations via a name lookup.
187 * Lookup the vnode and invoke VOP_CREATE if needed.
188 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
189 *
190 * Note that this does NOT free nameidata for the successful case,
191 * due to the NDINIT being done elsewhere.
192 */
193int
194vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
195    struct ucred *cred, struct file *fp)
196{
197	struct vnode *vp;
198	struct mount *mp;
199	struct thread *td = ndp->ni_cnd.cn_thread;
200	struct vattr vat;
201	struct vattr *vap = &vat;
202	int fmode, error;
203
204restart:
205	fmode = *flagp;
206	if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
207	    O_EXCL | O_DIRECTORY))
208		return (EINVAL);
209	else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
210		ndp->ni_cnd.cn_nameiop = CREATE;
211		/*
212		 * Set NOCACHE to avoid flushing the cache when
213		 * rolling in many files at once.
214		*/
215		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE;
216		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
217			ndp->ni_cnd.cn_flags |= FOLLOW;
218		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
219			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
220		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
221			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
222		bwillwrite();
223		if ((error = namei(ndp)) != 0)
224			return (error);
225		if (ndp->ni_vp == NULL) {
226			VATTR_NULL(vap);
227			vap->va_type = VREG;
228			vap->va_mode = cmode;
229			if (fmode & O_EXCL)
230				vap->va_vaflags |= VA_EXCLUSIVE;
231			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
232				NDFREE(ndp, NDF_ONLY_PNBUF);
233				vput(ndp->ni_dvp);
234				if ((error = vn_start_write(NULL, &mp,
235				    V_XSLEEP | PCATCH)) != 0)
236					return (error);
237				goto restart;
238			}
239			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
240				ndp->ni_cnd.cn_flags |= MAKEENTRY;
241#ifdef MAC
242			error = mac_vnode_check_create(cred, ndp->ni_dvp,
243			    &ndp->ni_cnd, vap);
244			if (error == 0)
245#endif
246				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
247						   &ndp->ni_cnd, vap);
248			vput(ndp->ni_dvp);
249			vn_finished_write(mp);
250			if (error) {
251				NDFREE(ndp, NDF_ONLY_PNBUF);
252				return (error);
253			}
254			fmode &= ~O_TRUNC;
255			vp = ndp->ni_vp;
256		} else {
257			if (ndp->ni_dvp == ndp->ni_vp)
258				vrele(ndp->ni_dvp);
259			else
260				vput(ndp->ni_dvp);
261			ndp->ni_dvp = NULL;
262			vp = ndp->ni_vp;
263			if (fmode & O_EXCL) {
264				error = EEXIST;
265				goto bad;
266			}
267			fmode &= ~O_CREAT;
268		}
269	} else {
270		ndp->ni_cnd.cn_nameiop = LOOKUP;
271		ndp->ni_cnd.cn_flags = ISOPEN |
272		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
273		if (!(fmode & FWRITE))
274			ndp->ni_cnd.cn_flags |= LOCKSHARED;
275		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
276			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
277		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
278			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
279		if ((error = namei(ndp)) != 0)
280			return (error);
281		vp = ndp->ni_vp;
282	}
283	error = vn_open_vnode(vp, fmode, cred, td, fp);
284	if (error)
285		goto bad;
286	*flagp = fmode;
287	return (0);
288bad:
289	NDFREE(ndp, NDF_ONLY_PNBUF);
290	vput(vp);
291	*flagp = fmode;
292	ndp->ni_vp = NULL;
293	return (error);
294}
295
296/*
297 * Common code for vnode open operations once a vnode is located.
298 * Check permissions, and call the VOP_OPEN routine.
299 */
300int
301vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
302    struct thread *td, struct file *fp)
303{
304	accmode_t accmode;
305	struct flock lf;
306	int error, lock_flags, type;
307
308	if (vp->v_type == VLNK)
309		return (EMLINK);
310	if (vp->v_type == VSOCK)
311		return (EOPNOTSUPP);
312	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
313		return (ENOTDIR);
314	accmode = 0;
315	if (fmode & (FWRITE | O_TRUNC)) {
316		if (vp->v_type == VDIR)
317			return (EISDIR);
318		accmode |= VWRITE;
319	}
320	if (fmode & FREAD)
321		accmode |= VREAD;
322	if (fmode & FEXEC)
323		accmode |= VEXEC;
324	if ((fmode & O_APPEND) && (fmode & FWRITE))
325		accmode |= VAPPEND;
326#ifdef MAC
327	if (fmode & O_CREAT)
328		accmode |= VCREAT;
329	if (fmode & O_VERIFY)
330		accmode |= VVERIFY;
331	error = mac_vnode_check_open(cred, vp, accmode);
332	if (error)
333		return (error);
334
335	accmode &= ~(VCREAT | VVERIFY);
336#endif
337	if ((fmode & O_CREAT) == 0) {
338		if (accmode & VWRITE) {
339			error = vn_writechk(vp);
340			if (error)
341				return (error);
342		}
343		if (accmode) {
344		        error = VOP_ACCESS(vp, accmode, cred, td);
345			if (error)
346				return (error);
347		}
348	}
349	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
350		vn_lock(vp, LK_UPGRADE | LK_RETRY);
351	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
352		return (error);
353
354	while ((fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
355		KASSERT(fp != NULL, ("open with flock requires fp"));
356		if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) {
357			error = EOPNOTSUPP;
358			break;
359		}
360		lock_flags = VOP_ISLOCKED(vp);
361		VOP_UNLOCK(vp, 0);
362		lf.l_whence = SEEK_SET;
363		lf.l_start = 0;
364		lf.l_len = 0;
365		if (fmode & O_EXLOCK)
366			lf.l_type = F_WRLCK;
367		else
368			lf.l_type = F_RDLCK;
369		type = F_FLOCK;
370		if ((fmode & FNONBLOCK) == 0)
371			type |= F_WAIT;
372		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
373		if (error == 0)
374			fp->f_flag |= FHASLOCK;
375		vn_lock(vp, lock_flags | LK_RETRY);
376		if (error != 0)
377			break;
378		if ((vp->v_iflag & VI_DOOMED) != 0) {
379			error = ENOENT;
380			break;
381		}
382
383		/*
384		 * Another thread might have used this vnode as an
385		 * executable while the vnode lock was dropped.
386		 * Ensure the vnode is still able to be opened for
387		 * writing after the lock has been obtained.
388		 */
389		if ((accmode & VWRITE) != 0)
390			error = vn_writechk(vp);
391		break;
392	}
393
394	if (error != 0) {
395		fp->f_flag |= FOPENFAILED;
396		fp->f_vnode = vp;
397		if (fp->f_ops == &badfileops) {
398			fp->f_type = DTYPE_VNODE;
399			fp->f_ops = &vnops;
400		}
401		vref(vp);
402	} else if  ((fmode & FWRITE) != 0) {
403		VOP_ADD_WRITECOUNT(vp, 1);
404		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
405		    __func__, vp, vp->v_writecount);
406	}
407	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
408	return (error);
409}
410
411/*
412 * Check for write permissions on the specified vnode.
413 * Prototype text segments cannot be written.
414 */
415int
416vn_writechk(vp)
417	register struct vnode *vp;
418{
419
420	ASSERT_VOP_LOCKED(vp, "vn_writechk");
421	/*
422	 * If there's shared text associated with
423	 * the vnode, try to free it up once.  If
424	 * we fail, we can't allow writing.
425	 */
426	if (VOP_IS_TEXT(vp))
427		return (ETXTBSY);
428
429	return (0);
430}
431
432/*
433 * Vnode close call
434 */
435static int
436vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
437    struct thread *td, bool keep_ref)
438{
439	struct mount *mp;
440	int error, lock_flags;
441
442	if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
443	    MNT_EXTENDED_SHARED(vp->v_mount))
444		lock_flags = LK_SHARED;
445	else
446		lock_flags = LK_EXCLUSIVE;
447
448	vn_start_write(vp, &mp, V_WAIT);
449	vn_lock(vp, lock_flags | LK_RETRY);
450	AUDIT_ARG_VNODE1(vp);
451	if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
452		VNASSERT(vp->v_writecount > 0, vp,
453		    ("vn_close: negative writecount"));
454		VOP_ADD_WRITECOUNT(vp, -1);
455		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
456		    __func__, vp, vp->v_writecount);
457	}
458	error = VOP_CLOSE(vp, flags, file_cred, td);
459	if (keep_ref)
460		VOP_UNLOCK(vp, 0);
461	else
462		vput(vp);
463	vn_finished_write(mp);
464	return (error);
465}
466
467int
468vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
469    struct thread *td)
470{
471
472	return (vn_close1(vp, flags, file_cred, td, false));
473}
474
475/*
476 * Heuristic to detect sequential operation.
477 */
478static int
479sequential_heuristic(struct uio *uio, struct file *fp)
480{
481
482	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
483	if (fp->f_flag & FRDAHEAD)
484		return (fp->f_seqcount << IO_SEQSHIFT);
485
486	/*
487	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
488	 * that the first I/O is normally considered to be slightly
489	 * sequential.  Seeking to offset 0 doesn't change sequentiality
490	 * unless previous seeks have reduced f_seqcount to 0, in which
491	 * case offset 0 is not special.
492	 */
493	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
494	    uio->uio_offset == fp->f_nextoff) {
495		/*
496		 * f_seqcount is in units of fixed-size blocks so that it
497		 * depends mainly on the amount of sequential I/O and not
498		 * much on the number of sequential I/O's.  The fixed size
499		 * of 16384 is hard-coded here since it is (not quite) just
500		 * a magic size that works well here.  This size is more
501		 * closely related to the best I/O size for real disks than
502		 * to any block size used by software.
503		 */
504		fp->f_seqcount += howmany(uio->uio_resid, 16384);
505		if (fp->f_seqcount > IO_SEQMAX)
506			fp->f_seqcount = IO_SEQMAX;
507		return (fp->f_seqcount << IO_SEQSHIFT);
508	}
509
510	/* Not sequential.  Quickly draw-down sequentiality. */
511	if (fp->f_seqcount > 1)
512		fp->f_seqcount = 1;
513	else
514		fp->f_seqcount = 0;
515	return (0);
516}
517
518/*
519 * Package up an I/O request on a vnode into a uio and do it.
520 */
521int
522vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
523    enum uio_seg segflg, int ioflg, struct ucred *active_cred,
524    struct ucred *file_cred, ssize_t *aresid, struct thread *td)
525{
526	struct uio auio;
527	struct iovec aiov;
528	struct mount *mp;
529	struct ucred *cred;
530	void *rl_cookie;
531	struct vn_io_fault_args args;
532	int error, lock_flags;
533
534	auio.uio_iov = &aiov;
535	auio.uio_iovcnt = 1;
536	aiov.iov_base = base;
537	aiov.iov_len = len;
538	auio.uio_resid = len;
539	auio.uio_offset = offset;
540	auio.uio_segflg = segflg;
541	auio.uio_rw = rw;
542	auio.uio_td = td;
543	error = 0;
544
545	if ((ioflg & IO_NODELOCKED) == 0) {
546		if ((ioflg & IO_RANGELOCKED) == 0) {
547			if (rw == UIO_READ) {
548				rl_cookie = vn_rangelock_rlock(vp, offset,
549				    offset + len);
550			} else {
551				rl_cookie = vn_rangelock_wlock(vp, offset,
552				    offset + len);
553			}
554		} else
555			rl_cookie = NULL;
556		mp = NULL;
557		if (rw == UIO_WRITE) {
558			if (vp->v_type != VCHR &&
559			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
560			    != 0)
561				goto out;
562			if (MNT_SHARED_WRITES(mp) ||
563			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
564				lock_flags = LK_SHARED;
565			else
566				lock_flags = LK_EXCLUSIVE;
567		} else
568			lock_flags = LK_SHARED;
569		vn_lock(vp, lock_flags | LK_RETRY);
570	} else
571		rl_cookie = NULL;
572
573	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
574#ifdef MAC
575	if ((ioflg & IO_NOMACCHECK) == 0) {
576		if (rw == UIO_READ)
577			error = mac_vnode_check_read(active_cred, file_cred,
578			    vp);
579		else
580			error = mac_vnode_check_write(active_cred, file_cred,
581			    vp);
582	}
583#endif
584	if (error == 0) {
585		if (file_cred != NULL)
586			cred = file_cred;
587		else
588			cred = active_cred;
589		if (do_vn_io_fault(vp, &auio)) {
590			args.kind = VN_IO_FAULT_VOP;
591			args.cred = cred;
592			args.flags = ioflg;
593			args.args.vop_args.vp = vp;
594			error = vn_io_fault1(vp, &auio, &args, td);
595		} else if (rw == UIO_READ) {
596			error = VOP_READ(vp, &auio, ioflg, cred);
597		} else /* if (rw == UIO_WRITE) */ {
598			error = VOP_WRITE(vp, &auio, ioflg, cred);
599		}
600	}
601	if (aresid)
602		*aresid = auio.uio_resid;
603	else
604		if (auio.uio_resid && error == 0)
605			error = EIO;
606	if ((ioflg & IO_NODELOCKED) == 0) {
607		VOP_UNLOCK(vp, 0);
608		if (mp != NULL)
609			vn_finished_write(mp);
610	}
611 out:
612	if (rl_cookie != NULL)
613		vn_rangelock_unlock(vp, rl_cookie);
614	return (error);
615}
616
617/*
618 * Package up an I/O request on a vnode into a uio and do it.  The I/O
619 * request is split up into smaller chunks and we try to avoid saturating
620 * the buffer cache while potentially holding a vnode locked, so we
621 * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
622 * to give other processes a chance to lock the vnode (either other processes
623 * core'ing the same binary, or unrelated processes scanning the directory).
624 */
625int
626vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
627    file_cred, aresid, td)
628	enum uio_rw rw;
629	struct vnode *vp;
630	void *base;
631	size_t len;
632	off_t offset;
633	enum uio_seg segflg;
634	int ioflg;
635	struct ucred *active_cred;
636	struct ucred *file_cred;
637	size_t *aresid;
638	struct thread *td;
639{
640	int error = 0;
641	ssize_t iaresid;
642
643	do {
644		int chunk;
645
646		/*
647		 * Force `offset' to a multiple of MAXBSIZE except possibly
648		 * for the first chunk, so that filesystems only need to
649		 * write full blocks except possibly for the first and last
650		 * chunks.
651		 */
652		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
653
654		if (chunk > len)
655			chunk = len;
656		if (rw != UIO_READ && vp->v_type == VREG)
657			bwillwrite();
658		iaresid = 0;
659		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
660		    ioflg, active_cred, file_cred, &iaresid, td);
661		len -= chunk;	/* aresid calc already includes length */
662		if (error)
663			break;
664		offset += chunk;
665		base = (char *)base + chunk;
666		kern_yield(PRI_USER);
667	} while (len);
668	if (aresid)
669		*aresid = len + iaresid;
670	return (error);
671}
672
673off_t
674foffset_lock(struct file *fp, int flags)
675{
676	struct mtx *mtxp;
677	off_t res;
678
679	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
680
681#if OFF_MAX <= LONG_MAX
682	/*
683	 * Caller only wants the current f_offset value.  Assume that
684	 * the long and shorter integer types reads are atomic.
685	 */
686	if ((flags & FOF_NOLOCK) != 0)
687		return (fp->f_offset);
688#endif
689
690	/*
691	 * According to McKusick the vn lock was protecting f_offset here.
692	 * It is now protected by the FOFFSET_LOCKED flag.
693	 */
694	mtxp = mtx_pool_find(mtxpool_sleep, fp);
695	mtx_lock(mtxp);
696	if ((flags & FOF_NOLOCK) == 0) {
697		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
698			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
699			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
700			    "vofflock", 0);
701		}
702		fp->f_vnread_flags |= FOFFSET_LOCKED;
703	}
704	res = fp->f_offset;
705	mtx_unlock(mtxp);
706	return (res);
707}
708
709void
710foffset_unlock(struct file *fp, off_t val, int flags)
711{
712	struct mtx *mtxp;
713
714	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
715
716#if OFF_MAX <= LONG_MAX
717	if ((flags & FOF_NOLOCK) != 0) {
718		if ((flags & FOF_NOUPDATE) == 0)
719			fp->f_offset = val;
720		if ((flags & FOF_NEXTOFF) != 0)
721			fp->f_nextoff = val;
722		return;
723	}
724#endif
725
726	mtxp = mtx_pool_find(mtxpool_sleep, fp);
727	mtx_lock(mtxp);
728	if ((flags & FOF_NOUPDATE) == 0)
729		fp->f_offset = val;
730	if ((flags & FOF_NEXTOFF) != 0)
731		fp->f_nextoff = val;
732	if ((flags & FOF_NOLOCK) == 0) {
733		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
734		    ("Lost FOFFSET_LOCKED"));
735		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
736			wakeup(&fp->f_vnread_flags);
737		fp->f_vnread_flags = 0;
738	}
739	mtx_unlock(mtxp);
740}
741
742void
743foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
744{
745
746	if ((flags & FOF_OFFSET) == 0)
747		uio->uio_offset = foffset_lock(fp, flags);
748}
749
750void
751foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
752{
753
754	if ((flags & FOF_OFFSET) == 0)
755		foffset_unlock(fp, uio->uio_offset, flags);
756}
757
758static int
759get_advice(struct file *fp, struct uio *uio)
760{
761	struct mtx *mtxp;
762	int ret;
763
764	ret = POSIX_FADV_NORMAL;
765	if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
766		return (ret);
767
768	mtxp = mtx_pool_find(mtxpool_sleep, fp);
769	mtx_lock(mtxp);
770	if (fp->f_advice != NULL &&
771	    uio->uio_offset >= fp->f_advice->fa_start &&
772	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
773		ret = fp->f_advice->fa_advice;
774	mtx_unlock(mtxp);
775	return (ret);
776}
777
778/*
779 * File table vnode read routine.
780 */
781static int
782vn_read(fp, uio, active_cred, flags, td)
783	struct file *fp;
784	struct uio *uio;
785	struct ucred *active_cred;
786	int flags;
787	struct thread *td;
788{
789	struct vnode *vp;
790	off_t orig_offset;
791	int error, ioflag;
792	int advice;
793
794	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
795	    uio->uio_td, td));
796	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
797	vp = fp->f_vnode;
798	ioflag = 0;
799	if (fp->f_flag & FNONBLOCK)
800		ioflag |= IO_NDELAY;
801	if (fp->f_flag & O_DIRECT)
802		ioflag |= IO_DIRECT;
803	advice = get_advice(fp, uio);
804	vn_lock(vp, LK_SHARED | LK_RETRY);
805
806	switch (advice) {
807	case POSIX_FADV_NORMAL:
808	case POSIX_FADV_SEQUENTIAL:
809	case POSIX_FADV_NOREUSE:
810		ioflag |= sequential_heuristic(uio, fp);
811		break;
812	case POSIX_FADV_RANDOM:
813		/* Disable read-ahead for random I/O. */
814		break;
815	}
816	orig_offset = uio->uio_offset;
817
818#ifdef MAC
819	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
820	if (error == 0)
821#endif
822		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
823	fp->f_nextoff = uio->uio_offset;
824	VOP_UNLOCK(vp, 0);
825	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
826	    orig_offset != uio->uio_offset)
827		/*
828		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
829		 * for the backing file after a POSIX_FADV_NOREUSE
830		 * read(2).
831		 */
832		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
833		    POSIX_FADV_DONTNEED);
834	return (error);
835}
836
837/*
838 * File table vnode write routine.
839 */
840static int
841vn_write(fp, uio, active_cred, flags, td)
842	struct file *fp;
843	struct uio *uio;
844	struct ucred *active_cred;
845	int flags;
846	struct thread *td;
847{
848	struct vnode *vp;
849	struct mount *mp;
850	off_t orig_offset;
851	int error, ioflag, lock_flags;
852	int advice;
853
854	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
855	    uio->uio_td, td));
856	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
857	vp = fp->f_vnode;
858	if (vp->v_type == VREG)
859		bwillwrite();
860	ioflag = IO_UNIT;
861	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
862		ioflag |= IO_APPEND;
863	if (fp->f_flag & FNONBLOCK)
864		ioflag |= IO_NDELAY;
865	if (fp->f_flag & O_DIRECT)
866		ioflag |= IO_DIRECT;
867	if ((fp->f_flag & O_FSYNC) ||
868	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
869		ioflag |= IO_SYNC;
870	mp = NULL;
871	if (vp->v_type != VCHR &&
872	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
873		goto unlock;
874
875	advice = get_advice(fp, uio);
876
877	if (MNT_SHARED_WRITES(mp) ||
878	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
879		lock_flags = LK_SHARED;
880	} else {
881		lock_flags = LK_EXCLUSIVE;
882	}
883
884	vn_lock(vp, lock_flags | LK_RETRY);
885	switch (advice) {
886	case POSIX_FADV_NORMAL:
887	case POSIX_FADV_SEQUENTIAL:
888	case POSIX_FADV_NOREUSE:
889		ioflag |= sequential_heuristic(uio, fp);
890		break;
891	case POSIX_FADV_RANDOM:
892		/* XXX: Is this correct? */
893		break;
894	}
895	orig_offset = uio->uio_offset;
896
897#ifdef MAC
898	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
899	if (error == 0)
900#endif
901		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
902	fp->f_nextoff = uio->uio_offset;
903	VOP_UNLOCK(vp, 0);
904	if (vp->v_type != VCHR)
905		vn_finished_write(mp);
906	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
907	    orig_offset != uio->uio_offset)
908		/*
909		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
910		 * for the backing file after a POSIX_FADV_NOREUSE
911		 * write(2).
912		 */
913		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
914		    POSIX_FADV_DONTNEED);
915unlock:
916	return (error);
917}
918
919/*
920 * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
921 * prevent the following deadlock:
922 *
923 * Assume that the thread A reads from the vnode vp1 into userspace
924 * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
925 * currently not resident, then system ends up with the call chain
926 *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
927 *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
928 * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
929 * If, at the same time, thread B reads from vnode vp2 into buffer buf2
930 * backed by the pages of vnode vp1, and some page in buf2 is not
931 * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
932 *
933 * To prevent the lock order reversal and deadlock, vn_io_fault() does
934 * not allow page faults to happen during VOP_READ() or VOP_WRITE().
935 * Instead, it first tries to do the whole range i/o with pagefaults
936 * disabled. If all pages in the i/o buffer are resident and mapped,
937 * VOP will succeed (ignoring the genuine filesystem errors).
938 * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
939 * i/o in chunks, with all pages in the chunk prefaulted and held
940 * using vm_fault_quick_hold_pages().
941 *
942 * Filesystems using this deadlock avoidance scheme should use the
943 * array of the held pages from uio, saved in the curthread->td_ma,
944 * instead of doing uiomove().  A helper function
945 * vn_io_fault_uiomove() converts uiomove request into
946 * uiomove_fromphys() over td_ma array.
947 *
948 * Since vnode locks do not cover the whole i/o anymore, rangelocks
949 * make the current i/o request atomic with respect to other i/os and
950 * truncations.
951 */
952
953/*
954 * Decode vn_io_fault_args and perform the corresponding i/o.
955 */
956static int
957vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
958    struct thread *td)
959{
960
961	switch (args->kind) {
962	case VN_IO_FAULT_FOP:
963		return ((args->args.fop_args.doio)(args->args.fop_args.fp,
964		    uio, args->cred, args->flags, td));
965	case VN_IO_FAULT_VOP:
966		if (uio->uio_rw == UIO_READ) {
967			return (VOP_READ(args->args.vop_args.vp, uio,
968			    args->flags, args->cred));
969		} else if (uio->uio_rw == UIO_WRITE) {
970			return (VOP_WRITE(args->args.vop_args.vp, uio,
971			    args->flags, args->cred));
972		}
973		break;
974	}
975	panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind,
976	    uio->uio_rw);
977}
978
979static int
980vn_io_fault_touch(char *base, const struct uio *uio)
981{
982	int r;
983
984	r = fubyte(base);
985	if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
986		return (EFAULT);
987	return (0);
988}
989
990static int
991vn_io_fault_prefault_user(const struct uio *uio)
992{
993	char *base;
994	const struct iovec *iov;
995	size_t len;
996	ssize_t resid;
997	int error, i;
998
999	KASSERT(uio->uio_segflg == UIO_USERSPACE,
1000	    ("vn_io_fault_prefault userspace"));
1001
1002	error = i = 0;
1003	iov = uio->uio_iov;
1004	resid = uio->uio_resid;
1005	base = iov->iov_base;
1006	len = iov->iov_len;
1007	while (resid > 0) {
1008		error = vn_io_fault_touch(base, uio);
1009		if (error != 0)
1010			break;
1011		if (len < PAGE_SIZE) {
1012			if (len != 0) {
1013				error = vn_io_fault_touch(base + len - 1, uio);
1014				if (error != 0)
1015					break;
1016				resid -= len;
1017			}
1018			if (++i >= uio->uio_iovcnt)
1019				break;
1020			iov = uio->uio_iov + i;
1021			base = iov->iov_base;
1022			len = iov->iov_len;
1023		} else {
1024			len -= PAGE_SIZE;
1025			base += PAGE_SIZE;
1026			resid -= PAGE_SIZE;
1027		}
1028	}
1029	return (error);
1030}
1031
1032/*
1033 * Common code for vn_io_fault(), agnostic to the kind of i/o request.
1034 * Uses vn_io_fault_doio() to make the call to an actual i/o function.
1035 * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
1036 * into args and call vn_io_fault1() to handle faults during the user
1037 * mode buffer accesses.
1038 */
1039static int
1040vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
1041    struct thread *td)
1042{
1043	vm_page_t ma[io_hold_cnt + 2];
1044	struct uio *uio_clone, short_uio;
1045	struct iovec short_iovec[1];
1046	vm_page_t *prev_td_ma;
1047	vm_prot_t prot;
1048	vm_offset_t addr, end;
1049	size_t len, resid;
1050	ssize_t adv;
1051	int error, cnt, save, saveheld, prev_td_ma_cnt;
1052
1053	if (vn_io_fault_prefault) {
1054		error = vn_io_fault_prefault_user(uio);
1055		if (error != 0)
1056			return (error); /* Or ignore ? */
1057	}
1058
1059	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
1060
1061	/*
1062	 * The UFS follows IO_UNIT directive and replays back both
1063	 * uio_offset and uio_resid if an error is encountered during the
1064	 * operation.  But, since the iovec may be already advanced,
1065	 * uio is still in an inconsistent state.
1066	 *
1067	 * Cache a copy of the original uio, which is advanced to the redo
1068	 * point using UIO_NOCOPY below.
1069	 */
1070	uio_clone = cloneuio(uio);
1071	resid = uio->uio_resid;
1072
1073	short_uio.uio_segflg = UIO_USERSPACE;
1074	short_uio.uio_rw = uio->uio_rw;
1075	short_uio.uio_td = uio->uio_td;
1076
1077	save = vm_fault_disable_pagefaults();
1078	error = vn_io_fault_doio(args, uio, td);
1079	if (error != EFAULT)
1080		goto out;
1081
1082	atomic_add_long(&vn_io_faults_cnt, 1);
1083	uio_clone->uio_segflg = UIO_NOCOPY;
1084	uiomove(NULL, resid - uio->uio_resid, uio_clone);
1085	uio_clone->uio_segflg = uio->uio_segflg;
1086
1087	saveheld = curthread_pflags_set(TDP_UIOHELD);
1088	prev_td_ma = td->td_ma;
1089	prev_td_ma_cnt = td->td_ma_cnt;
1090
1091	while (uio_clone->uio_resid != 0) {
1092		len = uio_clone->uio_iov->iov_len;
1093		if (len == 0) {
1094			KASSERT(uio_clone->uio_iovcnt >= 1,
1095			    ("iovcnt underflow"));
1096			uio_clone->uio_iov++;
1097			uio_clone->uio_iovcnt--;
1098			continue;
1099		}
1100		if (len > io_hold_cnt * PAGE_SIZE)
1101			len = io_hold_cnt * PAGE_SIZE;
1102		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
1103		end = round_page(addr + len);
1104		if (end < addr) {
1105			error = EFAULT;
1106			break;
1107		}
1108		cnt = atop(end - trunc_page(addr));
1109		/*
1110		 * A perfectly misaligned address and length could cause
1111		 * both the start and the end of the chunk to use partial
1112		 * page.  +2 accounts for such a situation.
1113		 */
1114		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
1115		    addr, len, prot, ma, io_hold_cnt + 2);
1116		if (cnt == -1) {
1117			error = EFAULT;
1118			break;
1119		}
1120		short_uio.uio_iov = &short_iovec[0];
1121		short_iovec[0].iov_base = (void *)addr;
1122		short_uio.uio_iovcnt = 1;
1123		short_uio.uio_resid = short_iovec[0].iov_len = len;
1124		short_uio.uio_offset = uio_clone->uio_offset;
1125		td->td_ma = ma;
1126		td->td_ma_cnt = cnt;
1127
1128		error = vn_io_fault_doio(args, &short_uio, td);
1129		vm_page_unhold_pages(ma, cnt);
1130		adv = len - short_uio.uio_resid;
1131
1132		uio_clone->uio_iov->iov_base =
1133		    (char *)uio_clone->uio_iov->iov_base + adv;
1134		uio_clone->uio_iov->iov_len -= adv;
1135		uio_clone->uio_resid -= adv;
1136		uio_clone->uio_offset += adv;
1137
1138		uio->uio_resid -= adv;
1139		uio->uio_offset += adv;
1140
1141		if (error != 0 || adv == 0)
1142			break;
1143	}
1144	td->td_ma = prev_td_ma;
1145	td->td_ma_cnt = prev_td_ma_cnt;
1146	curthread_pflags_restore(saveheld);
1147out:
1148	vm_fault_enable_pagefaults(save);
1149	free(uio_clone, M_IOV);
1150	return (error);
1151}
1152
1153static int
1154vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
1155    int flags, struct thread *td)
1156{
1157	fo_rdwr_t *doio;
1158	struct vnode *vp;
1159	void *rl_cookie;
1160	struct vn_io_fault_args args;
1161	int error;
1162
1163	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
1164	vp = fp->f_vnode;
1165	foffset_lock_uio(fp, uio, flags);
1166	if (do_vn_io_fault(vp, uio)) {
1167		args.kind = VN_IO_FAULT_FOP;
1168		args.args.fop_args.fp = fp;
1169		args.args.fop_args.doio = doio;
1170		args.cred = active_cred;
1171		args.flags = flags | FOF_OFFSET;
1172		if (uio->uio_rw == UIO_READ) {
1173			rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
1174			    uio->uio_offset + uio->uio_resid);
1175		} else if ((fp->f_flag & O_APPEND) != 0 ||
1176		    (flags & FOF_OFFSET) == 0) {
1177			/* For appenders, punt and lock the whole range. */
1178			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1179		} else {
1180			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
1181			    uio->uio_offset + uio->uio_resid);
1182		}
1183		error = vn_io_fault1(vp, uio, &args, td);
1184		vn_rangelock_unlock(vp, rl_cookie);
1185	} else {
1186		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
1187	}
1188	foffset_unlock_uio(fp, uio, flags);
1189	return (error);
1190}
1191
1192/*
1193 * Helper function to perform the requested uiomove operation using
1194 * the held pages for io->uio_iov[0].iov_base buffer instead of
1195 * copyin/copyout.  Access to the pages with uiomove_fromphys()
1196 * instead of iov_base prevents page faults that could occur due to
1197 * pmap_collect() invalidating the mapping created by
1198 * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1199 * object cleanup revoking the write access from page mappings.
1200 *
1201 * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1202 * instead of plain uiomove().
1203 */
1204int
1205vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
1206{
1207	struct uio transp_uio;
1208	struct iovec transp_iov[1];
1209	struct thread *td;
1210	size_t adv;
1211	int error, pgadv;
1212
1213	td = curthread;
1214	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1215	    uio->uio_segflg != UIO_USERSPACE)
1216		return (uiomove(data, xfersize, uio));
1217
1218	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1219	transp_iov[0].iov_base = data;
1220	transp_uio.uio_iov = &transp_iov[0];
1221	transp_uio.uio_iovcnt = 1;
1222	if (xfersize > uio->uio_resid)
1223		xfersize = uio->uio_resid;
1224	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1225	transp_uio.uio_offset = 0;
1226	transp_uio.uio_segflg = UIO_SYSSPACE;
1227	/*
1228	 * Since transp_iov points to data, and td_ma page array
1229	 * corresponds to original uio->uio_iov, we need to invert the
1230	 * direction of the i/o operation as passed to
1231	 * uiomove_fromphys().
1232	 */
1233	switch (uio->uio_rw) {
1234	case UIO_WRITE:
1235		transp_uio.uio_rw = UIO_READ;
1236		break;
1237	case UIO_READ:
1238		transp_uio.uio_rw = UIO_WRITE;
1239		break;
1240	}
1241	transp_uio.uio_td = uio->uio_td;
1242	error = uiomove_fromphys(td->td_ma,
1243	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1244	    xfersize, &transp_uio);
1245	adv = xfersize - transp_uio.uio_resid;
1246	pgadv =
1247	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1248	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1249	td->td_ma += pgadv;
1250	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1251	    pgadv));
1252	td->td_ma_cnt -= pgadv;
1253	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1254	uio->uio_iov->iov_len -= adv;
1255	uio->uio_resid -= adv;
1256	uio->uio_offset += adv;
1257	return (error);
1258}
1259
1260int
1261vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1262    struct uio *uio)
1263{
1264	struct thread *td;
1265	vm_offset_t iov_base;
1266	int cnt, pgadv;
1267
1268	td = curthread;
1269	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1270	    uio->uio_segflg != UIO_USERSPACE)
1271		return (uiomove_fromphys(ma, offset, xfersize, uio));
1272
1273	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1274	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1275	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1276	switch (uio->uio_rw) {
1277	case UIO_WRITE:
1278		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1279		    offset, cnt);
1280		break;
1281	case UIO_READ:
1282		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1283		    cnt);
1284		break;
1285	}
1286	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1287	td->td_ma += pgadv;
1288	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1289	    pgadv));
1290	td->td_ma_cnt -= pgadv;
1291	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1292	uio->uio_iov->iov_len -= cnt;
1293	uio->uio_resid -= cnt;
1294	uio->uio_offset += cnt;
1295	return (0);
1296}
1297
1298
1299/*
1300 * File table truncate routine.
1301 */
1302static int
1303vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1304    struct thread *td)
1305{
1306	struct vattr vattr;
1307	struct mount *mp;
1308	struct vnode *vp;
1309	void *rl_cookie;
1310	int error;
1311
1312	vp = fp->f_vnode;
1313
1314	/*
1315	 * Lock the whole range for truncation.  Otherwise split i/o
1316	 * might happen partly before and partly after the truncation.
1317	 */
1318	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1319	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1320	if (error)
1321		goto out1;
1322	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1323	if (vp->v_type == VDIR) {
1324		error = EISDIR;
1325		goto out;
1326	}
1327#ifdef MAC
1328	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1329	if (error)
1330		goto out;
1331#endif
1332	error = vn_writechk(vp);
1333	if (error == 0) {
1334		VATTR_NULL(&vattr);
1335		vattr.va_size = length;
1336		if ((fp->f_flag & O_FSYNC) != 0)
1337			vattr.va_vaflags |= VA_SYNC;
1338		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
1339	}
1340out:
1341	VOP_UNLOCK(vp, 0);
1342	vn_finished_write(mp);
1343out1:
1344	vn_rangelock_unlock(vp, rl_cookie);
1345	return (error);
1346}
1347
1348/*
1349 * File table vnode stat routine.
1350 */
1351static int
1352vn_statfile(fp, sb, active_cred, td)
1353	struct file *fp;
1354	struct stat *sb;
1355	struct ucred *active_cred;
1356	struct thread *td;
1357{
1358	struct vnode *vp = fp->f_vnode;
1359	int error;
1360
1361	vn_lock(vp, LK_SHARED | LK_RETRY);
1362	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
1363	VOP_UNLOCK(vp, 0);
1364
1365	return (error);
1366}
1367
1368/*
1369 * Stat a vnode; implementation for the stat syscall
1370 */
1371int
1372vn_stat(vp, sb, active_cred, file_cred, td)
1373	struct vnode *vp;
1374	register struct stat *sb;
1375	struct ucred *active_cred;
1376	struct ucred *file_cred;
1377	struct thread *td;
1378{
1379	struct vattr vattr;
1380	register struct vattr *vap;
1381	int error;
1382	u_short mode;
1383
1384	AUDIT_ARG_VNODE1(vp);
1385#ifdef MAC
1386	error = mac_vnode_check_stat(active_cred, file_cred, vp);
1387	if (error)
1388		return (error);
1389#endif
1390
1391	vap = &vattr;
1392
1393	/*
1394	 * Initialize defaults for new and unusual fields, so that file
1395	 * systems which don't support these fields don't need to know
1396	 * about them.
1397	 */
1398	vap->va_birthtime.tv_sec = -1;
1399	vap->va_birthtime.tv_nsec = 0;
1400	vap->va_fsid = VNOVAL;
1401	vap->va_rdev = NODEV;
1402
1403	error = VOP_GETATTR(vp, vap, active_cred);
1404	if (error)
1405		return (error);
1406
1407	/*
1408	 * Zero the spare stat fields
1409	 */
1410	bzero(sb, sizeof *sb);
1411
1412	/*
1413	 * Copy from vattr table
1414	 */
1415	if (vap->va_fsid != VNOVAL)
1416		sb->st_dev = vap->va_fsid;
1417	else
1418		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
1419	sb->st_ino = vap->va_fileid;
1420	mode = vap->va_mode;
1421	switch (vap->va_type) {
1422	case VREG:
1423		mode |= S_IFREG;
1424		break;
1425	case VDIR:
1426		mode |= S_IFDIR;
1427		break;
1428	case VBLK:
1429		mode |= S_IFBLK;
1430		break;
1431	case VCHR:
1432		mode |= S_IFCHR;
1433		break;
1434	case VLNK:
1435		mode |= S_IFLNK;
1436		break;
1437	case VSOCK:
1438		mode |= S_IFSOCK;
1439		break;
1440	case VFIFO:
1441		mode |= S_IFIFO;
1442		break;
1443	default:
1444		return (EBADF);
1445	}
1446	sb->st_mode = mode;
1447	sb->st_nlink = vap->va_nlink;
1448	sb->st_uid = vap->va_uid;
1449	sb->st_gid = vap->va_gid;
1450	sb->st_rdev = vap->va_rdev;
1451	if (vap->va_size > OFF_MAX)
1452		return (EOVERFLOW);
1453	sb->st_size = vap->va_size;
1454	sb->st_atim = vap->va_atime;
1455	sb->st_mtim = vap->va_mtime;
1456	sb->st_ctim = vap->va_ctime;
1457	sb->st_birthtim = vap->va_birthtime;
1458
1459        /*
1460	 * According to www.opengroup.org, the meaning of st_blksize is
1461	 *   "a filesystem-specific preferred I/O block size for this
1462	 *    object.  In some filesystem types, this may vary from file
1463	 *    to file"
1464	 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
1465	 */
1466
1467	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
1468
1469	sb->st_flags = vap->va_flags;
1470	if (priv_check(td, PRIV_VFS_GENERATION))
1471		sb->st_gen = 0;
1472	else
1473		sb->st_gen = vap->va_gen;
1474
1475	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
1476	return (0);
1477}
1478
1479/*
1480 * File table vnode ioctl routine.
1481 */
1482static int
1483vn_ioctl(fp, com, data, active_cred, td)
1484	struct file *fp;
1485	u_long com;
1486	void *data;
1487	struct ucred *active_cred;
1488	struct thread *td;
1489{
1490	struct vattr vattr;
1491	struct vnode *vp;
1492	int error;
1493
1494	vp = fp->f_vnode;
1495	switch (vp->v_type) {
1496	case VDIR:
1497	case VREG:
1498		switch (com) {
1499		case FIONREAD:
1500			vn_lock(vp, LK_SHARED | LK_RETRY);
1501			error = VOP_GETATTR(vp, &vattr, active_cred);
1502			VOP_UNLOCK(vp, 0);
1503			if (error == 0)
1504				*(int *)data = vattr.va_size - fp->f_offset;
1505			return (error);
1506		case FIONBIO:
1507		case FIOASYNC:
1508			return (0);
1509		default:
1510			return (VOP_IOCTL(vp, com, data, fp->f_flag,
1511			    active_cred, td));
1512		}
1513	default:
1514		return (ENOTTY);
1515	}
1516}
1517
1518/*
1519 * File table vnode poll routine.
1520 */
1521static int
1522vn_poll(fp, events, active_cred, td)
1523	struct file *fp;
1524	int events;
1525	struct ucred *active_cred;
1526	struct thread *td;
1527{
1528	struct vnode *vp;
1529	int error;
1530
1531	vp = fp->f_vnode;
1532#ifdef MAC
1533	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1534	AUDIT_ARG_VNODE1(vp);
1535	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1536	VOP_UNLOCK(vp, 0);
1537	if (!error)
1538#endif
1539
1540	error = VOP_POLL(vp, events, fp->f_cred, td);
1541	return (error);
1542}
1543
1544/*
1545 * Acquire the requested lock and then check for validity.  LK_RETRY
1546 * permits vn_lock to return doomed vnodes.
1547 */
1548int
1549_vn_lock(struct vnode *vp, int flags, char *file, int line)
1550{
1551	int error;
1552
1553	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1554	    ("vn_lock: no locktype"));
1555	VNASSERT(vp->v_holdcnt != 0, vp, ("vn_lock: zero hold count"));
1556retry:
1557	error = VOP_LOCK1(vp, flags, file, line);
1558	flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
1559	KASSERT((flags & LK_RETRY) == 0 || error == 0,
1560	    ("vn_lock: error %d incompatible with flags %#x", error, flags));
1561
1562	if ((flags & LK_RETRY) == 0) {
1563		if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) {
1564			VOP_UNLOCK(vp, 0);
1565			error = ENOENT;
1566		}
1567	} else if (error != 0)
1568		goto retry;
1569	return (error);
1570}
1571
1572/*
1573 * File table vnode close routine.
1574 */
1575static int
1576vn_closefile(struct file *fp, struct thread *td)
1577{
1578	struct vnode *vp;
1579	struct flock lf;
1580	int error;
1581	bool ref;
1582
1583	vp = fp->f_vnode;
1584	fp->f_ops = &badfileops;
1585	ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
1586
1587	error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
1588
1589	if (__predict_false(ref)) {
1590		lf.l_whence = SEEK_SET;
1591		lf.l_start = 0;
1592		lf.l_len = 0;
1593		lf.l_type = F_UNLCK;
1594		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1595		vrele(vp);
1596	}
1597	return (error);
1598}
1599
1600static bool
1601vn_suspendable(struct mount *mp)
1602{
1603
1604	return (mp->mnt_op->vfs_susp_clean != NULL);
1605}
1606
1607/*
1608 * Preparing to start a filesystem write operation. If the operation is
1609 * permitted, then we bump the count of operations in progress and
1610 * proceed. If a suspend request is in progress, we wait until the
1611 * suspension is over, and then proceed.
1612 */
1613static int
1614vn_start_write_locked(struct mount *mp, int flags)
1615{
1616	int error, mflags;
1617
1618	mtx_assert(MNT_MTX(mp), MA_OWNED);
1619	error = 0;
1620
1621	/*
1622	 * Check on status of suspension.
1623	 */
1624	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1625	    mp->mnt_susp_owner != curthread) {
1626		mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
1627		    (flags & PCATCH) : 0) | (PUSER - 1);
1628		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1629			if (flags & V_NOWAIT) {
1630				error = EWOULDBLOCK;
1631				goto unlock;
1632			}
1633			error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
1634			    "suspfs", 0);
1635			if (error)
1636				goto unlock;
1637		}
1638	}
1639	if (flags & V_XSLEEP)
1640		goto unlock;
1641	mp->mnt_writeopcount++;
1642unlock:
1643	if (error != 0 || (flags & V_XSLEEP) != 0)
1644		MNT_REL(mp);
1645	MNT_IUNLOCK(mp);
1646	return (error);
1647}
1648
1649int
1650vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
1651{
1652	struct mount *mp;
1653	int error;
1654
1655	KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1656	    ("V_MNTREF requires mp"));
1657
1658	error = 0;
1659	/*
1660	 * If a vnode is provided, get and return the mount point that
1661	 * to which it will write.
1662	 */
1663	if (vp != NULL) {
1664		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1665			*mpp = NULL;
1666			if (error != EOPNOTSUPP)
1667				return (error);
1668			return (0);
1669		}
1670	}
1671	if ((mp = *mpp) == NULL)
1672		return (0);
1673
1674	if (!vn_suspendable(mp)) {
1675		if (vp != NULL || (flags & V_MNTREF) != 0)
1676			vfs_rel(mp);
1677		return (0);
1678	}
1679
1680	/*
1681	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1682	 * a vfs_ref().
1683	 * As long as a vnode is not provided we need to acquire a
1684	 * refcount for the provided mountpoint too, in order to
1685	 * emulate a vfs_ref().
1686	 */
1687	MNT_ILOCK(mp);
1688	if (vp == NULL && (flags & V_MNTREF) == 0)
1689		MNT_REF(mp);
1690
1691	return (vn_start_write_locked(mp, flags));
1692}
1693
1694/*
1695 * Secondary suspension. Used by operations such as vop_inactive
1696 * routines that are needed by the higher level functions. These
1697 * are allowed to proceed until all the higher level functions have
1698 * completed (indicated by mnt_writeopcount dropping to zero). At that
1699 * time, these operations are halted until the suspension is over.
1700 */
1701int
1702vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
1703{
1704	struct mount *mp;
1705	int error;
1706
1707	KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1708	    ("V_MNTREF requires mp"));
1709
1710 retry:
1711	if (vp != NULL) {
1712		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1713			*mpp = NULL;
1714			if (error != EOPNOTSUPP)
1715				return (error);
1716			return (0);
1717		}
1718	}
1719	/*
1720	 * If we are not suspended or have not yet reached suspended
1721	 * mode, then let the operation proceed.
1722	 */
1723	if ((mp = *mpp) == NULL)
1724		return (0);
1725
1726	if (!vn_suspendable(mp)) {
1727		if (vp != NULL || (flags & V_MNTREF) != 0)
1728			vfs_rel(mp);
1729		return (0);
1730	}
1731
1732	/*
1733	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1734	 * a vfs_ref().
1735	 * As long as a vnode is not provided we need to acquire a
1736	 * refcount for the provided mountpoint too, in order to
1737	 * emulate a vfs_ref().
1738	 */
1739	MNT_ILOCK(mp);
1740	if (vp == NULL && (flags & V_MNTREF) == 0)
1741		MNT_REF(mp);
1742	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1743		mp->mnt_secondary_writes++;
1744		mp->mnt_secondary_accwrites++;
1745		MNT_IUNLOCK(mp);
1746		return (0);
1747	}
1748	if (flags & V_NOWAIT) {
1749		MNT_REL(mp);
1750		MNT_IUNLOCK(mp);
1751		return (EWOULDBLOCK);
1752	}
1753	/*
1754	 * Wait for the suspension to finish.
1755	 */
1756	error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
1757	    ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
1758	    "suspfs", 0);
1759	vfs_rel(mp);
1760	if (error == 0)
1761		goto retry;
1762	return (error);
1763}
1764
1765/*
1766 * Filesystem write operation has completed. If we are suspending and this
1767 * operation is the last one, notify the suspender that the suspension is
1768 * now in effect.
1769 */
1770void
1771vn_finished_write(mp)
1772	struct mount *mp;
1773{
1774	if (mp == NULL || !vn_suspendable(mp))
1775		return;
1776	MNT_ILOCK(mp);
1777	MNT_REL(mp);
1778	mp->mnt_writeopcount--;
1779	if (mp->mnt_writeopcount < 0)
1780		panic("vn_finished_write: neg cnt");
1781	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1782	    mp->mnt_writeopcount <= 0)
1783		wakeup(&mp->mnt_writeopcount);
1784	MNT_IUNLOCK(mp);
1785}
1786
1787
1788/*
1789 * Filesystem secondary write operation has completed. If we are
1790 * suspending and this operation is the last one, notify the suspender
1791 * that the suspension is now in effect.
1792 */
1793void
1794vn_finished_secondary_write(mp)
1795	struct mount *mp;
1796{
1797	if (mp == NULL || !vn_suspendable(mp))
1798		return;
1799	MNT_ILOCK(mp);
1800	MNT_REL(mp);
1801	mp->mnt_secondary_writes--;
1802	if (mp->mnt_secondary_writes < 0)
1803		panic("vn_finished_secondary_write: neg cnt");
1804	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1805	    mp->mnt_secondary_writes <= 0)
1806		wakeup(&mp->mnt_secondary_writes);
1807	MNT_IUNLOCK(mp);
1808}
1809
1810
1811
1812/*
1813 * Request a filesystem to suspend write operations.
1814 */
1815int
1816vfs_write_suspend(struct mount *mp, int flags)
1817{
1818	int error;
1819
1820	MPASS(vn_suspendable(mp));
1821
1822	MNT_ILOCK(mp);
1823	if (mp->mnt_susp_owner == curthread) {
1824		MNT_IUNLOCK(mp);
1825		return (EALREADY);
1826	}
1827	while (mp->mnt_kern_flag & MNTK_SUSPEND)
1828		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1829
1830	/*
1831	 * Unmount holds a write reference on the mount point.  If we
1832	 * own busy reference and drain for writers, we deadlock with
1833	 * the reference draining in the unmount path.  Callers of
1834	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
1835	 * vfs_busy() reference is owned and caller is not in the
1836	 * unmount context.
1837	 */
1838	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
1839	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
1840		MNT_IUNLOCK(mp);
1841		return (EBUSY);
1842	}
1843
1844	mp->mnt_kern_flag |= MNTK_SUSPEND;
1845	mp->mnt_susp_owner = curthread;
1846	if (mp->mnt_writeopcount > 0)
1847		(void) msleep(&mp->mnt_writeopcount,
1848		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1849	else
1850		MNT_IUNLOCK(mp);
1851	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
1852		vfs_write_resume(mp, 0);
1853	return (error);
1854}
1855
1856/*
1857 * Request a filesystem to resume write operations.
1858 */
1859void
1860vfs_write_resume(struct mount *mp, int flags)
1861{
1862
1863	MPASS(vn_suspendable(mp));
1864
1865	MNT_ILOCK(mp);
1866	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1867		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1868		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1869				       MNTK_SUSPENDED);
1870		mp->mnt_susp_owner = NULL;
1871		wakeup(&mp->mnt_writeopcount);
1872		wakeup(&mp->mnt_flag);
1873		curthread->td_pflags &= ~TDP_IGNSUSP;
1874		if ((flags & VR_START_WRITE) != 0) {
1875			MNT_REF(mp);
1876			mp->mnt_writeopcount++;
1877		}
1878		MNT_IUNLOCK(mp);
1879		if ((flags & VR_NO_SUSPCLR) == 0)
1880			VFS_SUSP_CLEAN(mp);
1881	} else if ((flags & VR_START_WRITE) != 0) {
1882		MNT_REF(mp);
1883		vn_start_write_locked(mp, 0);
1884	} else {
1885		MNT_IUNLOCK(mp);
1886	}
1887}
1888
1889/*
1890 * Helper loop around vfs_write_suspend() for filesystem unmount VFS
1891 * methods.
1892 */
1893int
1894vfs_write_suspend_umnt(struct mount *mp)
1895{
1896	int error;
1897
1898	MPASS(vn_suspendable(mp));
1899	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
1900	    ("vfs_write_suspend_umnt: recursed"));
1901
1902	/* dounmount() already called vn_start_write(). */
1903	for (;;) {
1904		vn_finished_write(mp);
1905		error = vfs_write_suspend(mp, 0);
1906		if (error != 0) {
1907			vn_start_write(NULL, &mp, V_WAIT);
1908			return (error);
1909		}
1910		MNT_ILOCK(mp);
1911		if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
1912			break;
1913		MNT_IUNLOCK(mp);
1914		vn_start_write(NULL, &mp, V_WAIT);
1915	}
1916	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
1917	wakeup(&mp->mnt_flag);
1918	MNT_IUNLOCK(mp);
1919	curthread->td_pflags |= TDP_IGNSUSP;
1920	return (0);
1921}
1922
1923/*
1924 * Implement kqueues for files by translating it to vnode operation.
1925 */
1926static int
1927vn_kqfilter(struct file *fp, struct knote *kn)
1928{
1929
1930	return (VOP_KQFILTER(fp->f_vnode, kn));
1931}
1932
1933/*
1934 * Simplified in-kernel wrapper calls for extended attribute access.
1935 * Both calls pass in a NULL credential, authorizing as "kernel" access.
1936 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1937 */
1938int
1939vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1940    const char *attrname, int *buflen, char *buf, struct thread *td)
1941{
1942	struct uio	auio;
1943	struct iovec	iov;
1944	int	error;
1945
1946	iov.iov_len = *buflen;
1947	iov.iov_base = buf;
1948
1949	auio.uio_iov = &iov;
1950	auio.uio_iovcnt = 1;
1951	auio.uio_rw = UIO_READ;
1952	auio.uio_segflg = UIO_SYSSPACE;
1953	auio.uio_td = td;
1954	auio.uio_offset = 0;
1955	auio.uio_resid = *buflen;
1956
1957	if ((ioflg & IO_NODELOCKED) == 0)
1958		vn_lock(vp, LK_SHARED | LK_RETRY);
1959
1960	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1961
1962	/* authorize attribute retrieval as kernel */
1963	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1964	    td);
1965
1966	if ((ioflg & IO_NODELOCKED) == 0)
1967		VOP_UNLOCK(vp, 0);
1968
1969	if (error == 0) {
1970		*buflen = *buflen - auio.uio_resid;
1971	}
1972
1973	return (error);
1974}
1975
1976/*
1977 * XXX failure mode if partially written?
1978 */
1979int
1980vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1981    const char *attrname, int buflen, char *buf, struct thread *td)
1982{
1983	struct uio	auio;
1984	struct iovec	iov;
1985	struct mount	*mp;
1986	int	error;
1987
1988	iov.iov_len = buflen;
1989	iov.iov_base = buf;
1990
1991	auio.uio_iov = &iov;
1992	auio.uio_iovcnt = 1;
1993	auio.uio_rw = UIO_WRITE;
1994	auio.uio_segflg = UIO_SYSSPACE;
1995	auio.uio_td = td;
1996	auio.uio_offset = 0;
1997	auio.uio_resid = buflen;
1998
1999	if ((ioflg & IO_NODELOCKED) == 0) {
2000		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2001			return (error);
2002		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2003	}
2004
2005	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2006
2007	/* authorize attribute setting as kernel */
2008	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
2009
2010	if ((ioflg & IO_NODELOCKED) == 0) {
2011		vn_finished_write(mp);
2012		VOP_UNLOCK(vp, 0);
2013	}
2014
2015	return (error);
2016}
2017
2018int
2019vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
2020    const char *attrname, struct thread *td)
2021{
2022	struct mount	*mp;
2023	int	error;
2024
2025	if ((ioflg & IO_NODELOCKED) == 0) {
2026		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2027			return (error);
2028		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2029	}
2030
2031	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2032
2033	/* authorize attribute removal as kernel */
2034	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
2035	if (error == EOPNOTSUPP)
2036		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
2037		    NULL, td);
2038
2039	if ((ioflg & IO_NODELOCKED) == 0) {
2040		vn_finished_write(mp);
2041		VOP_UNLOCK(vp, 0);
2042	}
2043
2044	return (error);
2045}
2046
2047static int
2048vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
2049    struct vnode **rvp)
2050{
2051
2052	return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
2053}
2054
2055int
2056vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
2057{
2058
2059	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
2060	    lkflags, rvp));
2061}
2062
2063int
2064vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
2065    int lkflags, struct vnode **rvp)
2066{
2067	struct mount *mp;
2068	int ltype, error;
2069
2070	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
2071	mp = vp->v_mount;
2072	ltype = VOP_ISLOCKED(vp);
2073	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
2074	    ("vn_vget_ino: vp not locked"));
2075	error = vfs_busy(mp, MBF_NOWAIT);
2076	if (error != 0) {
2077		vfs_ref(mp);
2078		VOP_UNLOCK(vp, 0);
2079		error = vfs_busy(mp, 0);
2080		vn_lock(vp, ltype | LK_RETRY);
2081		vfs_rel(mp);
2082		if (error != 0)
2083			return (ENOENT);
2084		if (vp->v_iflag & VI_DOOMED) {
2085			vfs_unbusy(mp);
2086			return (ENOENT);
2087		}
2088	}
2089	VOP_UNLOCK(vp, 0);
2090	error = alloc(mp, alloc_arg, lkflags, rvp);
2091	vfs_unbusy(mp);
2092	if (*rvp != vp)
2093		vn_lock(vp, ltype | LK_RETRY);
2094	if (vp->v_iflag & VI_DOOMED) {
2095		if (error == 0) {
2096			if (*rvp == vp)
2097				vunref(vp);
2098			else
2099				vput(*rvp);
2100		}
2101		error = ENOENT;
2102	}
2103	return (error);
2104}
2105
2106int
2107vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
2108    struct thread *td)
2109{
2110
2111	if (vp->v_type != VREG || td == NULL)
2112		return (0);
2113	if ((uoff_t)uio->uio_offset + uio->uio_resid >
2114	    lim_cur(td, RLIMIT_FSIZE)) {
2115		PROC_LOCK(td->td_proc);
2116		kern_psignal(td->td_proc, SIGXFSZ);
2117		PROC_UNLOCK(td->td_proc);
2118		return (EFBIG);
2119	}
2120	return (0);
2121}
2122
2123int
2124vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
2125    struct thread *td)
2126{
2127	struct vnode *vp;
2128
2129	vp = fp->f_vnode;
2130#ifdef AUDIT
2131	vn_lock(vp, LK_SHARED | LK_RETRY);
2132	AUDIT_ARG_VNODE1(vp);
2133	VOP_UNLOCK(vp, 0);
2134#endif
2135	return (setfmode(td, active_cred, vp, mode));
2136}
2137
2138int
2139vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
2140    struct thread *td)
2141{
2142	struct vnode *vp;
2143
2144	vp = fp->f_vnode;
2145#ifdef AUDIT
2146	vn_lock(vp, LK_SHARED | LK_RETRY);
2147	AUDIT_ARG_VNODE1(vp);
2148	VOP_UNLOCK(vp, 0);
2149#endif
2150	return (setfown(td, active_cred, vp, uid, gid));
2151}
2152
2153void
2154vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2155{
2156	vm_object_t object;
2157
2158	if ((object = vp->v_object) == NULL)
2159		return;
2160	VM_OBJECT_WLOCK(object);
2161	vm_object_page_remove(object, start, end, 0);
2162	VM_OBJECT_WUNLOCK(object);
2163}
2164
2165int
2166vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
2167{
2168	struct vattr va;
2169	daddr_t bn, bnp;
2170	uint64_t bsize;
2171	off_t noff;
2172	int error;
2173
2174	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2175	    ("Wrong command %lu", cmd));
2176
2177	if (vn_lock(vp, LK_SHARED) != 0)
2178		return (EBADF);
2179	if (vp->v_type != VREG) {
2180		error = ENOTTY;
2181		goto unlock;
2182	}
2183	error = VOP_GETATTR(vp, &va, cred);
2184	if (error != 0)
2185		goto unlock;
2186	noff = *off;
2187	if (noff >= va.va_size) {
2188		error = ENXIO;
2189		goto unlock;
2190	}
2191	bsize = vp->v_mount->mnt_stat.f_iosize;
2192	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
2193		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
2194		if (error == EOPNOTSUPP) {
2195			error = ENOTTY;
2196			goto unlock;
2197		}
2198		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
2199		    (bnp != -1 && cmd == FIOSEEKDATA)) {
2200			noff = bn * bsize;
2201			if (noff < *off)
2202				noff = *off;
2203			goto unlock;
2204		}
2205	}
2206	if (noff > va.va_size)
2207		noff = va.va_size;
2208	/* noff == va.va_size. There is an implicit hole at the end of file. */
2209	if (cmd == FIOSEEKDATA)
2210		error = ENXIO;
2211unlock:
2212	VOP_UNLOCK(vp, 0);
2213	if (error == 0)
2214		*off = noff;
2215	return (error);
2216}
2217
2218int
2219vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
2220{
2221	struct ucred *cred;
2222	struct vnode *vp;
2223	struct vattr vattr;
2224	off_t foffset, size;
2225	int error, noneg;
2226
2227	cred = td->td_ucred;
2228	vp = fp->f_vnode;
2229	foffset = foffset_lock(fp, 0);
2230	noneg = (vp->v_type != VCHR);
2231	error = 0;
2232	switch (whence) {
2233	case L_INCR:
2234		if (noneg &&
2235		    (foffset < 0 ||
2236		    (offset > 0 && foffset > OFF_MAX - offset))) {
2237			error = EOVERFLOW;
2238			break;
2239		}
2240		offset += foffset;
2241		break;
2242	case L_XTND:
2243		vn_lock(vp, LK_SHARED | LK_RETRY);
2244		error = VOP_GETATTR(vp, &vattr, cred);
2245		VOP_UNLOCK(vp, 0);
2246		if (error)
2247			break;
2248
2249		/*
2250		 * If the file references a disk device, then fetch
2251		 * the media size and use that to determine the ending
2252		 * offset.
2253		 */
2254		if (vattr.va_size == 0 && vp->v_type == VCHR &&
2255		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2256			vattr.va_size = size;
2257		if (noneg &&
2258		    (vattr.va_size > OFF_MAX ||
2259		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2260			error = EOVERFLOW;
2261			break;
2262		}
2263		offset += vattr.va_size;
2264		break;
2265	case L_SET:
2266		break;
2267	case SEEK_DATA:
2268		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2269		break;
2270	case SEEK_HOLE:
2271		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2272		break;
2273	default:
2274		error = EINVAL;
2275	}
2276	if (error == 0 && noneg && offset < 0)
2277		error = EINVAL;
2278	if (error != 0)
2279		goto drop;
2280	VFS_KNOTE_UNLOCKED(vp, 0);
2281	td->td_uretoff.tdu_off = offset;
2282drop:
2283	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2284	return (error);
2285}
2286
2287int
2288vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
2289    struct thread *td)
2290{
2291	int error;
2292
2293	/*
2294	 * Grant permission if the caller is the owner of the file, or
2295	 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
2296	 * on the file.  If the time pointer is null, then write
2297	 * permission on the file is also sufficient.
2298	 *
2299	 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
2300	 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
2301	 * will be allowed to set the times [..] to the current
2302	 * server time.
2303	 */
2304	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
2305	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
2306		error = VOP_ACCESS(vp, VWRITE, cred, td);
2307	return (error);
2308}
2309
2310int
2311vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2312{
2313	struct vnode *vp;
2314	int error;
2315
2316	if (fp->f_type == DTYPE_FIFO)
2317		kif->kf_type = KF_TYPE_FIFO;
2318	else
2319		kif->kf_type = KF_TYPE_VNODE;
2320	vp = fp->f_vnode;
2321	vref(vp);
2322	FILEDESC_SUNLOCK(fdp);
2323	error = vn_fill_kinfo_vnode(vp, kif);
2324	vrele(vp);
2325	FILEDESC_SLOCK(fdp);
2326	return (error);
2327}
2328
2329static inline void
2330vn_fill_junk(struct kinfo_file *kif)
2331{
2332	size_t len, olen;
2333
2334	/*
2335	 * Simulate vn_fullpath returning changing values for a given
2336	 * vp during e.g. coredump.
2337	 */
2338	len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
2339	olen = strlen(kif->kf_path);
2340	if (len < olen)
2341		strcpy(&kif->kf_path[len - 1], "$");
2342	else
2343		for (; olen < len; olen++)
2344			strcpy(&kif->kf_path[olen], "A");
2345}
2346
2347int
2348vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
2349{
2350	struct vattr va;
2351	char *fullpath, *freepath;
2352	int error;
2353
2354	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
2355	freepath = NULL;
2356	fullpath = "-";
2357	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
2358	if (error == 0) {
2359		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2360	}
2361	if (freepath != NULL)
2362		free(freepath, M_TEMP);
2363
2364	KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
2365		vn_fill_junk(kif);
2366	);
2367
2368	/*
2369	 * Retrieve vnode attributes.
2370	 */
2371	va.va_fsid = VNOVAL;
2372	va.va_rdev = NODEV;
2373	vn_lock(vp, LK_SHARED | LK_RETRY);
2374	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
2375	VOP_UNLOCK(vp, 0);
2376	if (error != 0)
2377		return (error);
2378	if (va.va_fsid != VNOVAL)
2379		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
2380	else
2381		kif->kf_un.kf_file.kf_file_fsid =
2382		    vp->v_mount->mnt_stat.f_fsid.val[0];
2383	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
2384	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
2385	kif->kf_un.kf_file.kf_file_size = va.va_size;
2386	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
2387	return (0);
2388}
2389
2390int
2391vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
2392    vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
2393    struct thread *td)
2394{
2395#ifdef HWPMC_HOOKS
2396	struct pmckern_map_in pkm;
2397#endif
2398	struct mount *mp;
2399	struct vnode *vp;
2400	vm_object_t object;
2401	vm_prot_t maxprot;
2402	boolean_t writecounted;
2403	int error;
2404
2405#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
2406    defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
2407	/*
2408	 * POSIX shared-memory objects are defined to have
2409	 * kernel persistence, and are not defined to support
2410	 * read(2)/write(2) -- or even open(2).  Thus, we can
2411	 * use MAP_ASYNC to trade on-disk coherence for speed.
2412	 * The shm_open(3) library routine turns on the FPOSIXSHM
2413	 * flag to request this behavior.
2414	 */
2415	if ((fp->f_flag & FPOSIXSHM) != 0)
2416		flags |= MAP_NOSYNC;
2417#endif
2418	vp = fp->f_vnode;
2419
2420	/*
2421	 * Ensure that file and memory protections are
2422	 * compatible.  Note that we only worry about
2423	 * writability if mapping is shared; in this case,
2424	 * current and max prot are dictated by the open file.
2425	 * XXX use the vnode instead?  Problem is: what
2426	 * credentials do we use for determination? What if
2427	 * proc does a setuid?
2428	 */
2429	mp = vp->v_mount;
2430	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
2431		maxprot = VM_PROT_NONE;
2432		if ((prot & VM_PROT_EXECUTE) != 0)
2433			return (EACCES);
2434	} else
2435		maxprot = VM_PROT_EXECUTE;
2436	if ((fp->f_flag & FREAD) != 0)
2437		maxprot |= VM_PROT_READ;
2438	else if ((prot & VM_PROT_READ) != 0)
2439		return (EACCES);
2440
2441	/*
2442	 * If we are sharing potential changes via MAP_SHARED and we
2443	 * are trying to get write permission although we opened it
2444	 * without asking for it, bail out.
2445	 */
2446	if ((flags & MAP_SHARED) != 0) {
2447		if ((fp->f_flag & FWRITE) != 0)
2448			maxprot |= VM_PROT_WRITE;
2449		else if ((prot & VM_PROT_WRITE) != 0)
2450			return (EACCES);
2451	} else {
2452		maxprot |= VM_PROT_WRITE;
2453		cap_maxprot |= VM_PROT_WRITE;
2454	}
2455	maxprot &= cap_maxprot;
2456
2457	/*
2458	 * For regular files and shared memory, POSIX requires that
2459	 * the value of foff be a legitimate offset within the data
2460	 * object.  In particular, negative offsets are invalid.
2461	 * Blocking negative offsets and overflows here avoids
2462	 * possible wraparound or user-level access into reserved
2463	 * ranges of the data object later.  In contrast, POSIX does
2464	 * not dictate how offsets are used by device drivers, so in
2465	 * the case of a device mapping a negative offset is passed
2466	 * on.
2467	 */
2468	if (
2469#ifdef _LP64
2470	    size > OFF_MAX ||
2471#endif
2472	    foff < 0 || foff > OFF_MAX - size)
2473		return (EINVAL);
2474
2475	writecounted = FALSE;
2476	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
2477	    &foff, &object, &writecounted);
2478	if (error != 0)
2479		return (error);
2480	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
2481	    foff, writecounted, td);
2482	if (error != 0) {
2483		/*
2484		 * If this mapping was accounted for in the vnode's
2485		 * writecount, then undo that now.
2486		 */
2487		if (writecounted)
2488			vnode_pager_release_writecount(object, 0, size);
2489		vm_object_deallocate(object);
2490	}
2491#ifdef HWPMC_HOOKS
2492	/* Inform hwpmc(4) if an executable is being mapped. */
2493	if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
2494		if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
2495			pkm.pm_file = vp;
2496			pkm.pm_address = (uintptr_t) *addr;
2497			PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
2498		}
2499	}
2500#endif
2501	return (error);
2502}
2503