vfs_vnops.c revision 276649
139291Sfenner/*-
239291Sfenner * Copyright (c) 1982, 1986, 1989, 1993
339291Sfenner *	The Regents of the University of California.  All rights reserved.
439291Sfenner * (c) UNIX System Laboratories, Inc.
539291Sfenner * All or some portions of this file are derived from material licensed
639291Sfenner * to the University of California by American Telephone and Telegraph
739291Sfenner * Co. or Unix System Laboratories, Inc. and are reproduced herein with
839291Sfenner * the permission of UNIX System Laboratories, Inc.
939291Sfenner *
1039291Sfenner * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
1139291Sfenner * Copyright (c) 2013, 2014 The FreeBSD Foundation
1239291Sfenner *
1339291Sfenner * Portions of this software were developed by Konstantin Belousov
1439291Sfenner * under sponsorship from the FreeBSD Foundation.
1539291Sfenner *
1639291Sfenner * Redistribution and use in source and binary forms, with or without
1739291Sfenner * modification, are permitted provided that the following conditions
1839291Sfenner * are met:
1939291Sfenner * 1. Redistributions of source code must retain the above copyright
2039291Sfenner *    notice, this list of conditions and the following disclaimer.
2139291Sfenner * 2. Redistributions in binary form must reproduce the above copyright
2239291Sfenner *    notice, this list of conditions and the following disclaimer in the
2339291Sfenner *    documentation and/or other materials provided with the distribution.
2439291Sfenner * 4. Neither the name of the University nor the names of its contributors
2539291Sfenner *    may be used to endorse or promote products derived from this software
2639291Sfenner *    without specific prior written permission.
2739291Sfenner *
2839291Sfenner * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2939291Sfenner * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3039291Sfenner * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3139291Sfenner * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
3239291Sfenner * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
3339291Sfenner * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
3439291Sfenner * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
3539291Sfenner * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3639291Sfenner * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3739291Sfenner * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3839291Sfenner * SUCH DAMAGE.
3939291Sfenner *
4039291Sfenner *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
4139291Sfenner */
4239291Sfenner
4339291Sfenner#include <sys/cdefs.h>
4439291Sfenner__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_vnops.c 276649 2015-01-04 00:49:45Z kib $");
4539291Sfenner
4639291Sfenner#include <sys/param.h>
4739291Sfenner#include <sys/systm.h>
4839291Sfenner#include <sys/disk.h>
4939291Sfenner#include <sys/fcntl.h>
5039291Sfenner#include <sys/file.h>
51#include <sys/kdb.h>
52#include <sys/stat.h>
53#include <sys/priv.h>
54#include <sys/proc.h>
55#include <sys/limits.h>
56#include <sys/lock.h>
57#include <sys/mount.h>
58#include <sys/mutex.h>
59#include <sys/namei.h>
60#include <sys/vnode.h>
61#include <sys/bio.h>
62#include <sys/buf.h>
63#include <sys/filio.h>
64#include <sys/resourcevar.h>
65#include <sys/rwlock.h>
66#include <sys/sx.h>
67#include <sys/sysctl.h>
68#include <sys/ttycom.h>
69#include <sys/conf.h>
70#include <sys/syslog.h>
71#include <sys/unistd.h>
72
73#include <security/audit/audit.h>
74#include <security/mac/mac_framework.h>
75
76#include <vm/vm.h>
77#include <vm/vm_extern.h>
78#include <vm/pmap.h>
79#include <vm/vm_map.h>
80#include <vm/vm_object.h>
81#include <vm/vm_page.h>
82
83static fo_rdwr_t	vn_read;
84static fo_rdwr_t	vn_write;
85static fo_rdwr_t	vn_io_fault;
86static fo_truncate_t	vn_truncate;
87static fo_ioctl_t	vn_ioctl;
88static fo_poll_t	vn_poll;
89static fo_kqfilter_t	vn_kqfilter;
90static fo_stat_t	vn_statfile;
91static fo_close_t	vn_closefile;
92
93struct 	fileops vnops = {
94	.fo_read = vn_io_fault,
95	.fo_write = vn_io_fault,
96	.fo_truncate = vn_truncate,
97	.fo_ioctl = vn_ioctl,
98	.fo_poll = vn_poll,
99	.fo_kqfilter = vn_kqfilter,
100	.fo_stat = vn_statfile,
101	.fo_close = vn_closefile,
102	.fo_chmod = vn_chmod,
103	.fo_chown = vn_chown,
104	.fo_sendfile = vn_sendfile,
105	.fo_seek = vn_seek,
106	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
107};
108
109static const int io_hold_cnt = 16;
110static int vn_io_fault_enable = 1;
111SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
112    &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
113static u_long vn_io_faults_cnt;
114SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
115    &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
116
117/*
118 * Returns true if vn_io_fault mode of handling the i/o request should
119 * be used.
120 */
121static bool
122do_vn_io_fault(struct vnode *vp, struct uio *uio)
123{
124	struct mount *mp;
125
126	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
127	    (mp = vp->v_mount) != NULL &&
128	    (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
129}
130
131/*
132 * Structure used to pass arguments to vn_io_fault1(), to do either
133 * file- or vnode-based I/O calls.
134 */
135struct vn_io_fault_args {
136	enum {
137		VN_IO_FAULT_FOP,
138		VN_IO_FAULT_VOP
139	} kind;
140	struct ucred *cred;
141	int flags;
142	union {
143		struct fop_args_tag {
144			struct file *fp;
145			fo_rdwr_t *doio;
146		} fop_args;
147		struct vop_args_tag {
148			struct vnode *vp;
149		} vop_args;
150	} args;
151};
152
153static int vn_io_fault1(struct vnode *vp, struct uio *uio,
154    struct vn_io_fault_args *args, struct thread *td);
155
156int
157vn_open(ndp, flagp, cmode, fp)
158	struct nameidata *ndp;
159	int *flagp, cmode;
160	struct file *fp;
161{
162	struct thread *td = ndp->ni_cnd.cn_thread;
163
164	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
165}
166
167/*
168 * Common code for vnode open operations via a name lookup.
169 * Lookup the vnode and invoke VOP_CREATE if needed.
170 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
171 *
172 * Note that this does NOT free nameidata for the successful case,
173 * due to the NDINIT being done elsewhere.
174 */
175int
176vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
177    struct ucred *cred, struct file *fp)
178{
179	struct vnode *vp;
180	struct mount *mp;
181	struct thread *td = ndp->ni_cnd.cn_thread;
182	struct vattr vat;
183	struct vattr *vap = &vat;
184	int fmode, error;
185
186restart:
187	fmode = *flagp;
188	if (fmode & O_CREAT) {
189		ndp->ni_cnd.cn_nameiop = CREATE;
190		/*
191		 * Set NOCACHE to avoid flushing the cache when
192		 * rolling in many files at once.
193		*/
194		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE;
195		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
196			ndp->ni_cnd.cn_flags |= FOLLOW;
197		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
198			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
199		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
200			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
201		bwillwrite();
202		if ((error = namei(ndp)) != 0)
203			return (error);
204		if (ndp->ni_vp == NULL) {
205			VATTR_NULL(vap);
206			vap->va_type = VREG;
207			vap->va_mode = cmode;
208			if (fmode & O_EXCL)
209				vap->va_vaflags |= VA_EXCLUSIVE;
210			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
211				NDFREE(ndp, NDF_ONLY_PNBUF);
212				vput(ndp->ni_dvp);
213				if ((error = vn_start_write(NULL, &mp,
214				    V_XSLEEP | PCATCH)) != 0)
215					return (error);
216				goto restart;
217			}
218			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
219				ndp->ni_cnd.cn_flags |= MAKEENTRY;
220#ifdef MAC
221			error = mac_vnode_check_create(cred, ndp->ni_dvp,
222			    &ndp->ni_cnd, vap);
223			if (error == 0)
224#endif
225				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
226						   &ndp->ni_cnd, vap);
227			vput(ndp->ni_dvp);
228			vn_finished_write(mp);
229			if (error) {
230				NDFREE(ndp, NDF_ONLY_PNBUF);
231				return (error);
232			}
233			fmode &= ~O_TRUNC;
234			vp = ndp->ni_vp;
235		} else {
236			if (ndp->ni_dvp == ndp->ni_vp)
237				vrele(ndp->ni_dvp);
238			else
239				vput(ndp->ni_dvp);
240			ndp->ni_dvp = NULL;
241			vp = ndp->ni_vp;
242			if (fmode & O_EXCL) {
243				error = EEXIST;
244				goto bad;
245			}
246			fmode &= ~O_CREAT;
247		}
248	} else {
249		ndp->ni_cnd.cn_nameiop = LOOKUP;
250		ndp->ni_cnd.cn_flags = ISOPEN |
251		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
252		if (!(fmode & FWRITE))
253			ndp->ni_cnd.cn_flags |= LOCKSHARED;
254		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
255			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
256		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
257			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
258		if ((error = namei(ndp)) != 0)
259			return (error);
260		vp = ndp->ni_vp;
261	}
262	error = vn_open_vnode(vp, fmode, cred, td, fp);
263	if (error)
264		goto bad;
265	*flagp = fmode;
266	return (0);
267bad:
268	NDFREE(ndp, NDF_ONLY_PNBUF);
269	vput(vp);
270	*flagp = fmode;
271	ndp->ni_vp = NULL;
272	return (error);
273}
274
275/*
276 * Common code for vnode open operations once a vnode is located.
277 * Check permissions, and call the VOP_OPEN routine.
278 */
279int
280vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
281    struct thread *td, struct file *fp)
282{
283	struct mount *mp;
284	accmode_t accmode;
285	struct flock lf;
286	int error, have_flock, lock_flags, type;
287
288	if (vp->v_type == VLNK)
289		return (EMLINK);
290	if (vp->v_type == VSOCK)
291		return (EOPNOTSUPP);
292	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
293		return (ENOTDIR);
294	accmode = 0;
295	if (fmode & (FWRITE | O_TRUNC)) {
296		if (vp->v_type == VDIR)
297			return (EISDIR);
298		accmode |= VWRITE;
299	}
300	if (fmode & FREAD)
301		accmode |= VREAD;
302	if (fmode & FEXEC)
303		accmode |= VEXEC;
304	if ((fmode & O_APPEND) && (fmode & FWRITE))
305		accmode |= VAPPEND;
306#ifdef MAC
307	error = mac_vnode_check_open(cred, vp, accmode);
308	if (error)
309		return (error);
310#endif
311	if ((fmode & O_CREAT) == 0) {
312		if (accmode & VWRITE) {
313			error = vn_writechk(vp);
314			if (error)
315				return (error);
316		}
317		if (accmode) {
318		        error = VOP_ACCESS(vp, accmode, cred, td);
319			if (error)
320				return (error);
321		}
322	}
323	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
324		vn_lock(vp, LK_UPGRADE | LK_RETRY);
325	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
326		return (error);
327
328	if (fmode & (O_EXLOCK | O_SHLOCK)) {
329		KASSERT(fp != NULL, ("open with flock requires fp"));
330		lock_flags = VOP_ISLOCKED(vp);
331		VOP_UNLOCK(vp, 0);
332		lf.l_whence = SEEK_SET;
333		lf.l_start = 0;
334		lf.l_len = 0;
335		if (fmode & O_EXLOCK)
336			lf.l_type = F_WRLCK;
337		else
338			lf.l_type = F_RDLCK;
339		type = F_FLOCK;
340		if ((fmode & FNONBLOCK) == 0)
341			type |= F_WAIT;
342		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
343		have_flock = (error == 0);
344		vn_lock(vp, lock_flags | LK_RETRY);
345		if (error == 0 && vp->v_iflag & VI_DOOMED)
346			error = ENOENT;
347		/*
348		 * Another thread might have used this vnode as an
349		 * executable while the vnode lock was dropped.
350		 * Ensure the vnode is still able to be opened for
351		 * writing after the lock has been obtained.
352		 */
353		if (error == 0 && accmode & VWRITE)
354			error = vn_writechk(vp);
355		if (error) {
356			VOP_UNLOCK(vp, 0);
357			if (have_flock) {
358				lf.l_whence = SEEK_SET;
359				lf.l_start = 0;
360				lf.l_len = 0;
361				lf.l_type = F_UNLCK;
362				(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf,
363				    F_FLOCK);
364			}
365			vn_start_write(vp, &mp, V_WAIT);
366			vn_lock(vp, lock_flags | LK_RETRY);
367			(void)VOP_CLOSE(vp, fmode, cred, td);
368			vn_finished_write(mp);
369			/* Prevent second close from fdrop()->vn_close(). */
370			if (fp != NULL)
371				fp->f_ops= &badfileops;
372			return (error);
373		}
374		fp->f_flag |= FHASLOCK;
375	}
376	if (fmode & FWRITE) {
377		VOP_ADD_WRITECOUNT(vp, 1);
378		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
379		    __func__, vp, vp->v_writecount);
380	}
381	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
382	return (0);
383}
384
385/*
386 * Check for write permissions on the specified vnode.
387 * Prototype text segments cannot be written.
388 */
389int
390vn_writechk(vp)
391	register struct vnode *vp;
392{
393
394	ASSERT_VOP_LOCKED(vp, "vn_writechk");
395	/*
396	 * If there's shared text associated with
397	 * the vnode, try to free it up once.  If
398	 * we fail, we can't allow writing.
399	 */
400	if (VOP_IS_TEXT(vp))
401		return (ETXTBSY);
402
403	return (0);
404}
405
406/*
407 * Vnode close call
408 */
409int
410vn_close(vp, flags, file_cred, td)
411	register struct vnode *vp;
412	int flags;
413	struct ucred *file_cred;
414	struct thread *td;
415{
416	struct mount *mp;
417	int error, lock_flags;
418
419	if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
420	    MNT_EXTENDED_SHARED(vp->v_mount))
421		lock_flags = LK_SHARED;
422	else
423		lock_flags = LK_EXCLUSIVE;
424
425	vn_start_write(vp, &mp, V_WAIT);
426	vn_lock(vp, lock_flags | LK_RETRY);
427	if (flags & FWRITE) {
428		VNASSERT(vp->v_writecount > 0, vp,
429		    ("vn_close: negative writecount"));
430		VOP_ADD_WRITECOUNT(vp, -1);
431		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
432		    __func__, vp, vp->v_writecount);
433	}
434	error = VOP_CLOSE(vp, flags, file_cred, td);
435	vput(vp);
436	vn_finished_write(mp);
437	return (error);
438}
439
440/*
441 * Heuristic to detect sequential operation.
442 */
443static int
444sequential_heuristic(struct uio *uio, struct file *fp)
445{
446
447	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
448	if (fp->f_flag & FRDAHEAD)
449		return (fp->f_seqcount << IO_SEQSHIFT);
450
451	/*
452	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
453	 * that the first I/O is normally considered to be slightly
454	 * sequential.  Seeking to offset 0 doesn't change sequentiality
455	 * unless previous seeks have reduced f_seqcount to 0, in which
456	 * case offset 0 is not special.
457	 */
458	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
459	    uio->uio_offset == fp->f_nextoff) {
460		/*
461		 * f_seqcount is in units of fixed-size blocks so that it
462		 * depends mainly on the amount of sequential I/O and not
463		 * much on the number of sequential I/O's.  The fixed size
464		 * of 16384 is hard-coded here since it is (not quite) just
465		 * a magic size that works well here.  This size is more
466		 * closely related to the best I/O size for real disks than
467		 * to any block size used by software.
468		 */
469		fp->f_seqcount += howmany(uio->uio_resid, 16384);
470		if (fp->f_seqcount > IO_SEQMAX)
471			fp->f_seqcount = IO_SEQMAX;
472		return (fp->f_seqcount << IO_SEQSHIFT);
473	}
474
475	/* Not sequential.  Quickly draw-down sequentiality. */
476	if (fp->f_seqcount > 1)
477		fp->f_seqcount = 1;
478	else
479		fp->f_seqcount = 0;
480	return (0);
481}
482
483/*
484 * Package up an I/O request on a vnode into a uio and do it.
485 */
486int
487vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
488    enum uio_seg segflg, int ioflg, struct ucred *active_cred,
489    struct ucred *file_cred, ssize_t *aresid, struct thread *td)
490{
491	struct uio auio;
492	struct iovec aiov;
493	struct mount *mp;
494	struct ucred *cred;
495	void *rl_cookie;
496	struct vn_io_fault_args args;
497	int error, lock_flags;
498
499	auio.uio_iov = &aiov;
500	auio.uio_iovcnt = 1;
501	aiov.iov_base = base;
502	aiov.iov_len = len;
503	auio.uio_resid = len;
504	auio.uio_offset = offset;
505	auio.uio_segflg = segflg;
506	auio.uio_rw = rw;
507	auio.uio_td = td;
508	error = 0;
509
510	if ((ioflg & IO_NODELOCKED) == 0) {
511		if ((ioflg & IO_RANGELOCKED) == 0) {
512			if (rw == UIO_READ) {
513				rl_cookie = vn_rangelock_rlock(vp, offset,
514				    offset + len);
515			} else {
516				rl_cookie = vn_rangelock_wlock(vp, offset,
517				    offset + len);
518			}
519		} else
520			rl_cookie = NULL;
521		mp = NULL;
522		if (rw == UIO_WRITE) {
523			if (vp->v_type != VCHR &&
524			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
525			    != 0)
526				goto out;
527			if (MNT_SHARED_WRITES(mp) ||
528			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
529				lock_flags = LK_SHARED;
530			else
531				lock_flags = LK_EXCLUSIVE;
532		} else
533			lock_flags = LK_SHARED;
534		vn_lock(vp, lock_flags | LK_RETRY);
535	} else
536		rl_cookie = NULL;
537
538	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
539#ifdef MAC
540	if ((ioflg & IO_NOMACCHECK) == 0) {
541		if (rw == UIO_READ)
542			error = mac_vnode_check_read(active_cred, file_cred,
543			    vp);
544		else
545			error = mac_vnode_check_write(active_cred, file_cred,
546			    vp);
547	}
548#endif
549	if (error == 0) {
550		if (file_cred != NULL)
551			cred = file_cred;
552		else
553			cred = active_cred;
554		if (do_vn_io_fault(vp, &auio)) {
555			args.kind = VN_IO_FAULT_VOP;
556			args.cred = cred;
557			args.flags = ioflg;
558			args.args.vop_args.vp = vp;
559			error = vn_io_fault1(vp, &auio, &args, td);
560		} else if (rw == UIO_READ) {
561			error = VOP_READ(vp, &auio, ioflg, cred);
562		} else /* if (rw == UIO_WRITE) */ {
563			error = VOP_WRITE(vp, &auio, ioflg, cred);
564		}
565	}
566	if (aresid)
567		*aresid = auio.uio_resid;
568	else
569		if (auio.uio_resid && error == 0)
570			error = EIO;
571	if ((ioflg & IO_NODELOCKED) == 0) {
572		VOP_UNLOCK(vp, 0);
573		if (mp != NULL)
574			vn_finished_write(mp);
575	}
576 out:
577	if (rl_cookie != NULL)
578		vn_rangelock_unlock(vp, rl_cookie);
579	return (error);
580}
581
582/*
583 * Package up an I/O request on a vnode into a uio and do it.  The I/O
584 * request is split up into smaller chunks and we try to avoid saturating
585 * the buffer cache while potentially holding a vnode locked, so we
586 * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
587 * to give other processes a chance to lock the vnode (either other processes
588 * core'ing the same binary, or unrelated processes scanning the directory).
589 */
590int
591vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
592    file_cred, aresid, td)
593	enum uio_rw rw;
594	struct vnode *vp;
595	void *base;
596	size_t len;
597	off_t offset;
598	enum uio_seg segflg;
599	int ioflg;
600	struct ucred *active_cred;
601	struct ucred *file_cred;
602	size_t *aresid;
603	struct thread *td;
604{
605	int error = 0;
606	ssize_t iaresid;
607
608	do {
609		int chunk;
610
611		/*
612		 * Force `offset' to a multiple of MAXBSIZE except possibly
613		 * for the first chunk, so that filesystems only need to
614		 * write full blocks except possibly for the first and last
615		 * chunks.
616		 */
617		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
618
619		if (chunk > len)
620			chunk = len;
621		if (rw != UIO_READ && vp->v_type == VREG)
622			bwillwrite();
623		iaresid = 0;
624		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
625		    ioflg, active_cred, file_cred, &iaresid, td);
626		len -= chunk;	/* aresid calc already includes length */
627		if (error)
628			break;
629		offset += chunk;
630		base = (char *)base + chunk;
631		kern_yield(PRI_USER);
632	} while (len);
633	if (aresid)
634		*aresid = len + iaresid;
635	return (error);
636}
637
638off_t
639foffset_lock(struct file *fp, int flags)
640{
641	struct mtx *mtxp;
642	off_t res;
643
644	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
645
646#if OFF_MAX <= LONG_MAX
647	/*
648	 * Caller only wants the current f_offset value.  Assume that
649	 * the long and shorter integer types reads are atomic.
650	 */
651	if ((flags & FOF_NOLOCK) != 0)
652		return (fp->f_offset);
653#endif
654
655	/*
656	 * According to McKusick the vn lock was protecting f_offset here.
657	 * It is now protected by the FOFFSET_LOCKED flag.
658	 */
659	mtxp = mtx_pool_find(mtxpool_sleep, fp);
660	mtx_lock(mtxp);
661	if ((flags & FOF_NOLOCK) == 0) {
662		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
663			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
664			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
665			    "vofflock", 0);
666		}
667		fp->f_vnread_flags |= FOFFSET_LOCKED;
668	}
669	res = fp->f_offset;
670	mtx_unlock(mtxp);
671	return (res);
672}
673
674void
675foffset_unlock(struct file *fp, off_t val, int flags)
676{
677	struct mtx *mtxp;
678
679	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
680
681#if OFF_MAX <= LONG_MAX
682	if ((flags & FOF_NOLOCK) != 0) {
683		if ((flags & FOF_NOUPDATE) == 0)
684			fp->f_offset = val;
685		if ((flags & FOF_NEXTOFF) != 0)
686			fp->f_nextoff = val;
687		return;
688	}
689#endif
690
691	mtxp = mtx_pool_find(mtxpool_sleep, fp);
692	mtx_lock(mtxp);
693	if ((flags & FOF_NOUPDATE) == 0)
694		fp->f_offset = val;
695	if ((flags & FOF_NEXTOFF) != 0)
696		fp->f_nextoff = val;
697	if ((flags & FOF_NOLOCK) == 0) {
698		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
699		    ("Lost FOFFSET_LOCKED"));
700		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
701			wakeup(&fp->f_vnread_flags);
702		fp->f_vnread_flags = 0;
703	}
704	mtx_unlock(mtxp);
705}
706
707void
708foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
709{
710
711	if ((flags & FOF_OFFSET) == 0)
712		uio->uio_offset = foffset_lock(fp, flags);
713}
714
715void
716foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
717{
718
719	if ((flags & FOF_OFFSET) == 0)
720		foffset_unlock(fp, uio->uio_offset, flags);
721}
722
723static int
724get_advice(struct file *fp, struct uio *uio)
725{
726	struct mtx *mtxp;
727	int ret;
728
729	ret = POSIX_FADV_NORMAL;
730	if (fp->f_advice == NULL)
731		return (ret);
732
733	mtxp = mtx_pool_find(mtxpool_sleep, fp);
734	mtx_lock(mtxp);
735	if (uio->uio_offset >= fp->f_advice->fa_start &&
736	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
737		ret = fp->f_advice->fa_advice;
738	mtx_unlock(mtxp);
739	return (ret);
740}
741
742/*
743 * File table vnode read routine.
744 */
745static int
746vn_read(fp, uio, active_cred, flags, td)
747	struct file *fp;
748	struct uio *uio;
749	struct ucred *active_cred;
750	int flags;
751	struct thread *td;
752{
753	struct vnode *vp;
754	struct mtx *mtxp;
755	int error, ioflag;
756	int advice;
757	off_t offset, start, end;
758
759	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
760	    uio->uio_td, td));
761	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
762	vp = fp->f_vnode;
763	ioflag = 0;
764	if (fp->f_flag & FNONBLOCK)
765		ioflag |= IO_NDELAY;
766	if (fp->f_flag & O_DIRECT)
767		ioflag |= IO_DIRECT;
768	advice = get_advice(fp, uio);
769	vn_lock(vp, LK_SHARED | LK_RETRY);
770
771	switch (advice) {
772	case POSIX_FADV_NORMAL:
773	case POSIX_FADV_SEQUENTIAL:
774	case POSIX_FADV_NOREUSE:
775		ioflag |= sequential_heuristic(uio, fp);
776		break;
777	case POSIX_FADV_RANDOM:
778		/* Disable read-ahead for random I/O. */
779		break;
780	}
781	offset = uio->uio_offset;
782
783#ifdef MAC
784	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
785	if (error == 0)
786#endif
787		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
788	fp->f_nextoff = uio->uio_offset;
789	VOP_UNLOCK(vp, 0);
790	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
791	    offset != uio->uio_offset) {
792		/*
793		 * Use POSIX_FADV_DONTNEED to flush clean pages and
794		 * buffers for the backing file after a
795		 * POSIX_FADV_NOREUSE read(2).  To optimize the common
796		 * case of using POSIX_FADV_NOREUSE with sequential
797		 * access, track the previous implicit DONTNEED
798		 * request and grow this request to include the
799		 * current read(2) in addition to the previous
800		 * DONTNEED.  With purely sequential access this will
801		 * cause the DONTNEED requests to continously grow to
802		 * cover all of the previously read regions of the
803		 * file.  This allows filesystem blocks that are
804		 * accessed by multiple calls to read(2) to be flushed
805		 * once the last read(2) finishes.
806		 */
807		start = offset;
808		end = uio->uio_offset - 1;
809		mtxp = mtx_pool_find(mtxpool_sleep, fp);
810		mtx_lock(mtxp);
811		if (fp->f_advice != NULL &&
812		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
813			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
814				start = fp->f_advice->fa_prevstart;
815			else if (fp->f_advice->fa_prevstart != 0 &&
816			    fp->f_advice->fa_prevstart == end + 1)
817				end = fp->f_advice->fa_prevend;
818			fp->f_advice->fa_prevstart = start;
819			fp->f_advice->fa_prevend = end;
820		}
821		mtx_unlock(mtxp);
822		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
823	}
824	return (error);
825}
826
827/*
828 * File table vnode write routine.
829 */
830static int
831vn_write(fp, uio, active_cred, flags, td)
832	struct file *fp;
833	struct uio *uio;
834	struct ucred *active_cred;
835	int flags;
836	struct thread *td;
837{
838	struct vnode *vp;
839	struct mount *mp;
840	struct mtx *mtxp;
841	int error, ioflag, lock_flags;
842	int advice;
843	off_t offset, start, end;
844
845	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
846	    uio->uio_td, td));
847	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
848	vp = fp->f_vnode;
849	if (vp->v_type == VREG)
850		bwillwrite();
851	ioflag = IO_UNIT;
852	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
853		ioflag |= IO_APPEND;
854	if (fp->f_flag & FNONBLOCK)
855		ioflag |= IO_NDELAY;
856	if (fp->f_flag & O_DIRECT)
857		ioflag |= IO_DIRECT;
858	if ((fp->f_flag & O_FSYNC) ||
859	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
860		ioflag |= IO_SYNC;
861	mp = NULL;
862	if (vp->v_type != VCHR &&
863	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
864		goto unlock;
865
866	advice = get_advice(fp, uio);
867
868	if (MNT_SHARED_WRITES(mp) ||
869	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
870		lock_flags = LK_SHARED;
871	} else {
872		lock_flags = LK_EXCLUSIVE;
873	}
874
875	vn_lock(vp, lock_flags | LK_RETRY);
876	switch (advice) {
877	case POSIX_FADV_NORMAL:
878	case POSIX_FADV_SEQUENTIAL:
879	case POSIX_FADV_NOREUSE:
880		ioflag |= sequential_heuristic(uio, fp);
881		break;
882	case POSIX_FADV_RANDOM:
883		/* XXX: Is this correct? */
884		break;
885	}
886	offset = uio->uio_offset;
887
888#ifdef MAC
889	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
890	if (error == 0)
891#endif
892		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
893	fp->f_nextoff = uio->uio_offset;
894	VOP_UNLOCK(vp, 0);
895	if (vp->v_type != VCHR)
896		vn_finished_write(mp);
897	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
898	    offset != uio->uio_offset) {
899		/*
900		 * Use POSIX_FADV_DONTNEED to flush clean pages and
901		 * buffers for the backing file after a
902		 * POSIX_FADV_NOREUSE write(2).  To optimize the
903		 * common case of using POSIX_FADV_NOREUSE with
904		 * sequential access, track the previous implicit
905		 * DONTNEED request and grow this request to include
906		 * the current write(2) in addition to the previous
907		 * DONTNEED.  With purely sequential access this will
908		 * cause the DONTNEED requests to continously grow to
909		 * cover all of the previously written regions of the
910		 * file.
911		 *
912		 * Note that the blocks just written are almost
913		 * certainly still dirty, so this only works when
914		 * VOP_ADVISE() calls from subsequent writes push out
915		 * the data written by this write(2) once the backing
916		 * buffers are clean.  However, as compared to forcing
917		 * IO_DIRECT, this gives much saner behavior.  Write
918		 * clustering is still allowed, and clean pages are
919		 * merely moved to the cache page queue rather than
920		 * outright thrown away.  This means a subsequent
921		 * read(2) can still avoid hitting the disk if the
922		 * pages have not been reclaimed.
923		 *
924		 * This does make POSIX_FADV_NOREUSE largely useless
925		 * with non-sequential access.  However, sequential
926		 * access is the more common use case and the flag is
927		 * merely advisory.
928		 */
929		start = offset;
930		end = uio->uio_offset - 1;
931		mtxp = mtx_pool_find(mtxpool_sleep, fp);
932		mtx_lock(mtxp);
933		if (fp->f_advice != NULL &&
934		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
935			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
936				start = fp->f_advice->fa_prevstart;
937			else if (fp->f_advice->fa_prevstart != 0 &&
938			    fp->f_advice->fa_prevstart == end + 1)
939				end = fp->f_advice->fa_prevend;
940			fp->f_advice->fa_prevstart = start;
941			fp->f_advice->fa_prevend = end;
942		}
943		mtx_unlock(mtxp);
944		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
945	}
946
947unlock:
948	return (error);
949}
950
951/*
952 * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
953 * prevent the following deadlock:
954 *
955 * Assume that the thread A reads from the vnode vp1 into userspace
956 * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
957 * currently not resident, then system ends up with the call chain
958 *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
959 *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
960 * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
961 * If, at the same time, thread B reads from vnode vp2 into buffer buf2
962 * backed by the pages of vnode vp1, and some page in buf2 is not
963 * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
964 *
965 * To prevent the lock order reversal and deadlock, vn_io_fault() does
966 * not allow page faults to happen during VOP_READ() or VOP_WRITE().
967 * Instead, it first tries to do the whole range i/o with pagefaults
968 * disabled. If all pages in the i/o buffer are resident and mapped,
969 * VOP will succeed (ignoring the genuine filesystem errors).
970 * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
971 * i/o in chunks, with all pages in the chunk prefaulted and held
972 * using vm_fault_quick_hold_pages().
973 *
974 * Filesystems using this deadlock avoidance scheme should use the
975 * array of the held pages from uio, saved in the curthread->td_ma,
976 * instead of doing uiomove().  A helper function
977 * vn_io_fault_uiomove() converts uiomove request into
978 * uiomove_fromphys() over td_ma array.
979 *
980 * Since vnode locks do not cover the whole i/o anymore, rangelocks
981 * make the current i/o request atomic with respect to other i/os and
982 * truncations.
983 */
984
985/*
986 * Decode vn_io_fault_args and perform the corresponding i/o.
987 */
988static int
989vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
990    struct thread *td)
991{
992
993	switch (args->kind) {
994	case VN_IO_FAULT_FOP:
995		return ((args->args.fop_args.doio)(args->args.fop_args.fp,
996		    uio, args->cred, args->flags, td));
997	case VN_IO_FAULT_VOP:
998		if (uio->uio_rw == UIO_READ) {
999			return (VOP_READ(args->args.vop_args.vp, uio,
1000			    args->flags, args->cred));
1001		} else if (uio->uio_rw == UIO_WRITE) {
1002			return (VOP_WRITE(args->args.vop_args.vp, uio,
1003			    args->flags, args->cred));
1004		}
1005		break;
1006	}
1007	panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind,
1008	    uio->uio_rw);
1009}
1010
1011/*
1012 * Common code for vn_io_fault(), agnostic to the kind of i/o request.
1013 * Uses vn_io_fault_doio() to make the call to an actual i/o function.
1014 * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
1015 * into args and call vn_io_fault1() to handle faults during the user
1016 * mode buffer accesses.
1017 */
1018static int
1019vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
1020    struct thread *td)
1021{
1022	vm_page_t ma[io_hold_cnt + 2];
1023	struct uio *uio_clone, short_uio;
1024	struct iovec short_iovec[1];
1025	vm_page_t *prev_td_ma;
1026	vm_prot_t prot;
1027	vm_offset_t addr, end;
1028	size_t len, resid;
1029	ssize_t adv;
1030	int error, cnt, save, saveheld, prev_td_ma_cnt;
1031
1032	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
1033
1034	/*
1035	 * The UFS follows IO_UNIT directive and replays back both
1036	 * uio_offset and uio_resid if an error is encountered during the
1037	 * operation.  But, since the iovec may be already advanced,
1038	 * uio is still in an inconsistent state.
1039	 *
1040	 * Cache a copy of the original uio, which is advanced to the redo
1041	 * point using UIO_NOCOPY below.
1042	 */
1043	uio_clone = cloneuio(uio);
1044	resid = uio->uio_resid;
1045
1046	short_uio.uio_segflg = UIO_USERSPACE;
1047	short_uio.uio_rw = uio->uio_rw;
1048	short_uio.uio_td = uio->uio_td;
1049
1050	save = vm_fault_disable_pagefaults();
1051	error = vn_io_fault_doio(args, uio, td);
1052	if (error != EFAULT)
1053		goto out;
1054
1055	atomic_add_long(&vn_io_faults_cnt, 1);
1056	uio_clone->uio_segflg = UIO_NOCOPY;
1057	uiomove(NULL, resid - uio->uio_resid, uio_clone);
1058	uio_clone->uio_segflg = uio->uio_segflg;
1059
1060	saveheld = curthread_pflags_set(TDP_UIOHELD);
1061	prev_td_ma = td->td_ma;
1062	prev_td_ma_cnt = td->td_ma_cnt;
1063
1064	while (uio_clone->uio_resid != 0) {
1065		len = uio_clone->uio_iov->iov_len;
1066		if (len == 0) {
1067			KASSERT(uio_clone->uio_iovcnt >= 1,
1068			    ("iovcnt underflow"));
1069			uio_clone->uio_iov++;
1070			uio_clone->uio_iovcnt--;
1071			continue;
1072		}
1073		if (len > io_hold_cnt * PAGE_SIZE)
1074			len = io_hold_cnt * PAGE_SIZE;
1075		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
1076		end = round_page(addr + len);
1077		if (end < addr) {
1078			error = EFAULT;
1079			break;
1080		}
1081		cnt = atop(end - trunc_page(addr));
1082		/*
1083		 * A perfectly misaligned address and length could cause
1084		 * both the start and the end of the chunk to use partial
1085		 * page.  +2 accounts for such a situation.
1086		 */
1087		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
1088		    addr, len, prot, ma, io_hold_cnt + 2);
1089		if (cnt == -1) {
1090			error = EFAULT;
1091			break;
1092		}
1093		short_uio.uio_iov = &short_iovec[0];
1094		short_iovec[0].iov_base = (void *)addr;
1095		short_uio.uio_iovcnt = 1;
1096		short_uio.uio_resid = short_iovec[0].iov_len = len;
1097		short_uio.uio_offset = uio_clone->uio_offset;
1098		td->td_ma = ma;
1099		td->td_ma_cnt = cnt;
1100
1101		error = vn_io_fault_doio(args, &short_uio, td);
1102		vm_page_unhold_pages(ma, cnt);
1103		adv = len - short_uio.uio_resid;
1104
1105		uio_clone->uio_iov->iov_base =
1106		    (char *)uio_clone->uio_iov->iov_base + adv;
1107		uio_clone->uio_iov->iov_len -= adv;
1108		uio_clone->uio_resid -= adv;
1109		uio_clone->uio_offset += adv;
1110
1111		uio->uio_resid -= adv;
1112		uio->uio_offset += adv;
1113
1114		if (error != 0 || adv == 0)
1115			break;
1116	}
1117	td->td_ma = prev_td_ma;
1118	td->td_ma_cnt = prev_td_ma_cnt;
1119	curthread_pflags_restore(saveheld);
1120out:
1121	vm_fault_enable_pagefaults(save);
1122	free(uio_clone, M_IOV);
1123	return (error);
1124}
1125
1126static int
1127vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
1128    int flags, struct thread *td)
1129{
1130	fo_rdwr_t *doio;
1131	struct vnode *vp;
1132	void *rl_cookie;
1133	struct vn_io_fault_args args;
1134	int error;
1135
1136	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
1137	vp = fp->f_vnode;
1138	foffset_lock_uio(fp, uio, flags);
1139	if (do_vn_io_fault(vp, uio)) {
1140		args.kind = VN_IO_FAULT_FOP;
1141		args.args.fop_args.fp = fp;
1142		args.args.fop_args.doio = doio;
1143		args.cred = active_cred;
1144		args.flags = flags | FOF_OFFSET;
1145		if (uio->uio_rw == UIO_READ) {
1146			rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
1147			    uio->uio_offset + uio->uio_resid);
1148		} else if ((fp->f_flag & O_APPEND) != 0 ||
1149		    (flags & FOF_OFFSET) == 0) {
1150			/* For appenders, punt and lock the whole range. */
1151			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1152		} else {
1153			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
1154			    uio->uio_offset + uio->uio_resid);
1155		}
1156		error = vn_io_fault1(vp, uio, &args, td);
1157		vn_rangelock_unlock(vp, rl_cookie);
1158	} else {
1159		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
1160	}
1161	foffset_unlock_uio(fp, uio, flags);
1162	return (error);
1163}
1164
1165/*
1166 * Helper function to perform the requested uiomove operation using
1167 * the held pages for io->uio_iov[0].iov_base buffer instead of
1168 * copyin/copyout.  Access to the pages with uiomove_fromphys()
1169 * instead of iov_base prevents page faults that could occur due to
1170 * pmap_collect() invalidating the mapping created by
1171 * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1172 * object cleanup revoking the write access from page mappings.
1173 *
1174 * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1175 * instead of plain uiomove().
1176 */
1177int
1178vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
1179{
1180	struct uio transp_uio;
1181	struct iovec transp_iov[1];
1182	struct thread *td;
1183	size_t adv;
1184	int error, pgadv;
1185
1186	td = curthread;
1187	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1188	    uio->uio_segflg != UIO_USERSPACE)
1189		return (uiomove(data, xfersize, uio));
1190
1191	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1192	transp_iov[0].iov_base = data;
1193	transp_uio.uio_iov = &transp_iov[0];
1194	transp_uio.uio_iovcnt = 1;
1195	if (xfersize > uio->uio_resid)
1196		xfersize = uio->uio_resid;
1197	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1198	transp_uio.uio_offset = 0;
1199	transp_uio.uio_segflg = UIO_SYSSPACE;
1200	/*
1201	 * Since transp_iov points to data, and td_ma page array
1202	 * corresponds to original uio->uio_iov, we need to invert the
1203	 * direction of the i/o operation as passed to
1204	 * uiomove_fromphys().
1205	 */
1206	switch (uio->uio_rw) {
1207	case UIO_WRITE:
1208		transp_uio.uio_rw = UIO_READ;
1209		break;
1210	case UIO_READ:
1211		transp_uio.uio_rw = UIO_WRITE;
1212		break;
1213	}
1214	transp_uio.uio_td = uio->uio_td;
1215	error = uiomove_fromphys(td->td_ma,
1216	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1217	    xfersize, &transp_uio);
1218	adv = xfersize - transp_uio.uio_resid;
1219	pgadv =
1220	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1221	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1222	td->td_ma += pgadv;
1223	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1224	    pgadv));
1225	td->td_ma_cnt -= pgadv;
1226	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1227	uio->uio_iov->iov_len -= adv;
1228	uio->uio_resid -= adv;
1229	uio->uio_offset += adv;
1230	return (error);
1231}
1232
1233int
1234vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1235    struct uio *uio)
1236{
1237	struct thread *td;
1238	vm_offset_t iov_base;
1239	int cnt, pgadv;
1240
1241	td = curthread;
1242	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1243	    uio->uio_segflg != UIO_USERSPACE)
1244		return (uiomove_fromphys(ma, offset, xfersize, uio));
1245
1246	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1247	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1248	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1249	switch (uio->uio_rw) {
1250	case UIO_WRITE:
1251		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1252		    offset, cnt);
1253		break;
1254	case UIO_READ:
1255		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1256		    cnt);
1257		break;
1258	}
1259	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1260	td->td_ma += pgadv;
1261	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1262	    pgadv));
1263	td->td_ma_cnt -= pgadv;
1264	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1265	uio->uio_iov->iov_len -= cnt;
1266	uio->uio_resid -= cnt;
1267	uio->uio_offset += cnt;
1268	return (0);
1269}
1270
1271
1272/*
1273 * File table truncate routine.
1274 */
1275static int
1276vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1277    struct thread *td)
1278{
1279	struct vattr vattr;
1280	struct mount *mp;
1281	struct vnode *vp;
1282	void *rl_cookie;
1283	int error;
1284
1285	vp = fp->f_vnode;
1286
1287	/*
1288	 * Lock the whole range for truncation.  Otherwise split i/o
1289	 * might happen partly before and partly after the truncation.
1290	 */
1291	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1292	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1293	if (error)
1294		goto out1;
1295	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1296	if (vp->v_type == VDIR) {
1297		error = EISDIR;
1298		goto out;
1299	}
1300#ifdef MAC
1301	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1302	if (error)
1303		goto out;
1304#endif
1305	error = vn_writechk(vp);
1306	if (error == 0) {
1307		VATTR_NULL(&vattr);
1308		vattr.va_size = length;
1309		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
1310	}
1311out:
1312	VOP_UNLOCK(vp, 0);
1313	vn_finished_write(mp);
1314out1:
1315	vn_rangelock_unlock(vp, rl_cookie);
1316	return (error);
1317}
1318
1319/*
1320 * File table vnode stat routine.
1321 */
1322static int
1323vn_statfile(fp, sb, active_cred, td)
1324	struct file *fp;
1325	struct stat *sb;
1326	struct ucred *active_cred;
1327	struct thread *td;
1328{
1329	struct vnode *vp = fp->f_vnode;
1330	int error;
1331
1332	vn_lock(vp, LK_SHARED | LK_RETRY);
1333	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
1334	VOP_UNLOCK(vp, 0);
1335
1336	return (error);
1337}
1338
1339/*
1340 * Stat a vnode; implementation for the stat syscall
1341 */
1342int
1343vn_stat(vp, sb, active_cred, file_cred, td)
1344	struct vnode *vp;
1345	register struct stat *sb;
1346	struct ucred *active_cred;
1347	struct ucred *file_cred;
1348	struct thread *td;
1349{
1350	struct vattr vattr;
1351	register struct vattr *vap;
1352	int error;
1353	u_short mode;
1354
1355#ifdef MAC
1356	error = mac_vnode_check_stat(active_cred, file_cred, vp);
1357	if (error)
1358		return (error);
1359#endif
1360
1361	vap = &vattr;
1362
1363	/*
1364	 * Initialize defaults for new and unusual fields, so that file
1365	 * systems which don't support these fields don't need to know
1366	 * about them.
1367	 */
1368	vap->va_birthtime.tv_sec = -1;
1369	vap->va_birthtime.tv_nsec = 0;
1370	vap->va_fsid = VNOVAL;
1371	vap->va_rdev = NODEV;
1372
1373	error = VOP_GETATTR(vp, vap, active_cred);
1374	if (error)
1375		return (error);
1376
1377	/*
1378	 * Zero the spare stat fields
1379	 */
1380	bzero(sb, sizeof *sb);
1381
1382	/*
1383	 * Copy from vattr table
1384	 */
1385	if (vap->va_fsid != VNOVAL)
1386		sb->st_dev = vap->va_fsid;
1387	else
1388		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
1389	sb->st_ino = vap->va_fileid;
1390	mode = vap->va_mode;
1391	switch (vap->va_type) {
1392	case VREG:
1393		mode |= S_IFREG;
1394		break;
1395	case VDIR:
1396		mode |= S_IFDIR;
1397		break;
1398	case VBLK:
1399		mode |= S_IFBLK;
1400		break;
1401	case VCHR:
1402		mode |= S_IFCHR;
1403		break;
1404	case VLNK:
1405		mode |= S_IFLNK;
1406		break;
1407	case VSOCK:
1408		mode |= S_IFSOCK;
1409		break;
1410	case VFIFO:
1411		mode |= S_IFIFO;
1412		break;
1413	default:
1414		return (EBADF);
1415	};
1416	sb->st_mode = mode;
1417	sb->st_nlink = vap->va_nlink;
1418	sb->st_uid = vap->va_uid;
1419	sb->st_gid = vap->va_gid;
1420	sb->st_rdev = vap->va_rdev;
1421	if (vap->va_size > OFF_MAX)
1422		return (EOVERFLOW);
1423	sb->st_size = vap->va_size;
1424	sb->st_atim = vap->va_atime;
1425	sb->st_mtim = vap->va_mtime;
1426	sb->st_ctim = vap->va_ctime;
1427	sb->st_birthtim = vap->va_birthtime;
1428
1429        /*
1430	 * According to www.opengroup.org, the meaning of st_blksize is
1431	 *   "a filesystem-specific preferred I/O block size for this
1432	 *    object.  In some filesystem types, this may vary from file
1433	 *    to file"
1434	 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
1435	 */
1436
1437	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
1438
1439	sb->st_flags = vap->va_flags;
1440	if (priv_check(td, PRIV_VFS_GENERATION))
1441		sb->st_gen = 0;
1442	else
1443		sb->st_gen = vap->va_gen;
1444
1445	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
1446	return (0);
1447}
1448
1449/*
1450 * File table vnode ioctl routine.
1451 */
1452static int
1453vn_ioctl(fp, com, data, active_cred, td)
1454	struct file *fp;
1455	u_long com;
1456	void *data;
1457	struct ucred *active_cred;
1458	struct thread *td;
1459{
1460	struct vattr vattr;
1461	struct vnode *vp;
1462	int error;
1463
1464	vp = fp->f_vnode;
1465	switch (vp->v_type) {
1466	case VDIR:
1467	case VREG:
1468		switch (com) {
1469		case FIONREAD:
1470			vn_lock(vp, LK_SHARED | LK_RETRY);
1471			error = VOP_GETATTR(vp, &vattr, active_cred);
1472			VOP_UNLOCK(vp, 0);
1473			if (error == 0)
1474				*(int *)data = vattr.va_size - fp->f_offset;
1475			return (error);
1476		case FIONBIO:
1477		case FIOASYNC:
1478			return (0);
1479		default:
1480			return (VOP_IOCTL(vp, com, data, fp->f_flag,
1481			    active_cred, td));
1482		}
1483	default:
1484		return (ENOTTY);
1485	}
1486}
1487
1488/*
1489 * File table vnode poll routine.
1490 */
1491static int
1492vn_poll(fp, events, active_cred, td)
1493	struct file *fp;
1494	int events;
1495	struct ucred *active_cred;
1496	struct thread *td;
1497{
1498	struct vnode *vp;
1499	int error;
1500
1501	vp = fp->f_vnode;
1502#ifdef MAC
1503	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1504	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1505	VOP_UNLOCK(vp, 0);
1506	if (!error)
1507#endif
1508
1509	error = VOP_POLL(vp, events, fp->f_cred, td);
1510	return (error);
1511}
1512
1513/*
1514 * Acquire the requested lock and then check for validity.  LK_RETRY
1515 * permits vn_lock to return doomed vnodes.
1516 */
1517int
1518_vn_lock(struct vnode *vp, int flags, char *file, int line)
1519{
1520	int error;
1521
1522	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1523	    ("vn_lock called with no locktype."));
1524	do {
1525#ifdef DEBUG_VFS_LOCKS
1526		KASSERT(vp->v_holdcnt != 0,
1527		    ("vn_lock %p: zero hold count", vp));
1528#endif
1529		error = VOP_LOCK1(vp, flags, file, line);
1530		flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
1531		KASSERT((flags & LK_RETRY) == 0 || error == 0,
1532		    ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
1533		    flags, error));
1534		/*
1535		 * Callers specify LK_RETRY if they wish to get dead vnodes.
1536		 * If RETRY is not set, we return ENOENT instead.
1537		 */
1538		if (error == 0 && vp->v_iflag & VI_DOOMED &&
1539		    (flags & LK_RETRY) == 0) {
1540			VOP_UNLOCK(vp, 0);
1541			error = ENOENT;
1542			break;
1543		}
1544	} while (flags & LK_RETRY && error != 0);
1545	return (error);
1546}
1547
1548/*
1549 * File table vnode close routine.
1550 */
1551static int
1552vn_closefile(fp, td)
1553	struct file *fp;
1554	struct thread *td;
1555{
1556	struct vnode *vp;
1557	struct flock lf;
1558	int error;
1559
1560	vp = fp->f_vnode;
1561	fp->f_ops = &badfileops;
1562
1563	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK)
1564		vref(vp);
1565
1566	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
1567
1568	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
1569		lf.l_whence = SEEK_SET;
1570		lf.l_start = 0;
1571		lf.l_len = 0;
1572		lf.l_type = F_UNLCK;
1573		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1574		vrele(vp);
1575	}
1576	return (error);
1577}
1578
1579/*
1580 * Preparing to start a filesystem write operation. If the operation is
1581 * permitted, then we bump the count of operations in progress and
1582 * proceed. If a suspend request is in progress, we wait until the
1583 * suspension is over, and then proceed.
1584 */
1585static int
1586vn_start_write_locked(struct mount *mp, int flags)
1587{
1588	int error, mflags;
1589
1590	mtx_assert(MNT_MTX(mp), MA_OWNED);
1591	error = 0;
1592
1593	/*
1594	 * Check on status of suspension.
1595	 */
1596	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1597	    mp->mnt_susp_owner != curthread) {
1598		mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
1599		    (flags & PCATCH) : 0) | (PUSER - 1);
1600		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1601			if (flags & V_NOWAIT) {
1602				error = EWOULDBLOCK;
1603				goto unlock;
1604			}
1605			error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
1606			    "suspfs", 0);
1607			if (error)
1608				goto unlock;
1609		}
1610	}
1611	if (flags & V_XSLEEP)
1612		goto unlock;
1613	mp->mnt_writeopcount++;
1614unlock:
1615	if (error != 0 || (flags & V_XSLEEP) != 0)
1616		MNT_REL(mp);
1617	MNT_IUNLOCK(mp);
1618	return (error);
1619}
1620
1621int
1622vn_start_write(vp, mpp, flags)
1623	struct vnode *vp;
1624	struct mount **mpp;
1625	int flags;
1626{
1627	struct mount *mp;
1628	int error;
1629
1630	error = 0;
1631	/*
1632	 * If a vnode is provided, get and return the mount point that
1633	 * to which it will write.
1634	 */
1635	if (vp != NULL) {
1636		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1637			*mpp = NULL;
1638			if (error != EOPNOTSUPP)
1639				return (error);
1640			return (0);
1641		}
1642	}
1643	if ((mp = *mpp) == NULL)
1644		return (0);
1645
1646	/*
1647	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1648	 * a vfs_ref().
1649	 * As long as a vnode is not provided we need to acquire a
1650	 * refcount for the provided mountpoint too, in order to
1651	 * emulate a vfs_ref().
1652	 */
1653	MNT_ILOCK(mp);
1654	if (vp == NULL)
1655		MNT_REF(mp);
1656
1657	return (vn_start_write_locked(mp, flags));
1658}
1659
1660/*
1661 * Secondary suspension. Used by operations such as vop_inactive
1662 * routines that are needed by the higher level functions. These
1663 * are allowed to proceed until all the higher level functions have
1664 * completed (indicated by mnt_writeopcount dropping to zero). At that
1665 * time, these operations are halted until the suspension is over.
1666 */
1667int
1668vn_start_secondary_write(vp, mpp, flags)
1669	struct vnode *vp;
1670	struct mount **mpp;
1671	int flags;
1672{
1673	struct mount *mp;
1674	int error;
1675
1676 retry:
1677	if (vp != NULL) {
1678		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1679			*mpp = NULL;
1680			if (error != EOPNOTSUPP)
1681				return (error);
1682			return (0);
1683		}
1684	}
1685	/*
1686	 * If we are not suspended or have not yet reached suspended
1687	 * mode, then let the operation proceed.
1688	 */
1689	if ((mp = *mpp) == NULL)
1690		return (0);
1691
1692	/*
1693	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1694	 * a vfs_ref().
1695	 * As long as a vnode is not provided we need to acquire a
1696	 * refcount for the provided mountpoint too, in order to
1697	 * emulate a vfs_ref().
1698	 */
1699	MNT_ILOCK(mp);
1700	if (vp == NULL)
1701		MNT_REF(mp);
1702	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1703		mp->mnt_secondary_writes++;
1704		mp->mnt_secondary_accwrites++;
1705		MNT_IUNLOCK(mp);
1706		return (0);
1707	}
1708	if (flags & V_NOWAIT) {
1709		MNT_REL(mp);
1710		MNT_IUNLOCK(mp);
1711		return (EWOULDBLOCK);
1712	}
1713	/*
1714	 * Wait for the suspension to finish.
1715	 */
1716	error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
1717	    ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
1718	    "suspfs", 0);
1719	vfs_rel(mp);
1720	if (error == 0)
1721		goto retry;
1722	return (error);
1723}
1724
1725/*
1726 * Filesystem write operation has completed. If we are suspending and this
1727 * operation is the last one, notify the suspender that the suspension is
1728 * now in effect.
1729 */
1730void
1731vn_finished_write(mp)
1732	struct mount *mp;
1733{
1734	if (mp == NULL)
1735		return;
1736	MNT_ILOCK(mp);
1737	MNT_REL(mp);
1738	mp->mnt_writeopcount--;
1739	if (mp->mnt_writeopcount < 0)
1740		panic("vn_finished_write: neg cnt");
1741	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1742	    mp->mnt_writeopcount <= 0)
1743		wakeup(&mp->mnt_writeopcount);
1744	MNT_IUNLOCK(mp);
1745}
1746
1747
1748/*
1749 * Filesystem secondary write operation has completed. If we are
1750 * suspending and this operation is the last one, notify the suspender
1751 * that the suspension is now in effect.
1752 */
1753void
1754vn_finished_secondary_write(mp)
1755	struct mount *mp;
1756{
1757	if (mp == NULL)
1758		return;
1759	MNT_ILOCK(mp);
1760	MNT_REL(mp);
1761	mp->mnt_secondary_writes--;
1762	if (mp->mnt_secondary_writes < 0)
1763		panic("vn_finished_secondary_write: neg cnt");
1764	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1765	    mp->mnt_secondary_writes <= 0)
1766		wakeup(&mp->mnt_secondary_writes);
1767	MNT_IUNLOCK(mp);
1768}
1769
1770
1771
1772/*
1773 * Request a filesystem to suspend write operations.
1774 */
1775int
1776vfs_write_suspend(struct mount *mp, int flags)
1777{
1778	int error;
1779
1780	MNT_ILOCK(mp);
1781	if (mp->mnt_susp_owner == curthread) {
1782		MNT_IUNLOCK(mp);
1783		return (EALREADY);
1784	}
1785	while (mp->mnt_kern_flag & MNTK_SUSPEND)
1786		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1787
1788	/*
1789	 * Unmount holds a write reference on the mount point.  If we
1790	 * own busy reference and drain for writers, we deadlock with
1791	 * the reference draining in the unmount path.  Callers of
1792	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
1793	 * vfs_busy() reference is owned and caller is not in the
1794	 * unmount context.
1795	 */
1796	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
1797	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
1798		MNT_IUNLOCK(mp);
1799		return (EBUSY);
1800	}
1801
1802	mp->mnt_kern_flag |= MNTK_SUSPEND;
1803	mp->mnt_susp_owner = curthread;
1804	if (mp->mnt_writeopcount > 0)
1805		(void) msleep(&mp->mnt_writeopcount,
1806		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1807	else
1808		MNT_IUNLOCK(mp);
1809	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
1810		vfs_write_resume(mp, 0);
1811	return (error);
1812}
1813
1814/*
1815 * Request a filesystem to resume write operations.
1816 */
1817void
1818vfs_write_resume(struct mount *mp, int flags)
1819{
1820
1821	MNT_ILOCK(mp);
1822	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1823		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1824		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1825				       MNTK_SUSPENDED);
1826		mp->mnt_susp_owner = NULL;
1827		wakeup(&mp->mnt_writeopcount);
1828		wakeup(&mp->mnt_flag);
1829		curthread->td_pflags &= ~TDP_IGNSUSP;
1830		if ((flags & VR_START_WRITE) != 0) {
1831			MNT_REF(mp);
1832			mp->mnt_writeopcount++;
1833		}
1834		MNT_IUNLOCK(mp);
1835		if ((flags & VR_NO_SUSPCLR) == 0)
1836			VFS_SUSP_CLEAN(mp);
1837	} else if ((flags & VR_START_WRITE) != 0) {
1838		MNT_REF(mp);
1839		vn_start_write_locked(mp, 0);
1840	} else {
1841		MNT_IUNLOCK(mp);
1842	}
1843}
1844
1845/*
1846 * Helper loop around vfs_write_suspend() for filesystem unmount VFS
1847 * methods.
1848 */
1849int
1850vfs_write_suspend_umnt(struct mount *mp)
1851{
1852	int error;
1853
1854	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
1855	    ("vfs_write_suspend_umnt: recursed"));
1856
1857	/* dounmount() already called vn_start_write(). */
1858	for (;;) {
1859		vn_finished_write(mp);
1860		error = vfs_write_suspend(mp, 0);
1861		if (error != 0) {
1862			vn_start_write(NULL, &mp, V_WAIT);
1863			return (error);
1864		}
1865		MNT_ILOCK(mp);
1866		if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
1867			break;
1868		MNT_IUNLOCK(mp);
1869		vn_start_write(NULL, &mp, V_WAIT);
1870	}
1871	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
1872	wakeup(&mp->mnt_flag);
1873	MNT_IUNLOCK(mp);
1874	curthread->td_pflags |= TDP_IGNSUSP;
1875	return (0);
1876}
1877
1878/*
1879 * Implement kqueues for files by translating it to vnode operation.
1880 */
1881static int
1882vn_kqfilter(struct file *fp, struct knote *kn)
1883{
1884
1885	return (VOP_KQFILTER(fp->f_vnode, kn));
1886}
1887
1888/*
1889 * Simplified in-kernel wrapper calls for extended attribute access.
1890 * Both calls pass in a NULL credential, authorizing as "kernel" access.
1891 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1892 */
1893int
1894vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1895    const char *attrname, int *buflen, char *buf, struct thread *td)
1896{
1897	struct uio	auio;
1898	struct iovec	iov;
1899	int	error;
1900
1901	iov.iov_len = *buflen;
1902	iov.iov_base = buf;
1903
1904	auio.uio_iov = &iov;
1905	auio.uio_iovcnt = 1;
1906	auio.uio_rw = UIO_READ;
1907	auio.uio_segflg = UIO_SYSSPACE;
1908	auio.uio_td = td;
1909	auio.uio_offset = 0;
1910	auio.uio_resid = *buflen;
1911
1912	if ((ioflg & IO_NODELOCKED) == 0)
1913		vn_lock(vp, LK_SHARED | LK_RETRY);
1914
1915	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1916
1917	/* authorize attribute retrieval as kernel */
1918	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1919	    td);
1920
1921	if ((ioflg & IO_NODELOCKED) == 0)
1922		VOP_UNLOCK(vp, 0);
1923
1924	if (error == 0) {
1925		*buflen = *buflen - auio.uio_resid;
1926	}
1927
1928	return (error);
1929}
1930
1931/*
1932 * XXX failure mode if partially written?
1933 */
1934int
1935vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1936    const char *attrname, int buflen, char *buf, struct thread *td)
1937{
1938	struct uio	auio;
1939	struct iovec	iov;
1940	struct mount	*mp;
1941	int	error;
1942
1943	iov.iov_len = buflen;
1944	iov.iov_base = buf;
1945
1946	auio.uio_iov = &iov;
1947	auio.uio_iovcnt = 1;
1948	auio.uio_rw = UIO_WRITE;
1949	auio.uio_segflg = UIO_SYSSPACE;
1950	auio.uio_td = td;
1951	auio.uio_offset = 0;
1952	auio.uio_resid = buflen;
1953
1954	if ((ioflg & IO_NODELOCKED) == 0) {
1955		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1956			return (error);
1957		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1958	}
1959
1960	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1961
1962	/* authorize attribute setting as kernel */
1963	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1964
1965	if ((ioflg & IO_NODELOCKED) == 0) {
1966		vn_finished_write(mp);
1967		VOP_UNLOCK(vp, 0);
1968	}
1969
1970	return (error);
1971}
1972
1973int
1974vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1975    const char *attrname, struct thread *td)
1976{
1977	struct mount	*mp;
1978	int	error;
1979
1980	if ((ioflg & IO_NODELOCKED) == 0) {
1981		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1982			return (error);
1983		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1984	}
1985
1986	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1987
1988	/* authorize attribute removal as kernel */
1989	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1990	if (error == EOPNOTSUPP)
1991		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1992		    NULL, td);
1993
1994	if ((ioflg & IO_NODELOCKED) == 0) {
1995		vn_finished_write(mp);
1996		VOP_UNLOCK(vp, 0);
1997	}
1998
1999	return (error);
2000}
2001
2002static int
2003vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
2004    struct vnode **rvp)
2005{
2006
2007	return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
2008}
2009
2010int
2011vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
2012{
2013
2014	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
2015	    lkflags, rvp));
2016}
2017
2018int
2019vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
2020    int lkflags, struct vnode **rvp)
2021{
2022	struct mount *mp;
2023	int ltype, error;
2024
2025	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
2026	mp = vp->v_mount;
2027	ltype = VOP_ISLOCKED(vp);
2028	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
2029	    ("vn_vget_ino: vp not locked"));
2030	error = vfs_busy(mp, MBF_NOWAIT);
2031	if (error != 0) {
2032		vfs_ref(mp);
2033		VOP_UNLOCK(vp, 0);
2034		error = vfs_busy(mp, 0);
2035		vn_lock(vp, ltype | LK_RETRY);
2036		vfs_rel(mp);
2037		if (error != 0)
2038			return (ENOENT);
2039		if (vp->v_iflag & VI_DOOMED) {
2040			vfs_unbusy(mp);
2041			return (ENOENT);
2042		}
2043	}
2044	VOP_UNLOCK(vp, 0);
2045	error = alloc(mp, alloc_arg, lkflags, rvp);
2046	vfs_unbusy(mp);
2047	if (*rvp != vp)
2048		vn_lock(vp, ltype | LK_RETRY);
2049	if (vp->v_iflag & VI_DOOMED) {
2050		if (error == 0) {
2051			if (*rvp == vp)
2052				vunref(vp);
2053			else
2054				vput(*rvp);
2055		}
2056		error = ENOENT;
2057	}
2058	return (error);
2059}
2060
2061int
2062vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
2063    const struct thread *td)
2064{
2065
2066	if (vp->v_type != VREG || td == NULL)
2067		return (0);
2068	PROC_LOCK(td->td_proc);
2069	if ((uoff_t)uio->uio_offset + uio->uio_resid >
2070	    lim_cur(td->td_proc, RLIMIT_FSIZE)) {
2071		kern_psignal(td->td_proc, SIGXFSZ);
2072		PROC_UNLOCK(td->td_proc);
2073		return (EFBIG);
2074	}
2075	PROC_UNLOCK(td->td_proc);
2076	return (0);
2077}
2078
2079int
2080vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
2081    struct thread *td)
2082{
2083	struct vnode *vp;
2084
2085	vp = fp->f_vnode;
2086#ifdef AUDIT
2087	vn_lock(vp, LK_SHARED | LK_RETRY);
2088	AUDIT_ARG_VNODE1(vp);
2089	VOP_UNLOCK(vp, 0);
2090#endif
2091	return (setfmode(td, active_cred, vp, mode));
2092}
2093
2094int
2095vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
2096    struct thread *td)
2097{
2098	struct vnode *vp;
2099
2100	vp = fp->f_vnode;
2101#ifdef AUDIT
2102	vn_lock(vp, LK_SHARED | LK_RETRY);
2103	AUDIT_ARG_VNODE1(vp);
2104	VOP_UNLOCK(vp, 0);
2105#endif
2106	return (setfown(td, active_cred, vp, uid, gid));
2107}
2108
2109void
2110vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2111{
2112	vm_object_t object;
2113
2114	if ((object = vp->v_object) == NULL)
2115		return;
2116	VM_OBJECT_WLOCK(object);
2117	vm_object_page_remove(object, start, end, 0);
2118	VM_OBJECT_WUNLOCK(object);
2119}
2120
2121int
2122vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
2123{
2124	struct vattr va;
2125	daddr_t bn, bnp;
2126	uint64_t bsize;
2127	off_t noff;
2128	int error;
2129
2130	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2131	    ("Wrong command %lu", cmd));
2132
2133	if (vn_lock(vp, LK_SHARED) != 0)
2134		return (EBADF);
2135	if (vp->v_type != VREG) {
2136		error = ENOTTY;
2137		goto unlock;
2138	}
2139	error = VOP_GETATTR(vp, &va, cred);
2140	if (error != 0)
2141		goto unlock;
2142	noff = *off;
2143	if (noff >= va.va_size) {
2144		error = ENXIO;
2145		goto unlock;
2146	}
2147	bsize = vp->v_mount->mnt_stat.f_iosize;
2148	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
2149		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
2150		if (error == EOPNOTSUPP) {
2151			error = ENOTTY;
2152			goto unlock;
2153		}
2154		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
2155		    (bnp != -1 && cmd == FIOSEEKDATA)) {
2156			noff = bn * bsize;
2157			if (noff < *off)
2158				noff = *off;
2159			goto unlock;
2160		}
2161	}
2162	if (noff > va.va_size)
2163		noff = va.va_size;
2164	/* noff == va.va_size. There is an implicit hole at the end of file. */
2165	if (cmd == FIOSEEKDATA)
2166		error = ENXIO;
2167unlock:
2168	VOP_UNLOCK(vp, 0);
2169	if (error == 0)
2170		*off = noff;
2171	return (error);
2172}
2173
2174int
2175vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
2176{
2177	struct ucred *cred;
2178	struct vnode *vp;
2179	struct vattr vattr;
2180	off_t foffset, size;
2181	int error, noneg;
2182
2183	cred = td->td_ucred;
2184	vp = fp->f_vnode;
2185	foffset = foffset_lock(fp, 0);
2186	noneg = (vp->v_type != VCHR);
2187	error = 0;
2188	switch (whence) {
2189	case L_INCR:
2190		if (noneg &&
2191		    (foffset < 0 ||
2192		    (offset > 0 && foffset > OFF_MAX - offset))) {
2193			error = EOVERFLOW;
2194			break;
2195		}
2196		offset += foffset;
2197		break;
2198	case L_XTND:
2199		vn_lock(vp, LK_SHARED | LK_RETRY);
2200		error = VOP_GETATTR(vp, &vattr, cred);
2201		VOP_UNLOCK(vp, 0);
2202		if (error)
2203			break;
2204
2205		/*
2206		 * If the file references a disk device, then fetch
2207		 * the media size and use that to determine the ending
2208		 * offset.
2209		 */
2210		if (vattr.va_size == 0 && vp->v_type == VCHR &&
2211		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2212			vattr.va_size = size;
2213		if (noneg &&
2214		    (vattr.va_size > OFF_MAX ||
2215		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2216			error = EOVERFLOW;
2217			break;
2218		}
2219		offset += vattr.va_size;
2220		break;
2221	case L_SET:
2222		break;
2223	case SEEK_DATA:
2224		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2225		break;
2226	case SEEK_HOLE:
2227		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2228		break;
2229	default:
2230		error = EINVAL;
2231	}
2232	if (error == 0 && noneg && offset < 0)
2233		error = EINVAL;
2234	if (error != 0)
2235		goto drop;
2236	VFS_KNOTE_UNLOCKED(vp, 0);
2237	*(off_t *)(td->td_retval) = offset;
2238drop:
2239	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2240	return (error);
2241}
2242
2243int
2244vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
2245    struct thread *td)
2246{
2247	int error;
2248
2249	/*
2250	 * Grant permission if the caller is the owner of the file, or
2251	 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
2252	 * on the file.  If the time pointer is null, then write
2253	 * permission on the file is also sufficient.
2254	 *
2255	 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
2256	 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
2257	 * will be allowed to set the times [..] to the current
2258	 * server time.
2259	 */
2260	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
2261	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
2262		error = VOP_ACCESS(vp, VWRITE, cred, td);
2263	return (error);
2264}
2265