1/*	$NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $	*/
2
3/*-
4 * Copyright (c) 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * Copyright (c) 1982, 1986, 1989, 1993
34 *	The Regents of the University of California.  All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 *    notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 *    notice, this list of conditions and the following disclaimer in the
48 *    documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 *    may be used to endorse or promote products derived from this software
51 *    without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 *	@(#)vfs_vnops.c	8.14 (Berkeley) 6/15/95
66 */
67
68#include <sys/cdefs.h>
69__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $");
70
71#include "veriexec.h"
72
73#include <sys/param.h>
74#include <sys/systm.h>
75#include <sys/kernel.h>
76#include <sys/file.h>
77#include <sys/stat.h>
78#include <sys/buf.h>
79#include <sys/proc.h>
80#include <sys/mount.h>
81#include <sys/namei.h>
82#include <sys/vnode_impl.h>
83#include <sys/ioctl.h>
84#include <sys/tty.h>
85#include <sys/poll.h>
86#include <sys/kauth.h>
87#include <sys/syslog.h>
88#include <sys/fstrans.h>
89#include <sys/atomic.h>
90#include <sys/filedesc.h>
91#include <sys/wapbl.h>
92#include <sys/mman.h>
93
94#include <miscfs/specfs/specdev.h>
95#include <miscfs/fifofs/fifo.h>
96
97#include <uvm/uvm_extern.h>
98#include <uvm/uvm_readahead.h>
99#include <uvm/uvm_device.h>
100
101#ifdef UNION
102#include <fs/union/union.h>
103#endif
104
105#ifndef COMPAT_ZERODEV
106#define COMPAT_ZERODEV(dev)	(0)
107#endif
108
109int (*vn_union_readdir_hook)(struct vnode **, struct file *, struct lwp *);
110
111#include <sys/verified_exec.h>
112
113static int vn_read(file_t *fp, off_t *offset, struct uio *uio,
114    kauth_cred_t cred, int flags);
115static int vn_write(file_t *fp, off_t *offset, struct uio *uio,
116    kauth_cred_t cred, int flags);
117static int vn_closefile(file_t *fp);
118static int vn_poll(file_t *fp, int events);
119static int vn_fcntl(file_t *fp, u_int com, void *data);
120static int vn_statfile(file_t *fp, struct stat *sb);
121static int vn_ioctl(file_t *fp, u_long com, void *data);
122static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *,
123    struct uvm_object **, int *);
124static int vn_seek(struct file *, off_t, int, off_t *, int);
125static int vn_advlock(struct file *, void *, int, struct flock *, int);
126static int vn_fpathconf(struct file *, int, register_t *);
127static int vn_posix_fadvise(struct file *, off_t, off_t, int);
128static int vn_truncate(file_t *, off_t);
129
130const struct fileops vnops = {
131	.fo_name = "vn",
132	.fo_read = vn_read,
133	.fo_write = vn_write,
134	.fo_ioctl = vn_ioctl,
135	.fo_fcntl = vn_fcntl,
136	.fo_poll = vn_poll,
137	.fo_stat = vn_statfile,
138	.fo_close = vn_closefile,
139	.fo_kqfilter = vn_kqfilter,
140	.fo_restart = fnullop_restart,
141	.fo_mmap = vn_mmap,
142	.fo_seek = vn_seek,
143	.fo_advlock = vn_advlock,
144	.fo_fpathconf = vn_fpathconf,
145	.fo_posix_fadvise = vn_posix_fadvise,
146	.fo_truncate = vn_truncate,
147};
148
149/*
150 * Common code for vnode open operations.
151 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
152 *
153 * at_dvp is the directory for openat(), if any.
154 * pb is the path.
155 * nmode is additional namei flags, restricted to TRYEMULROOT and NOCHROOT.
156 * fmode is the open flags, converted from O_* to F*
157 * cmode is the creation file permissions.
158 *
159 * XXX shouldn't cmode be mode_t?
160 *
161 * On success produces either a locked vnode in *ret_vp, or NULL in
162 * *ret_vp and a file descriptor number in *ret_fd.
163 *
164 * The caller may pass NULL for ret_fd (and ret_domove), in which case
165 * EOPNOTSUPP will be produced in the cases that would otherwise return
166 * a file descriptor.
167 *
168 * Note that callers that want no-follow behavior should pass
169 * O_NOFOLLOW in fmode. Neither FOLLOW nor NOFOLLOW in nmode is
170 * honored.
171 */
172int
173vn_open(struct vnode *at_dvp, struct pathbuf *pb,
174	int nmode, int fmode, int cmode,
175	struct vnode **ret_vp, bool *ret_domove, int *ret_fd)
176{
177	struct nameidata nd;
178	struct vnode *vp = NULL;
179	struct lwp *l = curlwp;
180	kauth_cred_t cred = l->l_cred;
181	struct vattr va;
182	int error;
183	const char *pathstring;
184
185	KASSERT((nmode & (TRYEMULROOT | NOCHROOT)) == nmode);
186
187	KASSERT(ret_vp != NULL);
188	KASSERT((ret_domove == NULL) == (ret_fd == NULL));
189
190	if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY))
191		return EINVAL;
192
193	NDINIT(&nd, LOOKUP, nmode, pb);
194	if (at_dvp != NULL)
195		NDAT(&nd, at_dvp);
196
197	nd.ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT;
198
199	if (fmode & O_CREAT) {
200		nd.ni_cnd.cn_nameiop = CREATE;
201		nd.ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF;
202		if ((fmode & O_EXCL) == 0 &&
203		    ((fmode & O_NOFOLLOW) == 0))
204			nd.ni_cnd.cn_flags |= FOLLOW;
205		if ((fmode & O_EXCL) == 0)
206			nd.ni_cnd.cn_flags |= NONEXCLHACK;
207	} else {
208		nd.ni_cnd.cn_nameiop = LOOKUP;
209		nd.ni_cnd.cn_flags |= LOCKLEAF;
210		if ((fmode & O_NOFOLLOW) == 0)
211			nd.ni_cnd.cn_flags |= FOLLOW;
212	}
213
214	pathstring = pathbuf_stringcopy_get(nd.ni_pathbuf);
215	if (pathstring == NULL) {
216		return ENOMEM;
217	}
218
219	/*
220	 * When this "interface" was exposed to do_open() it used
221	 * to initialize l_dupfd to -newfd-1 (thus passing in the
222	 * new file handle number to use)... but nothing in the
223	 * kernel uses that value. So just send 0.
224	 */
225	l->l_dupfd = 0;
226
227	error = namei(&nd);
228	if (error)
229		goto out;
230
231	vp = nd.ni_vp;
232
233#if NVERIEXEC > 0
234	error = veriexec_openchk(l, nd.ni_vp, pathstring, fmode);
235	if (error) {
236		/* We have to release the locks ourselves */
237		/*
238		 * 20210604 dholland passing NONEXCLHACK means we can
239		 * get ni_dvp == NULL back if ni_vp exists, and we should
240		 * treat that like the non-O_CREAT case.
241		 */
242		if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
243			if (vp == NULL) {
244				vput(nd.ni_dvp);
245			} else {
246				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
247				if (nd.ni_dvp == nd.ni_vp)
248					vrele(nd.ni_dvp);
249				else
250					vput(nd.ni_dvp);
251				nd.ni_dvp = NULL;
252				vput(vp);
253				vp = NULL;
254			}
255		} else {
256			vput(vp);
257			vp = NULL;
258		}
259		goto out;
260	}
261#endif /* NVERIEXEC > 0 */
262
263	/*
264	 * 20210604 dholland ditto
265	 */
266	if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
267		if (nd.ni_vp == NULL) {
268			vattr_null(&va);
269			va.va_type = VREG;
270			va.va_mode = cmode;
271			if (fmode & O_EXCL)
272				 va.va_vaflags |= VA_EXCLUSIVE;
273			error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
274					   &nd.ni_cnd, &va);
275			if (error) {
276				vput(nd.ni_dvp);
277				goto out;
278			}
279			fmode &= ~O_TRUNC;
280			vp = nd.ni_vp;
281			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
282			vput(nd.ni_dvp);
283		} else {
284			VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
285			if (nd.ni_dvp == nd.ni_vp)
286				vrele(nd.ni_dvp);
287			else
288				vput(nd.ni_dvp);
289			nd.ni_dvp = NULL;
290			vp = nd.ni_vp;
291			if (fmode & O_EXCL) {
292				error = EEXIST;
293				goto bad;
294			}
295			fmode &= ~O_CREAT;
296		}
297	} else if ((fmode & O_CREAT) != 0) {
298		/*
299		 * 20210606 dholland passing NONEXCLHACK means this
300		 * case exists; it is the same as the following one
301		 * but also needs to do things in the second (exists)
302		 * half of the following block. (Besides handle
303		 * ni_dvp, anyway.)
304		 */
305		vp = nd.ni_vp;
306		KASSERT((fmode & O_EXCL) == 0);
307		fmode &= ~O_CREAT;
308	} else {
309		vp = nd.ni_vp;
310	}
311	if (vp->v_type == VSOCK) {
312		error = EOPNOTSUPP;
313		goto bad;
314	}
315	if (nd.ni_vp->v_type == VLNK) {
316		error = EFTYPE;
317		goto bad;
318	}
319
320	if ((fmode & O_CREAT) == 0) {
321		error = vn_openchk(vp, cred, fmode);
322		if (error != 0)
323			goto bad;
324	}
325
326	if (fmode & O_TRUNC) {
327		vattr_null(&va);
328		va.va_size = 0;
329		error = VOP_SETATTR(vp, &va, cred);
330		if (error != 0)
331			goto bad;
332	}
333	if ((error = VOP_OPEN(vp, fmode, cred)) != 0)
334		goto bad;
335	if (fmode & FWRITE) {
336		mutex_enter(vp->v_interlock);
337		vp->v_writecount++;
338		mutex_exit(vp->v_interlock);
339	}
340
341bad:
342	if (error) {
343		vput(vp);
344		vp = NULL;
345	}
346out:
347	pathbuf_stringcopy_put(nd.ni_pathbuf, pathstring);
348
349	switch (error) {
350	case EDUPFD:
351	case EMOVEFD:
352		/* if the caller isn't prepared to handle fds, fail for them */
353		if (ret_fd == NULL) {
354			error = EOPNOTSUPP;
355			break;
356		}
357		*ret_vp = NULL;
358		*ret_domove = error == EMOVEFD;
359		*ret_fd = l->l_dupfd;
360		error = 0;
361		break;
362	case 0:
363		KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
364		*ret_vp = vp;
365		break;
366	}
367	l->l_dupfd = 0;
368	return error;
369}
370
371/*
372 * Check for write permissions on the specified vnode.
373 * Prototype text segments cannot be written.
374 */
375int
376vn_writechk(struct vnode *vp)
377{
378
379	/*
380	 * If the vnode is in use as a process's text,
381	 * we can't allow writing.
382	 */
383	if (vp->v_iflag & VI_TEXT)
384		return ETXTBSY;
385	return 0;
386}
387
388int
389vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags)
390{
391	int permbits = 0;
392	int error;
393
394	if (vp->v_type == VNON || vp->v_type == VBAD)
395		return ENXIO;
396
397	if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR)
398		return ENOTDIR;
399
400	if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG)
401		return EFTYPE;
402
403	if ((fflags & FREAD) != 0) {
404		permbits = VREAD;
405	}
406	if ((fflags & FEXEC) != 0) {
407		permbits |= VEXEC;
408	}
409	if ((fflags & (FWRITE | O_TRUNC)) != 0) {
410		permbits |= VWRITE;
411		if (vp->v_type == VDIR) {
412			error = EISDIR;
413			goto bad;
414		}
415		error = vn_writechk(vp);
416		if (error != 0)
417			goto bad;
418	}
419	error = VOP_ACCESS(vp, permbits, cred);
420bad:
421	return error;
422}
423
424/*
425 * Mark a vnode as having executable mappings.
426 */
427void
428vn_markexec(struct vnode *vp)
429{
430
431	if ((vp->v_iflag & VI_EXECMAP) != 0) {
432		/* Safe unlocked, as long as caller holds a reference. */
433		return;
434	}
435
436	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
437	mutex_enter(vp->v_interlock);
438	if ((vp->v_iflag & VI_EXECMAP) == 0) {
439		cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
440		vp->v_iflag |= VI_EXECMAP;
441	}
442	mutex_exit(vp->v_interlock);
443	rw_exit(vp->v_uobj.vmobjlock);
444}
445
446/*
447 * Mark a vnode as being the text of a process.
448 * Fail if the vnode is currently writable.
449 */
450int
451vn_marktext(struct vnode *vp)
452{
453
454	if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) {
455		/* Safe unlocked, as long as caller holds a reference. */
456		return 0;
457	}
458
459	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
460	mutex_enter(vp->v_interlock);
461	if (vp->v_writecount != 0) {
462		KASSERT((vp->v_iflag & VI_TEXT) == 0);
463		mutex_exit(vp->v_interlock);
464		rw_exit(vp->v_uobj.vmobjlock);
465		return ETXTBSY;
466	}
467	if ((vp->v_iflag & VI_EXECMAP) == 0) {
468		cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
469	}
470	vp->v_iflag |= (VI_TEXT | VI_EXECMAP);
471	mutex_exit(vp->v_interlock);
472	rw_exit(vp->v_uobj.vmobjlock);
473	return 0;
474}
475
476/*
477 * Vnode close call
478 *
479 * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node.
480 */
481int
482vn_close(struct vnode *vp, int flags, kauth_cred_t cred)
483{
484	int error;
485
486	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
487	if (flags & FWRITE) {
488		mutex_enter(vp->v_interlock);
489		KASSERT(vp->v_writecount > 0);
490		vp->v_writecount--;
491		mutex_exit(vp->v_interlock);
492	}
493	error = VOP_CLOSE(vp, flags, cred);
494	vput(vp);
495	return error;
496}
497
498static int
499enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag)
500{
501	struct lwp *l = curlwp;
502	off_t testoff;
503
504	if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG)
505		return 0;
506
507	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
508	if (ioflag & IO_APPEND)
509		testoff = vp->v_size;
510	else
511		testoff = uio->uio_offset;
512
513	if (testoff + uio->uio_resid >
514	    l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
515		mutex_enter(&proc_lock);
516		psignal(l->l_proc, SIGXFSZ);
517		mutex_exit(&proc_lock);
518		return EFBIG;
519	}
520
521	return 0;
522}
523
524/*
525 * Package up an I/O request on a vnode into a uio and do it.
526 */
527int
528vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
529    enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid,
530    struct lwp *l)
531{
532	struct uio auio;
533	struct iovec aiov;
534	int error;
535
536	if ((ioflg & IO_NODELOCKED) == 0) {
537		if (rw == UIO_READ) {
538			vn_lock(vp, LK_SHARED | LK_RETRY);
539		} else /* UIO_WRITE */ {
540			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
541		}
542	}
543	auio.uio_iov = &aiov;
544	auio.uio_iovcnt = 1;
545	aiov.iov_base = base;
546	aiov.iov_len = len;
547	auio.uio_resid = len;
548	auio.uio_offset = offset;
549	auio.uio_rw = rw;
550	if (segflg == UIO_SYSSPACE) {
551		UIO_SETUP_SYSSPACE(&auio);
552	} else {
553		auio.uio_vmspace = l->l_proc->p_vmspace;
554	}
555
556	if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0)
557		goto out;
558
559	if (rw == UIO_READ) {
560		error = VOP_READ(vp, &auio, ioflg, cred);
561	} else {
562		error = VOP_WRITE(vp, &auio, ioflg, cred);
563	}
564
565	if (aresid)
566		*aresid = auio.uio_resid;
567	else
568		if (auio.uio_resid && error == 0)
569			error = EIO;
570
571 out:
572	if ((ioflg & IO_NODELOCKED) == 0) {
573		VOP_UNLOCK(vp);
574	}
575	return error;
576}
577
578int
579vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done,
580    struct lwp *l, off_t **cookies, int *ncookies)
581{
582	struct vnode *vp = fp->f_vnode;
583	struct iovec aiov;
584	struct uio auio;
585	int error, eofflag;
586
587	/* Limit the size on any kernel buffers used by VOP_READDIR */
588	count = uimin(MAXBSIZE, count);
589
590unionread:
591	if (vp->v_type != VDIR)
592		return EINVAL;
593	aiov.iov_base = bf;
594	aiov.iov_len = count;
595	auio.uio_iov = &aiov;
596	auio.uio_iovcnt = 1;
597	auio.uio_rw = UIO_READ;
598	if (segflg == UIO_SYSSPACE) {
599		UIO_SETUP_SYSSPACE(&auio);
600	} else {
601		KASSERT(l == curlwp);
602		auio.uio_vmspace = l->l_proc->p_vmspace;
603	}
604	auio.uio_resid = count;
605	vn_lock(vp, LK_SHARED | LK_RETRY);
606	mutex_enter(&fp->f_lock);
607	auio.uio_offset = fp->f_offset;
608	mutex_exit(&fp->f_lock);
609	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies,
610	    ncookies);
611	mutex_enter(&fp->f_lock);
612	fp->f_offset = auio.uio_offset;
613	mutex_exit(&fp->f_lock);
614	VOP_UNLOCK(vp);
615	if (error)
616		return error;
617
618	if (count == auio.uio_resid && vn_union_readdir_hook) {
619		struct vnode *ovp = vp;
620
621		error = (*vn_union_readdir_hook)(&vp, fp, l);
622		if (error)
623			return error;
624		if (vp != ovp)
625			goto unionread;
626	}
627
628	if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) &&
629	    (vp->v_mount->mnt_flag & MNT_UNION)) {
630		struct vnode *tvp = vp;
631		vp = vp->v_mount->mnt_vnodecovered;
632		vref(vp);
633		mutex_enter(&fp->f_lock);
634		fp->f_vnode = vp;
635		fp->f_offset = 0;
636		mutex_exit(&fp->f_lock);
637		vrele(tvp);
638		goto unionread;
639	}
640	*done = count - auio.uio_resid;
641	return error;
642}
643
644/*
645 * File table vnode read routine.
646 */
647static int
648vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
649    int flags)
650{
651	struct vnode *vp = fp->f_vnode;
652	int error, ioflag, fflag;
653	size_t count;
654
655	ioflag = IO_ADV_ENCODE(fp->f_advice);
656	fflag = fp->f_flag;
657	if (fflag & FNONBLOCK)
658		ioflag |= IO_NDELAY;
659	if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC))
660		ioflag |= IO_SYNC;
661	if (fflag & FALTIO)
662		ioflag |= IO_ALTSEMANTICS;
663	if (fflag & FDIRECT)
664		ioflag |= IO_DIRECT;
665	if (offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) != 0)
666		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
667	else
668		vn_lock(vp, LK_SHARED | LK_RETRY);
669	if (__predict_false(vp->v_type == VDIR) &&
670	    offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0)
671		mutex_enter(&fp->f_lock);
672	uio->uio_offset = *offset;
673	if (__predict_false(vp->v_type == VDIR) &&
674	    offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0)
675		mutex_enter(&fp->f_lock);
676	count = uio->uio_resid;
677	error = VOP_READ(vp, uio, ioflag, cred);
678	if (flags & FOF_UPDATE_OFFSET)
679		*offset += count - uio->uio_resid;
680	VOP_UNLOCK(vp);
681	return error;
682}
683
684/*
685 * File table vnode write routine.
686 */
687static int
688vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
689    int flags)
690{
691	struct vnode *vp = fp->f_vnode;
692	int error, ioflag, fflag;
693	size_t count;
694
695	ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT;
696	fflag = fp->f_flag;
697	if (vp->v_type == VREG && (fflag & O_APPEND))
698		ioflag |= IO_APPEND;
699	if (fflag & FNONBLOCK)
700		ioflag |= IO_NDELAY;
701	if (fflag & FFSYNC ||
702	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
703		ioflag |= IO_SYNC;
704	else if (fflag & FDSYNC)
705		ioflag |= IO_DSYNC;
706	if (fflag & FALTIO)
707		ioflag |= IO_ALTSEMANTICS;
708	if (fflag & FDIRECT)
709		ioflag |= IO_DIRECT;
710	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
711	uio->uio_offset = *offset;
712	count = uio->uio_resid;
713
714	if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0)
715		goto out;
716
717	error = VOP_WRITE(vp, uio, ioflag, cred);
718
719	if (flags & FOF_UPDATE_OFFSET) {
720		if (ioflag & IO_APPEND) {
721			/*
722			 * SUSv3 describes behaviour for count = 0 as following:
723			 * "Before any action ... is taken, and if nbyte is zero
724			 * and the file is a regular file, the write() function
725			 * ... in the absence of errors ... shall return zero
726			 * and have no other results."
727			 */
728			if (count)
729				*offset = uio->uio_offset;
730		} else
731			*offset += count - uio->uio_resid;
732	}
733
734 out:
735	VOP_UNLOCK(vp);
736	return error;
737}
738
739/*
740 * File table vnode stat routine.
741 */
742static int
743vn_statfile(file_t *fp, struct stat *sb)
744{
745	struct vnode *vp = fp->f_vnode;
746	int error;
747
748	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
749	error = vn_stat(vp, sb);
750	VOP_UNLOCK(vp);
751	return error;
752}
753
754int
755vn_stat(struct vnode *vp, struct stat *sb)
756{
757	struct vattr va;
758	int error;
759	mode_t mode;
760
761	memset(&va, 0, sizeof(va));
762	error = VOP_GETATTR(vp, &va, kauth_cred_get());
763	if (error)
764		return error;
765	/*
766	 * Copy from vattr table
767	 */
768	memset(sb, 0, sizeof(*sb));
769	sb->st_dev = va.va_fsid;
770	sb->st_ino = va.va_fileid;
771	mode = va.va_mode;
772	switch (vp->v_type) {
773	case VREG:
774		mode |= S_IFREG;
775		break;
776	case VDIR:
777		mode |= S_IFDIR;
778		break;
779	case VBLK:
780		mode |= S_IFBLK;
781		break;
782	case VCHR:
783		mode |= S_IFCHR;
784		break;
785	case VLNK:
786		mode |= S_IFLNK;
787		break;
788	case VSOCK:
789		mode |= S_IFSOCK;
790		break;
791	case VFIFO:
792		mode |= S_IFIFO;
793		break;
794	default:
795		return EBADF;
796	}
797	sb->st_mode = mode;
798	sb->st_nlink = va.va_nlink;
799	sb->st_uid = va.va_uid;
800	sb->st_gid = va.va_gid;
801	sb->st_rdev = va.va_rdev;
802	sb->st_size = va.va_size;
803	sb->st_atimespec = va.va_atime;
804	sb->st_mtimespec = va.va_mtime;
805	sb->st_ctimespec = va.va_ctime;
806	sb->st_birthtimespec = va.va_birthtime;
807	sb->st_blksize = va.va_blocksize;
808	sb->st_flags = va.va_flags;
809	sb->st_gen = 0;
810	sb->st_blocks = va.va_bytes / S_BLKSIZE;
811	return 0;
812}
813
814/*
815 * File table vnode fcntl routine.
816 */
817static int
818vn_fcntl(file_t *fp, u_int com, void *data)
819{
820	struct vnode *vp = fp->f_vnode;
821	int error;
822
823	error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get());
824	return error;
825}
826
827/*
828 * File table vnode ioctl routine.
829 */
830static int
831vn_ioctl(file_t *fp, u_long com, void *data)
832{
833	struct vnode *vp = fp->f_vnode, *ovp;
834	struct vattr vattr;
835	int error;
836
837	switch (vp->v_type) {
838
839	case VREG:
840	case VDIR:
841		if (com == FIONREAD) {
842			vn_lock(vp, LK_SHARED | LK_RETRY);
843			error = VOP_GETATTR(vp, &vattr, kauth_cred_get());
844			if (error == 0) {
845				if (vp->v_type == VDIR)
846					mutex_enter(&fp->f_lock);
847				*(int *)data = vattr.va_size - fp->f_offset;
848				if (vp->v_type == VDIR)
849					mutex_exit(&fp->f_lock);
850			}
851			VOP_UNLOCK(vp);
852			if (error)
853				return error;
854			return 0;
855		}
856		if ((com == FIONWRITE) || (com == FIONSPACE)) {
857			/*
858			 * Files don't have send queues, so there never
859			 * are any bytes in them, nor is there any
860			 * open space in them.
861			 */
862			*(int *)data = 0;
863			return 0;
864		}
865		if (com == FIOGETBMAP) {
866			daddr_t *block;
867
868			if (*(daddr_t *)data < 0)
869				return EINVAL;
870			block = (daddr_t *)data;
871			vn_lock(vp, LK_SHARED | LK_RETRY);
872			error = VOP_BMAP(vp, *block, NULL, block, NULL);
873			VOP_UNLOCK(vp);
874			return error;
875		}
876		if (com == OFIOGETBMAP) {
877			daddr_t ibn, obn;
878
879			if (*(int32_t *)data < 0)
880				return EINVAL;
881			ibn = (daddr_t)*(int32_t *)data;
882			vn_lock(vp, LK_SHARED | LK_RETRY);
883			error = VOP_BMAP(vp, ibn, NULL, &obn, NULL);
884			VOP_UNLOCK(vp);
885			*(int32_t *)data = (int32_t)obn;
886			return error;
887		}
888		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
889			return 0;			/* XXX */
890		/* FALLTHROUGH */
891	case VFIFO:
892	case VCHR:
893	case VBLK:
894		error = VOP_IOCTL(vp, com, data, fp->f_flag,
895		    kauth_cred_get());
896		if (error == 0 && com == TIOCSCTTY) {
897			vref(vp);
898			mutex_enter(&proc_lock);
899			ovp = curproc->p_session->s_ttyvp;
900			curproc->p_session->s_ttyvp = vp;
901			mutex_exit(&proc_lock);
902			if (ovp != NULL)
903				vrele(ovp);
904		}
905		return error;
906
907	default:
908		return EPASSTHROUGH;
909	}
910}
911
912/*
913 * File table vnode poll routine.
914 */
915static int
916vn_poll(file_t *fp, int events)
917{
918
919	return VOP_POLL(fp->f_vnode, events);
920}
921
922/*
923 * File table vnode kqfilter routine.
924 */
925int
926vn_kqfilter(file_t *fp, struct knote *kn)
927{
928
929	return VOP_KQFILTER(fp->f_vnode, kn);
930}
931
932static int
933vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp,
934    int *advicep, struct uvm_object **uobjp, int *maxprotp)
935{
936	struct uvm_object *uobj;
937	struct vnode *vp;
938	struct vattr va;
939	struct lwp *l;
940	vm_prot_t maxprot;
941	off_t off;
942	int error, flags;
943	bool needwritemap;
944
945	l = curlwp;
946
947	off = *offp;
948	flags = *flagsp;
949	maxprot = VM_PROT_EXECUTE;
950
951	KASSERT(size > 0);
952
953	vp = fp->f_vnode;
954	if (vp->v_type != VREG && vp->v_type != VCHR &&
955	    vp->v_type != VBLK) {
956		/* only REG/CHR/BLK support mmap */
957		return ENODEV;
958	}
959	if (vp->v_type != VCHR && off < 0) {
960		return EINVAL;
961	}
962#if SIZE_MAX > UINT32_MAX	/* XXX -Wtype-limits */
963	if (vp->v_type != VCHR && size > __type_max(off_t)) {
964		return EOVERFLOW;
965	}
966#endif
967	if (vp->v_type != VCHR && off > __type_max(off_t) - size) {
968		/* no offset wrapping */
969		return EOVERFLOW;
970	}
971
972	/* special case: catch SunOS style /dev/zero */
973	if (vp->v_type == VCHR &&
974	    (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) {
975		*uobjp = NULL;
976		*maxprotp = VM_PROT_ALL;
977		return 0;
978	}
979
980	/*
981	 * Old programs may not select a specific sharing type, so
982	 * default to an appropriate one.
983	 *
984	 * XXX: how does MAP_ANON fit in the picture?
985	 */
986	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
987#if defined(DEBUG)
988		struct proc *p = l->l_proc;
989		printf("WARNING: defaulted mmap() share type to "
990		       "%s (pid %d command %s)\n", vp->v_type == VCHR ?
991		       "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
992		       p->p_comm);
993#endif
994		if (vp->v_type == VCHR)
995			flags |= MAP_SHARED;	/* for a device */
996		else
997			flags |= MAP_PRIVATE;	/* for a file */
998	}
999
1000	/*
1001	 * MAP_PRIVATE device mappings don't make sense (and aren't
1002	 * supported anyway).  However, some programs rely on this,
1003	 * so just change it to MAP_SHARED.
1004	 */
1005	if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
1006		flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
1007	}
1008
1009	/*
1010	 * now check protection
1011	 */
1012
1013	/* check read access */
1014	if (fp->f_flag & FREAD)
1015		maxprot |= VM_PROT_READ;
1016	else if (prot & PROT_READ) {
1017		return EACCES;
1018	}
1019
1020	/* check write access, shared case first */
1021	if (flags & MAP_SHARED) {
1022		/*
1023		 * if the file is writable, only add PROT_WRITE to
1024		 * maxprot if the file is not immutable, append-only.
1025		 * otherwise, if we have asked for PROT_WRITE, return
1026		 * EPERM.
1027		 */
1028		if (fp->f_flag & FWRITE) {
1029			vn_lock(vp, LK_SHARED | LK_RETRY);
1030			error = VOP_GETATTR(vp, &va, l->l_cred);
1031			VOP_UNLOCK(vp);
1032			if (error) {
1033				return error;
1034			}
1035			if ((va.va_flags &
1036			     (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0)
1037				maxprot |= VM_PROT_WRITE;
1038			else if (prot & PROT_WRITE) {
1039				return EPERM;
1040			}
1041		} else if (prot & PROT_WRITE) {
1042			return EACCES;
1043		}
1044	} else {
1045		/* MAP_PRIVATE mappings can always write to */
1046		maxprot |= VM_PROT_WRITE;
1047	}
1048
1049	/*
1050	 * Don't allow mmap for EXEC if the file system
1051	 * is mounted NOEXEC.
1052	 */
1053	if ((prot & PROT_EXEC) != 0 &&
1054	    (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
1055		return EACCES;
1056	}
1057
1058	if (vp->v_type != VCHR) {
1059		error = VOP_MMAP(vp, prot, curlwp->l_cred);
1060		if (error) {
1061			return error;
1062		}
1063		vref(vp);
1064		uobj = &vp->v_uobj;
1065
1066		/*
1067		 * If the vnode is being mapped with PROT_EXEC,
1068		 * then mark it as text.
1069		 */
1070		if (prot & PROT_EXEC) {
1071			vn_markexec(vp);
1072		}
1073	} else {
1074		int i = maxprot;
1075
1076		/*
1077		 * XXX Some devices don't like to be mapped with
1078		 * XXX PROT_EXEC or PROT_WRITE, but we don't really
1079		 * XXX have a better way of handling this, right now
1080		 */
1081		do {
1082			uobj = udv_attach(vp->v_rdev,
1083					  (flags & MAP_SHARED) ? i :
1084					  (i & ~VM_PROT_WRITE), off, size);
1085			i--;
1086		} while ((uobj == NULL) && (i > 0));
1087		if (uobj == NULL) {
1088			return EINVAL;
1089		}
1090		*advicep = UVM_ADV_RANDOM;
1091	}
1092
1093	/*
1094	 * Set vnode flags to indicate the new kinds of mapping.
1095	 * We take the vnode lock in exclusive mode here to serialize
1096	 * with direct I/O.
1097	 *
1098	 * Safe to check for these flag values without a lock, as
1099	 * long as a reference to the vnode is held.
1100	 */
1101	needwritemap = (vp->v_iflag & VI_WRMAP) == 0 &&
1102		(flags & MAP_SHARED) != 0 &&
1103		(maxprot & VM_PROT_WRITE) != 0;
1104	if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) {
1105		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1106		vp->v_vflag |= VV_MAPPED;
1107		if (needwritemap) {
1108			rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1109			mutex_enter(vp->v_interlock);
1110			vp->v_iflag |= VI_WRMAP;
1111			mutex_exit(vp->v_interlock);
1112			rw_exit(vp->v_uobj.vmobjlock);
1113		}
1114		VOP_UNLOCK(vp);
1115	}
1116
1117#if NVERIEXEC > 0
1118
1119	/*
1120	 * Check if the file can be executed indirectly.
1121	 *
1122	 * XXX: This gives false warnings about "Incorrect access type"
1123	 * XXX: if the mapping is not executable. Harmless, but will be
1124	 * XXX: fixed as part of other changes.
1125	 */
1126	if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT,
1127			    NULL)) {
1128
1129		/*
1130		 * Don't allow executable mappings if we can't
1131		 * indirectly execute the file.
1132		 */
1133		if (prot & VM_PROT_EXECUTE) {
1134			return EPERM;
1135		}
1136
1137		/*
1138		 * Strip the executable bit from 'maxprot' to make sure
1139		 * it can't be made executable later.
1140		 */
1141		maxprot &= ~VM_PROT_EXECUTE;
1142	}
1143#endif /* NVERIEXEC > 0 */
1144
1145	*uobjp = uobj;
1146	*maxprotp = maxprot;
1147	*flagsp = flags;
1148
1149	return 0;
1150}
1151
1152static int
1153vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp,
1154    int flags)
1155{
1156	const off_t OFF_MIN = __type_min(off_t);
1157	const off_t OFF_MAX = __type_max(off_t);
1158	kauth_cred_t cred = fp->f_cred;
1159	off_t oldoff, newoff;
1160	struct vnode *vp = fp->f_vnode;
1161	struct vattr vattr;
1162	int error;
1163
1164	if (vp->v_type == VFIFO)
1165		return ESPIPE;
1166
1167	if (flags & FOF_UPDATE_OFFSET)
1168		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1169	else
1170		vn_lock(vp, LK_SHARED | LK_RETRY);
1171
1172	/* Compute the old and new offsets.  */
1173	if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0)
1174		mutex_enter(&fp->f_lock);
1175	oldoff = fp->f_offset;
1176	if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0)
1177		mutex_exit(&fp->f_lock);
1178	switch (whence) {
1179	case SEEK_CUR:
1180		if (delta > 0) {
1181			if (oldoff > 0 && delta > OFF_MAX - oldoff) {
1182				newoff = OFF_MAX;
1183				break;
1184			}
1185		} else {
1186			if (oldoff < 0 && delta < OFF_MIN - oldoff) {
1187				newoff = OFF_MIN;
1188				break;
1189			}
1190		}
1191		newoff = oldoff + delta;
1192		break;
1193	case SEEK_END:
1194		error = VOP_GETATTR(vp, &vattr, cred);
1195		if (error)
1196			goto out;
1197		if (vattr.va_size > OFF_MAX ||
1198		    delta > OFF_MAX - (off_t)vattr.va_size) {
1199			newoff = OFF_MAX;
1200			break;
1201		}
1202		newoff = delta + vattr.va_size;
1203		break;
1204	case SEEK_SET:
1205		newoff = delta;
1206		break;
1207	default:
1208		error = EINVAL;
1209		goto out;
1210	}
1211
1212	/* Pass the proposed change to the file system to audit.  */
1213	error = VOP_SEEK(vp, oldoff, newoff, cred);
1214	if (error)
1215		goto out;
1216
1217	/* Success!  */
1218	if (newoffp)
1219		*newoffp = newoff;
1220	if (flags & FOF_UPDATE_OFFSET)
1221		fp->f_offset = newoff;
1222	error = 0;
1223
1224out:	VOP_UNLOCK(vp);
1225	return error;
1226}
1227
1228static int
1229vn_advlock(struct file *fp, void *id, int op, struct flock *fl,
1230    int flags)
1231{
1232	struct vnode *const vp = fp->f_vnode;
1233
1234	if (fl->l_whence == SEEK_CUR) {
1235		vn_lock(vp, LK_SHARED | LK_RETRY);
1236		fl->l_start += fp->f_offset;
1237		VOP_UNLOCK(vp);
1238	}
1239
1240	return VOP_ADVLOCK(vp, id, op, fl, flags);
1241}
1242
1243static int
1244vn_fpathconf(struct file *fp, int name, register_t *retval)
1245{
1246	struct vnode *const vp = fp->f_vnode;
1247	int error;
1248
1249	vn_lock(vp, LK_SHARED | LK_RETRY);
1250	error = VOP_PATHCONF(vp, name, retval);
1251	VOP_UNLOCK(vp);
1252
1253	return error;
1254}
1255
1256static int
1257vn_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice)
1258{
1259	const off_t OFF_MAX = __type_max(off_t);
1260	struct vnode *vp = fp->f_vnode;
1261	off_t endoffset;
1262	int error;
1263
1264	if (offset < 0) {
1265		return EINVAL;
1266	}
1267	if (len == 0) {
1268		endoffset = OFF_MAX;
1269	} else if (len > 0 && (OFF_MAX - offset) >= len) {
1270		endoffset = offset + len;
1271	} else {
1272		return EINVAL;
1273	}
1274
1275	CTASSERT(POSIX_FADV_NORMAL == UVM_ADV_NORMAL);
1276	CTASSERT(POSIX_FADV_RANDOM == UVM_ADV_RANDOM);
1277	CTASSERT(POSIX_FADV_SEQUENTIAL == UVM_ADV_SEQUENTIAL);
1278
1279	switch (advice) {
1280	case POSIX_FADV_WILLNEED:
1281	case POSIX_FADV_DONTNEED:
1282		if (vp->v_type != VREG && vp->v_type != VBLK)
1283			return 0;
1284		break;
1285	}
1286
1287	switch (advice) {
1288	case POSIX_FADV_NORMAL:
1289	case POSIX_FADV_RANDOM:
1290	case POSIX_FADV_SEQUENTIAL:
1291		/*
1292		 * We ignore offset and size.  Must lock the file to
1293		 * do this, as f_advice is sub-word sized.
1294		 */
1295		mutex_enter(&fp->f_lock);
1296		fp->f_advice = (u_char)advice;
1297		mutex_exit(&fp->f_lock);
1298		error = 0;
1299		break;
1300
1301	case POSIX_FADV_WILLNEED:
1302		error = uvm_readahead(&vp->v_uobj, offset, endoffset - offset);
1303		break;
1304
1305	case POSIX_FADV_DONTNEED:
1306		/*
1307		 * Align the region to page boundaries as VOP_PUTPAGES expects
1308		 * by shrinking it.  We shrink instead of expand because we
1309		 * do not want to deactivate cache outside of the requested
1310		 * region.  It means that if the specified region is smaller
1311		 * than PAGE_SIZE, we do nothing.
1312		 */
1313		if (offset <= trunc_page(OFF_MAX) &&
1314		    round_page(offset) < trunc_page(endoffset)) {
1315			rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1316			error = VOP_PUTPAGES(vp,
1317			    round_page(offset), trunc_page(endoffset),
1318			    PGO_DEACTIVATE | PGO_CLEANIT);
1319		} else {
1320			error = 0;
1321		}
1322		break;
1323
1324	case POSIX_FADV_NOREUSE:
1325		/* Not implemented yet. */
1326		error = 0;
1327		break;
1328	default:
1329		error = EINVAL;
1330		break;
1331	}
1332
1333	return error;
1334}
1335
1336static int
1337vn_truncate(file_t *fp, off_t length)
1338{
1339	struct vattr vattr;
1340	struct vnode *vp;
1341	int error = 0;
1342
1343	if (length < 0)
1344		return EINVAL;
1345
1346	if ((fp->f_flag & FWRITE) == 0)
1347		return EINVAL;
1348	vp = fp->f_vnode;
1349	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1350	if (vp->v_type == VDIR)
1351		error = EISDIR;
1352	else if ((error = vn_writechk(vp)) == 0) {
1353		vattr_null(&vattr);
1354		vattr.va_size = length;
1355		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
1356	}
1357	VOP_UNLOCK(vp);
1358
1359	return error;
1360}
1361
1362
1363/*
1364 * Check that the vnode is still valid, and if so
1365 * acquire requested lock.
1366 */
1367int
1368vn_lock(struct vnode *vp, int flags)
1369{
1370	struct lwp *l;
1371	int error;
1372
1373	KASSERT(vrefcnt(vp) > 0);
1374	KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY|
1375	    LK_UPGRADE|LK_DOWNGRADE)) == 0);
1376	KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock));
1377
1378#ifdef DIAGNOSTIC
1379	if (wapbl_vphaswapbl(vp))
1380		WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp));
1381#endif
1382
1383	/* Get a more useful report for lockstat. */
1384	l = curlwp;
1385	KASSERT(l->l_rwcallsite == 0);
1386	l->l_rwcallsite = (uintptr_t)__builtin_return_address(0);
1387
1388	error = VOP_LOCK(vp, flags);
1389
1390	l->l_rwcallsite = 0;
1391
1392	switch (flags & (LK_RETRY | LK_NOWAIT)) {
1393	case 0:
1394		KASSERT(error == 0 || error == ENOENT);
1395		break;
1396	case LK_RETRY:
1397		KASSERT(error == 0);
1398		break;
1399	case LK_NOWAIT:
1400		KASSERT(error == 0 || error == EBUSY || error == ENOENT);
1401		break;
1402	case LK_RETRY | LK_NOWAIT:
1403		KASSERT(error == 0 || error == EBUSY);
1404		break;
1405	}
1406
1407	return error;
1408}
1409
1410/*
1411 * File table vnode close routine.
1412 */
1413static int
1414vn_closefile(file_t *fp)
1415{
1416
1417	return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred);
1418}
1419
1420/*
1421 * Simplified in-kernel wrapper calls for extended attribute access.
1422 * Both calls pass in a NULL credential, authorizing a "kernel" access.
1423 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1424 */
1425int
1426vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1427    const char *attrname, size_t *buflen, void *bf, struct lwp *l)
1428{
1429	struct uio auio;
1430	struct iovec aiov;
1431	int error;
1432
1433	aiov.iov_len = *buflen;
1434	aiov.iov_base = bf;
1435
1436	auio.uio_iov = &aiov;
1437	auio.uio_iovcnt = 1;
1438	auio.uio_rw = UIO_READ;
1439	auio.uio_offset = 0;
1440	auio.uio_resid = *buflen;
1441	UIO_SETUP_SYSSPACE(&auio);
1442
1443	if ((ioflg & IO_NODELOCKED) == 0)
1444		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1445
1446	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL,
1447	    NOCRED);
1448
1449	if ((ioflg & IO_NODELOCKED) == 0)
1450		VOP_UNLOCK(vp);
1451
1452	if (error == 0)
1453		*buflen = *buflen - auio.uio_resid;
1454
1455	return error;
1456}
1457
1458/*
1459 * XXX Failure mode if partially written?
1460 */
1461int
1462vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1463    const char *attrname, size_t buflen, const void *bf, struct lwp *l)
1464{
1465	struct uio auio;
1466	struct iovec aiov;
1467	int error;
1468
1469	aiov.iov_len = buflen;
1470	aiov.iov_base = __UNCONST(bf);		/* XXXUNCONST kills const */
1471
1472	auio.uio_iov = &aiov;
1473	auio.uio_iovcnt = 1;
1474	auio.uio_rw = UIO_WRITE;
1475	auio.uio_offset = 0;
1476	auio.uio_resid = buflen;
1477	UIO_SETUP_SYSSPACE(&auio);
1478
1479	if ((ioflg & IO_NODELOCKED) == 0) {
1480		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1481	}
1482
1483	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED);
1484
1485	if ((ioflg & IO_NODELOCKED) == 0) {
1486		VOP_UNLOCK(vp);
1487	}
1488
1489	return error;
1490}
1491
1492int
1493vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1494    const char *attrname, struct lwp *l)
1495{
1496	int error;
1497
1498	if ((ioflg & IO_NODELOCKED) == 0) {
1499		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1500	}
1501
1502	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED);
1503	if (error == EOPNOTSUPP)
1504		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1505		    NOCRED);
1506
1507	if ((ioflg & IO_NODELOCKED) == 0) {
1508		VOP_UNLOCK(vp);
1509	}
1510
1511	return error;
1512}
1513
1514int
1515vn_fifo_bypass(void *v)
1516{
1517	struct vop_generic_args *ap = v;
1518
1519	return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v);
1520}
1521
1522/*
1523 * Open block device by device number
1524 */
1525int
1526vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l)
1527{
1528	int     error;
1529
1530	if ((error = bdevvp(dev, vpp)) != 0)
1531		return error;
1532
1533	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1534	if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) {
1535		vput(*vpp);
1536		return error;
1537	}
1538	mutex_enter((*vpp)->v_interlock);
1539	(*vpp)->v_writecount++;
1540	mutex_exit((*vpp)->v_interlock);
1541	VOP_UNLOCK(*vpp);
1542
1543	return 0;
1544}
1545
1546/*
1547 * Lookup the provided name in the filesystem.  If the file exists,
1548 * is a valid block device, and isn't being used by anyone else,
1549 * set *vpp to the file's vnode.
1550 */
1551int
1552vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l)
1553{
1554	struct vnode *vp;
1555	dev_t dev;
1556	enum vtype vt;
1557	int     error;
1558
1559	error = vn_open(NULL, pb, 0, FREAD | FWRITE, 0, &vp, NULL, NULL);
1560	if (error != 0)
1561		return error;
1562
1563	dev = vp->v_rdev;
1564	vt = vp->v_type;
1565
1566	VOP_UNLOCK(vp);
1567	(void) vn_close(vp, FREAD | FWRITE, l->l_cred);
1568
1569	if (vt != VBLK)
1570		return ENOTBLK;
1571
1572	return vn_bdev_open(dev, vpp, l);
1573}
1574
1575static long
1576vn_knote_to_interest(const struct knote *kn)
1577{
1578	switch (kn->kn_filter) {
1579	case EVFILT_READ:
1580		/*
1581		 * Writing to the file or changing its attributes can
1582		 * set the file size, which impacts the readability
1583		 * filter.
1584		 *
1585		 * (No need to set NOTE_EXTEND here; it's only ever
1586		 * send with other hints; see vnode_if.c.)
1587		 */
1588		return NOTE_WRITE | NOTE_ATTRIB;
1589
1590	case EVFILT_VNODE:
1591		return kn->kn_sfflags;
1592
1593	case EVFILT_WRITE:
1594	default:
1595		return 0;
1596	}
1597}
1598
1599void
1600vn_knote_attach(struct vnode *vp, struct knote *kn)
1601{
1602	struct vnode_klist *vk = vp->v_klist;
1603	long interest = 0;
1604
1605	/*
1606	 * In the case of layered / stacked file systems, knotes
1607	 * should only ever be associated with the base vnode.
1608	 */
1609	KASSERT(kn->kn_hook == vp);
1610	KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
1611
1612	/*
1613	 * We maintain a bitmask of the kevents that there is interest in,
1614	 * to minimize the impact of having watchers.  It's silly to have
1615	 * to traverse vn_klist every time a read or write happens simply
1616	 * because there is someone interested in knowing when the file
1617	 * is deleted, for example.
1618	 */
1619
1620	mutex_enter(vp->v_interlock);
1621	SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext);
1622	SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
1623		interest |= vn_knote_to_interest(kn);
1624	}
1625	vk->vk_interest = interest;
1626	mutex_exit(vp->v_interlock);
1627}
1628
1629void
1630vn_knote_detach(struct vnode *vp, struct knote *kn)
1631{
1632	struct vnode_klist *vk = vp->v_klist;
1633	long interest = 0;
1634
1635	/* See above. */
1636	KASSERT(kn->kn_hook == vp);
1637	KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
1638
1639	/*
1640	 * We special case removing the head of the list, because:
1641	 *
1642	 * 1. It's extremely likely that we're detaching the only
1643	 *    knote.
1644	 *
1645	 * 2. We're already traversing the whole list, so we don't
1646	 *    want to use the generic SLIST_REMOVE() which would
1647	 *    traverse it *again*.
1648	 */
1649
1650	mutex_enter(vp->v_interlock);
1651	if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) {
1652		SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext);
1653		SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
1654			interest |= vn_knote_to_interest(kn);
1655		}
1656		vk->vk_interest = interest;
1657	} else {
1658		struct knote *thiskn, *nextkn, *prevkn = NULL;
1659
1660		SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) {
1661			if (thiskn == kn) {
1662				KASSERT(kn != NULL);
1663				KASSERT(prevkn != NULL);
1664				SLIST_REMOVE_AFTER(prevkn, kn_selnext);
1665				kn = NULL;
1666			} else {
1667				interest |= vn_knote_to_interest(thiskn);
1668				prevkn = thiskn;
1669			}
1670		}
1671		vk->vk_interest = interest;
1672	}
1673	mutex_exit(vp->v_interlock);
1674}
1675