1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2007-2009 Google Inc. and Amit Singh
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are
9 * met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 *   notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above
14 *   copyright notice, this list of conditions and the following disclaimer
15 *   in the documentation and/or other materials provided with the
16 *   distribution.
17 * * Neither the name of Google Inc. nor the names of its
18 *   contributors may be used to endorse or promote products derived from
19 *   this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Copyright (C) 2005 Csaba Henk.
34 * All rights reserved.
35 *
36 * Copyright (c) 2019 The FreeBSD Foundation
37 *
38 * Portions of this software were developed by BFF Storage Systems, LLC under
39 * sponsorship from the FreeBSD Foundation.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 *    notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 *    notice, this list of conditions and the following disclaimer in the
48 *    documentation and/or other materials provided with the distribution.
49 *
50 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
61 */
62
63#include <sys/cdefs.h>
64__FBSDID("$FreeBSD$");
65
66#include <sys/param.h>
67#include <sys/systm.h>
68#include <sys/counter.h>
69#include <sys/module.h>
70#include <sys/errno.h>
71#include <sys/kernel.h>
72#include <sys/conf.h>
73#include <sys/uio.h>
74#include <sys/malloc.h>
75#include <sys/queue.h>
76#include <sys/lock.h>
77#include <sys/mutex.h>
78#include <sys/sdt.h>
79#include <sys/sx.h>
80#include <sys/proc.h>
81#include <sys/mount.h>
82#include <sys/vnode.h>
83#include <sys/namei.h>
84#include <sys/stat.h>
85#include <sys/unistd.h>
86#include <sys/filedesc.h>
87#include <sys/file.h>
88#include <sys/fcntl.h>
89#include <sys/dirent.h>
90#include <sys/bio.h>
91#include <sys/buf.h>
92#include <sys/sysctl.h>
93#include <sys/priv.h>
94
95#include "fuse.h"
96#include "fuse_file.h"
97#include "fuse_internal.h"
98#include "fuse_io.h"
99#include "fuse_ipc.h"
100#include "fuse_node.h"
101#include "fuse_file.h"
102
103SDT_PROVIDER_DECLARE(fusefs);
104/*
105 * Fuse trace probe:
106 * arg0: verbosity.  Higher numbers give more verbose messages
107 * arg1: Textual message
108 */
109SDT_PROBE_DEFINE2(fusefs, , internal, trace, "int", "char*");
110
111#ifdef ZERO_PAD_INCOMPLETE_BUFS
112static int isbzero(void *buf, size_t len);
113
114#endif
115
116counter_u64_t fuse_lookup_cache_hits;
117counter_u64_t fuse_lookup_cache_misses;
118
119SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, lookup_cache_hits, CTLFLAG_RD,
120    &fuse_lookup_cache_hits, "number of positive cache hits in lookup");
121
122SYSCTL_COUNTER_U64(_vfs_fusefs_stats, OID_AUTO, lookup_cache_misses, CTLFLAG_RD,
123    &fuse_lookup_cache_misses, "number of cache misses in lookup");
124
125int
126fuse_internal_get_cached_vnode(struct mount* mp, ino_t ino, int flags,
127	struct vnode **vpp)
128{
129	struct bintime now;
130	struct thread *td = curthread;
131	uint64_t nodeid = ino;
132	int error;
133
134	*vpp = NULL;
135
136	error = vfs_hash_get(mp, fuse_vnode_hash(nodeid), flags, td, vpp,
137	    fuse_vnode_cmp, &nodeid);
138	if (error)
139		return error;
140	/*
141	 * Check the entry cache timeout.  We have to do this within fusefs
142	 * instead of by using cache_enter_time/cache_lookup because those
143	 * routines are only intended to work with pathnames, not inodes
144	 */
145	if (*vpp != NULL) {
146		getbinuptime(&now);
147		if (bintime_cmp(&(VTOFUD(*vpp)->entry_cache_timeout), &now, >)){
148			counter_u64_add(fuse_lookup_cache_hits, 1);
149			return 0;
150		} else {
151			/* Entry cache timeout */
152			counter_u64_add(fuse_lookup_cache_misses, 1);
153			cache_purge(*vpp);
154			vput(*vpp);
155			*vpp = NULL;
156		}
157	}
158	return 0;
159}
160
161SDT_PROBE_DEFINE0(fusefs, , internal, access_vadmin);
162/* Synchronously send a FUSE_ACCESS operation */
163int
164fuse_internal_access(struct vnode *vp,
165    accmode_t mode,
166    struct thread *td,
167    struct ucred *cred)
168{
169	int err = 0;
170	uint32_t mask = F_OK;
171	int dataflags;
172	int vtype;
173	struct mount *mp;
174	struct fuse_dispatcher fdi;
175	struct fuse_access_in *fai;
176	struct fuse_data *data;
177
178	mp = vnode_mount(vp);
179	vtype = vnode_vtype(vp);
180
181	data = fuse_get_mpdata(mp);
182	dataflags = data->dataflags;
183
184	if (mode == 0)
185		return 0;
186
187	if (mode & VMODIFY_PERMS && vfs_isrdonly(mp)) {
188		switch (vp->v_type) {
189		case VDIR:
190			/* FALLTHROUGH */
191		case VLNK:
192			/* FALLTHROUGH */
193		case VREG:
194			return EROFS;
195		default:
196			break;
197		}
198	}
199
200	/* Unless explicitly permitted, deny everyone except the fs owner. */
201	if (!(dataflags & FSESS_DAEMON_CAN_SPY)) {
202		if (fuse_match_cred(data->daemoncred, cred))
203			return EPERM;
204	}
205
206	if (dataflags & FSESS_DEFAULT_PERMISSIONS) {
207		struct vattr va;
208
209		fuse_internal_getattr(vp, &va, cred, td);
210		return vaccess(vp->v_type, va.va_mode, va.va_uid,
211		    va.va_gid, mode, cred, NULL);
212	}
213
214	if (mode & VADMIN) {
215		/*
216		 * The FUSE protocol doesn't have an equivalent of VADMIN, so
217		 * it's a bug if we ever reach this point with that bit set.
218		 */
219		SDT_PROBE0(fusefs, , internal, access_vadmin);
220	}
221
222	if (!fsess_isimpl(mp, FUSE_ACCESS))
223		return 0;
224
225	if ((mode & (VWRITE | VAPPEND)) != 0)
226		mask |= W_OK;
227	if ((mode & VREAD) != 0)
228		mask |= R_OK;
229	if ((mode & VEXEC) != 0)
230		mask |= X_OK;
231
232	fdisp_init(&fdi, sizeof(*fai));
233	fdisp_make_vp(&fdi, FUSE_ACCESS, vp, td, cred);
234
235	fai = fdi.indata;
236	fai->mask = mask;
237
238	err = fdisp_wait_answ(&fdi);
239	fdisp_destroy(&fdi);
240
241	if (err == ENOSYS) {
242		fsess_set_notimpl(mp, FUSE_ACCESS);
243		err = 0;
244	}
245	return err;
246}
247
248/*
249 * Cache FUSE attributes from attr, in attribute cache associated with vnode
250 * 'vp'.  Optionally, if argument 'vap' is not NULL, store a copy of the
251 * converted attributes there as well.
252 *
253 * If the nominal attribute cache TTL is zero, do not cache on the 'vp' (but do
254 * return the result to the caller).
255 */
256void
257fuse_internal_cache_attrs(struct vnode *vp, struct fuse_attr *attr,
258	uint64_t attr_valid, uint32_t attr_valid_nsec, struct vattr *vap)
259{
260	struct mount *mp;
261	struct fuse_vnode_data *fvdat;
262	struct fuse_data *data;
263	struct vattr *vp_cache_at;
264
265	mp = vnode_mount(vp);
266	fvdat = VTOFUD(vp);
267	data = fuse_get_mpdata(mp);
268
269	ASSERT_VOP_ELOCKED(vp, "fuse_internal_cache_attrs");
270
271	fuse_validity_2_bintime(attr_valid, attr_valid_nsec,
272		&fvdat->attr_cache_timeout);
273
274	/* Fix our buffers if the filesize changed without us knowing */
275	if (vnode_isreg(vp) && attr->size != fvdat->cached_attrs.va_size) {
276		(void)fuse_vnode_setsize(vp, attr->size);
277		fvdat->cached_attrs.va_size = attr->size;
278	}
279
280	if (attr_valid > 0 || attr_valid_nsec > 0)
281		vp_cache_at = &(fvdat->cached_attrs);
282	else if (vap != NULL)
283		vp_cache_at = vap;
284	else
285		return;
286
287	vattr_null(vp_cache_at);
288	vp_cache_at->va_fsid = mp->mnt_stat.f_fsid.val[0];
289	vp_cache_at->va_fileid = attr->ino;
290	vp_cache_at->va_mode = attr->mode & ~S_IFMT;
291	vp_cache_at->va_nlink     = attr->nlink;
292	vp_cache_at->va_uid       = attr->uid;
293	vp_cache_at->va_gid       = attr->gid;
294	vp_cache_at->va_rdev      = attr->rdev;
295	vp_cache_at->va_size      = attr->size;
296	/* XXX on i386, seconds are truncated to 32 bits */
297	vp_cache_at->va_atime.tv_sec  = attr->atime;
298	vp_cache_at->va_atime.tv_nsec = attr->atimensec;
299	vp_cache_at->va_mtime.tv_sec  = attr->mtime;
300	vp_cache_at->va_mtime.tv_nsec = attr->mtimensec;
301	vp_cache_at->va_ctime.tv_sec  = attr->ctime;
302	vp_cache_at->va_ctime.tv_nsec = attr->ctimensec;
303	if (fuse_libabi_geq(data, 7, 9) && attr->blksize > 0)
304		vp_cache_at->va_blocksize = attr->blksize;
305	else
306		vp_cache_at->va_blocksize = PAGE_SIZE;
307	vp_cache_at->va_type = IFTOVT(attr->mode);
308	vp_cache_at->va_bytes = attr->blocks * S_BLKSIZE;
309	vp_cache_at->va_flags = 0;
310
311	if (vap != vp_cache_at && vap != NULL)
312		memcpy(vap, vp_cache_at, sizeof(*vap));
313}
314
315
316/* fsync */
317
318int
319fuse_internal_fsync_callback(struct fuse_ticket *tick, struct uio *uio)
320{
321	if (tick->tk_aw_ohead.error == ENOSYS) {
322		fsess_set_notimpl(tick->tk_data->mp, fticket_opcode(tick));
323	}
324	return 0;
325}
326
327int
328fuse_internal_fsync(struct vnode *vp,
329    struct thread *td,
330    int waitfor,
331    bool datasync)
332{
333	struct fuse_fsync_in *ffsi = NULL;
334	struct fuse_dispatcher fdi;
335	struct fuse_filehandle *fufh;
336	struct fuse_vnode_data *fvdat = VTOFUD(vp);
337	struct mount *mp = vnode_mount(vp);
338	int op = FUSE_FSYNC;
339	int err = 0;
340
341	if (!fsess_isimpl(vnode_mount(vp),
342	    (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) {
343		return 0;
344	}
345	if (vnode_isdir(vp))
346		op = FUSE_FSYNCDIR;
347
348	if (!fsess_isimpl(mp, op))
349		return 0;
350
351	fdisp_init(&fdi, sizeof(*ffsi));
352	/*
353	 * fsync every open file handle for this file, because we can't be sure
354	 * which file handle the caller is really referring to.
355	 */
356	LIST_FOREACH(fufh, &fvdat->handles, next) {
357		fdi.iosize = sizeof(*ffsi);
358		if (ffsi == NULL)
359			fdisp_make_vp(&fdi, op, vp, td, NULL);
360		else
361			fdisp_refresh_vp(&fdi, op, vp, td, NULL);
362		ffsi = fdi.indata;
363		ffsi->fh = fufh->fh_id;
364		ffsi->fsync_flags = 0;
365
366		if (datasync)
367			ffsi->fsync_flags = 1;
368
369		if (waitfor == MNT_WAIT) {
370			err = fdisp_wait_answ(&fdi);
371		} else {
372			fuse_insert_callback(fdi.tick,
373				fuse_internal_fsync_callback);
374			fuse_insert_message(fdi.tick, false);
375		}
376		if (err == ENOSYS) {
377			/* ENOSYS means "success, and don't call again" */
378			fsess_set_notimpl(mp, op);
379			err = 0;
380			break;
381		}
382	}
383	fdisp_destroy(&fdi);
384
385	return err;
386}
387
388/* Asynchronous invalidation */
389SDT_PROBE_DEFINE3(fusefs, , internal, invalidate_entry,
390	"struct vnode*", "struct fuse_notify_inval_entry_out*", "char*");
391int
392fuse_internal_invalidate_entry(struct mount *mp, struct uio *uio)
393{
394	struct fuse_notify_inval_entry_out fnieo;
395	struct componentname cn;
396	struct vnode *dvp, *vp;
397	char name[PATH_MAX];
398	int err;
399
400	if ((err = uiomove(&fnieo, sizeof(fnieo), uio)) != 0)
401		return (err);
402
403	if (fnieo.namelen >= sizeof(name))
404		return (EINVAL);
405
406	if ((err = uiomove(name, fnieo.namelen, uio)) != 0)
407		return (err);
408	name[fnieo.namelen] = '\0';
409	/* fusefs does not cache "." or ".." entries */
410	if (strncmp(name, ".", sizeof(".")) == 0 ||
411	    strncmp(name, "..", sizeof("..")) == 0)
412		return (0);
413
414	if (fnieo.parent == FUSE_ROOT_ID)
415		err = VFS_ROOT(mp, LK_SHARED, &dvp);
416	else
417		err = fuse_internal_get_cached_vnode( mp, fnieo.parent,
418			LK_SHARED, &dvp);
419	SDT_PROBE3(fusefs, , internal, invalidate_entry, dvp, &fnieo, name);
420	/*
421	 * If dvp is not in the cache, then it must've been reclaimed.  And
422	 * since fuse_vnop_reclaim does a cache_purge, name's entry must've
423	 * been invalidated already.  So we can safely return if dvp == NULL
424	 */
425	if (err != 0 || dvp == NULL)
426		return (err);
427	/*
428	 * XXX we can't check dvp's generation because the FUSE invalidate
429	 * entry message doesn't include it.  Worse case is that we invalidate
430	 * an entry that didn't need to be invalidated.
431	 */
432
433	cn.cn_nameiop = LOOKUP;
434	cn.cn_flags = 0;	/* !MAKEENTRY means free cached entry */
435	cn.cn_thread = curthread;
436	cn.cn_cred = curthread->td_ucred;
437	cn.cn_lkflags = LK_SHARED;
438	cn.cn_pnbuf = NULL;
439	cn.cn_nameptr = name;
440	cn.cn_namelen = fnieo.namelen;
441	err = cache_lookup(dvp, &vp, &cn, NULL, NULL);
442	MPASS(err == 0);
443	fuse_vnode_clear_attr_cache(dvp);
444	vput(dvp);
445	return (0);
446}
447
448SDT_PROBE_DEFINE2(fusefs, , internal, invalidate_inode,
449	"struct vnode*", "struct fuse_notify_inval_inode_out *");
450int
451fuse_internal_invalidate_inode(struct mount *mp, struct uio *uio)
452{
453	struct fuse_notify_inval_inode_out fniio;
454	struct vnode *vp;
455	int err;
456
457	if ((err = uiomove(&fniio, sizeof(fniio), uio)) != 0)
458		return (err);
459
460	if (fniio.ino == FUSE_ROOT_ID)
461		err = VFS_ROOT(mp, LK_EXCLUSIVE, &vp);
462	else
463		err = fuse_internal_get_cached_vnode(mp, fniio.ino, LK_SHARED,
464			&vp);
465	SDT_PROBE2(fusefs, , internal, invalidate_inode, vp, &fniio);
466	if (err != 0 || vp == NULL)
467		return (err);
468	/*
469	 * XXX we can't check vp's generation because the FUSE invalidate
470	 * entry message doesn't include it.  Worse case is that we invalidate
471	 * an inode that didn't need to be invalidated.
472	 */
473
474	/*
475	 * Flush and invalidate buffers if off >= 0.  Technically we only need
476	 * to flush and invalidate the range of offsets [off, off + len), but
477	 * for simplicity's sake we do everything.
478	 */
479	if (fniio.off >= 0)
480		fuse_io_invalbuf(vp, curthread);
481	fuse_vnode_clear_attr_cache(vp);
482	vput(vp);
483	return (0);
484}
485
486/* mknod */
487int
488fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp,
489	struct componentname *cnp, struct vattr *vap)
490{
491	struct fuse_data *data;
492	struct fuse_mknod_in fmni;
493	size_t insize;
494
495	data = fuse_get_mpdata(dvp->v_mount);
496
497	fmni.mode = MAKEIMODE(vap->va_type, vap->va_mode);
498	fmni.rdev = vap->va_rdev;
499	if (fuse_libabi_geq(data, 7, 12)) {
500		insize = sizeof(fmni);
501		fmni.umask = curthread->td_proc->p_fd->fd_cmask;
502	} else {
503		insize = FUSE_COMPAT_MKNOD_IN_SIZE;
504	}
505	return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKNOD, &fmni,
506	    insize, vap->va_type));
507}
508
509/* readdir */
510
511int
512fuse_internal_readdir(struct vnode *vp,
513    struct uio *uio,
514    off_t startoff,
515    struct fuse_filehandle *fufh,
516    struct fuse_iov *cookediov,
517    int *ncookies,
518    u_long *cookies)
519{
520	int err = 0;
521	struct fuse_dispatcher fdi;
522	struct fuse_read_in *fri = NULL;
523	int fnd_start;
524
525	if (uio_resid(uio) == 0)
526		return 0;
527	fdisp_init(&fdi, 0);
528
529	/*
530	 * Note that we DO NOT have a UIO_SYSSPACE here (so no need for p2p
531	 * I/O).
532	 */
533
534	/*
535	 * fnd_start is set non-zero once the offset in the directory gets
536	 * to the startoff.  This is done because directories must be read
537	 * from the beginning (offset == 0) when fuse_vnop_readdir() needs
538	 * to do an open of the directory.
539	 * If it is not set non-zero here, it will be set non-zero in
540	 * fuse_internal_readdir_processdata() when uio_offset == startoff.
541	 */
542	fnd_start = 0;
543	if (uio->uio_offset == startoff)
544		fnd_start = 1;
545	while (uio_resid(uio) > 0) {
546		fdi.iosize = sizeof(*fri);
547		if (fri == NULL)
548			fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
549		else
550			fdisp_refresh_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
551
552		fri = fdi.indata;
553		fri->fh = fufh->fh_id;
554		fri->offset = uio_offset(uio);
555		fri->size = MIN(uio->uio_resid,
556		    fuse_get_mpdata(vp->v_mount)->max_read);
557
558		if ((err = fdisp_wait_answ(&fdi)))
559			break;
560		if ((err = fuse_internal_readdir_processdata(uio, startoff,
561		    &fnd_start, fri->size, fdi.answ, fdi.iosize, cookediov,
562		    ncookies, &cookies)))
563			break;
564	}
565
566	fdisp_destroy(&fdi);
567	return ((err == -1) ? 0 : err);
568}
569
570/*
571 * Return -1 to indicate that this readdir is finished, 0 if it copied
572 * all the directory data read in and it may be possible to read more
573 * and greater than 0 for a failure.
574 */
575int
576fuse_internal_readdir_processdata(struct uio *uio,
577    off_t startoff,
578    int *fnd_start,
579    size_t reqsize,
580    void *buf,
581    size_t bufsize,
582    struct fuse_iov *cookediov,
583    int *ncookies,
584    u_long **cookiesp)
585{
586	int err = 0;
587	int oreclen;
588	size_t freclen;
589
590	struct dirent *de;
591	struct fuse_dirent *fudge;
592	u_long *cookies;
593
594	cookies = *cookiesp;
595	if (bufsize < FUSE_NAME_OFFSET)
596		return -1;
597	for (;;) {
598		if (bufsize < FUSE_NAME_OFFSET) {
599			err = -1;
600			break;
601		}
602		fudge = (struct fuse_dirent *)buf;
603		freclen = FUSE_DIRENT_SIZE(fudge);
604
605		if (bufsize < freclen) {
606			/*
607			 * This indicates a partial directory entry at the
608			 * end of the directory data.
609			 */
610			err = -1;
611			break;
612		}
613#ifdef ZERO_PAD_INCOMPLETE_BUFS
614		if (isbzero(buf, FUSE_NAME_OFFSET)) {
615			err = -1;
616			break;
617		}
618#endif
619
620		if (!fudge->namelen || fudge->namelen > MAXNAMLEN) {
621			err = EINVAL;
622			break;
623		}
624		oreclen = GENERIC_DIRSIZ((struct pseudo_dirent *)
625					    &fudge->namelen);
626
627		if (oreclen > uio_resid(uio)) {
628			/* Out of space for the dir so we are done. */
629			err = -1;
630			break;
631		}
632		/*
633		 * Don't start to copy the directory entries out until
634		 * the requested offset in the directory is found.
635		 */
636		if (*fnd_start != 0) {
637			fiov_adjust(cookediov, oreclen);
638			bzero(cookediov->base, oreclen);
639
640			de = (struct dirent *)cookediov->base;
641			de->d_fileno = fudge->ino;
642			de->d_off = fudge->off;
643			de->d_reclen = oreclen;
644			de->d_type = fudge->type;
645			de->d_namlen = fudge->namelen;
646			memcpy((char *)cookediov->base + sizeof(struct dirent) -
647			       MAXNAMLEN - 1,
648			       (char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
649			dirent_terminate(de);
650
651			err = uiomove(cookediov->base, cookediov->len, uio);
652			if (err)
653				break;
654			if (cookies != NULL) {
655				if (*ncookies == 0) {
656					err = -1;
657					break;
658				}
659				*cookies = fudge->off;
660				cookies++;
661				(*ncookies)--;
662			}
663		} else if (startoff == fudge->off)
664			*fnd_start = 1;
665		buf = (char *)buf + freclen;
666		bufsize -= freclen;
667		uio_setoffset(uio, fudge->off);
668	}
669	*cookiesp = cookies;
670
671	return err;
672}
673
674/* remove */
675
676int
677fuse_internal_remove(struct vnode *dvp,
678    struct vnode *vp,
679    struct componentname *cnp,
680    enum fuse_opcode op)
681{
682	struct fuse_dispatcher fdi;
683	nlink_t nlink;
684	int err = 0;
685
686	fdisp_init(&fdi, cnp->cn_namelen + 1);
687	fdisp_make_vp(&fdi, op, dvp, cnp->cn_thread, cnp->cn_cred);
688
689	memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
690	((char *)fdi.indata)[cnp->cn_namelen] = '\0';
691
692	err = fdisp_wait_answ(&fdi);
693	fdisp_destroy(&fdi);
694
695	if (err)
696		return (err);
697
698	/*
699	 * Access the cached nlink even if the attr cached has expired.  If
700	 * it's inaccurate, the worst that will happen is:
701	 * 1) We'll recycle the vnode even though the file has another link we
702	 *    don't know about, costing a bit of cpu time, or
703	 * 2) We won't recycle the vnode even though all of its links are gone.
704	 *    It will linger around until vnlru reclaims it, costing a bit of
705	 *    temporary memory.
706	 */
707	nlink = VTOFUD(vp)->cached_attrs.va_nlink--;
708
709	/*
710	 * Purge the parent's attribute cache because the daemon
711	 * should've updated its mtime and ctime.
712	 */
713	fuse_vnode_clear_attr_cache(dvp);
714
715	/* NB: nlink could be zero if it was never cached */
716	if (nlink <= 1 || vnode_vtype(vp) == VDIR) {
717		fuse_internal_vnode_disappear(vp);
718	} else {
719		cache_purge(vp);
720		fuse_vnode_update(vp, FN_CTIMECHANGE);
721	}
722
723	return err;
724}
725
726/* rename */
727
728int
729fuse_internal_rename(struct vnode *fdvp,
730    struct componentname *fcnp,
731    struct vnode *tdvp,
732    struct componentname *tcnp)
733{
734	struct fuse_dispatcher fdi;
735	struct fuse_rename_in *fri;
736	int err = 0;
737
738	fdisp_init(&fdi, sizeof(*fri) + fcnp->cn_namelen + tcnp->cn_namelen + 2);
739	fdisp_make_vp(&fdi, FUSE_RENAME, fdvp, tcnp->cn_thread, tcnp->cn_cred);
740
741	fri = fdi.indata;
742	fri->newdir = VTOI(tdvp);
743	memcpy((char *)fdi.indata + sizeof(*fri), fcnp->cn_nameptr,
744	    fcnp->cn_namelen);
745	((char *)fdi.indata)[sizeof(*fri) + fcnp->cn_namelen] = '\0';
746	memcpy((char *)fdi.indata + sizeof(*fri) + fcnp->cn_namelen + 1,
747	    tcnp->cn_nameptr, tcnp->cn_namelen);
748	((char *)fdi.indata)[sizeof(*fri) + fcnp->cn_namelen +
749	    tcnp->cn_namelen + 1] = '\0';
750
751	err = fdisp_wait_answ(&fdi);
752	fdisp_destroy(&fdi);
753	return err;
754}
755
756/* strategy */
757
758/* entity creation */
759
760void
761fuse_internal_newentry_makerequest(struct mount *mp,
762    uint64_t dnid,
763    struct componentname *cnp,
764    enum fuse_opcode op,
765    void *buf,
766    size_t bufsize,
767    struct fuse_dispatcher *fdip)
768{
769	fdip->iosize = bufsize + cnp->cn_namelen + 1;
770
771	fdisp_make(fdip, op, mp, dnid, cnp->cn_thread, cnp->cn_cred);
772	memcpy(fdip->indata, buf, bufsize);
773	memcpy((char *)fdip->indata + bufsize, cnp->cn_nameptr, cnp->cn_namelen);
774	((char *)fdip->indata)[bufsize + cnp->cn_namelen] = '\0';
775}
776
777int
778fuse_internal_newentry_core(struct vnode *dvp,
779    struct vnode **vpp,
780    struct componentname *cnp,
781    enum vtype vtyp,
782    struct fuse_dispatcher *fdip)
783{
784	int err = 0;
785	struct fuse_entry_out *feo;
786	struct mount *mp = vnode_mount(dvp);
787
788	if ((err = fdisp_wait_answ(fdip))) {
789		return err;
790	}
791	feo = fdip->answ;
792
793	if ((err = fuse_internal_checkentry(feo, vtyp))) {
794		return err;
795	}
796	err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vtyp);
797	if (err) {
798		fuse_internal_forget_send(mp, cnp->cn_thread, cnp->cn_cred,
799		    feo->nodeid, 1);
800		return err;
801	}
802
803	/*
804	 * Purge the parent's attribute cache because the daemon should've
805	 * updated its mtime and ctime
806	 */
807	fuse_vnode_clear_attr_cache(dvp);
808
809	fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
810		feo->attr_valid_nsec, NULL);
811
812	return err;
813}
814
815int
816fuse_internal_newentry(struct vnode *dvp,
817    struct vnode **vpp,
818    struct componentname *cnp,
819    enum fuse_opcode op,
820    void *buf,
821    size_t bufsize,
822    enum vtype vtype)
823{
824	int err;
825	struct fuse_dispatcher fdi;
826	struct mount *mp = vnode_mount(dvp);
827
828	fdisp_init(&fdi, 0);
829	fuse_internal_newentry_makerequest(mp, VTOI(dvp), cnp, op, buf,
830	    bufsize, &fdi);
831	err = fuse_internal_newentry_core(dvp, vpp, cnp, vtype, &fdi);
832	fdisp_destroy(&fdi);
833
834	return err;
835}
836
837/* entity destruction */
838
839int
840fuse_internal_forget_callback(struct fuse_ticket *ftick, struct uio *uio)
841{
842	fuse_internal_forget_send(ftick->tk_data->mp, curthread, NULL,
843	    ((struct fuse_in_header *)ftick->tk_ms_fiov.base)->nodeid, 1);
844
845	return 0;
846}
847
848void
849fuse_internal_forget_send(struct mount *mp,
850    struct thread *td,
851    struct ucred *cred,
852    uint64_t nodeid,
853    uint64_t nlookup)
854{
855
856	struct fuse_dispatcher fdi;
857	struct fuse_forget_in *ffi;
858
859	/*
860         * KASSERT(nlookup > 0, ("zero-times forget for vp #%llu",
861         *         (long long unsigned) nodeid));
862         */
863
864	fdisp_init(&fdi, sizeof(*ffi));
865	fdisp_make(&fdi, FUSE_FORGET, mp, nodeid, td, cred);
866
867	ffi = fdi.indata;
868	ffi->nlookup = nlookup;
869
870	fuse_insert_message(fdi.tick, false);
871	fdisp_destroy(&fdi);
872}
873
874SDT_PROBE_DEFINE2(fusefs, , internal, getattr_cache_incoherent,
875	"struct vnode*", "struct fuse_attr_out*");
876
877/* Fetch the vnode's attributes from the daemon*/
878int
879fuse_internal_do_getattr(struct vnode *vp, struct vattr *vap,
880	struct ucred *cred, struct thread *td)
881{
882	struct fuse_dispatcher fdi;
883	struct fuse_vnode_data *fvdat = VTOFUD(vp);
884	struct fuse_getattr_in *fgai;
885	struct fuse_attr_out *fao;
886	off_t old_filesize = fvdat->cached_attrs.va_size;
887	struct timespec old_ctime = fvdat->cached_attrs.va_ctime;
888	struct timespec old_mtime = fvdat->cached_attrs.va_mtime;
889	enum vtype vtyp;
890	int err;
891
892	fdisp_init(&fdi, sizeof(*fgai));
893	fdisp_make_vp(&fdi, FUSE_GETATTR, vp, td, cred);
894	fgai = fdi.indata;
895	/*
896	 * We could look up a file handle and set it in fgai->fh, but that
897	 * involves extra runtime work and I'm unaware of any file systems that
898	 * care.
899	 */
900	fgai->getattr_flags = 0;
901	if ((err = fdisp_wait_answ(&fdi))) {
902		if (err == ENOENT)
903			fuse_internal_vnode_disappear(vp);
904		goto out;
905	}
906
907	fao = (struct fuse_attr_out *)fdi.answ;
908	vtyp = IFTOVT(fao->attr.mode);
909	if (fvdat->flag & FN_SIZECHANGE)
910		fao->attr.size = old_filesize;
911	if (fvdat->flag & FN_CTIMECHANGE) {
912		fao->attr.ctime = old_ctime.tv_sec;
913		fao->attr.ctimensec = old_ctime.tv_nsec;
914	}
915	if (fvdat->flag & FN_MTIMECHANGE) {
916		fao->attr.mtime = old_mtime.tv_sec;
917		fao->attr.mtimensec = old_mtime.tv_nsec;
918	}
919	if (vnode_isreg(vp) &&
920	    fvdat->cached_attrs.va_size != VNOVAL &&
921	    fao->attr.size != fvdat->cached_attrs.va_size) {
922		/*
923		 * The server changed the file's size even though we had it
924		 * cached!  That's a server bug.
925		 */
926		SDT_PROBE2(fusefs, , internal, getattr_cache_incoherent, vp,
927		    fao);
928		printf("%s: cache incoherent on %s!  "
929		    "Buggy FUSE server detected.  To prevent data corruption, "
930		    "disable the data cache by mounting with -o direct_io, or "
931		    "as directed otherwise by your FUSE server's "
932		    "documentation\n", __func__,
933		    vnode_mount(vp)->mnt_stat.f_mntonname);
934		int iosize = fuse_iosize(vp);
935		v_inval_buf_range(vp, 0, INT64_MAX, iosize);
936	}
937	fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
938		fao->attr_valid_nsec, vap);
939	if (vtyp != vnode_vtype(vp)) {
940		fuse_internal_vnode_disappear(vp);
941		err = ENOENT;
942	}
943
944out:
945	fdisp_destroy(&fdi);
946	return err;
947}
948
949/* Read a vnode's attributes from cache or fetch them from the fuse daemon */
950int
951fuse_internal_getattr(struct vnode *vp, struct vattr *vap, struct ucred *cred,
952	struct thread *td)
953{
954	struct vattr *attrs;
955
956	if ((attrs = VTOVA(vp)) != NULL) {
957		*vap = *attrs;	/* struct copy */
958		return 0;
959	}
960
961	return fuse_internal_do_getattr(vp, vap, cred, td);
962}
963
964void
965fuse_internal_vnode_disappear(struct vnode *vp)
966{
967	struct fuse_vnode_data *fvdat = VTOFUD(vp);
968
969	ASSERT_VOP_ELOCKED(vp, "fuse_internal_vnode_disappear");
970	fvdat->flag |= FN_REVOKED;
971	cache_purge(vp);
972}
973
974/* fuse start/stop */
975
976SDT_PROBE_DEFINE2(fusefs, , internal, init_done,
977	"struct fuse_data*", "struct fuse_init_out*");
978int
979fuse_internal_init_callback(struct fuse_ticket *tick, struct uio *uio)
980{
981	int err = 0;
982	struct fuse_data *data = tick->tk_data;
983	struct fuse_init_out *fiio;
984
985	if ((err = tick->tk_aw_ohead.error)) {
986		goto out;
987	}
988	if ((err = fticket_pull(tick, uio))) {
989		goto out;
990	}
991	fiio = fticket_resp(tick)->base;
992
993	data->fuse_libabi_major = fiio->major;
994	data->fuse_libabi_minor = fiio->minor;
995	if (!fuse_libabi_geq(data, 7, 4)) {
996		/*
997		 * With a little work we could support servers as old as 7.1.
998		 * But there would be little payoff.
999		 */
1000		SDT_PROBE2(fusefs, , internal, trace, 1,
1001			"userpace version too low");
1002		err = EPROTONOSUPPORT;
1003		goto out;
1004	}
1005
1006	if (fuse_libabi_geq(data, 7, 5)) {
1007		if (fticket_resp(tick)->len == sizeof(struct fuse_init_out) ||
1008		    fticket_resp(tick)->len == FUSE_COMPAT_22_INIT_OUT_SIZE) {
1009			data->max_write = fiio->max_write;
1010			if (fiio->flags & FUSE_ASYNC_READ)
1011				data->dataflags |= FSESS_ASYNC_READ;
1012			if (fiio->flags & FUSE_POSIX_LOCKS)
1013				data->dataflags |= FSESS_POSIX_LOCKS;
1014			if (fiio->flags & FUSE_EXPORT_SUPPORT)
1015				data->dataflags |= FSESS_EXPORT_SUPPORT;
1016			/*
1017			 * Don't bother to check FUSE_BIG_WRITES, because it's
1018			 * redundant with max_write
1019			 */
1020			/*
1021			 * max_background and congestion_threshold are not
1022			 * implemented
1023			 */
1024		} else {
1025			err = EINVAL;
1026		}
1027	} else {
1028		/* Old fixed values */
1029		data->max_write = 4096;
1030	}
1031
1032	if (fuse_libabi_geq(data, 7, 6))
1033		data->max_readahead_blocks = fiio->max_readahead / maxbcachebuf;
1034
1035	if (!fuse_libabi_geq(data, 7, 7))
1036		fsess_set_notimpl(data->mp, FUSE_INTERRUPT);
1037
1038	if (!fuse_libabi_geq(data, 7, 8)) {
1039		fsess_set_notimpl(data->mp, FUSE_BMAP);
1040		fsess_set_notimpl(data->mp, FUSE_DESTROY);
1041	}
1042
1043	if (fuse_libabi_geq(data, 7, 23) && fiio->time_gran >= 1 &&
1044	    fiio->time_gran <= 1000000000)
1045		data->time_gran = fiio->time_gran;
1046	else
1047		data->time_gran = 1;
1048
1049	if (!fuse_libabi_geq(data, 7, 23))
1050		data->cache_mode = fuse_data_cache_mode;
1051	else if (fiio->flags & FUSE_WRITEBACK_CACHE)
1052		data->cache_mode = FUSE_CACHE_WB;
1053	else
1054		data->cache_mode = FUSE_CACHE_WT;
1055
1056out:
1057	if (err) {
1058		fdata_set_dead(data);
1059	}
1060	FUSE_LOCK();
1061	data->dataflags |= FSESS_INITED;
1062	SDT_PROBE2(fusefs, , internal, init_done, data, fiio);
1063	wakeup(&data->ticketer);
1064	FUSE_UNLOCK();
1065
1066	return 0;
1067}
1068
1069void
1070fuse_internal_send_init(struct fuse_data *data, struct thread *td)
1071{
1072	struct fuse_init_in *fiii;
1073	struct fuse_dispatcher fdi;
1074
1075	fdisp_init(&fdi, sizeof(*fiii));
1076	fdisp_make(&fdi, FUSE_INIT, data->mp, 0, td, NULL);
1077	fiii = fdi.indata;
1078	fiii->major = FUSE_KERNEL_VERSION;
1079	fiii->minor = FUSE_KERNEL_MINOR_VERSION;
1080	/*
1081	 * fusefs currently reads ahead no more than one cache block at a time.
1082	 * See fuse_read_biobackend
1083	 */
1084	fiii->max_readahead = maxbcachebuf;
1085	/*
1086	 * Unsupported features:
1087	 * FUSE_FILE_OPS: No known FUSE server or client supports it
1088	 * FUSE_ATOMIC_O_TRUNC: our VFS cannot support it
1089	 * FUSE_DONT_MASK: unlike Linux, FreeBSD always applies the umask, even
1090	 *	when default ACLs are in use.
1091	 * FUSE_SPLICE_WRITE, FUSE_SPLICE_MOVE, FUSE_SPLICE_READ: FreeBSD
1092	 *	doesn't have splice(2).
1093	 * FUSE_FLOCK_LOCKS: not yet implemented
1094	 * FUSE_HAS_IOCTL_DIR: not yet implemented
1095	 * FUSE_AUTO_INVAL_DATA: not yet implemented
1096	 * FUSE_DO_READDIRPLUS: not yet implemented
1097	 * FUSE_READDIRPLUS_AUTO: not yet implemented
1098	 * FUSE_ASYNC_DIO: not yet implemented
1099	 * FUSE_NO_OPEN_SUPPORT: not yet implemented
1100	 */
1101	fiii->flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_EXPORT_SUPPORT
1102		| FUSE_BIG_WRITES | FUSE_WRITEBACK_CACHE;
1103
1104	fuse_insert_callback(fdi.tick, fuse_internal_init_callback);
1105	fuse_insert_message(fdi.tick, false);
1106	fdisp_destroy(&fdi);
1107}
1108
1109/*
1110 * Send a FUSE_SETATTR operation with no permissions checks.  If cred is NULL,
1111 * send the request with root credentials
1112 */
1113int fuse_internal_setattr(struct vnode *vp, struct vattr *vap,
1114	struct thread *td, struct ucred *cred)
1115{
1116	struct fuse_vnode_data *fvdat;
1117	struct fuse_dispatcher fdi;
1118	struct fuse_setattr_in *fsai;
1119	struct mount *mp;
1120	pid_t pid = td->td_proc->p_pid;
1121	struct fuse_data *data;
1122	int dataflags;
1123	int err = 0;
1124	enum vtype vtyp;
1125	int sizechanged = -1;
1126	uint64_t newsize = 0;
1127
1128	mp = vnode_mount(vp);
1129	fvdat = VTOFUD(vp);
1130	data = fuse_get_mpdata(mp);
1131	dataflags = data->dataflags;
1132
1133	fdisp_init(&fdi, sizeof(*fsai));
1134	fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred);
1135	if (!cred) {
1136		fdi.finh->uid = 0;
1137		fdi.finh->gid = 0;
1138	}
1139	fsai = fdi.indata;
1140	fsai->valid = 0;
1141
1142	if (vap->va_uid != (uid_t)VNOVAL) {
1143		fsai->uid = vap->va_uid;
1144		fsai->valid |= FATTR_UID;
1145	}
1146	if (vap->va_gid != (gid_t)VNOVAL) {
1147		fsai->gid = vap->va_gid;
1148		fsai->valid |= FATTR_GID;
1149	}
1150	if (vap->va_size != VNOVAL) {
1151		struct fuse_filehandle *fufh = NULL;
1152
1153		/*Truncate to a new value. */
1154		fsai->size = vap->va_size;
1155		sizechanged = 1;
1156		newsize = vap->va_size;
1157		fsai->valid |= FATTR_SIZE;
1158
1159		fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid);
1160		if (fufh) {
1161			fsai->fh = fufh->fh_id;
1162			fsai->valid |= FATTR_FH;
1163		}
1164		VTOFUD(vp)->flag &= ~FN_SIZECHANGE;
1165	}
1166	if (vap->va_atime.tv_sec != VNOVAL) {
1167		fsai->atime = vap->va_atime.tv_sec;
1168		fsai->atimensec = vap->va_atime.tv_nsec;
1169		fsai->valid |= FATTR_ATIME;
1170		if (vap->va_vaflags & VA_UTIMES_NULL)
1171			fsai->valid |= FATTR_ATIME_NOW;
1172	}
1173	if (vap->va_mtime.tv_sec != VNOVAL) {
1174		fsai->mtime = vap->va_mtime.tv_sec;
1175		fsai->mtimensec = vap->va_mtime.tv_nsec;
1176		fsai->valid |= FATTR_MTIME;
1177		if (vap->va_vaflags & VA_UTIMES_NULL)
1178			fsai->valid |= FATTR_MTIME_NOW;
1179	} else if (fvdat->flag & FN_MTIMECHANGE) {
1180		fsai->mtime = fvdat->cached_attrs.va_mtime.tv_sec;
1181		fsai->mtimensec = fvdat->cached_attrs.va_mtime.tv_nsec;
1182		fsai->valid |= FATTR_MTIME;
1183	}
1184	if (fuse_libabi_geq(data, 7, 23) && fvdat->flag & FN_CTIMECHANGE) {
1185		fsai->ctime = fvdat->cached_attrs.va_ctime.tv_sec;
1186		fsai->ctimensec = fvdat->cached_attrs.va_ctime.tv_nsec;
1187		fsai->valid |= FATTR_CTIME;
1188	}
1189	if (vap->va_mode != (mode_t)VNOVAL) {
1190		fsai->mode = vap->va_mode & ALLPERMS;
1191		fsai->valid |= FATTR_MODE;
1192	}
1193	if (!fsai->valid) {
1194		goto out;
1195	}
1196
1197	if ((err = fdisp_wait_answ(&fdi)))
1198		goto out;
1199	vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode);
1200
1201	if (vnode_vtype(vp) != vtyp) {
1202		if (vnode_vtype(vp) == VNON && vtyp != VNON) {
1203			SDT_PROBE2(fusefs, , internal, trace, 1, "FUSE: Dang! "
1204				"vnode_vtype is VNON and vtype isn't.");
1205		} else {
1206			/*
1207	                 * STALE vnode, ditch
1208	                 *
1209			 * The vnode has changed its type "behind our back".
1210			 * There's nothing really we can do, so let us just
1211			 * force an internal revocation and tell the caller to
1212			 * try again, if interested.
1213	                 */
1214			fuse_internal_vnode_disappear(vp);
1215			err = EAGAIN;
1216		}
1217	}
1218	if (err == 0) {
1219		struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ;
1220		fuse_vnode_undirty_cached_timestamps(vp);
1221		fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid,
1222			fao->attr_valid_nsec, NULL);
1223	}
1224
1225out:
1226	fdisp_destroy(&fdi);
1227	return err;
1228}
1229
1230#ifdef ZERO_PAD_INCOMPLETE_BUFS
1231static int
1232isbzero(void *buf, size_t len)
1233{
1234	int i;
1235
1236	for (i = 0; i < len; i++) {
1237		if (((char *)buf)[i])
1238			return (0);
1239	}
1240
1241	return (1);
1242}
1243
1244#endif
1245
1246void
1247fuse_internal_init(void)
1248{
1249	fuse_lookup_cache_misses = counter_u64_alloc(M_WAITOK);
1250	fuse_lookup_cache_hits = counter_u64_alloc(M_WAITOK);
1251}
1252
1253void
1254fuse_internal_destroy(void)
1255{
1256	counter_u64_free(fuse_lookup_cache_hits);
1257	counter_u64_free(fuse_lookup_cache_misses);
1258}
1259