1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD$");
37
38/*
39 * vnode op calls for Sun NFS version 2 and 3
40 */
41
42#include "opt_inet.h"
43#include "opt_kdtrace.h"
44
45#include <sys/param.h>
46#include <sys/kernel.h>
47#include <sys/systm.h>
48#include <sys/resourcevar.h>
49#include <sys/proc.h>
50#include <sys/mount.h>
51#include <sys/bio.h>
52#include <sys/buf.h>
53#include <sys/jail.h>
54#include <sys/malloc.h>
55#include <sys/mbuf.h>
56#include <sys/namei.h>
57#include <sys/socket.h>
58#include <sys/vnode.h>
59#include <sys/dirent.h>
60#include <sys/fcntl.h>
61#include <sys/lockf.h>
62#include <sys/stat.h>
63#include <sys/sysctl.h>
64#include <sys/signalvar.h>
65
66#include <vm/vm.h>
67#include <vm/vm_extern.h>
68#include <vm/vm_object.h>
69
70#include <fs/fifofs/fifo.h>
71
72#include <nfs/nfsproto.h>
73#include <nfsclient/nfs.h>
74#include <nfsclient/nfsnode.h>
75#include <nfsclient/nfsmount.h>
76#include <nfs/nfs_kdtrace.h>
77#include <nfs/nfs_lock.h>
78#include <nfs/xdr_subs.h>
79#include <nfsclient/nfsm_subs.h>
80
81#include <net/if.h>
82#include <netinet/in.h>
83#include <netinet/in_var.h>
84
85#include <machine/stdarg.h>
86
87#ifdef KDTRACE_HOOKS
88#include <sys/dtrace_bsd.h>
89
90dtrace_nfsclient_accesscache_flush_probe_func_t
91    dtrace_nfsclient_accesscache_flush_done_probe;
92uint32_t nfsclient_accesscache_flush_done_id;
93
94dtrace_nfsclient_accesscache_get_probe_func_t
95    dtrace_nfsclient_accesscache_get_hit_probe,
96    dtrace_nfsclient_accesscache_get_miss_probe;
97uint32_t nfsclient_accesscache_get_hit_id;
98uint32_t nfsclient_accesscache_get_miss_id;
99
100dtrace_nfsclient_accesscache_load_probe_func_t
101    dtrace_nfsclient_accesscache_load_done_probe;
102uint32_t nfsclient_accesscache_load_done_id;
103#endif /* !KDTRACE_HOOKS */
104
105/* Defs */
106#define	TRUE	1
107#define	FALSE	0
108
109/*
110 * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these
111 * calls are not in getblk() and brelse() so that they would not be necessary
112 * here.
113 */
114#ifndef B_VMIO
115#define vfs_busy_pages(bp, f)
116#endif
117
118static vop_read_t	nfsfifo_read;
119static vop_write_t	nfsfifo_write;
120static vop_close_t	nfsfifo_close;
121static int	nfs_flush(struct vnode *, int, int);
122static int	nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *);
123static vop_lookup_t	nfs_lookup;
124static vop_create_t	nfs_create;
125static vop_mknod_t	nfs_mknod;
126static vop_open_t	nfs_open;
127static vop_close_t	nfs_close;
128static vop_access_t	nfs_access;
129static vop_getattr_t	nfs_getattr;
130static vop_setattr_t	nfs_setattr;
131static vop_read_t	nfs_read;
132static vop_fsync_t	nfs_fsync;
133static vop_remove_t	nfs_remove;
134static vop_link_t	nfs_link;
135static vop_rename_t	nfs_rename;
136static vop_mkdir_t	nfs_mkdir;
137static vop_rmdir_t	nfs_rmdir;
138static vop_symlink_t	nfs_symlink;
139static vop_readdir_t	nfs_readdir;
140static vop_strategy_t	nfs_strategy;
141static	int	nfs_lookitup(struct vnode *, const char *, int,
142		    struct ucred *, struct thread *, struct nfsnode **);
143static	int	nfs_sillyrename(struct vnode *, struct vnode *,
144		    struct componentname *);
145static vop_access_t	nfsspec_access;
146static vop_readlink_t	nfs_readlink;
147static vop_print_t	nfs_print;
148static vop_advlock_t	nfs_advlock;
149static vop_advlockasync_t nfs_advlockasync;
150
151/*
152 * Global vfs data structures for nfs
153 */
154struct vop_vector nfs_vnodeops = {
155	.vop_default =		&default_vnodeops,
156	.vop_access =		nfs_access,
157	.vop_advlock =		nfs_advlock,
158	.vop_advlockasync =	nfs_advlockasync,
159	.vop_close =		nfs_close,
160	.vop_create =		nfs_create,
161	.vop_fsync =		nfs_fsync,
162	.vop_getattr =		nfs_getattr,
163	.vop_getpages =		nfs_getpages,
164	.vop_putpages =		nfs_putpages,
165	.vop_inactive =		nfs_inactive,
166	.vop_link =		nfs_link,
167	.vop_lookup =		nfs_lookup,
168	.vop_mkdir =		nfs_mkdir,
169	.vop_mknod =		nfs_mknod,
170	.vop_open =		nfs_open,
171	.vop_print =		nfs_print,
172	.vop_read =		nfs_read,
173	.vop_readdir =		nfs_readdir,
174	.vop_readlink =		nfs_readlink,
175	.vop_reclaim =		nfs_reclaim,
176	.vop_remove =		nfs_remove,
177	.vop_rename =		nfs_rename,
178	.vop_rmdir =		nfs_rmdir,
179	.vop_setattr =		nfs_setattr,
180	.vop_strategy =		nfs_strategy,
181	.vop_symlink =		nfs_symlink,
182	.vop_write =		nfs_write,
183};
184
185struct vop_vector nfs_fifoops = {
186	.vop_default =		&fifo_specops,
187	.vop_access =		nfsspec_access,
188	.vop_close =		nfsfifo_close,
189	.vop_fsync =		nfs_fsync,
190	.vop_getattr =		nfs_getattr,
191	.vop_inactive =		nfs_inactive,
192	.vop_print =		nfs_print,
193	.vop_read =		nfsfifo_read,
194	.vop_reclaim =		nfs_reclaim,
195	.vop_setattr =		nfs_setattr,
196	.vop_write =		nfsfifo_write,
197};
198
199static int	nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp,
200			     struct componentname *cnp, struct vattr *vap);
201static int	nfs_removerpc(struct vnode *dvp, const char *name, int namelen,
202			      struct ucred *cred, struct thread *td);
203static int	nfs_renamerpc(struct vnode *fdvp, const char *fnameptr,
204			      int fnamelen, struct vnode *tdvp,
205			      const char *tnameptr, int tnamelen,
206			      struct ucred *cred, struct thread *td);
207static int	nfs_renameit(struct vnode *sdvp, struct componentname *scnp,
208			     struct sillyrename *sp);
209
210/*
211 * Global variables
212 */
213struct mtx 	nfs_iod_mtx;
214enum nfsiod_state nfs_iodwant[NFS_MAXASYNCDAEMON];
215struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
216int		 nfs_numasync = 0;
217#define	DIRHDSIZ	(sizeof (struct dirent) - (MAXNAMLEN + 1))
218
219SYSCTL_DECL(_vfs_oldnfs);
220
221static int	nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
222SYSCTL_INT(_vfs_oldnfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
223	   &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");
224
225static int	nfs_prime_access_cache = 0;
226SYSCTL_INT(_vfs_oldnfs, OID_AUTO, prime_access_cache, CTLFLAG_RW,
227	   &nfs_prime_access_cache, 0,
228	   "Prime NFS ACCESS cache when fetching attributes");
229
230static int	nfsv3_commit_on_close = 0;
231SYSCTL_INT(_vfs_oldnfs, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW,
232	   &nfsv3_commit_on_close, 0, "write+commit on close, else only write");
233
234static int	nfs_clean_pages_on_close = 1;
235SYSCTL_INT(_vfs_oldnfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
236	   &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
237
238int nfs_directio_enable = 0;
239SYSCTL_INT(_vfs_oldnfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
240	   &nfs_directio_enable, 0, "Enable NFS directio");
241
242/*
243 * This sysctl allows other processes to mmap a file that has been opened
244 * O_DIRECT by a process.  In general, having processes mmap the file while
245 * Direct IO is in progress can lead to Data Inconsistencies.  But, we allow
246 * this by default to prevent DoS attacks - to prevent a malicious user from
247 * opening up files O_DIRECT preventing other users from mmap'ing these
248 * files.  "Protected" environments where stricter consistency guarantees are
249 * required can disable this knob.  The process that opened the file O_DIRECT
250 * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
251 * meaningful.
252 */
253int nfs_directio_allow_mmap = 1;
254SYSCTL_INT(_vfs_oldnfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
255	   &nfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
256
257#if 0
258SYSCTL_INT(_vfs_oldnfs, OID_AUTO, access_cache_hits, CTLFLAG_RD,
259	   &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");
260
261SYSCTL_INT(_vfs_oldnfs, OID_AUTO, access_cache_misses, CTLFLAG_RD,
262	   &nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count");
263#endif
264
265#define	NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY		\
266			 | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE	\
267			 | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP)
268
269/*
270 * SMP Locking Note :
271 * The list of locks after the description of the lock is the ordering
272 * of other locks acquired with the lock held.
273 * np->n_mtx : Protects the fields in the nfsnode.
274       VM Object Lock
275       VI_MTX (acquired indirectly)
276 * nmp->nm_mtx : Protects the fields in the nfsmount.
277       rep->r_mtx
278 * nfs_iod_mtx : Global lock, protects shared nfsiod state.
279 * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
280       nmp->nm_mtx
281       rep->r_mtx
282 * rep->r_mtx : Protects the fields in an nfsreq.
283 */
284
285static int
286nfs3_access_otw(struct vnode *vp, int wmode, struct thread *td,
287    struct ucred *cred, uint32_t *retmode)
288{
289	const int v3 = 1;
290	u_int32_t *tl;
291	int error = 0, attrflag, i, lrupos;
292
293	struct mbuf *mreq, *mrep, *md, *mb;
294	caddr_t bpos, dpos;
295	u_int32_t rmode;
296	struct nfsnode *np = VTONFS(vp);
297
298	nfsstats.rpccnt[NFSPROC_ACCESS]++;
299	mreq = nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED);
300	mb = mreq;
301	bpos = mtod(mb, caddr_t);
302	nfsm_fhtom(vp, v3);
303	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
304	*tl = txdr_unsigned(wmode);
305	nfsm_request(vp, NFSPROC_ACCESS, td, cred);
306	nfsm_postop_attr(vp, attrflag);
307	if (!error) {
308		lrupos = 0;
309		tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
310		rmode = fxdr_unsigned(u_int32_t, *tl);
311		mtx_lock(&np->n_mtx);
312		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
313			if (np->n_accesscache[i].uid == cred->cr_uid) {
314				np->n_accesscache[i].mode = rmode;
315				np->n_accesscache[i].stamp = time_second;
316				break;
317			}
318			if (i > 0 && np->n_accesscache[i].stamp <
319			    np->n_accesscache[lrupos].stamp)
320				lrupos = i;
321		}
322		if (i == NFS_ACCESSCACHESIZE) {
323			np->n_accesscache[lrupos].uid = cred->cr_uid;
324			np->n_accesscache[lrupos].mode = rmode;
325			np->n_accesscache[lrupos].stamp = time_second;
326		}
327		mtx_unlock(&np->n_mtx);
328		if (retmode != NULL)
329			*retmode = rmode;
330		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, rmode, 0);
331	}
332	m_freem(mrep);
333nfsmout:
334#ifdef KDTRACE_HOOKS
335	if (error) {
336		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, 0,
337		    error);
338	}
339#endif
340	return (error);
341}
342
343/*
344 * nfs access vnode op.
345 * For nfs version 2, just return ok. File accesses may fail later.
346 * For nfs version 3, use the access rpc to check accessibility. If file modes
347 * are changed on the server, accesses might still fail later.
348 */
349static int
350nfs_access(struct vop_access_args *ap)
351{
352	struct vnode *vp = ap->a_vp;
353	int error = 0, i, gotahit;
354	u_int32_t mode, rmode, wmode;
355	int v3 = NFS_ISV3(vp);
356	struct nfsnode *np = VTONFS(vp);
357
358	/*
359	 * Disallow write attempts on filesystems mounted read-only;
360	 * unless the file is a socket, fifo, or a block or character
361	 * device resident on the filesystem.
362	 */
363	if ((ap->a_accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
364		switch (vp->v_type) {
365		case VREG:
366		case VDIR:
367		case VLNK:
368			return (EROFS);
369		default:
370			break;
371		}
372	}
373	/*
374	 * For nfs v3, check to see if we have done this recently, and if
375	 * so return our cached result instead of making an ACCESS call.
376	 * If not, do an access rpc, otherwise you are stuck emulating
377	 * ufs_access() locally using the vattr. This may not be correct,
378	 * since the server may apply other access criteria such as
379	 * client uid-->server uid mapping that we do not know about.
380	 */
381	if (v3) {
382		if (ap->a_accmode & VREAD)
383			mode = NFSV3ACCESS_READ;
384		else
385			mode = 0;
386		if (vp->v_type != VDIR) {
387			if (ap->a_accmode & VWRITE)
388				mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND);
389			if (ap->a_accmode & VEXEC)
390				mode |= NFSV3ACCESS_EXECUTE;
391		} else {
392			if (ap->a_accmode & VWRITE)
393				mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND |
394					 NFSV3ACCESS_DELETE);
395			if (ap->a_accmode & VEXEC)
396				mode |= NFSV3ACCESS_LOOKUP;
397		}
398		/* XXX safety belt, only make blanket request if caching */
399		if (nfsaccess_cache_timeout > 0) {
400			wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY |
401				NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE |
402				NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP;
403		} else {
404			wmode = mode;
405		}
406
407		/*
408		 * Does our cached result allow us to give a definite yes to
409		 * this request?
410		 */
411		gotahit = 0;
412		mtx_lock(&np->n_mtx);
413		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
414			if (ap->a_cred->cr_uid == np->n_accesscache[i].uid) {
415				if (time_second < (np->n_accesscache[i].stamp +
416				    nfsaccess_cache_timeout) &&
417				    (np->n_accesscache[i].mode & mode) == mode) {
418					nfsstats.accesscache_hits++;
419					gotahit = 1;
420				}
421				break;
422			}
423		}
424		mtx_unlock(&np->n_mtx);
425#ifdef KDTRACE_HOOKS
426		if (gotahit)
427			KDTRACE_NFS_ACCESSCACHE_GET_HIT(vp,
428			    ap->a_cred->cr_uid, mode);
429		else
430			KDTRACE_NFS_ACCESSCACHE_GET_MISS(vp,
431			    ap->a_cred->cr_uid, mode);
432#endif
433		if (gotahit == 0) {
434			/*
435			 * Either a no, or a don't know.  Go to the wire.
436			 */
437			nfsstats.accesscache_misses++;
438		        error = nfs3_access_otw(vp, wmode, ap->a_td, ap->a_cred,
439			    &rmode);
440			if (!error) {
441				if ((rmode & mode) != mode)
442					error = EACCES;
443			}
444		}
445		return (error);
446	} else {
447		if ((error = nfsspec_access(ap)) != 0) {
448			return (error);
449		}
450		/*
451		 * Attempt to prevent a mapped root from accessing a file
452		 * which it shouldn't.  We try to read a byte from the file
453		 * if the user is root and the file is not zero length.
454		 * After calling nfsspec_access, we should have the correct
455		 * file size cached.
456		 */
457		mtx_lock(&np->n_mtx);
458		if (ap->a_cred->cr_uid == 0 && (ap->a_accmode & VREAD)
459		    && VTONFS(vp)->n_size > 0) {
460			struct iovec aiov;
461			struct uio auio;
462			char buf[1];
463
464			mtx_unlock(&np->n_mtx);
465			aiov.iov_base = buf;
466			aiov.iov_len = 1;
467			auio.uio_iov = &aiov;
468			auio.uio_iovcnt = 1;
469			auio.uio_offset = 0;
470			auio.uio_resid = 1;
471			auio.uio_segflg = UIO_SYSSPACE;
472			auio.uio_rw = UIO_READ;
473			auio.uio_td = ap->a_td;
474
475			if (vp->v_type == VREG)
476				error = nfs_readrpc(vp, &auio, ap->a_cred);
477			else if (vp->v_type == VDIR) {
478				char* bp;
479				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
480				aiov.iov_base = bp;
481				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
482				error = nfs_readdirrpc(vp, &auio, ap->a_cred);
483				free(bp, M_TEMP);
484			} else if (vp->v_type == VLNK)
485				error = nfs_readlinkrpc(vp, &auio, ap->a_cred);
486			else
487				error = EACCES;
488		} else
489			mtx_unlock(&np->n_mtx);
490		return (error);
491	}
492}
493
494int nfs_otw_getattr_avoid = 0;
495
496/*
497 * nfs open vnode op
498 * Check to see if the type is ok
499 * and that deletion is not in progress.
500 * For paged in text files, you will need to flush the page cache
501 * if consistency is lost.
502 */
503/* ARGSUSED */
504static int
505nfs_open(struct vop_open_args *ap)
506{
507	struct vnode *vp = ap->a_vp;
508	struct nfsnode *np = VTONFS(vp);
509	struct vattr vattr;
510	int error;
511	int fmode = ap->a_mode;
512	struct ucred *cred;
513
514	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
515		return (EOPNOTSUPP);
516
517	/*
518	 * Get a valid lease. If cached data is stale, flush it.
519	 */
520	mtx_lock(&np->n_mtx);
521	if (np->n_flag & NMODIFIED) {
522		mtx_unlock(&np->n_mtx);
523		error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
524		if (error == EINTR || error == EIO)
525			return (error);
526		mtx_lock(&np->n_mtx);
527		np->n_attrstamp = 0;
528		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
529		if (vp->v_type == VDIR)
530			np->n_direofoffset = 0;
531		mtx_unlock(&np->n_mtx);
532		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
533		if (error)
534			return (error);
535		mtx_lock(&np->n_mtx);
536		np->n_mtime = vattr.va_mtime;
537	} else {
538		mtx_unlock(&np->n_mtx);
539		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
540		if (error)
541			return (error);
542		mtx_lock(&np->n_mtx);
543		if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
544			if (vp->v_type == VDIR)
545				np->n_direofoffset = 0;
546			mtx_unlock(&np->n_mtx);
547			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
548			if (error == EINTR || error == EIO) {
549				return (error);
550			}
551			mtx_lock(&np->n_mtx);
552			np->n_mtime = vattr.va_mtime;
553		}
554	}
555	/*
556	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
557	 */
558	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
559		if (np->n_directio_opens == 0) {
560			mtx_unlock(&np->n_mtx);
561			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
562			if (error)
563				return (error);
564			mtx_lock(&np->n_mtx);
565			np->n_flag |= NNONCACHE;
566		}
567		np->n_directio_opens++;
568	}
569
570	/*
571	 * If this is an open for writing, capture a reference to the
572	 * credentials, so they can be used by nfs_putpages(). Using
573	 * these write credentials is preferable to the credentials of
574	 * whatever thread happens to be doing the VOP_PUTPAGES() since
575	 * the write RPCs are less likely to fail with EACCES.
576	 */
577	if ((fmode & FWRITE) != 0) {
578		cred = np->n_writecred;
579		np->n_writecred = crhold(ap->a_cred);
580	} else
581		cred = NULL;
582	mtx_unlock(&np->n_mtx);
583	if (cred != NULL)
584		crfree(cred);
585	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
586	return (0);
587}
588
589/*
590 * nfs close vnode op
591 * What an NFS client should do upon close after writing is a debatable issue.
592 * Most NFS clients push delayed writes to the server upon close, basically for
593 * two reasons:
594 * 1 - So that any write errors may be reported back to the client process
595 *     doing the close system call. By far the two most likely errors are
596 *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
597 * 2 - To put a worst case upper bound on cache inconsistency between
598 *     multiple clients for the file.
599 * There is also a consistency problem for Version 2 of the protocol w.r.t.
600 * not being able to tell if other clients are writing a file concurrently,
601 * since there is no way of knowing if the changed modify time in the reply
602 * is only due to the write for this client.
603 * (NFS Version 3 provides weak cache consistency data in the reply that
604 *  should be sufficient to detect and handle this case.)
605 *
606 * The current code does the following:
607 * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
608 * for NFS Version 3 - flush dirty buffers to the server but don't invalidate
609 *                     or commit them (this satisfies 1 and 2 except for the
610 *                     case where the server crashes after this close but
611 *                     before the commit RPC, which is felt to be "good
612 *                     enough". Changing the last argument to nfs_flush() to
613 *                     a 1 would force a commit operation, if it is felt a
614 *                     commit is necessary now.
615 */
616/* ARGSUSED */
617static int
618nfs_close(struct vop_close_args *ap)
619{
620	struct vnode *vp = ap->a_vp;
621	struct nfsnode *np = VTONFS(vp);
622	int error = 0;
623	int fmode = ap->a_fflag;
624
625	if (vp->v_type == VREG) {
626	    /*
627	     * Examine and clean dirty pages, regardless of NMODIFIED.
628	     * This closes a major hole in close-to-open consistency.
629	     * We want to push out all dirty pages (and buffers) on
630	     * close, regardless of whether they were dirtied by
631	     * mmap'ed writes or via write().
632	     */
633	    if (nfs_clean_pages_on_close && vp->v_object) {
634		VM_OBJECT_LOCK(vp->v_object);
635		vm_object_page_clean(vp->v_object, 0, 0, 0);
636		VM_OBJECT_UNLOCK(vp->v_object);
637	    }
638	    mtx_lock(&np->n_mtx);
639	    if (np->n_flag & NMODIFIED) {
640		mtx_unlock(&np->n_mtx);
641		if (NFS_ISV3(vp)) {
642		    /*
643		     * Under NFSv3 we have dirty buffers to dispose of.  We
644		     * must flush them to the NFS server.  We have the option
645		     * of waiting all the way through the commit rpc or just
646		     * waiting for the initial write.  The default is to only
647		     * wait through the initial write so the data is in the
648		     * server's cache, which is roughly similar to the state
649		     * a standard disk subsystem leaves the file in on close().
650		     *
651		     * We cannot clear the NMODIFIED bit in np->n_flag due to
652		     * potential races with other processes, and certainly
653		     * cannot clear it if we don't commit.
654		     */
655		    int cm = nfsv3_commit_on_close ? 1 : 0;
656		    error = nfs_flush(vp, MNT_WAIT, cm);
657		    /* np->n_flag &= ~NMODIFIED; */
658		} else
659		    error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
660		mtx_lock(&np->n_mtx);
661	    }
662	    if (np->n_flag & NWRITEERR) {
663		np->n_flag &= ~NWRITEERR;
664		error = np->n_error;
665	    }
666	    mtx_unlock(&np->n_mtx);
667	}
668	if (nfs_directio_enable)
669		KASSERT((np->n_directio_asyncwr == 0),
670			("nfs_close: dirty unflushed (%d) directio buffers\n",
671			 np->n_directio_asyncwr));
672	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
673		mtx_lock(&np->n_mtx);
674		KASSERT((np->n_directio_opens > 0),
675			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
676		np->n_directio_opens--;
677		if (np->n_directio_opens == 0)
678			np->n_flag &= ~NNONCACHE;
679		mtx_unlock(&np->n_mtx);
680	}
681	return (error);
682}
683
684/*
685 * nfs getattr call from vfs.
686 */
687static int
688nfs_getattr(struct vop_getattr_args *ap)
689{
690	struct vnode *vp = ap->a_vp;
691	struct nfsnode *np = VTONFS(vp);
692	struct thread *td = curthread;
693	struct vattr *vap = ap->a_vap;
694	struct vattr vattr;
695	caddr_t bpos, dpos;
696	int error = 0;
697	struct mbuf *mreq, *mrep, *md, *mb;
698	int v3 = NFS_ISV3(vp);
699
700	/*
701	 * Update local times for special files.
702	 */
703	mtx_lock(&np->n_mtx);
704	if (np->n_flag & (NACC | NUPD))
705		np->n_flag |= NCHG;
706	mtx_unlock(&np->n_mtx);
707	/*
708	 * First look in the cache.
709	 */
710	if (nfs_getattrcache(vp, &vattr) == 0)
711		goto nfsmout;
712	if (v3 && nfs_prime_access_cache && nfsaccess_cache_timeout > 0) {
713		nfsstats.accesscache_misses++;
714		nfs3_access_otw(vp, NFSV3ACCESS_ALL, td, ap->a_cred, NULL);
715		if (nfs_getattrcache(vp, &vattr) == 0)
716			goto nfsmout;
717	}
718	nfsstats.rpccnt[NFSPROC_GETATTR]++;
719	mreq = nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3));
720	mb = mreq;
721	bpos = mtod(mb, caddr_t);
722	nfsm_fhtom(vp, v3);
723	nfsm_request(vp, NFSPROC_GETATTR, td, ap->a_cred);
724	if (!error) {
725		nfsm_loadattr(vp, &vattr);
726	}
727	m_freem(mrep);
728nfsmout:
729	vap->va_type = vattr.va_type;
730	vap->va_mode = vattr.va_mode;
731	vap->va_nlink = vattr.va_nlink;
732	vap->va_uid = vattr.va_uid;
733	vap->va_gid = vattr.va_gid;
734	vap->va_fsid = vattr.va_fsid;
735	vap->va_fileid = vattr.va_fileid;
736	vap->va_size = vattr.va_size;
737	vap->va_blocksize = vattr.va_blocksize;
738	vap->va_atime = vattr.va_atime;
739	vap->va_mtime = vattr.va_mtime;
740	vap->va_ctime = vattr.va_ctime;
741	vap->va_gen = vattr.va_gen;
742	vap->va_flags = vattr.va_flags;
743	vap->va_rdev = vattr.va_rdev;
744	vap->va_bytes = vattr.va_bytes;
745	vap->va_filerev = vattr.va_filerev;
746
747	return (error);
748}
749
750/*
751 * nfs setattr call.
752 */
753static int
754nfs_setattr(struct vop_setattr_args *ap)
755{
756	struct vnode *vp = ap->a_vp;
757	struct nfsnode *np = VTONFS(vp);
758	struct vattr *vap = ap->a_vap;
759	struct thread *td = curthread;
760	int error = 0;
761	u_quad_t tsize;
762
763#ifndef nolint
764	tsize = (u_quad_t)0;
765#endif
766
767	/*
768	 * Setting of flags is not supported.
769	 */
770	if (vap->va_flags != VNOVAL)
771		return (EOPNOTSUPP);
772
773	/*
774	 * Disallow write attempts if the filesystem is mounted read-only.
775	 */
776  	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
777	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
778	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
779	    (vp->v_mount->mnt_flag & MNT_RDONLY)) {
780		error = EROFS;
781		goto out;
782	}
783	if (vap->va_size != VNOVAL) {
784 		switch (vp->v_type) {
785 		case VDIR:
786 			return (EISDIR);
787 		case VCHR:
788 		case VBLK:
789 		case VSOCK:
790 		case VFIFO:
791			if (vap->va_mtime.tv_sec == VNOVAL &&
792			    vap->va_atime.tv_sec == VNOVAL &&
793			    vap->va_mode == (mode_t)VNOVAL &&
794			    vap->va_uid == (uid_t)VNOVAL &&
795			    vap->va_gid == (gid_t)VNOVAL)
796				return (0);
797 			vap->va_size = VNOVAL;
798 			break;
799 		default:
800			/*
801			 * Disallow write attempts if the filesystem is
802			 * mounted read-only.
803			 */
804			if (vp->v_mount->mnt_flag & MNT_RDONLY)
805				return (EROFS);
806			/*
807			 *  We run vnode_pager_setsize() early (why?),
808			 * we must set np->n_size now to avoid vinvalbuf
809			 * V_SAVE races that might setsize a lower
810			 * value.
811			 */
812			mtx_lock(&np->n_mtx);
813			tsize = np->n_size;
814			mtx_unlock(&np->n_mtx);
815			error = nfs_meta_setsize(vp, ap->a_cred, td,
816			    vap->va_size);
817			mtx_lock(&np->n_mtx);
818 			if (np->n_flag & NMODIFIED) {
819			    tsize = np->n_size;
820			    mtx_unlock(&np->n_mtx);
821 			    if (vap->va_size == 0)
822 				error = nfs_vinvalbuf(vp, 0, td, 1);
823 			    else
824 				error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
825 			    if (error) {
826				vnode_pager_setsize(vp, tsize);
827				goto out;
828			    }
829 			} else
830			    mtx_unlock(&np->n_mtx);
831			/*
832			 * np->n_size has already been set to vap->va_size
833			 * in nfs_meta_setsize(). We must set it again since
834			 * nfs_loadattrcache() could be called through
835			 * nfs_meta_setsize() and could modify np->n_size.
836			 */
837			mtx_lock(&np->n_mtx);
838 			np->n_vattr.va_size = np->n_size = vap->va_size;
839			mtx_unlock(&np->n_mtx);
840  		};
841  	} else {
842		mtx_lock(&np->n_mtx);
843		if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) &&
844		    (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
845			mtx_unlock(&np->n_mtx);
846			if ((error = nfs_vinvalbuf(vp, V_SAVE, td, 1)) != 0 &&
847			    (error == EINTR || error == EIO))
848				return error;
849		} else
850			mtx_unlock(&np->n_mtx);
851	}
852	error = nfs_setattrrpc(vp, vap, ap->a_cred);
853	if (error && vap->va_size != VNOVAL) {
854		mtx_lock(&np->n_mtx);
855		np->n_size = np->n_vattr.va_size = tsize;
856		vnode_pager_setsize(vp, tsize);
857		mtx_unlock(&np->n_mtx);
858	}
859out:
860	return (error);
861}
862
863/*
864 * Do an nfs setattr rpc.
865 */
866static int
867nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred)
868{
869	struct nfsv2_sattr *sp;
870	struct nfsnode *np = VTONFS(vp);
871	caddr_t bpos, dpos;
872	u_int32_t *tl;
873	int error = 0, i, wccflag = NFSV3_WCCRATTR;
874	struct mbuf *mreq, *mrep, *md, *mb;
875	int v3 = NFS_ISV3(vp);
876
877	nfsstats.rpccnt[NFSPROC_SETATTR]++;
878	mreq = nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3));
879	mb = mreq;
880	bpos = mtod(mb, caddr_t);
881	nfsm_fhtom(vp, v3);
882	if (v3) {
883		nfsm_v3attrbuild(vap, TRUE);
884		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
885		*tl = nfs_false;
886	} else {
887		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
888		if (vap->va_mode == (mode_t)VNOVAL)
889			sp->sa_mode = nfs_xdrneg1;
890		else
891			sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode);
892		if (vap->va_uid == (uid_t)VNOVAL)
893			sp->sa_uid = nfs_xdrneg1;
894		else
895			sp->sa_uid = txdr_unsigned(vap->va_uid);
896		if (vap->va_gid == (gid_t)VNOVAL)
897			sp->sa_gid = nfs_xdrneg1;
898		else
899			sp->sa_gid = txdr_unsigned(vap->va_gid);
900		sp->sa_size = txdr_unsigned(vap->va_size);
901		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
902		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
903	}
904	nfsm_request(vp, NFSPROC_SETATTR, curthread, cred);
905	if (v3) {
906		mtx_lock(&np->n_mtx);
907		for (i = 0; i < NFS_ACCESSCACHESIZE; i++)
908			np->n_accesscache[i].stamp = 0;
909		mtx_unlock(&np->n_mtx);
910		KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp);
911		nfsm_wcc_data(vp, wccflag);
912	} else
913		nfsm_loadattr(vp, NULL);
914	m_freem(mrep);
915nfsmout:
916	return (error);
917}
918
919/*
920 * nfs lookup call, one step at a time...
921 * First look in cache
922 * If not found, unlock the directory nfsnode and do the rpc
923 */
924static int
925nfs_lookup(struct vop_lookup_args *ap)
926{
927	struct componentname *cnp = ap->a_cnp;
928	struct vnode *dvp = ap->a_dvp;
929	struct vnode **vpp = ap->a_vpp;
930	struct mount *mp = dvp->v_mount;
931	struct vattr dvattr, vattr;
932	struct timespec nctime;
933	int flags = cnp->cn_flags;
934	struct vnode *newvp;
935	struct nfsmount *nmp;
936	caddr_t bpos, dpos;
937	struct mbuf *mreq, *mrep, *md, *mb;
938	long len;
939	nfsfh_t *fhp;
940	struct nfsnode *np, *newnp;
941	int error = 0, attrflag, dattrflag, fhsize, ltype, ncticks;
942	int v3 = NFS_ISV3(dvp);
943	struct thread *td = cnp->cn_thread;
944
945	*vpp = NULLVP;
946	if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) &&
947	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
948		return (EROFS);
949	if (dvp->v_type != VDIR)
950		return (ENOTDIR);
951	nmp = VFSTONFS(mp);
952	np = VTONFS(dvp);
953	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) {
954		*vpp = NULLVP;
955		return (error);
956	}
957	error = cache_lookup_times(dvp, vpp, cnp, &nctime, &ncticks);
958	if (error > 0 && error != ENOENT)
959		return (error);
960	if (error == -1) {
961		/*
962		 * Lookups of "." are special and always return the
963		 * current directory.  cache_lookup() already handles
964		 * associated locking bookkeeping, etc.
965		 */
966		if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
967			/* XXX: Is this really correct? */
968			if (cnp->cn_nameiop != LOOKUP &&
969			    (flags & ISLASTCN))
970				cnp->cn_flags |= SAVENAME;
971			return (0);
972		}
973
974		/*
975		 * We only accept a positive hit in the cache if the
976		 * change time of the file matches our cached copy.
977		 * Otherwise, we discard the cache entry and fallback
978		 * to doing a lookup RPC.  We also only trust cache
979		 * entries for less than nm_nametimeo seconds.
980		 *
981		 * To better handle stale file handles and attributes,
982		 * clear the attribute cache of this node if it is a
983		 * leaf component, part of an open() call, and not
984		 * locally modified before fetching the attributes.
985		 * This should allow stale file handles to be detected
986		 * here where we can fall back to a LOOKUP RPC to
987		 * recover rather than having nfs_open() detect the
988		 * stale file handle and failing open(2) with ESTALE.
989		 */
990		newvp = *vpp;
991		newnp = VTONFS(newvp);
992		if (!(nmp->nm_flag & NFSMNT_NOCTO) &&
993		    (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
994		    !(newnp->n_flag & NMODIFIED)) {
995			mtx_lock(&newnp->n_mtx);
996			newnp->n_attrstamp = 0;
997			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
998			mtx_unlock(&newnp->n_mtx);
999		}
1000		if ((u_int)(ticks - ncticks) < (nmp->nm_nametimeo * hz) &&
1001		    VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 &&
1002		    timespeccmp(&vattr.va_ctime, &nctime, ==)) {
1003			nfsstats.lookupcache_hits++;
1004			if (cnp->cn_nameiop != LOOKUP &&
1005			    (flags & ISLASTCN))
1006				cnp->cn_flags |= SAVENAME;
1007			return (0);
1008		}
1009		cache_purge(newvp);
1010		if (dvp != newvp)
1011			vput(newvp);
1012		else
1013			vrele(newvp);
1014		*vpp = NULLVP;
1015	} else if (error == ENOENT) {
1016		if (dvp->v_iflag & VI_DOOMED)
1017			return (ENOENT);
1018		/*
1019		 * We only accept a negative hit in the cache if the
1020		 * modification time of the parent directory matches
1021		 * the cached copy in the name cache entry.
1022		 * Otherwise, we discard all of the negative cache
1023		 * entries for this directory.  We also only trust
1024		 * negative cache entries for up to nm_negnametimeo
1025		 * seconds.
1026		 */
1027		if ((u_int)(ticks - ncticks) < (nmp->nm_negnametimeo * hz) &&
1028		    VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 &&
1029		    timespeccmp(&vattr.va_mtime, &nctime, ==)) {
1030			nfsstats.lookupcache_hits++;
1031			return (ENOENT);
1032		}
1033		cache_purge_negative(dvp);
1034	}
1035
1036	attrflag = dattrflag = 0;
1037	error = 0;
1038	newvp = NULLVP;
1039	nfsstats.lookupcache_misses++;
1040	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
1041	len = cnp->cn_namelen;
1042	mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP,
1043		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len));
1044	mb = mreq;
1045	bpos = mtod(mb, caddr_t);
1046	nfsm_fhtom(dvp, v3);
1047	nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
1048	nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_thread, cnp->cn_cred);
1049	if (error) {
1050		if (v3) {
1051			nfsm_postop_attr_va(dvp, dattrflag, &vattr);
1052			m_freem(mrep);
1053		}
1054		goto nfsmout;
1055	}
1056	nfsm_getfh(fhp, fhsize, v3);
1057
1058	/*
1059	 * Handle RENAME case...
1060	 */
1061	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
1062		if (NFS_CMPFH(np, fhp, fhsize)) {
1063			m_freem(mrep);
1064			return (EISDIR);
1065		}
1066		error = nfs_nget(mp, fhp, fhsize, &np, LK_EXCLUSIVE);
1067		if (error) {
1068			m_freem(mrep);
1069			return (error);
1070		}
1071		newvp = NFSTOV(np);
1072		if (v3) {
1073			nfsm_postop_attr(newvp, attrflag);
1074			nfsm_postop_attr(dvp, attrflag);
1075		} else
1076			nfsm_loadattr(newvp, NULL);
1077		*vpp = newvp;
1078		m_freem(mrep);
1079		cnp->cn_flags |= SAVENAME;
1080		return (0);
1081	}
1082
1083	if (flags & ISDOTDOT) {
1084		ltype = VOP_ISLOCKED(dvp);
1085		error = vfs_busy(mp, MBF_NOWAIT);
1086		if (error != 0) {
1087			vfs_ref(mp);
1088			VOP_UNLOCK(dvp, 0);
1089			error = vfs_busy(mp, 0);
1090			vn_lock(dvp, ltype | LK_RETRY);
1091			vfs_rel(mp);
1092			if (error == 0 && (dvp->v_iflag & VI_DOOMED)) {
1093				vfs_unbusy(mp);
1094				error = ENOENT;
1095			}
1096			if (error != 0) {
1097				m_freem(mrep);
1098				return (error);
1099			}
1100		}
1101		VOP_UNLOCK(dvp, 0);
1102		error = nfs_nget(mp, fhp, fhsize, &np, cnp->cn_lkflags);
1103		if (error == 0)
1104			newvp = NFSTOV(np);
1105		vfs_unbusy(mp);
1106		if (newvp != dvp)
1107			vn_lock(dvp, ltype | LK_RETRY);
1108		if (dvp->v_iflag & VI_DOOMED) {
1109			if (error == 0) {
1110				if (newvp == dvp)
1111					vrele(newvp);
1112				else
1113					vput(newvp);
1114			}
1115			error = ENOENT;
1116		}
1117		if (error) {
1118			m_freem(mrep);
1119			return (error);
1120		}
1121	} else if (NFS_CMPFH(np, fhp, fhsize)) {
1122		VREF(dvp);
1123		newvp = dvp;
1124	} else {
1125		error = nfs_nget(mp, fhp, fhsize, &np, cnp->cn_lkflags);
1126		if (error) {
1127			m_freem(mrep);
1128			return (error);
1129		}
1130		newvp = NFSTOV(np);
1131
1132		/*
1133		 * Flush the attribute cache when opening a leaf node
1134		 * to ensure that fresh attributes are fetched in
1135		 * nfs_open() if we are unable to fetch attributes
1136		 * from the LOOKUP reply.
1137		 */
1138		if ((flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
1139		    !(np->n_flag & NMODIFIED)) {
1140			mtx_lock(&np->n_mtx);
1141			np->n_attrstamp = 0;
1142			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
1143			mtx_unlock(&np->n_mtx);
1144		}
1145	}
1146	if (v3) {
1147		nfsm_postop_attr_va(newvp, attrflag, &vattr);
1148		nfsm_postop_attr_va(dvp, dattrflag, &dvattr);
1149	} else {
1150		nfsm_loadattr(newvp, &vattr);
1151		attrflag = 1;
1152	}
1153	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
1154		cnp->cn_flags |= SAVENAME;
1155	if ((cnp->cn_flags & MAKEENTRY) &&
1156	    (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN)) &&
1157	    attrflag != 0 && (newvp->v_type != VDIR || dattrflag != 0))
1158		cache_enter_time(dvp, newvp, cnp, &vattr.va_ctime,
1159		    newvp->v_type != VDIR ? NULL : &dvattr.va_ctime);
1160	*vpp = newvp;
1161	m_freem(mrep);
1162nfsmout:
1163	if (error) {
1164		if (newvp != NULLVP) {
1165			vput(newvp);
1166			*vpp = NULLVP;
1167		}
1168
1169		if (error != ENOENT)
1170			goto done;
1171
1172		/* The requested file was not found. */
1173		if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
1174		    (flags & ISLASTCN)) {
1175			/*
1176			 * XXX: UFS does a full VOP_ACCESS(dvp,
1177			 * VWRITE) here instead of just checking
1178			 * MNT_RDONLY.
1179			 */
1180			if (mp->mnt_flag & MNT_RDONLY)
1181				return (EROFS);
1182			cnp->cn_flags |= SAVENAME;
1183			return (EJUSTRETURN);
1184		}
1185
1186		if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE &&
1187		    dattrflag) {
1188			/*
1189			 * Cache the modification time of the parent
1190			 * directory from the post-op attributes in
1191			 * the name cache entry.  The negative cache
1192			 * entry will be ignored once the directory
1193			 * has changed.  Don't bother adding the entry
1194			 * if the directory has already changed.
1195			 */
1196			mtx_lock(&np->n_mtx);
1197			if (timespeccmp(&np->n_vattr.va_mtime,
1198			    &vattr.va_mtime, ==)) {
1199				mtx_unlock(&np->n_mtx);
1200				cache_enter_time(dvp, NULL, cnp,
1201				    &vattr.va_mtime, NULL);
1202			} else
1203				mtx_unlock(&np->n_mtx);
1204		}
1205		return (ENOENT);
1206	}
1207done:
1208	return (error);
1209}
1210
1211/*
1212 * nfs read call.
1213 * Just call nfs_bioread() to do the work.
1214 */
1215static int
1216nfs_read(struct vop_read_args *ap)
1217{
1218	struct vnode *vp = ap->a_vp;
1219
1220	switch (vp->v_type) {
1221	case VREG:
1222		return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
1223	case VDIR:
1224		return (EISDIR);
1225	default:
1226		return (EOPNOTSUPP);
1227	}
1228}
1229
1230/*
1231 * nfs readlink call
1232 */
1233static int
1234nfs_readlink(struct vop_readlink_args *ap)
1235{
1236	struct vnode *vp = ap->a_vp;
1237
1238	if (vp->v_type != VLNK)
1239		return (EINVAL);
1240	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
1241}
1242
1243/*
1244 * Do a readlink rpc.
1245 * Called by nfs_doio() from below the buffer cache.
1246 */
1247int
1248nfs_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
1249{
1250	caddr_t bpos, dpos;
1251	int error = 0, len, attrflag;
1252	struct mbuf *mreq, *mrep, *md, *mb;
1253	int v3 = NFS_ISV3(vp);
1254
1255	nfsstats.rpccnt[NFSPROC_READLINK]++;
1256	mreq = nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3));
1257	mb = mreq;
1258	bpos = mtod(mb, caddr_t);
1259	nfsm_fhtom(vp, v3);
1260	nfsm_request(vp, NFSPROC_READLINK, uiop->uio_td, cred);
1261	if (v3)
1262		nfsm_postop_attr(vp, attrflag);
1263	if (!error) {
1264		nfsm_strsiz(len, NFS_MAXPATHLEN);
1265		if (len == NFS_MAXPATHLEN) {
1266			struct nfsnode *np = VTONFS(vp);
1267			mtx_lock(&np->n_mtx);
1268			if (np->n_size && np->n_size < NFS_MAXPATHLEN)
1269				len = np->n_size;
1270			mtx_unlock(&np->n_mtx);
1271		}
1272		nfsm_mtouio(uiop, len);
1273	}
1274	m_freem(mrep);
1275nfsmout:
1276	return (error);
1277}
1278
1279/*
1280 * nfs read rpc call
1281 * Ditto above
1282 */
1283int
1284nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
1285{
1286	u_int32_t *tl;
1287	caddr_t bpos, dpos;
1288	struct mbuf *mreq, *mrep, *md, *mb;
1289	struct nfsmount *nmp;
1290	off_t end;
1291	int error = 0, len, retlen, tsiz, eof, attrflag;
1292	int v3 = NFS_ISV3(vp);
1293	int rsize;
1294
1295#ifndef nolint
1296	eof = 0;
1297#endif
1298	nmp = VFSTONFS(vp->v_mount);
1299	tsiz = uiop->uio_resid;
1300	mtx_lock(&nmp->nm_mtx);
1301	end = uiop->uio_offset + tsiz;
1302	if (end > nmp->nm_maxfilesize || end < uiop->uio_offset) {
1303		mtx_unlock(&nmp->nm_mtx);
1304		return (EFBIG);
1305	}
1306	rsize = nmp->nm_rsize;
1307	mtx_unlock(&nmp->nm_mtx);
1308	while (tsiz > 0) {
1309		nfsstats.rpccnt[NFSPROC_READ]++;
1310		len = (tsiz > rsize) ? rsize : tsiz;
1311		mreq = nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3);
1312		mb = mreq;
1313		bpos = mtod(mb, caddr_t);
1314		nfsm_fhtom(vp, v3);
1315		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED * 3);
1316		if (v3) {
1317			txdr_hyper(uiop->uio_offset, tl);
1318			*(tl + 2) = txdr_unsigned(len);
1319		} else {
1320			*tl++ = txdr_unsigned(uiop->uio_offset);
1321			*tl++ = txdr_unsigned(len);
1322			*tl = 0;
1323		}
1324		nfsm_request(vp, NFSPROC_READ, uiop->uio_td, cred);
1325		if (v3) {
1326			nfsm_postop_attr(vp, attrflag);
1327			if (error) {
1328				m_freem(mrep);
1329				goto nfsmout;
1330			}
1331			tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
1332			eof = fxdr_unsigned(int, *(tl + 1));
1333		} else {
1334			nfsm_loadattr(vp, NULL);
1335		}
1336		nfsm_strsiz(retlen, rsize);
1337		nfsm_mtouio(uiop, retlen);
1338		m_freem(mrep);
1339		tsiz -= retlen;
1340		if (v3) {
1341			if (eof || retlen == 0) {
1342				tsiz = 0;
1343			}
1344		} else if (retlen < len) {
1345			tsiz = 0;
1346		}
1347	}
1348nfsmout:
1349	return (error);
1350}
1351
1352/*
1353 * nfs write call
1354 */
1355int
1356nfs_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
1357	     int *iomode, int *must_commit)
1358{
1359	u_int32_t *tl;
1360	int32_t backup;
1361	caddr_t bpos, dpos;
1362	struct mbuf *mreq, *mrep, *md, *mb;
1363	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1364	off_t end;
1365	int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit;
1366	int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC;
1367	int wsize;
1368
1369	KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1"));
1370	*must_commit = 0;
1371	tsiz = uiop->uio_resid;
1372	mtx_lock(&nmp->nm_mtx);
1373	end = uiop->uio_offset + tsiz;
1374	if (end > nmp->nm_maxfilesize || end < uiop->uio_offset) {
1375		mtx_unlock(&nmp->nm_mtx);
1376		return (EFBIG);
1377	}
1378	wsize = nmp->nm_wsize;
1379	mtx_unlock(&nmp->nm_mtx);
1380	while (tsiz > 0) {
1381		nfsstats.rpccnt[NFSPROC_WRITE]++;
1382		len = (tsiz > wsize) ? wsize : tsiz;
1383		mreq = nfsm_reqhead(vp, NFSPROC_WRITE,
1384			NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
1385		mb = mreq;
1386		bpos = mtod(mb, caddr_t);
1387		nfsm_fhtom(vp, v3);
1388		if (v3) {
1389			tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
1390			txdr_hyper(uiop->uio_offset, tl);
1391			tl += 2;
1392			*tl++ = txdr_unsigned(len);
1393			*tl++ = txdr_unsigned(*iomode);
1394			*tl = txdr_unsigned(len);
1395		} else {
1396			u_int32_t x;
1397
1398			tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED);
1399			/* Set both "begin" and "current" to non-garbage. */
1400			x = txdr_unsigned((u_int32_t)uiop->uio_offset);
1401			*tl++ = x;	/* "begin offset" */
1402			*tl++ = x;	/* "current offset" */
1403			x = txdr_unsigned(len);
1404			*tl++ = x;	/* total to this offset */
1405			*tl = x;	/* size of this write */
1406		}
1407		nfsm_uiotom(uiop, len);
1408		nfsm_request(vp, NFSPROC_WRITE, uiop->uio_td, cred);
1409		if (v3) {
1410			wccflag = NFSV3_WCCCHK;
1411			nfsm_wcc_data(vp, wccflag);
1412			if (!error) {
1413				tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED
1414					+ NFSX_V3WRITEVERF);
1415				rlen = fxdr_unsigned(int, *tl++);
1416				if (rlen == 0) {
1417					error = NFSERR_IO;
1418					m_freem(mrep);
1419					break;
1420				} else if (rlen < len) {
1421					backup = len - rlen;
1422					uiop->uio_iov->iov_base =
1423					    (char *)uiop->uio_iov->iov_base -
1424					    backup;
1425					uiop->uio_iov->iov_len += backup;
1426					uiop->uio_offset -= backup;
1427					uiop->uio_resid += backup;
1428					len = rlen;
1429				}
1430				commit = fxdr_unsigned(int, *tl++);
1431
1432				/*
1433				 * Return the lowest committment level
1434				 * obtained by any of the RPCs.
1435				 */
1436				if (committed == NFSV3WRITE_FILESYNC)
1437					committed = commit;
1438				else if (committed == NFSV3WRITE_DATASYNC &&
1439					commit == NFSV3WRITE_UNSTABLE)
1440					committed = commit;
1441				mtx_lock(&nmp->nm_mtx);
1442				if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
1443				    bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
1444					NFSX_V3WRITEVERF);
1445				    nmp->nm_state |= NFSSTA_HASWRITEVERF;
1446				} else if (bcmp((caddr_t)tl,
1447				    (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) {
1448				    *must_commit = 1;
1449				    bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
1450					NFSX_V3WRITEVERF);
1451				}
1452				mtx_unlock(&nmp->nm_mtx);
1453			}
1454		} else {
1455			nfsm_loadattr(vp, NULL);
1456		}
1457		if (wccflag) {
1458			mtx_lock(&(VTONFS(vp))->n_mtx);
1459			VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
1460			mtx_unlock(&(VTONFS(vp))->n_mtx);
1461		}
1462		m_freem(mrep);
1463		if (error)
1464			break;
1465		tsiz -= len;
1466	}
1467nfsmout:
1468	if (DOINGASYNC(vp))
1469		committed = NFSV3WRITE_FILESYNC;
1470	*iomode = committed;
1471	if (error)
1472		uiop->uio_resid = tsiz;
1473	return (error);
1474}
1475
1476/*
1477 * nfs mknod rpc
1478 * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
1479 * mode set to specify the file type and the size field for rdev.
1480 */
1481static int
1482nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1483    struct vattr *vap)
1484{
1485	struct nfsv2_sattr *sp;
1486	u_int32_t *tl;
1487	struct vnode *newvp = NULL;
1488	struct nfsnode *np = NULL;
1489	struct vattr vattr;
1490	caddr_t bpos, dpos;
1491	int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0;
1492	struct mbuf *mreq, *mrep, *md, *mb;
1493	u_int32_t rdev;
1494	int v3 = NFS_ISV3(dvp);
1495
1496	if (vap->va_type == VCHR || vap->va_type == VBLK)
1497		rdev = txdr_unsigned(vap->va_rdev);
1498	else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
1499		rdev = nfs_xdrneg1;
1500	else {
1501		return (EOPNOTSUPP);
1502	}
1503	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
1504		return (error);
1505	nfsstats.rpccnt[NFSPROC_MKNOD]++;
1506	mreq = nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED +
1507		+ nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3));
1508	mb = mreq;
1509	bpos = mtod(mb, caddr_t);
1510	nfsm_fhtom(dvp, v3);
1511	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1512	if (v3) {
1513		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
1514		*tl++ = vtonfsv3_type(vap->va_type);
1515		nfsm_v3attrbuild(vap, FALSE);
1516		if (vap->va_type == VCHR || vap->va_type == VBLK) {
1517			tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
1518			*tl++ = txdr_unsigned(major(vap->va_rdev));
1519			*tl = txdr_unsigned(minor(vap->va_rdev));
1520		}
1521	} else {
1522		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
1523		sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
1524		sp->sa_uid = nfs_xdrneg1;
1525		sp->sa_gid = nfs_xdrneg1;
1526		sp->sa_size = rdev;
1527		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
1528		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
1529	}
1530	nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_thread, cnp->cn_cred);
1531	if (!error) {
1532		nfsm_mtofh(dvp, newvp, v3, gotvp);
1533		if (!gotvp) {
1534			if (newvp) {
1535				vput(newvp);
1536				newvp = NULL;
1537			}
1538			error = nfs_lookitup(dvp, cnp->cn_nameptr,
1539			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
1540			if (!error)
1541				newvp = NFSTOV(np);
1542		}
1543	}
1544	if (v3)
1545		nfsm_wcc_data(dvp, wccflag);
1546	m_freem(mrep);
1547nfsmout:
1548	if (error) {
1549		if (newvp)
1550			vput(newvp);
1551	} else {
1552		*vpp = newvp;
1553	}
1554	mtx_lock(&(VTONFS(dvp))->n_mtx);
1555	VTONFS(dvp)->n_flag |= NMODIFIED;
1556	if (!wccflag) {
1557		VTONFS(dvp)->n_attrstamp = 0;
1558		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
1559	}
1560	mtx_unlock(&(VTONFS(dvp))->n_mtx);
1561	return (error);
1562}
1563
1564/*
1565 * nfs mknod vop
1566 * just call nfs_mknodrpc() to do the work.
1567 */
1568/* ARGSUSED */
1569static int
1570nfs_mknod(struct vop_mknod_args *ap)
1571{
1572	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
1573}
1574
1575static u_long create_verf;
1576/*
1577 * nfs file create call
1578 */
1579static int
1580nfs_create(struct vop_create_args *ap)
1581{
1582	struct vnode *dvp = ap->a_dvp;
1583	struct vattr *vap = ap->a_vap;
1584	struct componentname *cnp = ap->a_cnp;
1585	struct nfsv2_sattr *sp;
1586	u_int32_t *tl;
1587	struct nfsnode *np = NULL;
1588	struct vnode *newvp = NULL;
1589	caddr_t bpos, dpos;
1590	int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0;
1591	struct mbuf *mreq, *mrep, *md, *mb;
1592	struct vattr vattr;
1593	int v3 = NFS_ISV3(dvp);
1594
1595	/*
1596	 * Oops, not for me..
1597	 */
1598	if (vap->va_type == VSOCK) {
1599		error = nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap);
1600		return (error);
1601	}
1602
1603	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0) {
1604		return (error);
1605	}
1606	if (vap->va_vaflags & VA_EXCLUSIVE)
1607		fmode |= O_EXCL;
1608again:
1609	nfsstats.rpccnt[NFSPROC_CREATE]++;
1610	mreq = nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED +
1611		nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3));
1612	mb = mreq;
1613	bpos = mtod(mb, caddr_t);
1614	nfsm_fhtom(dvp, v3);
1615	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1616	if (v3) {
1617		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
1618		if (fmode & O_EXCL) {
1619			*tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE);
1620			tl = nfsm_build(u_int32_t *, NFSX_V3CREATEVERF);
1621#ifdef INET
1622			CURVNET_SET(CRED_TO_VNET(cnp->cn_cred));
1623			IN_IFADDR_RLOCK();
1624			if (!TAILQ_EMPTY(&V_in_ifaddrhead))
1625				*tl++ = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr.s_addr;
1626			else
1627#endif
1628				*tl++ = create_verf;
1629#ifdef INET
1630			IN_IFADDR_RUNLOCK();
1631			CURVNET_RESTORE();
1632#endif
1633			*tl = ++create_verf;
1634		} else {
1635			*tl = txdr_unsigned(NFSV3CREATE_UNCHECKED);
1636			nfsm_v3attrbuild(vap, FALSE);
1637		}
1638	} else {
1639		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
1640		sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
1641		sp->sa_uid = nfs_xdrneg1;
1642		sp->sa_gid = nfs_xdrneg1;
1643		sp->sa_size = 0;
1644		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
1645		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
1646	}
1647	nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_thread, cnp->cn_cred);
1648	if (!error) {
1649		nfsm_mtofh(dvp, newvp, v3, gotvp);
1650		if (!gotvp) {
1651			if (newvp) {
1652				vput(newvp);
1653				newvp = NULL;
1654			}
1655			error = nfs_lookitup(dvp, cnp->cn_nameptr,
1656			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
1657			if (!error)
1658				newvp = NFSTOV(np);
1659		}
1660	}
1661	if (v3)
1662		nfsm_wcc_data(dvp, wccflag);
1663	m_freem(mrep);
1664nfsmout:
1665	if (error) {
1666		if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) {
1667			fmode &= ~O_EXCL;
1668			goto again;
1669		}
1670		if (newvp)
1671			vput(newvp);
1672	} else if (v3 && (fmode & O_EXCL)) {
1673		/*
1674		 * We are normally called with only a partially initialized
1675		 * VAP.  Since the NFSv3 spec says that server may use the
1676		 * file attributes to store the verifier, the spec requires
1677		 * us to do a SETATTR RPC. FreeBSD servers store the verifier
1678		 * in atime, but we can't really assume that all servers will
1679		 * so we ensure that our SETATTR sets both atime and mtime.
1680		 */
1681		if (vap->va_mtime.tv_sec == VNOVAL)
1682			vfs_timestamp(&vap->va_mtime);
1683		if (vap->va_atime.tv_sec == VNOVAL)
1684			vap->va_atime = vap->va_mtime;
1685		error = nfs_setattrrpc(newvp, vap, cnp->cn_cred);
1686		if (error)
1687			vput(newvp);
1688	}
1689	if (!error) {
1690		*ap->a_vpp = newvp;
1691	}
1692	mtx_lock(&(VTONFS(dvp))->n_mtx);
1693	VTONFS(dvp)->n_flag |= NMODIFIED;
1694	if (!wccflag) {
1695		VTONFS(dvp)->n_attrstamp = 0;
1696		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
1697	}
1698	mtx_unlock(&(VTONFS(dvp))->n_mtx);
1699	return (error);
1700}
1701
1702/*
1703 * nfs file remove call
1704 * To try and make nfs semantics closer to ufs semantics, a file that has
1705 * other processes using the vnode is renamed instead of removed and then
1706 * removed later on the last close.
1707 * - If v_usecount > 1
1708 *	  If a rename is not already in the works
1709 *	     call nfs_sillyrename() to set it up
1710 *     else
1711 *	  do the remove rpc
1712 */
1713static int
1714nfs_remove(struct vop_remove_args *ap)
1715{
1716	struct vnode *vp = ap->a_vp;
1717	struct vnode *dvp = ap->a_dvp;
1718	struct componentname *cnp = ap->a_cnp;
1719	struct nfsnode *np = VTONFS(vp);
1720	int error = 0;
1721	struct vattr vattr;
1722
1723	KASSERT((cnp->cn_flags & HASBUF) != 0, ("nfs_remove: no name"));
1724	KASSERT(vrefcnt(vp) > 0, ("nfs_remove: bad v_usecount"));
1725	if (vp->v_type == VDIR)
1726		error = EPERM;
1727	else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
1728	    !VOP_GETATTR(vp, &vattr, cnp->cn_cred) && vattr.va_nlink > 1)) {
1729		/*
1730		 * Purge the name cache so that the chance of a lookup for
1731		 * the name succeeding while the remove is in progress is
1732		 * minimized. Without node locking it can still happen, such
1733		 * that an I/O op returns ESTALE, but since you get this if
1734		 * another host removes the file..
1735		 */
1736		cache_purge(vp);
1737		/*
1738		 * throw away biocache buffers, mainly to avoid
1739		 * unnecessary delayed writes later.
1740		 */
1741		error = nfs_vinvalbuf(vp, 0, cnp->cn_thread, 1);
1742		/* Do the rpc */
1743		if (error != EINTR && error != EIO)
1744			error = nfs_removerpc(dvp, cnp->cn_nameptr,
1745				cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
1746		/*
1747		 * Kludge City: If the first reply to the remove rpc is lost..
1748		 *   the reply to the retransmitted request will be ENOENT
1749		 *   since the file was in fact removed
1750		 *   Therefore, we cheat and return success.
1751		 */
1752		if (error == ENOENT)
1753			error = 0;
1754	} else if (!np->n_sillyrename)
1755		error = nfs_sillyrename(dvp, vp, cnp);
1756	mtx_lock(&np->n_mtx);
1757	np->n_attrstamp = 0;
1758	mtx_unlock(&np->n_mtx);
1759	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
1760	return (error);
1761}
1762
1763/*
1764 * nfs file remove rpc called from nfs_inactive
1765 */
1766int
1767nfs_removeit(struct sillyrename *sp)
1768{
1769	/*
1770	 * Make sure that the directory vnode is still valid.
1771	 * XXX we should lock sp->s_dvp here.
1772	 */
1773	if (sp->s_dvp->v_type == VBAD)
1774		return (0);
1775	return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred,
1776		NULL));
1777}
1778
1779/*
1780 * Nfs remove rpc, called from nfs_remove() and nfs_removeit().
1781 */
1782static int
1783nfs_removerpc(struct vnode *dvp, const char *name, int namelen,
1784    struct ucred *cred, struct thread *td)
1785{
1786	caddr_t bpos, dpos;
1787	int error = 0, wccflag = NFSV3_WCCRATTR;
1788	struct mbuf *mreq, *mrep, *md, *mb;
1789	int v3 = NFS_ISV3(dvp);
1790
1791	nfsstats.rpccnt[NFSPROC_REMOVE]++;
1792	mreq = nfsm_reqhead(dvp, NFSPROC_REMOVE,
1793		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen));
1794	mb = mreq;
1795	bpos = mtod(mb, caddr_t);
1796	nfsm_fhtom(dvp, v3);
1797	nfsm_strtom(name, namelen, NFS_MAXNAMLEN);
1798	nfsm_request(dvp, NFSPROC_REMOVE, td, cred);
1799	if (v3)
1800		nfsm_wcc_data(dvp, wccflag);
1801	m_freem(mrep);
1802nfsmout:
1803	mtx_lock(&(VTONFS(dvp))->n_mtx);
1804	VTONFS(dvp)->n_flag |= NMODIFIED;
1805	if (!wccflag) {
1806		VTONFS(dvp)->n_attrstamp = 0;
1807		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
1808	}
1809	mtx_unlock(&(VTONFS(dvp))->n_mtx);
1810	return (error);
1811}
1812
1813/*
1814 * nfs file rename call
1815 */
1816static int
1817nfs_rename(struct vop_rename_args *ap)
1818{
1819	struct vnode *fvp = ap->a_fvp;
1820	struct vnode *tvp = ap->a_tvp;
1821	struct vnode *fdvp = ap->a_fdvp;
1822	struct vnode *tdvp = ap->a_tdvp;
1823	struct componentname *tcnp = ap->a_tcnp;
1824	struct componentname *fcnp = ap->a_fcnp;
1825	int error;
1826
1827	KASSERT((tcnp->cn_flags & HASBUF) != 0 &&
1828	    (fcnp->cn_flags & HASBUF) != 0, ("nfs_rename: no name"));
1829	/* Check for cross-device rename */
1830	if ((fvp->v_mount != tdvp->v_mount) ||
1831	    (tvp && (fvp->v_mount != tvp->v_mount))) {
1832		error = EXDEV;
1833		goto out;
1834	}
1835
1836	if (fvp == tvp) {
1837		nfs_printf("nfs_rename: fvp == tvp (can't happen)\n");
1838		error = 0;
1839		goto out;
1840	}
1841	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
1842		goto out;
1843
1844	/*
1845	 * We have to flush B_DELWRI data prior to renaming
1846	 * the file.  If we don't, the delayed-write buffers
1847	 * can be flushed out later after the file has gone stale
1848	 * under NFSV3.  NFSV2 does not have this problem because
1849	 * ( as far as I can tell ) it flushes dirty buffers more
1850	 * often.
1851	 *
1852	 * Skip the rename operation if the fsync fails, this can happen
1853	 * due to the server's volume being full, when we pushed out data
1854	 * that was written back to our cache earlier. Not checking for
1855	 * this condition can result in potential (silent) data loss.
1856	 */
1857	error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
1858	VOP_UNLOCK(fvp, 0);
1859	if (!error && tvp)
1860		error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
1861	if (error)
1862		goto out;
1863
1864	/*
1865	 * If the tvp exists and is in use, sillyrename it before doing the
1866	 * rename of the new file over it.
1867	 * XXX Can't sillyrename a directory.
1868	 */
1869	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
1870		tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
1871		vput(tvp);
1872		tvp = NULL;
1873	}
1874
1875	error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen,
1876		tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
1877		tcnp->cn_thread);
1878
1879	if (fvp->v_type == VDIR) {
1880		if (tvp != NULL && tvp->v_type == VDIR)
1881			cache_purge(tdvp);
1882		cache_purge(fdvp);
1883	}
1884
1885out:
1886	if (tdvp == tvp)
1887		vrele(tdvp);
1888	else
1889		vput(tdvp);
1890	if (tvp)
1891		vput(tvp);
1892	vrele(fdvp);
1893	vrele(fvp);
1894	/*
1895	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
1896	 */
1897	if (error == ENOENT)
1898		error = 0;
1899	return (error);
1900}
1901
1902/*
1903 * nfs file rename rpc called from nfs_remove() above
1904 */
1905static int
1906nfs_renameit(struct vnode *sdvp, struct componentname *scnp,
1907    struct sillyrename *sp)
1908{
1909
1910	return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp,
1911	    sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_thread));
1912}
1913
1914/*
1915 * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
1916 */
1917static int
1918nfs_renamerpc(struct vnode *fdvp, const char *fnameptr, int fnamelen,
1919    struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred,
1920    struct thread *td)
1921{
1922	caddr_t bpos, dpos;
1923	int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR;
1924	struct mbuf *mreq, *mrep, *md, *mb;
1925	int v3 = NFS_ISV3(fdvp);
1926
1927	nfsstats.rpccnt[NFSPROC_RENAME]++;
1928	mreq = nfsm_reqhead(fdvp, NFSPROC_RENAME,
1929		(NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) +
1930		nfsm_rndup(tnamelen));
1931	mb = mreq;
1932	bpos = mtod(mb, caddr_t);
1933	nfsm_fhtom(fdvp, v3);
1934	nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN);
1935	nfsm_fhtom(tdvp, v3);
1936	nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN);
1937	nfsm_request(fdvp, NFSPROC_RENAME, td, cred);
1938	if (v3) {
1939		nfsm_wcc_data(fdvp, fwccflag);
1940		nfsm_wcc_data(tdvp, twccflag);
1941	}
1942	m_freem(mrep);
1943nfsmout:
1944	mtx_lock(&(VTONFS(fdvp))->n_mtx);
1945	VTONFS(fdvp)->n_flag |= NMODIFIED;
1946	mtx_unlock(&(VTONFS(fdvp))->n_mtx);
1947	mtx_lock(&(VTONFS(tdvp))->n_mtx);
1948	VTONFS(tdvp)->n_flag |= NMODIFIED;
1949	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
1950	if (!fwccflag) {
1951		VTONFS(fdvp)->n_attrstamp = 0;
1952		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(fdvp);
1953	}
1954	if (!twccflag) {
1955		VTONFS(tdvp)->n_attrstamp = 0;
1956		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
1957	}
1958	return (error);
1959}
1960
1961/*
1962 * nfs hard link create call
1963 */
1964static int
1965nfs_link(struct vop_link_args *ap)
1966{
1967	struct vnode *vp = ap->a_vp;
1968	struct vnode *tdvp = ap->a_tdvp;
1969	struct componentname *cnp = ap->a_cnp;
1970	caddr_t bpos, dpos;
1971	int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0;
1972	struct mbuf *mreq, *mrep, *md, *mb;
1973	int v3;
1974
1975	if (vp->v_mount != tdvp->v_mount) {
1976		return (EXDEV);
1977	}
1978
1979	/*
1980	 * Push all writes to the server, so that the attribute cache
1981	 * doesn't get "out of sync" with the server.
1982	 * XXX There should be a better way!
1983	 */
1984	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
1985
1986	v3 = NFS_ISV3(vp);
1987	nfsstats.rpccnt[NFSPROC_LINK]++;
1988	mreq = nfsm_reqhead(vp, NFSPROC_LINK,
1989		NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
1990	mb = mreq;
1991	bpos = mtod(mb, caddr_t);
1992	nfsm_fhtom(vp, v3);
1993	nfsm_fhtom(tdvp, v3);
1994	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1995	nfsm_request(vp, NFSPROC_LINK, cnp->cn_thread, cnp->cn_cred);
1996	if (v3) {
1997		nfsm_postop_attr(vp, attrflag);
1998		nfsm_wcc_data(tdvp, wccflag);
1999	}
2000	m_freem(mrep);
2001nfsmout:
2002	mtx_lock(&(VTONFS(tdvp))->n_mtx);
2003	VTONFS(tdvp)->n_flag |= NMODIFIED;
2004	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
2005	if (!attrflag) {
2006		VTONFS(vp)->n_attrstamp = 0;
2007		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
2008	}
2009	if (!wccflag) {
2010		VTONFS(tdvp)->n_attrstamp = 0;
2011		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
2012	}
2013	return (error);
2014}
2015
2016/*
2017 * nfs symbolic link create call
2018 */
2019static int
2020nfs_symlink(struct vop_symlink_args *ap)
2021{
2022	struct vnode *dvp = ap->a_dvp;
2023	struct vattr *vap = ap->a_vap;
2024	struct componentname *cnp = ap->a_cnp;
2025	struct nfsv2_sattr *sp;
2026	caddr_t bpos, dpos;
2027	int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp;
2028	struct mbuf *mreq, *mrep, *md, *mb;
2029	struct vnode *newvp = NULL;
2030	int v3 = NFS_ISV3(dvp);
2031
2032	nfsstats.rpccnt[NFSPROC_SYMLINK]++;
2033	slen = strlen(ap->a_target);
2034	mreq = nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED +
2035	    nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3));
2036	mb = mreq;
2037	bpos = mtod(mb, caddr_t);
2038	nfsm_fhtom(dvp, v3);
2039	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
2040	if (v3) {
2041		nfsm_v3attrbuild(vap, FALSE);
2042	}
2043	nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN);
2044	if (!v3) {
2045		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
2046		sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode);
2047		sp->sa_uid = nfs_xdrneg1;
2048		sp->sa_gid = nfs_xdrneg1;
2049		sp->sa_size = nfs_xdrneg1;
2050		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
2051		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
2052	}
2053
2054	/*
2055	 * Issue the NFS request and get the rpc response.
2056	 *
2057	 * Only NFSv3 responses returning an error of 0 actually return
2058	 * a file handle that can be converted into newvp without having
2059	 * to do an extra lookup rpc.
2060	 */
2061	nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_thread, cnp->cn_cred);
2062	if (v3) {
2063		if (error == 0)
2064			nfsm_mtofh(dvp, newvp, v3, gotvp);
2065		nfsm_wcc_data(dvp, wccflag);
2066	}
2067
2068	/*
2069	 * out code jumps -> here, mrep is also freed.
2070	 */
2071
2072	m_freem(mrep);
2073nfsmout:
2074
2075	/*
2076	 * If we do not have an error and we could not extract the newvp from
2077	 * the response due to the request being NFSv2, we have to do a
2078	 * lookup in order to obtain a newvp to return.
2079	 */
2080	if (error == 0 && newvp == NULL) {
2081		struct nfsnode *np = NULL;
2082
2083		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
2084		    cnp->cn_cred, cnp->cn_thread, &np);
2085		if (!error)
2086			newvp = NFSTOV(np);
2087	}
2088	if (error) {
2089		if (newvp)
2090			vput(newvp);
2091	} else {
2092		*ap->a_vpp = newvp;
2093	}
2094	mtx_lock(&(VTONFS(dvp))->n_mtx);
2095	VTONFS(dvp)->n_flag |= NMODIFIED;
2096	mtx_unlock(&(VTONFS(dvp))->n_mtx);
2097	if (!wccflag) {
2098		VTONFS(dvp)->n_attrstamp = 0;
2099		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
2100	}
2101	return (error);
2102}
2103
2104/*
2105 * nfs make dir call
2106 */
2107static int
2108nfs_mkdir(struct vop_mkdir_args *ap)
2109{
2110	struct vnode *dvp = ap->a_dvp;
2111	struct vattr *vap = ap->a_vap;
2112	struct componentname *cnp = ap->a_cnp;
2113	struct nfsv2_sattr *sp;
2114	int len;
2115	struct nfsnode *np = NULL;
2116	struct vnode *newvp = NULL;
2117	caddr_t bpos, dpos;
2118	int error = 0, wccflag = NFSV3_WCCRATTR;
2119	int gotvp = 0;
2120	struct mbuf *mreq, *mrep, *md, *mb;
2121	struct vattr vattr;
2122	int v3 = NFS_ISV3(dvp);
2123
2124	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
2125		return (error);
2126	len = cnp->cn_namelen;
2127	nfsstats.rpccnt[NFSPROC_MKDIR]++;
2128	mreq = nfsm_reqhead(dvp, NFSPROC_MKDIR,
2129	  NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3));
2130	mb = mreq;
2131	bpos = mtod(mb, caddr_t);
2132	nfsm_fhtom(dvp, v3);
2133	nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
2134	if (v3) {
2135		nfsm_v3attrbuild(vap, FALSE);
2136	} else {
2137		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
2138		sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode);
2139		sp->sa_uid = nfs_xdrneg1;
2140		sp->sa_gid = nfs_xdrneg1;
2141		sp->sa_size = nfs_xdrneg1;
2142		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
2143		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
2144	}
2145	nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_thread, cnp->cn_cred);
2146	if (!error)
2147		nfsm_mtofh(dvp, newvp, v3, gotvp);
2148	if (v3)
2149		nfsm_wcc_data(dvp, wccflag);
2150	m_freem(mrep);
2151nfsmout:
2152	mtx_lock(&(VTONFS(dvp))->n_mtx);
2153	VTONFS(dvp)->n_flag |= NMODIFIED;
2154	mtx_unlock(&(VTONFS(dvp))->n_mtx);
2155	if (!wccflag) {
2156		VTONFS(dvp)->n_attrstamp = 0;
2157		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
2158	}
2159	if (error == 0 && newvp == NULL) {
2160		error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred,
2161			cnp->cn_thread, &np);
2162		if (!error) {
2163			newvp = NFSTOV(np);
2164			if (newvp->v_type != VDIR)
2165				error = EEXIST;
2166		}
2167	}
2168	if (error) {
2169		if (newvp)
2170			vput(newvp);
2171	} else
2172		*ap->a_vpp = newvp;
2173	return (error);
2174}
2175
2176/*
2177 * nfs remove directory call
2178 */
2179static int
2180nfs_rmdir(struct vop_rmdir_args *ap)
2181{
2182	struct vnode *vp = ap->a_vp;
2183	struct vnode *dvp = ap->a_dvp;
2184	struct componentname *cnp = ap->a_cnp;
2185	caddr_t bpos, dpos;
2186	int error = 0, wccflag = NFSV3_WCCRATTR;
2187	struct mbuf *mreq, *mrep, *md, *mb;
2188	int v3 = NFS_ISV3(dvp);
2189
2190	if (dvp == vp)
2191		return (EINVAL);
2192	nfsstats.rpccnt[NFSPROC_RMDIR]++;
2193	mreq = nfsm_reqhead(dvp, NFSPROC_RMDIR,
2194		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
2195	mb = mreq;
2196	bpos = mtod(mb, caddr_t);
2197	nfsm_fhtom(dvp, v3);
2198	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
2199	nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_thread, cnp->cn_cred);
2200	if (v3)
2201		nfsm_wcc_data(dvp, wccflag);
2202	m_freem(mrep);
2203nfsmout:
2204	mtx_lock(&(VTONFS(dvp))->n_mtx);
2205	VTONFS(dvp)->n_flag |= NMODIFIED;
2206	mtx_unlock(&(VTONFS(dvp))->n_mtx);
2207	if (!wccflag) {
2208		VTONFS(dvp)->n_attrstamp = 0;
2209		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
2210	}
2211	cache_purge(dvp);
2212	cache_purge(vp);
2213	/*
2214	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
2215	 */
2216	if (error == ENOENT)
2217		error = 0;
2218	return (error);
2219}
2220
2221/*
2222 * nfs readdir call
2223 */
2224static int
2225nfs_readdir(struct vop_readdir_args *ap)
2226{
2227	struct vnode *vp = ap->a_vp;
2228	struct nfsnode *np = VTONFS(vp);
2229	struct uio *uio = ap->a_uio;
2230	int tresid, error = 0;
2231	struct vattr vattr;
2232
2233	if (vp->v_type != VDIR)
2234		return(EPERM);
2235
2236	/*
2237	 * First, check for hit on the EOF offset cache
2238	 */
2239	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
2240	    (np->n_flag & NMODIFIED) == 0) {
2241		if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) {
2242			mtx_lock(&np->n_mtx);
2243			if (!NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
2244				mtx_unlock(&np->n_mtx);
2245				nfsstats.direofcache_hits++;
2246				goto out;
2247			} else
2248				mtx_unlock(&np->n_mtx);
2249		}
2250	}
2251
2252	/*
2253	 * Call nfs_bioread() to do the real work.
2254	 */
2255	tresid = uio->uio_resid;
2256	error = nfs_bioread(vp, uio, 0, ap->a_cred);
2257
2258	if (!error && uio->uio_resid == tresid) {
2259		nfsstats.direofcache_misses++;
2260	}
2261out:
2262	return (error);
2263}
2264
2265/*
2266 * Readdir rpc call.
2267 * Called from below the buffer cache by nfs_doio().
2268 */
2269int
2270nfs_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
2271{
2272	int len, left;
2273	struct dirent *dp = NULL;
2274	u_int32_t *tl;
2275	caddr_t cp;
2276	nfsuint64 *cookiep;
2277	caddr_t bpos, dpos;
2278	struct mbuf *mreq, *mrep, *md, *mb;
2279	nfsuint64 cookie;
2280	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2281	struct nfsnode *dnp = VTONFS(vp);
2282	u_quad_t fileno;
2283	int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
2284	int attrflag;
2285	int v3 = NFS_ISV3(vp);
2286
2287	KASSERT(uiop->uio_iovcnt == 1 &&
2288	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
2289	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
2290	    ("nfs readdirrpc bad uio"));
2291
2292	/*
2293	 * If there is no cookie, assume directory was stale.
2294	 */
2295	nfs_dircookie_lock(dnp);
2296	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
2297	if (cookiep) {
2298		cookie = *cookiep;
2299		nfs_dircookie_unlock(dnp);
2300	} else {
2301		nfs_dircookie_unlock(dnp);
2302		return (NFSERR_BAD_COOKIE);
2303	}
2304
2305	/*
2306	 * Loop around doing readdir rpc's of size nm_readdirsize
2307	 * truncated to a multiple of DIRBLKSIZ.
2308	 * The stopping criteria is EOF or buffer full.
2309	 */
2310	while (more_dirs && bigenough) {
2311		nfsstats.rpccnt[NFSPROC_READDIR]++;
2312		mreq = nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) +
2313			NFSX_READDIR(v3));
2314		mb = mreq;
2315		bpos = mtod(mb, caddr_t);
2316		nfsm_fhtom(vp, v3);
2317		if (v3) {
2318			tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
2319			*tl++ = cookie.nfsuquad[0];
2320			*tl++ = cookie.nfsuquad[1];
2321			mtx_lock(&dnp->n_mtx);
2322			*tl++ = dnp->n_cookieverf.nfsuquad[0];
2323			*tl++ = dnp->n_cookieverf.nfsuquad[1];
2324			mtx_unlock(&dnp->n_mtx);
2325		} else {
2326			tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
2327			*tl++ = cookie.nfsuquad[0];
2328		}
2329		*tl = txdr_unsigned(nmp->nm_readdirsize);
2330		nfsm_request(vp, NFSPROC_READDIR, uiop->uio_td, cred);
2331		if (v3) {
2332			nfsm_postop_attr(vp, attrflag);
2333			if (!error) {
2334				tl = nfsm_dissect(u_int32_t *,
2335				    2 * NFSX_UNSIGNED);
2336				mtx_lock(&dnp->n_mtx);
2337				dnp->n_cookieverf.nfsuquad[0] = *tl++;
2338				dnp->n_cookieverf.nfsuquad[1] = *tl;
2339				mtx_unlock(&dnp->n_mtx);
2340			} else {
2341				m_freem(mrep);
2342				goto nfsmout;
2343			}
2344		}
2345		tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2346		more_dirs = fxdr_unsigned(int, *tl);
2347
2348		/* loop thru the dir entries, doctoring them to 4bsd form */
2349		while (more_dirs && bigenough) {
2350			if (v3) {
2351				tl = nfsm_dissect(u_int32_t *,
2352				    3 * NFSX_UNSIGNED);
2353				fileno = fxdr_hyper(tl);
2354				len = fxdr_unsigned(int, *(tl + 2));
2355			} else {
2356				tl = nfsm_dissect(u_int32_t *,
2357				    2 * NFSX_UNSIGNED);
2358				fileno = fxdr_unsigned(u_quad_t, *tl++);
2359				len = fxdr_unsigned(int, *tl);
2360			}
2361			if (len <= 0 || len > NFS_MAXNAMLEN) {
2362				error = EBADRPC;
2363				m_freem(mrep);
2364				goto nfsmout;
2365			}
2366			tlen = nfsm_rndup(len);
2367			if (tlen == len)
2368				tlen += 4;	/* To ensure null termination */
2369			left = DIRBLKSIZ - blksiz;
2370			if ((tlen + DIRHDSIZ) > left) {
2371				dp->d_reclen += left;
2372				uiop->uio_iov->iov_base =
2373				    (char *)uiop->uio_iov->iov_base + left;
2374				uiop->uio_iov->iov_len -= left;
2375				uiop->uio_offset += left;
2376				uiop->uio_resid -= left;
2377				blksiz = 0;
2378			}
2379			if ((tlen + DIRHDSIZ) > uiop->uio_resid)
2380				bigenough = 0;
2381			if (bigenough) {
2382				dp = (struct dirent *)uiop->uio_iov->iov_base;
2383				dp->d_fileno = (int)fileno;
2384				dp->d_namlen = len;
2385				dp->d_reclen = tlen + DIRHDSIZ;
2386				dp->d_type = DT_UNKNOWN;
2387				blksiz += dp->d_reclen;
2388				if (blksiz == DIRBLKSIZ)
2389					blksiz = 0;
2390				uiop->uio_offset += DIRHDSIZ;
2391				uiop->uio_resid -= DIRHDSIZ;
2392				uiop->uio_iov->iov_base =
2393				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
2394				uiop->uio_iov->iov_len -= DIRHDSIZ;
2395				nfsm_mtouio(uiop, len);
2396				cp = uiop->uio_iov->iov_base;
2397				tlen -= len;
2398				*cp = '\0';	/* null terminate */
2399				uiop->uio_iov->iov_base =
2400				    (char *)uiop->uio_iov->iov_base + tlen;
2401				uiop->uio_iov->iov_len -= tlen;
2402				uiop->uio_offset += tlen;
2403				uiop->uio_resid -= tlen;
2404			} else
2405				nfsm_adv(nfsm_rndup(len));
2406			if (v3) {
2407				tl = nfsm_dissect(u_int32_t *,
2408				    3 * NFSX_UNSIGNED);
2409			} else {
2410				tl = nfsm_dissect(u_int32_t *,
2411				    2 * NFSX_UNSIGNED);
2412			}
2413			if (bigenough) {
2414				cookie.nfsuquad[0] = *tl++;
2415				if (v3)
2416					cookie.nfsuquad[1] = *tl++;
2417			} else if (v3)
2418				tl += 2;
2419			else
2420				tl++;
2421			more_dirs = fxdr_unsigned(int, *tl);
2422		}
2423		/*
2424		 * If at end of rpc data, get the eof boolean
2425		 */
2426		if (!more_dirs) {
2427			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2428			more_dirs = (fxdr_unsigned(int, *tl) == 0);
2429		}
2430		m_freem(mrep);
2431	}
2432	/*
2433	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
2434	 * by increasing d_reclen for the last record.
2435	 */
2436	if (blksiz > 0) {
2437		left = DIRBLKSIZ - blksiz;
2438		dp->d_reclen += left;
2439		uiop->uio_iov->iov_base =
2440		    (char *)uiop->uio_iov->iov_base + left;
2441		uiop->uio_iov->iov_len -= left;
2442		uiop->uio_offset += left;
2443		uiop->uio_resid -= left;
2444	}
2445
2446	/*
2447	 * We are now either at the end of the directory or have filled the
2448	 * block.
2449	 */
2450	if (bigenough)
2451		dnp->n_direofoffset = uiop->uio_offset;
2452	else {
2453		if (uiop->uio_resid > 0)
2454			nfs_printf("EEK! readdirrpc resid > 0\n");
2455		nfs_dircookie_lock(dnp);
2456		cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
2457		*cookiep = cookie;
2458		nfs_dircookie_unlock(dnp);
2459	}
2460nfsmout:
2461	return (error);
2462}
2463
2464/*
2465 * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc().
2466 */
2467int
2468nfs_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
2469{
2470	int len, left;
2471	struct dirent *dp;
2472	u_int32_t *tl;
2473	caddr_t cp;
2474	struct vnode *newvp;
2475	nfsuint64 *cookiep;
2476	caddr_t bpos, dpos, dpossav1, dpossav2;
2477	struct mbuf *mreq, *mrep, *md, *mb, *mdsav1, *mdsav2;
2478	struct nameidata nami, *ndp = &nami;
2479	struct componentname *cnp = &ndp->ni_cnd;
2480	nfsuint64 cookie;
2481	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2482	struct nfsnode *dnp = VTONFS(vp), *np;
2483	struct vattr vattr, dvattr;
2484	nfsfh_t *fhp;
2485	u_quad_t fileno;
2486	int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i;
2487	int attrflag, dattrflag, fhsize;
2488
2489#ifndef nolint
2490	dp = NULL;
2491#endif
2492	KASSERT(uiop->uio_iovcnt == 1 &&
2493	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
2494	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
2495	    ("nfs readdirplusrpc bad uio"));
2496	ndp->ni_dvp = vp;
2497	newvp = NULLVP;
2498
2499	/*
2500	 * If there is no cookie, assume directory was stale.
2501	 */
2502	nfs_dircookie_lock(dnp);
2503	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
2504	if (cookiep) {
2505		cookie = *cookiep;
2506		nfs_dircookie_unlock(dnp);
2507	} else {
2508		nfs_dircookie_unlock(dnp);
2509		return (NFSERR_BAD_COOKIE);
2510	}
2511	/*
2512	 * Loop around doing readdir rpc's of size nm_readdirsize
2513	 * truncated to a multiple of DIRBLKSIZ.
2514	 * The stopping criteria is EOF or buffer full.
2515	 */
2516	while (more_dirs && bigenough) {
2517		nfsstats.rpccnt[NFSPROC_READDIRPLUS]++;
2518		mreq = nfsm_reqhead(vp, NFSPROC_READDIRPLUS,
2519			NFSX_FH(1) + 6 * NFSX_UNSIGNED);
2520		mb = mreq;
2521		bpos = mtod(mb, caddr_t);
2522		nfsm_fhtom(vp, 1);
2523 		tl = nfsm_build(u_int32_t *, 6 * NFSX_UNSIGNED);
2524		*tl++ = cookie.nfsuquad[0];
2525		*tl++ = cookie.nfsuquad[1];
2526		mtx_lock(&dnp->n_mtx);
2527		*tl++ = dnp->n_cookieverf.nfsuquad[0];
2528		*tl++ = dnp->n_cookieverf.nfsuquad[1];
2529		mtx_unlock(&dnp->n_mtx);
2530		*tl++ = txdr_unsigned(nmp->nm_readdirsize);
2531		*tl = txdr_unsigned(nmp->nm_rsize);
2532		nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_td, cred);
2533		nfsm_postop_attr_va(vp, dattrflag, &dvattr);
2534		if (error) {
2535			m_freem(mrep);
2536			goto nfsmout;
2537		}
2538		tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
2539		mtx_lock(&dnp->n_mtx);
2540		dnp->n_cookieverf.nfsuquad[0] = *tl++;
2541		dnp->n_cookieverf.nfsuquad[1] = *tl++;
2542		mtx_unlock(&dnp->n_mtx);
2543		more_dirs = fxdr_unsigned(int, *tl);
2544
2545		/* loop thru the dir entries, doctoring them to 4bsd form */
2546		while (more_dirs && bigenough) {
2547			tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
2548			fileno = fxdr_hyper(tl);
2549			len = fxdr_unsigned(int, *(tl + 2));
2550			if (len <= 0 || len > NFS_MAXNAMLEN) {
2551				error = EBADRPC;
2552				m_freem(mrep);
2553				goto nfsmout;
2554			}
2555			tlen = nfsm_rndup(len);
2556			if (tlen == len)
2557				tlen += 4;	/* To ensure null termination*/
2558			left = DIRBLKSIZ - blksiz;
2559			if ((tlen + DIRHDSIZ) > left) {
2560				dp->d_reclen += left;
2561				uiop->uio_iov->iov_base =
2562				    (char *)uiop->uio_iov->iov_base + left;
2563				uiop->uio_iov->iov_len -= left;
2564				uiop->uio_offset += left;
2565				uiop->uio_resid -= left;
2566				blksiz = 0;
2567			}
2568			if ((tlen + DIRHDSIZ) > uiop->uio_resid)
2569				bigenough = 0;
2570			if (bigenough) {
2571				dp = (struct dirent *)uiop->uio_iov->iov_base;
2572				dp->d_fileno = (int)fileno;
2573				dp->d_namlen = len;
2574				dp->d_reclen = tlen + DIRHDSIZ;
2575				dp->d_type = DT_UNKNOWN;
2576				blksiz += dp->d_reclen;
2577				if (blksiz == DIRBLKSIZ)
2578					blksiz = 0;
2579				uiop->uio_offset += DIRHDSIZ;
2580				uiop->uio_resid -= DIRHDSIZ;
2581				uiop->uio_iov->iov_base =
2582				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
2583				uiop->uio_iov->iov_len -= DIRHDSIZ;
2584				cnp->cn_nameptr = uiop->uio_iov->iov_base;
2585				cnp->cn_namelen = len;
2586				nfsm_mtouio(uiop, len);
2587				cp = uiop->uio_iov->iov_base;
2588				tlen -= len;
2589				*cp = '\0';
2590				uiop->uio_iov->iov_base =
2591				    (char *)uiop->uio_iov->iov_base + tlen;
2592				uiop->uio_iov->iov_len -= tlen;
2593				uiop->uio_offset += tlen;
2594				uiop->uio_resid -= tlen;
2595			} else
2596				nfsm_adv(nfsm_rndup(len));
2597			tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
2598			if (bigenough) {
2599				cookie.nfsuquad[0] = *tl++;
2600				cookie.nfsuquad[1] = *tl++;
2601			} else
2602				tl += 2;
2603
2604			/*
2605			 * Since the attributes are before the file handle
2606			 * (sigh), we must skip over the attributes and then
2607			 * come back and get them.
2608			 */
2609			attrflag = fxdr_unsigned(int, *tl);
2610			if (attrflag) {
2611			    dpossav1 = dpos;
2612			    mdsav1 = md;
2613			    nfsm_adv(NFSX_V3FATTR);
2614			    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2615			    doit = fxdr_unsigned(int, *tl);
2616			    /*
2617 			     * Skip loading the attrs for "..". There's a
2618 			     * race between loading the attrs here and
2619 			     * lookups that look for the directory currently
2620 			     * being read (in the parent). We try to acquire
2621 			     * the exclusive lock on ".." here, owning the
2622 			     * lock on the directory being read. Lookup will
2623 			     * hold the lock on ".." and try to acquire the
2624 			     * lock on the directory being read.
2625 			     *
2626 			     * There are other ways of fixing this, one would
2627 			     * be to do a trylock on the ".." vnode and skip
2628 			     * loading the attrs on ".." if it happens to be
2629 			     * locked by another process. But skipping the
2630 			     * attrload on ".." seems the easiest option.
2631 			     */
2632 			    if (strcmp(dp->d_name, "..") == 0) {
2633 				    doit = 0;
2634 				    /*
2635 				     * We've already skipped over the attrs,
2636 				     * skip over the filehandle. And store d_type
2637 				     * as VDIR.
2638 				     */
2639 				    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2640 				    i = fxdr_unsigned(int, *tl);
2641 				    nfsm_adv(nfsm_rndup(i));
2642 				    dp->d_type = IFTODT(VTTOIF(VDIR));
2643 			    }
2644			    if (doit) {
2645				nfsm_getfh(fhp, fhsize, 1);
2646				if (NFS_CMPFH(dnp, fhp, fhsize)) {
2647				    VREF(vp);
2648				    newvp = vp;
2649				    np = dnp;
2650				} else {
2651				    error = nfs_nget(vp->v_mount, fhp,
2652					fhsize, &np, LK_EXCLUSIVE);
2653				    if (error)
2654					doit = 0;
2655				    else
2656					newvp = NFSTOV(np);
2657				}
2658			    }
2659			    if (doit && bigenough) {
2660				dpossav2 = dpos;
2661				dpos = dpossav1;
2662				mdsav2 = md;
2663				md = mdsav1;
2664				nfsm_loadattr(newvp, &vattr);
2665				dpos = dpossav2;
2666				md = mdsav2;
2667				dp->d_type = IFTODT(VTTOIF(vattr.va_type));
2668				ndp->ni_vp = newvp;
2669				if (newvp->v_type != VDIR || dattrflag != 0)
2670				    cache_enter_time(ndp->ni_dvp, ndp->ni_vp,
2671					cnp, &vattr.va_ctime,
2672					newvp->v_type != VDIR ? NULL :
2673					&dvattr.va_ctime);
2674			    }
2675			} else {
2676			    /* Just skip over the file handle */
2677			    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2678			    i = fxdr_unsigned(int, *tl);
2679			    if (i) {
2680				    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2681				    fhsize = fxdr_unsigned(int, *tl);
2682				    nfsm_adv(nfsm_rndup(fhsize));
2683			    }
2684			}
2685			if (newvp != NULLVP) {
2686			    if (newvp == vp)
2687				vrele(newvp);
2688			    else
2689				vput(newvp);
2690			    newvp = NULLVP;
2691			}
2692			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2693			more_dirs = fxdr_unsigned(int, *tl);
2694		}
2695		/*
2696		 * If at end of rpc data, get the eof boolean
2697		 */
2698		if (!more_dirs) {
2699			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2700			more_dirs = (fxdr_unsigned(int, *tl) == 0);
2701		}
2702		m_freem(mrep);
2703	}
2704	/*
2705	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
2706	 * by increasing d_reclen for the last record.
2707	 */
2708	if (blksiz > 0) {
2709		left = DIRBLKSIZ - blksiz;
2710		dp->d_reclen += left;
2711		uiop->uio_iov->iov_base =
2712		    (char *)uiop->uio_iov->iov_base + left;
2713		uiop->uio_iov->iov_len -= left;
2714		uiop->uio_offset += left;
2715		uiop->uio_resid -= left;
2716	}
2717
2718	/*
2719	 * We are now either at the end of the directory or have filled the
2720	 * block.
2721	 */
2722	if (bigenough)
2723		dnp->n_direofoffset = uiop->uio_offset;
2724	else {
2725		if (uiop->uio_resid > 0)
2726			nfs_printf("EEK! readdirplusrpc resid > 0\n");
2727		nfs_dircookie_lock(dnp);
2728		cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
2729		*cookiep = cookie;
2730		nfs_dircookie_unlock(dnp);
2731	}
2732nfsmout:
2733	if (newvp != NULLVP) {
2734	        if (newvp == vp)
2735			vrele(newvp);
2736		else
2737			vput(newvp);
2738		newvp = NULLVP;
2739	}
2740	return (error);
2741}
2742
2743/*
2744 * Silly rename. To make the NFS filesystem that is stateless look a little
2745 * more like the "ufs" a remove of an active vnode is translated to a rename
2746 * to a funny looking filename that is removed by nfs_inactive on the
2747 * nfsnode. There is the potential for another process on a different client
2748 * to create the same funny name between the nfs_lookitup() fails and the
2749 * nfs_rename() completes, but...
2750 */
2751static int
2752nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2753{
2754	struct sillyrename *sp;
2755	struct nfsnode *np;
2756	int error;
2757	short pid;
2758	unsigned int lticks;
2759
2760	cache_purge(dvp);
2761	np = VTONFS(vp);
2762	KASSERT(vp->v_type != VDIR, ("nfs: sillyrename dir"));
2763	sp = malloc(sizeof (struct sillyrename),
2764		M_NFSREQ, M_WAITOK);
2765	sp->s_cred = crhold(cnp->cn_cred);
2766	sp->s_dvp = dvp;
2767	sp->s_removeit = nfs_removeit;
2768	VREF(dvp);
2769
2770	/*
2771	 * Fudge together a funny name.
2772	 * Changing the format of the funny name to accomodate more
2773	 * sillynames per directory.
2774	 * The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is
2775	 * CPU ticks since boot.
2776	 */
2777	pid = cnp->cn_thread->td_proc->p_pid;
2778	lticks = (unsigned int)ticks;
2779	for ( ; ; ) {
2780		sp->s_namlen = sprintf(sp->s_name,
2781				       ".nfs.%08x.%04x4.4", lticks,
2782				       pid);
2783		if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
2784				 cnp->cn_thread, NULL))
2785			break;
2786		lticks++;
2787	}
2788	error = nfs_renameit(dvp, cnp, sp);
2789	if (error)
2790		goto bad;
2791	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
2792		cnp->cn_thread, &np);
2793	np->n_sillyrename = sp;
2794	return (0);
2795bad:
2796	vrele(sp->s_dvp);
2797	crfree(sp->s_cred);
2798	free((caddr_t)sp, M_NFSREQ);
2799	return (error);
2800}
2801
2802/*
2803 * Look up a file name and optionally either update the file handle or
2804 * allocate an nfsnode, depending on the value of npp.
2805 * npp == NULL	--> just do the lookup
2806 * *npp == NULL --> allocate a new nfsnode and make sure attributes are
2807 *			handled too
2808 * *npp != NULL --> update the file handle in the vnode
2809 */
2810static int
2811nfs_lookitup(struct vnode *dvp, const char *name, int len, struct ucred *cred,
2812    struct thread *td, struct nfsnode **npp)
2813{
2814	struct vnode *newvp = NULL;
2815	struct nfsnode *np, *dnp = VTONFS(dvp);
2816	caddr_t bpos, dpos;
2817	int error = 0, fhlen, attrflag;
2818	struct mbuf *mreq, *mrep, *md, *mb;
2819	nfsfh_t *nfhp;
2820	int v3 = NFS_ISV3(dvp);
2821
2822	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
2823	mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP,
2824		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len));
2825	mb = mreq;
2826	bpos = mtod(mb, caddr_t);
2827	nfsm_fhtom(dvp, v3);
2828	nfsm_strtom(name, len, NFS_MAXNAMLEN);
2829	nfsm_request(dvp, NFSPROC_LOOKUP, td, cred);
2830	if (npp && !error) {
2831		nfsm_getfh(nfhp, fhlen, v3);
2832		if (*npp) {
2833		    np = *npp;
2834		    if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) {
2835			free((caddr_t)np->n_fhp, M_NFSBIGFH);
2836			np->n_fhp = &np->n_fh;
2837		    } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH)
2838			np->n_fhp =(nfsfh_t *)malloc(fhlen, M_NFSBIGFH, M_WAITOK);
2839		    bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen);
2840		    np->n_fhsize = fhlen;
2841		    newvp = NFSTOV(np);
2842		} else if (NFS_CMPFH(dnp, nfhp, fhlen)) {
2843		    VREF(dvp);
2844		    newvp = dvp;
2845		} else {
2846		    error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE);
2847		    if (error) {
2848			m_freem(mrep);
2849			return (error);
2850		    }
2851		    newvp = NFSTOV(np);
2852		}
2853		if (v3) {
2854			nfsm_postop_attr(newvp, attrflag);
2855			if (!attrflag && *npp == NULL) {
2856				m_freem(mrep);
2857				if (newvp == dvp)
2858					vrele(newvp);
2859				else
2860					vput(newvp);
2861				return (ENOENT);
2862			}
2863		} else
2864			nfsm_loadattr(newvp, NULL);
2865	}
2866	m_freem(mrep);
2867nfsmout:
2868	if (npp && *npp == NULL) {
2869		if (error) {
2870			if (newvp) {
2871				if (newvp == dvp)
2872					vrele(newvp);
2873				else
2874					vput(newvp);
2875			}
2876		} else
2877			*npp = np;
2878	}
2879	return (error);
2880}
2881
2882/*
2883 * Nfs Version 3 commit rpc
2884 */
2885int
2886nfs_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
2887	   struct thread *td)
2888{
2889	u_int32_t *tl;
2890	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2891	caddr_t bpos, dpos;
2892	int error = 0, wccflag = NFSV3_WCCRATTR;
2893	struct mbuf *mreq, *mrep, *md, *mb;
2894
2895	mtx_lock(&nmp->nm_mtx);
2896	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
2897		mtx_unlock(&nmp->nm_mtx);
2898		return (0);
2899	}
2900	mtx_unlock(&nmp->nm_mtx);
2901	nfsstats.rpccnt[NFSPROC_COMMIT]++;
2902	mreq = nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1));
2903	mb = mreq;
2904	bpos = mtod(mb, caddr_t);
2905	nfsm_fhtom(vp, 1);
2906	tl = nfsm_build(u_int32_t *, 3 * NFSX_UNSIGNED);
2907	txdr_hyper(offset, tl);
2908	tl += 2;
2909	*tl = txdr_unsigned(cnt);
2910	nfsm_request(vp, NFSPROC_COMMIT, td, cred);
2911	nfsm_wcc_data(vp, wccflag);
2912	if (!error) {
2913		tl = nfsm_dissect(u_int32_t *, NFSX_V3WRITEVERF);
2914		if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl,
2915			NFSX_V3WRITEVERF)) {
2916			bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
2917				NFSX_V3WRITEVERF);
2918			error = NFSERR_STALEWRITEVERF;
2919		}
2920	}
2921	m_freem(mrep);
2922nfsmout:
2923	return (error);
2924}
2925
2926/*
2927 * Strategy routine.
2928 * For async requests when nfsiod(s) are running, queue the request by
2929 * calling nfs_asyncio(), otherwise just all nfs_doio() to do the
2930 * request.
2931 */
2932static int
2933nfs_strategy(struct vop_strategy_args *ap)
2934{
2935	struct buf *bp = ap->a_bp;
2936	struct ucred *cr;
2937
2938	KASSERT(!(bp->b_flags & B_DONE),
2939	    ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
2940	BUF_ASSERT_HELD(bp);
2941
2942	if (bp->b_iocmd == BIO_READ)
2943		cr = bp->b_rcred;
2944	else
2945		cr = bp->b_wcred;
2946
2947	/*
2948	 * If the op is asynchronous and an i/o daemon is waiting
2949	 * queue the request, wake it up and wait for completion
2950	 * otherwise just do it ourselves.
2951	 */
2952	if ((bp->b_flags & B_ASYNC) == 0 ||
2953	    nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
2954		(void)nfs_doio(ap->a_vp, bp, cr, curthread);
2955	return (0);
2956}
2957
2958/*
2959 * fsync vnode op. Just call nfs_flush() with commit == 1.
2960 */
2961/* ARGSUSED */
2962static int
2963nfs_fsync(struct vop_fsync_args *ap)
2964{
2965
2966	return (nfs_flush(ap->a_vp, ap->a_waitfor, 1));
2967}
2968
2969/*
2970 * Flush all the blocks associated with a vnode.
2971 * 	Walk through the buffer pool and push any dirty pages
2972 *	associated with the vnode.
2973 */
2974static int
2975nfs_flush(struct vnode *vp, int waitfor, int commit)
2976{
2977	struct nfsnode *np = VTONFS(vp);
2978	struct buf *bp;
2979	int i;
2980	struct buf *nbp;
2981	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2982	int error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
2983	int passone = 1;
2984	u_quad_t off, endoff, toff;
2985	struct ucred* wcred = NULL;
2986	struct buf **bvec = NULL;
2987	struct bufobj *bo;
2988	struct thread *td = curthread;
2989#ifndef NFS_COMMITBVECSIZ
2990#define NFS_COMMITBVECSIZ	20
2991#endif
2992	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
2993	int bvecsize = 0, bveccount;
2994
2995	if (nmp->nm_flag & NFSMNT_INT)
2996		slpflag = NFS_PCATCH;
2997	if (!commit)
2998		passone = 0;
2999	bo = &vp->v_bufobj;
3000	/*
3001	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
3002	 * server, but has not been committed to stable storage on the server
3003	 * yet. On the first pass, the byte range is worked out and the commit
3004	 * rpc is done. On the second pass, nfs_writebp() is called to do the
3005	 * job.
3006	 */
3007again:
3008	off = (u_quad_t)-1;
3009	endoff = 0;
3010	bvecpos = 0;
3011	if (NFS_ISV3(vp) && commit) {
3012		if (bvec != NULL && bvec != bvec_on_stack)
3013			free(bvec, M_TEMP);
3014		/*
3015		 * Count up how many buffers waiting for a commit.
3016		 */
3017		bveccount = 0;
3018		BO_LOCK(bo);
3019		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
3020			if (!BUF_ISLOCKED(bp) &&
3021			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
3022				== (B_DELWRI | B_NEEDCOMMIT))
3023				bveccount++;
3024		}
3025		/*
3026		 * Allocate space to remember the list of bufs to commit.  It is
3027		 * important to use M_NOWAIT here to avoid a race with nfs_write.
3028		 * If we can't get memory (for whatever reason), we will end up
3029		 * committing the buffers one-by-one in the loop below.
3030		 */
3031		if (bveccount > NFS_COMMITBVECSIZ) {
3032			/*
3033			 * Release the vnode interlock to avoid a lock
3034			 * order reversal.
3035			 */
3036			BO_UNLOCK(bo);
3037			bvec = (struct buf **)
3038				malloc(bveccount * sizeof(struct buf *),
3039				       M_TEMP, M_NOWAIT);
3040			BO_LOCK(bo);
3041			if (bvec == NULL) {
3042				bvec = bvec_on_stack;
3043				bvecsize = NFS_COMMITBVECSIZ;
3044			} else
3045				bvecsize = bveccount;
3046		} else {
3047			bvec = bvec_on_stack;
3048			bvecsize = NFS_COMMITBVECSIZ;
3049		}
3050		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
3051			if (bvecpos >= bvecsize)
3052				break;
3053			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
3054				nbp = TAILQ_NEXT(bp, b_bobufs);
3055				continue;
3056			}
3057			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
3058			    (B_DELWRI | B_NEEDCOMMIT)) {
3059				BUF_UNLOCK(bp);
3060				nbp = TAILQ_NEXT(bp, b_bobufs);
3061				continue;
3062			}
3063			BO_UNLOCK(bo);
3064			bremfree(bp);
3065			/*
3066			 * Work out if all buffers are using the same cred
3067			 * so we can deal with them all with one commit.
3068			 *
3069			 * NOTE: we are not clearing B_DONE here, so we have
3070			 * to do it later on in this routine if we intend to
3071			 * initiate I/O on the bp.
3072			 *
3073			 * Note: to avoid loopback deadlocks, we do not
3074			 * assign b_runningbufspace.
3075			 */
3076			if (wcred == NULL)
3077				wcred = bp->b_wcred;
3078			else if (wcred != bp->b_wcred)
3079				wcred = NOCRED;
3080			vfs_busy_pages(bp, 1);
3081
3082			BO_LOCK(bo);
3083			/*
3084			 * bp is protected by being locked, but nbp is not
3085			 * and vfs_busy_pages() may sleep.  We have to
3086			 * recalculate nbp.
3087			 */
3088			nbp = TAILQ_NEXT(bp, b_bobufs);
3089
3090			/*
3091			 * A list of these buffers is kept so that the
3092			 * second loop knows which buffers have actually
3093			 * been committed. This is necessary, since there
3094			 * may be a race between the commit rpc and new
3095			 * uncommitted writes on the file.
3096			 */
3097			bvec[bvecpos++] = bp;
3098			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
3099				bp->b_dirtyoff;
3100			if (toff < off)
3101				off = toff;
3102			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
3103			if (toff > endoff)
3104				endoff = toff;
3105		}
3106		BO_UNLOCK(bo);
3107	}
3108	if (bvecpos > 0) {
3109		/*
3110		 * Commit data on the server, as required.
3111		 * If all bufs are using the same wcred, then use that with
3112		 * one call for all of them, otherwise commit each one
3113		 * separately.
3114		 */
3115		if (wcred != NOCRED)
3116			retv = nfs_commit(vp, off, (int)(endoff - off),
3117					  wcred, td);
3118		else {
3119			retv = 0;
3120			for (i = 0; i < bvecpos; i++) {
3121				off_t off, size;
3122				bp = bvec[i];
3123				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
3124					bp->b_dirtyoff;
3125				size = (u_quad_t)(bp->b_dirtyend
3126						  - bp->b_dirtyoff);
3127				retv = nfs_commit(vp, off, (int)size,
3128						  bp->b_wcred, td);
3129				if (retv) break;
3130			}
3131		}
3132
3133		if (retv == NFSERR_STALEWRITEVERF)
3134			nfs_clearcommit(vp->v_mount);
3135
3136		/*
3137		 * Now, either mark the blocks I/O done or mark the
3138		 * blocks dirty, depending on whether the commit
3139		 * succeeded.
3140		 */
3141		for (i = 0; i < bvecpos; i++) {
3142			bp = bvec[i];
3143			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
3144			if (retv) {
3145				/*
3146				 * Error, leave B_DELWRI intact
3147				 */
3148				vfs_unbusy_pages(bp);
3149				brelse(bp);
3150			} else {
3151				/*
3152				 * Success, remove B_DELWRI ( bundirty() ).
3153				 *
3154				 * b_dirtyoff/b_dirtyend seem to be NFS
3155				 * specific.  We should probably move that
3156				 * into bundirty(). XXX
3157				 */
3158				bufobj_wref(bo);
3159				bp->b_flags |= B_ASYNC;
3160				bundirty(bp);
3161				bp->b_flags &= ~B_DONE;
3162				bp->b_ioflags &= ~BIO_ERROR;
3163				bp->b_dirtyoff = bp->b_dirtyend = 0;
3164				bufdone(bp);
3165			}
3166		}
3167	}
3168
3169	/*
3170	 * Start/do any write(s) that are required.
3171	 */
3172loop:
3173	BO_LOCK(bo);
3174	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
3175		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
3176			if (waitfor != MNT_WAIT || passone)
3177				continue;
3178
3179			error = BUF_TIMELOCK(bp,
3180			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
3181			    BO_MTX(bo), "nfsfsync", slpflag, slptimeo);
3182			if (error == 0) {
3183				BUF_UNLOCK(bp);
3184				goto loop;
3185			}
3186			if (error == ENOLCK) {
3187				error = 0;
3188				goto loop;
3189			}
3190			if (nfs_sigintr(nmp, td)) {
3191				error = EINTR;
3192				goto done;
3193			}
3194			if (slpflag & PCATCH) {
3195				slpflag = 0;
3196				slptimeo = 2 * hz;
3197			}
3198			goto loop;
3199		}
3200		if ((bp->b_flags & B_DELWRI) == 0)
3201			panic("nfs_fsync: not dirty");
3202		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
3203			BUF_UNLOCK(bp);
3204			continue;
3205		}
3206		BO_UNLOCK(bo);
3207		bremfree(bp);
3208		if (passone || !commit)
3209		    bp->b_flags |= B_ASYNC;
3210		else
3211		    bp->b_flags |= B_ASYNC;
3212		bwrite(bp);
3213		if (nfs_sigintr(nmp, td)) {
3214			error = EINTR;
3215			goto done;
3216		}
3217		goto loop;
3218	}
3219	if (passone) {
3220		passone = 0;
3221		BO_UNLOCK(bo);
3222		goto again;
3223	}
3224	if (waitfor == MNT_WAIT) {
3225		while (bo->bo_numoutput) {
3226			error = bufobj_wwait(bo, slpflag, slptimeo);
3227			if (error) {
3228			    BO_UNLOCK(bo);
3229			    error = nfs_sigintr(nmp, td);
3230			    if (error)
3231				goto done;
3232			    if (slpflag & PCATCH) {
3233				slpflag = 0;
3234				slptimeo = 2 * hz;
3235			    }
3236			    BO_LOCK(bo);
3237			}
3238		}
3239		if (bo->bo_dirty.bv_cnt != 0 && commit) {
3240			BO_UNLOCK(bo);
3241			goto loop;
3242		}
3243		/*
3244		 * Wait for all the async IO requests to drain
3245		 */
3246		BO_UNLOCK(bo);
3247		mtx_lock(&np->n_mtx);
3248		while (np->n_directio_asyncwr > 0) {
3249			np->n_flag |= NFSYNCWAIT;
3250			error = nfs_msleep(td, (caddr_t)&np->n_directio_asyncwr,
3251					   &np->n_mtx, slpflag | (PRIBIO + 1),
3252					   "nfsfsync", 0);
3253			if (error) {
3254				if (nfs_sigintr(nmp, td)) {
3255					mtx_unlock(&np->n_mtx);
3256					error = EINTR;
3257					goto done;
3258				}
3259			}
3260		}
3261		mtx_unlock(&np->n_mtx);
3262	} else
3263		BO_UNLOCK(bo);
3264	mtx_lock(&np->n_mtx);
3265	if (np->n_flag & NWRITEERR) {
3266		error = np->n_error;
3267		np->n_flag &= ~NWRITEERR;
3268	}
3269  	if (commit && bo->bo_dirty.bv_cnt == 0 &&
3270	    bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0)
3271  		np->n_flag &= ~NMODIFIED;
3272	mtx_unlock(&np->n_mtx);
3273done:
3274	if (bvec != NULL && bvec != bvec_on_stack)
3275		free(bvec, M_TEMP);
3276	return (error);
3277}
3278
3279/*
3280 * NFS advisory byte-level locks.
3281 */
3282static int
3283nfs_advlock(struct vop_advlock_args *ap)
3284{
3285	struct vnode *vp = ap->a_vp;
3286	u_quad_t size;
3287	int error;
3288
3289	error = vn_lock(vp, LK_SHARED);
3290	if (error)
3291		return (error);
3292	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
3293		size = VTONFS(vp)->n_size;
3294		VOP_UNLOCK(vp, 0);
3295		error = lf_advlock(ap, &(vp->v_lockf), size);
3296	} else {
3297		if (nfs_advlock_p)
3298			error = nfs_advlock_p(ap);
3299		else
3300			error = ENOLCK;
3301	}
3302
3303	return (error);
3304}
3305
3306/*
3307 * NFS advisory byte-level locks.
3308 */
3309static int
3310nfs_advlockasync(struct vop_advlockasync_args *ap)
3311{
3312	struct vnode *vp = ap->a_vp;
3313	u_quad_t size;
3314	int error;
3315
3316	error = vn_lock(vp, LK_SHARED);
3317	if (error)
3318		return (error);
3319	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
3320		size = VTONFS(vp)->n_size;
3321		VOP_UNLOCK(vp, 0);
3322		error = lf_advlockasync(ap, &(vp->v_lockf), size);
3323	} else {
3324		VOP_UNLOCK(vp, 0);
3325		error = EOPNOTSUPP;
3326	}
3327	return (error);
3328}
3329
3330/*
3331 * Print out the contents of an nfsnode.
3332 */
3333static int
3334nfs_print(struct vop_print_args *ap)
3335{
3336	struct vnode *vp = ap->a_vp;
3337	struct nfsnode *np = VTONFS(vp);
3338
3339	nfs_printf("\tfileid %ld fsid 0x%x",
3340	   np->n_vattr.va_fileid, np->n_vattr.va_fsid);
3341	if (vp->v_type == VFIFO)
3342		fifo_printinfo(vp);
3343	printf("\n");
3344	return (0);
3345}
3346
3347/*
3348 * This is the "real" nfs::bwrite(struct buf*).
3349 * We set B_CACHE if this is a VMIO buffer.
3350 */
3351int
3352nfs_writebp(struct buf *bp, int force __unused, struct thread *td)
3353{
3354	int s;
3355	int oldflags = bp->b_flags;
3356#if 0
3357	int retv = 1;
3358	off_t off;
3359#endif
3360
3361	BUF_ASSERT_HELD(bp);
3362
3363	if (bp->b_flags & B_INVAL) {
3364		brelse(bp);
3365		return(0);
3366	}
3367
3368	bp->b_flags |= B_CACHE;
3369
3370	/*
3371	 * Undirty the bp.  We will redirty it later if the I/O fails.
3372	 */
3373
3374	s = splbio();
3375	bundirty(bp);
3376	bp->b_flags &= ~B_DONE;
3377	bp->b_ioflags &= ~BIO_ERROR;
3378	bp->b_iocmd = BIO_WRITE;
3379
3380	bufobj_wref(bp->b_bufobj);
3381	curthread->td_ru.ru_oublock++;
3382	splx(s);
3383
3384	/*
3385	 * Note: to avoid loopback deadlocks, we do not
3386	 * assign b_runningbufspace.
3387	 */
3388	vfs_busy_pages(bp, 1);
3389
3390	BUF_KERNPROC(bp);
3391	bp->b_iooffset = dbtob(bp->b_blkno);
3392	bstrategy(bp);
3393
3394	if( (oldflags & B_ASYNC) == 0) {
3395		int rtval = bufwait(bp);
3396
3397		if (oldflags & B_DELWRI) {
3398			s = splbio();
3399			reassignbuf(bp);
3400			splx(s);
3401		}
3402		brelse(bp);
3403		return (rtval);
3404	}
3405
3406	return (0);
3407}
3408
3409/*
3410 * nfs special file access vnode op.
3411 * Essentially just get vattr and then imitate iaccess() since the device is
3412 * local to the client.
3413 */
3414static int
3415nfsspec_access(struct vop_access_args *ap)
3416{
3417	struct vattr *vap;
3418	struct ucred *cred = ap->a_cred;
3419	struct vnode *vp = ap->a_vp;
3420	accmode_t accmode = ap->a_accmode;
3421	struct vattr vattr;
3422	int error;
3423
3424	/*
3425	 * Disallow write attempts on filesystems mounted read-only;
3426	 * unless the file is a socket, fifo, or a block or character
3427	 * device resident on the filesystem.
3428	 */
3429	if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
3430		switch (vp->v_type) {
3431		case VREG:
3432		case VDIR:
3433		case VLNK:
3434			return (EROFS);
3435		default:
3436			break;
3437		}
3438	}
3439	vap = &vattr;
3440	error = VOP_GETATTR(vp, vap, cred);
3441	if (error)
3442		goto out;
3443	error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
3444			 accmode, cred, NULL);
3445out:
3446	return error;
3447}
3448
3449/*
3450 * Read wrapper for fifos.
3451 */
3452static int
3453nfsfifo_read(struct vop_read_args *ap)
3454{
3455	struct nfsnode *np = VTONFS(ap->a_vp);
3456	int error;
3457
3458	/*
3459	 * Set access flag.
3460	 */
3461	mtx_lock(&np->n_mtx);
3462	np->n_flag |= NACC;
3463	vfs_timestamp(&np->n_atim);
3464	mtx_unlock(&np->n_mtx);
3465	error = fifo_specops.vop_read(ap);
3466	return error;
3467}
3468
3469/*
3470 * Write wrapper for fifos.
3471 */
3472static int
3473nfsfifo_write(struct vop_write_args *ap)
3474{
3475	struct nfsnode *np = VTONFS(ap->a_vp);
3476
3477	/*
3478	 * Set update flag.
3479	 */
3480	mtx_lock(&np->n_mtx);
3481	np->n_flag |= NUPD;
3482	vfs_timestamp(&np->n_mtim);
3483	mtx_unlock(&np->n_mtx);
3484	return(fifo_specops.vop_write(ap));
3485}
3486
3487/*
3488 * Close wrapper for fifos.
3489 *
3490 * Update the times on the nfsnode then do fifo close.
3491 */
3492static int
3493nfsfifo_close(struct vop_close_args *ap)
3494{
3495	struct vnode *vp = ap->a_vp;
3496	struct nfsnode *np = VTONFS(vp);
3497	struct vattr vattr;
3498	struct timespec ts;
3499
3500	mtx_lock(&np->n_mtx);
3501	if (np->n_flag & (NACC | NUPD)) {
3502		vfs_timestamp(&ts);
3503		if (np->n_flag & NACC)
3504			np->n_atim = ts;
3505		if (np->n_flag & NUPD)
3506			np->n_mtim = ts;
3507		np->n_flag |= NCHG;
3508		if (vrefcnt(vp) == 1 &&
3509		    (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
3510			VATTR_NULL(&vattr);
3511			if (np->n_flag & NACC)
3512				vattr.va_atime = np->n_atim;
3513			if (np->n_flag & NUPD)
3514				vattr.va_mtime = np->n_mtim;
3515			mtx_unlock(&np->n_mtx);
3516			(void)VOP_SETATTR(vp, &vattr, ap->a_cred);
3517			goto out;
3518		}
3519	}
3520	mtx_unlock(&np->n_mtx);
3521out:
3522	return (fifo_specops.vop_close(ap));
3523}
3524
3525/*
3526 * Just call nfs_writebp() with the force argument set to 1.
3527 *
3528 * NOTE: B_DONE may or may not be set in a_bp on call.
3529 */
3530static int
3531nfs_bwrite(struct buf *bp)
3532{
3533
3534	return (nfs_writebp(bp, 1, curthread));
3535}
3536
3537struct buf_ops buf_ops_nfs = {
3538	.bop_name	=	"buf_ops_nfs",
3539	.bop_write	=	nfs_bwrite,
3540	.bop_strategy	=	bufstrategy,
3541	.bop_sync	=	bufsync,
3542	.bop_bdflush	=	bufbdflush,
3543};
3544