nfs_vnops.c revision 222187
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/nfsclient/nfs_vnops.c 222187 2011-05-22 18:11:41Z alc $");
37
38/*
39 * vnode op calls for Sun NFS version 2 and 3
40 */
41
42#include "opt_inet.h"
43#include "opt_kdtrace.h"
44
45#include <sys/param.h>
46#include <sys/kernel.h>
47#include <sys/systm.h>
48#include <sys/resourcevar.h>
49#include <sys/proc.h>
50#include <sys/mount.h>
51#include <sys/bio.h>
52#include <sys/buf.h>
53#include <sys/jail.h>
54#include <sys/malloc.h>
55#include <sys/mbuf.h>
56#include <sys/namei.h>
57#include <sys/socket.h>
58#include <sys/vnode.h>
59#include <sys/dirent.h>
60#include <sys/fcntl.h>
61#include <sys/lockf.h>
62#include <sys/stat.h>
63#include <sys/sysctl.h>
64#include <sys/signalvar.h>
65
66#include <vm/vm.h>
67#include <vm/vm_extern.h>
68#include <vm/vm_object.h>
69
70#include <fs/fifofs/fifo.h>
71
72#include <nfs/nfsproto.h>
73#include <nfsclient/nfs.h>
74#include <nfsclient/nfsnode.h>
75#include <nfsclient/nfsmount.h>
76#include <nfs/nfs_kdtrace.h>
77#include <nfs/nfs_lock.h>
78#include <nfs/xdr_subs.h>
79#include <nfsclient/nfsm_subs.h>
80
81#include <net/if.h>
82#include <netinet/in.h>
83#include <netinet/in_var.h>
84
85#include <machine/stdarg.h>
86
87#ifdef KDTRACE_HOOKS
88#include <sys/dtrace_bsd.h>
89
90dtrace_nfsclient_accesscache_flush_probe_func_t
91    dtrace_nfsclient_accesscache_flush_done_probe;
92uint32_t nfsclient_accesscache_flush_done_id;
93
94dtrace_nfsclient_accesscache_get_probe_func_t
95    dtrace_nfsclient_accesscache_get_hit_probe,
96    dtrace_nfsclient_accesscache_get_miss_probe;
97uint32_t nfsclient_accesscache_get_hit_id;
98uint32_t nfsclient_accesscache_get_miss_id;
99
100dtrace_nfsclient_accesscache_load_probe_func_t
101    dtrace_nfsclient_accesscache_load_done_probe;
102uint32_t nfsclient_accesscache_load_done_id;
103#endif /* !KDTRACE_HOOKS */
104
105/* Defs */
106#define	TRUE	1
107#define	FALSE	0
108
109/*
110 * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these
111 * calls are not in getblk() and brelse() so that they would not be necessary
112 * here.
113 */
114#ifndef B_VMIO
115#define vfs_busy_pages(bp, f)
116#endif
117
118static vop_read_t	nfsfifo_read;
119static vop_write_t	nfsfifo_write;
120static vop_close_t	nfsfifo_close;
121static int	nfs_flush(struct vnode *, int, int);
122static int	nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *);
123static vop_lookup_t	nfs_lookup;
124static vop_create_t	nfs_create;
125static vop_mknod_t	nfs_mknod;
126static vop_open_t	nfs_open;
127static vop_close_t	nfs_close;
128static vop_access_t	nfs_access;
129static vop_getattr_t	nfs_getattr;
130static vop_setattr_t	nfs_setattr;
131static vop_read_t	nfs_read;
132static vop_fsync_t	nfs_fsync;
133static vop_remove_t	nfs_remove;
134static vop_link_t	nfs_link;
135static vop_rename_t	nfs_rename;
136static vop_mkdir_t	nfs_mkdir;
137static vop_rmdir_t	nfs_rmdir;
138static vop_symlink_t	nfs_symlink;
139static vop_readdir_t	nfs_readdir;
140static vop_strategy_t	nfs_strategy;
141static	int	nfs_lookitup(struct vnode *, const char *, int,
142		    struct ucred *, struct thread *, struct nfsnode **);
143static	int	nfs_sillyrename(struct vnode *, struct vnode *,
144		    struct componentname *);
145static vop_access_t	nfsspec_access;
146static vop_readlink_t	nfs_readlink;
147static vop_print_t	nfs_print;
148static vop_advlock_t	nfs_advlock;
149static vop_advlockasync_t nfs_advlockasync;
150
151/*
152 * Global vfs data structures for nfs
153 */
154struct vop_vector nfs_vnodeops = {
155	.vop_default =		&default_vnodeops,
156	.vop_access =		nfs_access,
157	.vop_advlock =		nfs_advlock,
158	.vop_advlockasync =	nfs_advlockasync,
159	.vop_close =		nfs_close,
160	.vop_create =		nfs_create,
161	.vop_fsync =		nfs_fsync,
162	.vop_getattr =		nfs_getattr,
163	.vop_getpages =		nfs_getpages,
164	.vop_putpages =		nfs_putpages,
165	.vop_inactive =		nfs_inactive,
166	.vop_link =		nfs_link,
167	.vop_lookup =		nfs_lookup,
168	.vop_mkdir =		nfs_mkdir,
169	.vop_mknod =		nfs_mknod,
170	.vop_open =		nfs_open,
171	.vop_print =		nfs_print,
172	.vop_read =		nfs_read,
173	.vop_readdir =		nfs_readdir,
174	.vop_readlink =		nfs_readlink,
175	.vop_reclaim =		nfs_reclaim,
176	.vop_remove =		nfs_remove,
177	.vop_rename =		nfs_rename,
178	.vop_rmdir =		nfs_rmdir,
179	.vop_setattr =		nfs_setattr,
180	.vop_strategy =		nfs_strategy,
181	.vop_symlink =		nfs_symlink,
182	.vop_write =		nfs_write,
183};
184
185struct vop_vector nfs_fifoops = {
186	.vop_default =		&fifo_specops,
187	.vop_access =		nfsspec_access,
188	.vop_close =		nfsfifo_close,
189	.vop_fsync =		nfs_fsync,
190	.vop_getattr =		nfs_getattr,
191	.vop_inactive =		nfs_inactive,
192	.vop_print =		nfs_print,
193	.vop_read =		nfsfifo_read,
194	.vop_reclaim =		nfs_reclaim,
195	.vop_setattr =		nfs_setattr,
196	.vop_write =		nfsfifo_write,
197};
198
199static int	nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp,
200			     struct componentname *cnp, struct vattr *vap);
201static int	nfs_removerpc(struct vnode *dvp, const char *name, int namelen,
202			      struct ucred *cred, struct thread *td);
203static int	nfs_renamerpc(struct vnode *fdvp, const char *fnameptr,
204			      int fnamelen, struct vnode *tdvp,
205			      const char *tnameptr, int tnamelen,
206			      struct ucred *cred, struct thread *td);
207static int	nfs_renameit(struct vnode *sdvp, struct componentname *scnp,
208			     struct sillyrename *sp);
209
210/*
211 * Global variables
212 */
213struct mtx 	nfs_iod_mtx;
214enum nfsiod_state nfs_iodwant[NFS_MAXASYNCDAEMON];
215struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
216int		 nfs_numasync = 0;
217#define	DIRHDSIZ	(sizeof (struct dirent) - (MAXNAMLEN + 1))
218
219SYSCTL_DECL(_vfs_oldnfs);
220
221static int	nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
222SYSCTL_INT(_vfs_oldnfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
223	   &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");
224
225static int	nfs_prime_access_cache = 0;
226SYSCTL_INT(_vfs_oldnfs, OID_AUTO, prime_access_cache, CTLFLAG_RW,
227	   &nfs_prime_access_cache, 0,
228	   "Prime NFS ACCESS cache when fetching attributes");
229
230static int	nfsv3_commit_on_close = 0;
231SYSCTL_INT(_vfs_oldnfs, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW,
232	   &nfsv3_commit_on_close, 0, "write+commit on close, else only write");
233
234static int	nfs_clean_pages_on_close = 1;
235SYSCTL_INT(_vfs_oldnfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
236	   &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
237
238int nfs_directio_enable = 0;
239SYSCTL_INT(_vfs_oldnfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
240	   &nfs_directio_enable, 0, "Enable NFS directio");
241
242/*
243 * This sysctl allows other processes to mmap a file that has been opened
244 * O_DIRECT by a process.  In general, having processes mmap the file while
245 * Direct IO is in progress can lead to Data Inconsistencies.  But, we allow
246 * this by default to prevent DoS attacks - to prevent a malicious user from
247 * opening up files O_DIRECT preventing other users from mmap'ing these
248 * files.  "Protected" environments where stricter consistency guarantees are
249 * required can disable this knob.  The process that opened the file O_DIRECT
250 * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
251 * meaningful.
252 */
253int nfs_directio_allow_mmap = 1;
254SYSCTL_INT(_vfs_oldnfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
255	   &nfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
256
257#if 0
258SYSCTL_INT(_vfs_oldnfs, OID_AUTO, access_cache_hits, CTLFLAG_RD,
259	   &nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");
260
261SYSCTL_INT(_vfs_oldnfs, OID_AUTO, access_cache_misses, CTLFLAG_RD,
262	   &nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count");
263#endif
264
265#define	NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY		\
266			 | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE	\
267			 | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP)
268
269/*
270 * SMP Locking Note :
271 * The list of locks after the description of the lock is the ordering
272 * of other locks acquired with the lock held.
273 * np->n_mtx : Protects the fields in the nfsnode.
274       VM Object Lock
275       VI_MTX (acquired indirectly)
276 * nmp->nm_mtx : Protects the fields in the nfsmount.
277       rep->r_mtx
278 * nfs_iod_mtx : Global lock, protects shared nfsiod state.
279 * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
280       nmp->nm_mtx
281       rep->r_mtx
282 * rep->r_mtx : Protects the fields in an nfsreq.
283 */
284
285static int
286nfs3_access_otw(struct vnode *vp, int wmode, struct thread *td,
287    struct ucred *cred, uint32_t *retmode)
288{
289	const int v3 = 1;
290	u_int32_t *tl;
291	int error = 0, attrflag, i, lrupos;
292
293	struct mbuf *mreq, *mrep, *md, *mb;
294	caddr_t bpos, dpos;
295	u_int32_t rmode;
296	struct nfsnode *np = VTONFS(vp);
297
298	nfsstats.rpccnt[NFSPROC_ACCESS]++;
299	mreq = nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED);
300	mb = mreq;
301	bpos = mtod(mb, caddr_t);
302	nfsm_fhtom(vp, v3);
303	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
304	*tl = txdr_unsigned(wmode);
305	nfsm_request(vp, NFSPROC_ACCESS, td, cred);
306	nfsm_postop_attr(vp, attrflag);
307	if (!error) {
308		lrupos = 0;
309		tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
310		rmode = fxdr_unsigned(u_int32_t, *tl);
311		mtx_lock(&np->n_mtx);
312		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
313			if (np->n_accesscache[i].uid == cred->cr_uid) {
314				np->n_accesscache[i].mode = rmode;
315				np->n_accesscache[i].stamp = time_second;
316				break;
317			}
318			if (i > 0 && np->n_accesscache[i].stamp <
319			    np->n_accesscache[lrupos].stamp)
320				lrupos = i;
321		}
322		if (i == NFS_ACCESSCACHESIZE) {
323			np->n_accesscache[lrupos].uid = cred->cr_uid;
324			np->n_accesscache[lrupos].mode = rmode;
325			np->n_accesscache[lrupos].stamp = time_second;
326		}
327		mtx_unlock(&np->n_mtx);
328		if (retmode != NULL)
329			*retmode = rmode;
330		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, rmode, 0);
331	}
332	m_freem(mrep);
333nfsmout:
334#ifdef KDTRACE_HOOKS
335	if (error) {
336		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, 0,
337		    error);
338	}
339#endif
340	return (error);
341}
342
343/*
344 * nfs access vnode op.
345 * For nfs version 2, just return ok. File accesses may fail later.
346 * For nfs version 3, use the access rpc to check accessibility. If file modes
347 * are changed on the server, accesses might still fail later.
348 */
349static int
350nfs_access(struct vop_access_args *ap)
351{
352	struct vnode *vp = ap->a_vp;
353	int error = 0, i, gotahit;
354	u_int32_t mode, rmode, wmode;
355	int v3 = NFS_ISV3(vp);
356	struct nfsnode *np = VTONFS(vp);
357
358	/*
359	 * Disallow write attempts on filesystems mounted read-only;
360	 * unless the file is a socket, fifo, or a block or character
361	 * device resident on the filesystem.
362	 */
363	if ((ap->a_accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
364		switch (vp->v_type) {
365		case VREG:
366		case VDIR:
367		case VLNK:
368			return (EROFS);
369		default:
370			break;
371		}
372	}
373	/*
374	 * For nfs v3, check to see if we have done this recently, and if
375	 * so return our cached result instead of making an ACCESS call.
376	 * If not, do an access rpc, otherwise you are stuck emulating
377	 * ufs_access() locally using the vattr. This may not be correct,
378	 * since the server may apply other access criteria such as
379	 * client uid-->server uid mapping that we do not know about.
380	 */
381	if (v3) {
382		if (ap->a_accmode & VREAD)
383			mode = NFSV3ACCESS_READ;
384		else
385			mode = 0;
386		if (vp->v_type != VDIR) {
387			if (ap->a_accmode & VWRITE)
388				mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND);
389			if (ap->a_accmode & VEXEC)
390				mode |= NFSV3ACCESS_EXECUTE;
391		} else {
392			if (ap->a_accmode & VWRITE)
393				mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND |
394					 NFSV3ACCESS_DELETE);
395			if (ap->a_accmode & VEXEC)
396				mode |= NFSV3ACCESS_LOOKUP;
397		}
398		/* XXX safety belt, only make blanket request if caching */
399		if (nfsaccess_cache_timeout > 0) {
400			wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY |
401				NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE |
402				NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP;
403		} else {
404			wmode = mode;
405		}
406
407		/*
408		 * Does our cached result allow us to give a definite yes to
409		 * this request?
410		 */
411		gotahit = 0;
412		mtx_lock(&np->n_mtx);
413		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
414			if (ap->a_cred->cr_uid == np->n_accesscache[i].uid) {
415				if (time_second < (np->n_accesscache[i].stamp +
416				    nfsaccess_cache_timeout) &&
417				    (np->n_accesscache[i].mode & mode) == mode) {
418					nfsstats.accesscache_hits++;
419					gotahit = 1;
420				}
421				break;
422			}
423		}
424		mtx_unlock(&np->n_mtx);
425#ifdef KDTRACE_HOOKS
426		if (gotahit)
427			KDTRACE_NFS_ACCESSCACHE_GET_HIT(vp,
428			    ap->a_cred->cr_uid, mode);
429		else
430			KDTRACE_NFS_ACCESSCACHE_GET_MISS(vp,
431			    ap->a_cred->cr_uid, mode);
432#endif
433		if (gotahit == 0) {
434			/*
435			 * Either a no, or a don't know.  Go to the wire.
436			 */
437			nfsstats.accesscache_misses++;
438		        error = nfs3_access_otw(vp, wmode, ap->a_td, ap->a_cred,
439			    &rmode);
440			if (!error) {
441				if ((rmode & mode) != mode)
442					error = EACCES;
443			}
444		}
445		return (error);
446	} else {
447		if ((error = nfsspec_access(ap)) != 0) {
448			return (error);
449		}
450		/*
451		 * Attempt to prevent a mapped root from accessing a file
452		 * which it shouldn't.  We try to read a byte from the file
453		 * if the user is root and the file is not zero length.
454		 * After calling nfsspec_access, we should have the correct
455		 * file size cached.
456		 */
457		mtx_lock(&np->n_mtx);
458		if (ap->a_cred->cr_uid == 0 && (ap->a_accmode & VREAD)
459		    && VTONFS(vp)->n_size > 0) {
460			struct iovec aiov;
461			struct uio auio;
462			char buf[1];
463
464			mtx_unlock(&np->n_mtx);
465			aiov.iov_base = buf;
466			aiov.iov_len = 1;
467			auio.uio_iov = &aiov;
468			auio.uio_iovcnt = 1;
469			auio.uio_offset = 0;
470			auio.uio_resid = 1;
471			auio.uio_segflg = UIO_SYSSPACE;
472			auio.uio_rw = UIO_READ;
473			auio.uio_td = ap->a_td;
474
475			if (vp->v_type == VREG)
476				error = nfs_readrpc(vp, &auio, ap->a_cred);
477			else if (vp->v_type == VDIR) {
478				char* bp;
479				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
480				aiov.iov_base = bp;
481				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
482				error = nfs_readdirrpc(vp, &auio, ap->a_cred);
483				free(bp, M_TEMP);
484			} else if (vp->v_type == VLNK)
485				error = nfs_readlinkrpc(vp, &auio, ap->a_cred);
486			else
487				error = EACCES;
488		} else
489			mtx_unlock(&np->n_mtx);
490		return (error);
491	}
492}
493
494int nfs_otw_getattr_avoid = 0;
495
496/*
497 * nfs open vnode op
498 * Check to see if the type is ok
499 * and that deletion is not in progress.
500 * For paged in text files, you will need to flush the page cache
501 * if consistency is lost.
502 */
503/* ARGSUSED */
504static int
505nfs_open(struct vop_open_args *ap)
506{
507	struct vnode *vp = ap->a_vp;
508	struct nfsnode *np = VTONFS(vp);
509	struct vattr vattr;
510	int error;
511	int fmode = ap->a_mode;
512
513	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
514		return (EOPNOTSUPP);
515
516	/*
517	 * Get a valid lease. If cached data is stale, flush it.
518	 */
519	mtx_lock(&np->n_mtx);
520	if (np->n_flag & NMODIFIED) {
521		mtx_unlock(&np->n_mtx);
522		error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
523		if (error == EINTR || error == EIO)
524			return (error);
525		mtx_lock(&np->n_mtx);
526		np->n_attrstamp = 0;
527		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
528		if (vp->v_type == VDIR)
529			np->n_direofoffset = 0;
530		mtx_unlock(&np->n_mtx);
531		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
532		if (error)
533			return (error);
534		mtx_lock(&np->n_mtx);
535		np->n_mtime = vattr.va_mtime;
536	} else {
537		mtx_unlock(&np->n_mtx);
538		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
539		if (error)
540			return (error);
541		mtx_lock(&np->n_mtx);
542		if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
543			if (vp->v_type == VDIR)
544				np->n_direofoffset = 0;
545			mtx_unlock(&np->n_mtx);
546			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
547			if (error == EINTR || error == EIO) {
548				return (error);
549			}
550			mtx_lock(&np->n_mtx);
551			np->n_mtime = vattr.va_mtime;
552		}
553	}
554	/*
555	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
556	 */
557	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
558		if (np->n_directio_opens == 0) {
559			mtx_unlock(&np->n_mtx);
560			error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
561			if (error)
562				return (error);
563			mtx_lock(&np->n_mtx);
564			np->n_flag |= NNONCACHE;
565		}
566		np->n_directio_opens++;
567	}
568	mtx_unlock(&np->n_mtx);
569	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
570	return (0);
571}
572
573/*
574 * nfs close vnode op
575 * What an NFS client should do upon close after writing is a debatable issue.
576 * Most NFS clients push delayed writes to the server upon close, basically for
577 * two reasons:
578 * 1 - So that any write errors may be reported back to the client process
579 *     doing the close system call. By far the two most likely errors are
580 *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
581 * 2 - To put a worst case upper bound on cache inconsistency between
582 *     multiple clients for the file.
583 * There is also a consistency problem for Version 2 of the protocol w.r.t.
584 * not being able to tell if other clients are writing a file concurrently,
585 * since there is no way of knowing if the changed modify time in the reply
586 * is only due to the write for this client.
587 * (NFS Version 3 provides weak cache consistency data in the reply that
588 *  should be sufficient to detect and handle this case.)
589 *
590 * The current code does the following:
591 * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
592 * for NFS Version 3 - flush dirty buffers to the server but don't invalidate
593 *                     or commit them (this satisfies 1 and 2 except for the
594 *                     case where the server crashes after this close but
595 *                     before the commit RPC, which is felt to be "good
596 *                     enough". Changing the last argument to nfs_flush() to
597 *                     a 1 would force a commit operation, if it is felt a
598 *                     commit is necessary now.
599 */
600/* ARGSUSED */
601static int
602nfs_close(struct vop_close_args *ap)
603{
604	struct vnode *vp = ap->a_vp;
605	struct nfsnode *np = VTONFS(vp);
606	int error = 0;
607	int fmode = ap->a_fflag;
608
609	if (vp->v_type == VREG) {
610	    /*
611	     * Examine and clean dirty pages, regardless of NMODIFIED.
612	     * This closes a major hole in close-to-open consistency.
613	     * We want to push out all dirty pages (and buffers) on
614	     * close, regardless of whether they were dirtied by
615	     * mmap'ed writes or via write().
616	     */
617	    if (nfs_clean_pages_on_close && vp->v_object) {
618		VM_OBJECT_LOCK(vp->v_object);
619		vm_object_page_clean(vp->v_object, 0, 0, 0);
620		VM_OBJECT_UNLOCK(vp->v_object);
621	    }
622	    mtx_lock(&np->n_mtx);
623	    if (np->n_flag & NMODIFIED) {
624		mtx_unlock(&np->n_mtx);
625		if (NFS_ISV3(vp)) {
626		    /*
627		     * Under NFSv3 we have dirty buffers to dispose of.  We
628		     * must flush them to the NFS server.  We have the option
629		     * of waiting all the way through the commit rpc or just
630		     * waiting for the initial write.  The default is to only
631		     * wait through the initial write so the data is in the
632		     * server's cache, which is roughly similar to the state
633		     * a standard disk subsystem leaves the file in on close().
634		     *
635		     * We cannot clear the NMODIFIED bit in np->n_flag due to
636		     * potential races with other processes, and certainly
637		     * cannot clear it if we don't commit.
638		     */
639		    int cm = nfsv3_commit_on_close ? 1 : 0;
640		    error = nfs_flush(vp, MNT_WAIT, cm);
641		    /* np->n_flag &= ~NMODIFIED; */
642		} else
643		    error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
644		mtx_lock(&np->n_mtx);
645	    }
646	    if (np->n_flag & NWRITEERR) {
647		np->n_flag &= ~NWRITEERR;
648		error = np->n_error;
649	    }
650	    mtx_unlock(&np->n_mtx);
651	}
652	if (nfs_directio_enable)
653		KASSERT((np->n_directio_asyncwr == 0),
654			("nfs_close: dirty unflushed (%d) directio buffers\n",
655			 np->n_directio_asyncwr));
656	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
657		mtx_lock(&np->n_mtx);
658		KASSERT((np->n_directio_opens > 0),
659			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
660		np->n_directio_opens--;
661		if (np->n_directio_opens == 0)
662			np->n_flag &= ~NNONCACHE;
663		mtx_unlock(&np->n_mtx);
664	}
665	return (error);
666}
667
668/*
669 * nfs getattr call from vfs.
670 */
671static int
672nfs_getattr(struct vop_getattr_args *ap)
673{
674	struct vnode *vp = ap->a_vp;
675	struct nfsnode *np = VTONFS(vp);
676	struct thread *td = curthread;
677	struct vattr *vap = ap->a_vap;
678	struct vattr vattr;
679	caddr_t bpos, dpos;
680	int error = 0;
681	struct mbuf *mreq, *mrep, *md, *mb;
682	int v3 = NFS_ISV3(vp);
683
684	/*
685	 * Update local times for special files.
686	 */
687	mtx_lock(&np->n_mtx);
688	if (np->n_flag & (NACC | NUPD))
689		np->n_flag |= NCHG;
690	mtx_unlock(&np->n_mtx);
691	/*
692	 * First look in the cache.
693	 */
694	if (nfs_getattrcache(vp, &vattr) == 0)
695		goto nfsmout;
696	if (v3 && nfs_prime_access_cache && nfsaccess_cache_timeout > 0) {
697		nfsstats.accesscache_misses++;
698		nfs3_access_otw(vp, NFSV3ACCESS_ALL, td, ap->a_cred, NULL);
699		if (nfs_getattrcache(vp, &vattr) == 0)
700			goto nfsmout;
701	}
702	nfsstats.rpccnt[NFSPROC_GETATTR]++;
703	mreq = nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3));
704	mb = mreq;
705	bpos = mtod(mb, caddr_t);
706	nfsm_fhtom(vp, v3);
707	nfsm_request(vp, NFSPROC_GETATTR, td, ap->a_cred);
708	if (!error) {
709		nfsm_loadattr(vp, &vattr);
710	}
711	m_freem(mrep);
712nfsmout:
713	vap->va_type = vattr.va_type;
714	vap->va_mode = vattr.va_mode;
715	vap->va_nlink = vattr.va_nlink;
716	vap->va_uid = vattr.va_uid;
717	vap->va_gid = vattr.va_gid;
718	vap->va_fsid = vattr.va_fsid;
719	vap->va_fileid = vattr.va_fileid;
720	vap->va_size = vattr.va_size;
721	vap->va_blocksize = vattr.va_blocksize;
722	vap->va_atime = vattr.va_atime;
723	vap->va_mtime = vattr.va_mtime;
724	vap->va_ctime = vattr.va_ctime;
725	vap->va_gen = vattr.va_gen;
726	vap->va_flags = vattr.va_flags;
727	vap->va_rdev = vattr.va_rdev;
728	vap->va_bytes = vattr.va_bytes;
729	vap->va_filerev = vattr.va_filerev;
730
731	return (error);
732}
733
734/*
735 * nfs setattr call.
736 */
737static int
738nfs_setattr(struct vop_setattr_args *ap)
739{
740	struct vnode *vp = ap->a_vp;
741	struct nfsnode *np = VTONFS(vp);
742	struct vattr *vap = ap->a_vap;
743	struct thread *td = curthread;
744	int error = 0;
745	u_quad_t tsize;
746
747#ifndef nolint
748	tsize = (u_quad_t)0;
749#endif
750
751	/*
752	 * Setting of flags is not supported.
753	 */
754	if (vap->va_flags != VNOVAL)
755		return (EOPNOTSUPP);
756
757	/*
758	 * Disallow write attempts if the filesystem is mounted read-only.
759	 */
760  	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
761	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
762	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
763	    (vp->v_mount->mnt_flag & MNT_RDONLY)) {
764		error = EROFS;
765		goto out;
766	}
767	if (vap->va_size != VNOVAL) {
768 		switch (vp->v_type) {
769 		case VDIR:
770 			return (EISDIR);
771 		case VCHR:
772 		case VBLK:
773 		case VSOCK:
774 		case VFIFO:
775			if (vap->va_mtime.tv_sec == VNOVAL &&
776			    vap->va_atime.tv_sec == VNOVAL &&
777			    vap->va_mode == (mode_t)VNOVAL &&
778			    vap->va_uid == (uid_t)VNOVAL &&
779			    vap->va_gid == (gid_t)VNOVAL)
780				return (0);
781 			vap->va_size = VNOVAL;
782 			break;
783 		default:
784			/*
785			 * Disallow write attempts if the filesystem is
786			 * mounted read-only.
787			 */
788			if (vp->v_mount->mnt_flag & MNT_RDONLY)
789				return (EROFS);
790			/*
791			 *  We run vnode_pager_setsize() early (why?),
792			 * we must set np->n_size now to avoid vinvalbuf
793			 * V_SAVE races that might setsize a lower
794			 * value.
795			 */
796			mtx_lock(&np->n_mtx);
797			tsize = np->n_size;
798			mtx_unlock(&np->n_mtx);
799			error = nfs_meta_setsize(vp, ap->a_cred, td,
800			    vap->va_size);
801			mtx_lock(&np->n_mtx);
802 			if (np->n_flag & NMODIFIED) {
803			    tsize = np->n_size;
804			    mtx_unlock(&np->n_mtx);
805 			    if (vap->va_size == 0)
806 				error = nfs_vinvalbuf(vp, 0, td, 1);
807 			    else
808 				error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
809 			    if (error) {
810				vnode_pager_setsize(vp, tsize);
811				goto out;
812			    }
813 			} else
814			    mtx_unlock(&np->n_mtx);
815			/*
816			 * np->n_size has already been set to vap->va_size
817			 * in nfs_meta_setsize(). We must set it again since
818			 * nfs_loadattrcache() could be called through
819			 * nfs_meta_setsize() and could modify np->n_size.
820			 */
821			mtx_lock(&np->n_mtx);
822 			np->n_vattr.va_size = np->n_size = vap->va_size;
823			mtx_unlock(&np->n_mtx);
824  		};
825  	} else {
826		mtx_lock(&np->n_mtx);
827		if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) &&
828		    (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
829			mtx_unlock(&np->n_mtx);
830			if ((error = nfs_vinvalbuf(vp, V_SAVE, td, 1)) != 0 &&
831			    (error == EINTR || error == EIO))
832				return error;
833		} else
834			mtx_unlock(&np->n_mtx);
835	}
836	error = nfs_setattrrpc(vp, vap, ap->a_cred);
837	if (error && vap->va_size != VNOVAL) {
838		mtx_lock(&np->n_mtx);
839		np->n_size = np->n_vattr.va_size = tsize;
840		vnode_pager_setsize(vp, tsize);
841		mtx_unlock(&np->n_mtx);
842	}
843out:
844	return (error);
845}
846
847/*
848 * Do an nfs setattr rpc.
849 */
850static int
851nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred)
852{
853	struct nfsv2_sattr *sp;
854	struct nfsnode *np = VTONFS(vp);
855	caddr_t bpos, dpos;
856	u_int32_t *tl;
857	int error = 0, i, wccflag = NFSV3_WCCRATTR;
858	struct mbuf *mreq, *mrep, *md, *mb;
859	int v3 = NFS_ISV3(vp);
860
861	nfsstats.rpccnt[NFSPROC_SETATTR]++;
862	mreq = nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3));
863	mb = mreq;
864	bpos = mtod(mb, caddr_t);
865	nfsm_fhtom(vp, v3);
866	if (v3) {
867		nfsm_v3attrbuild(vap, TRUE);
868		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
869		*tl = nfs_false;
870	} else {
871		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
872		if (vap->va_mode == (mode_t)VNOVAL)
873			sp->sa_mode = nfs_xdrneg1;
874		else
875			sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode);
876		if (vap->va_uid == (uid_t)VNOVAL)
877			sp->sa_uid = nfs_xdrneg1;
878		else
879			sp->sa_uid = txdr_unsigned(vap->va_uid);
880		if (vap->va_gid == (gid_t)VNOVAL)
881			sp->sa_gid = nfs_xdrneg1;
882		else
883			sp->sa_gid = txdr_unsigned(vap->va_gid);
884		sp->sa_size = txdr_unsigned(vap->va_size);
885		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
886		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
887	}
888	nfsm_request(vp, NFSPROC_SETATTR, curthread, cred);
889	if (v3) {
890		mtx_lock(&np->n_mtx);
891		for (i = 0; i < NFS_ACCESSCACHESIZE; i++)
892			np->n_accesscache[i].stamp = 0;
893		mtx_unlock(&np->n_mtx);
894		KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp);
895		nfsm_wcc_data(vp, wccflag);
896	} else
897		nfsm_loadattr(vp, NULL);
898	m_freem(mrep);
899nfsmout:
900	return (error);
901}
902
903/*
904 * nfs lookup call, one step at a time...
905 * First look in cache
906 * If not found, unlock the directory nfsnode and do the rpc
907 */
908static int
909nfs_lookup(struct vop_lookup_args *ap)
910{
911	struct componentname *cnp = ap->a_cnp;
912	struct vnode *dvp = ap->a_dvp;
913	struct vnode **vpp = ap->a_vpp;
914	struct mount *mp = dvp->v_mount;
915	struct vattr vattr;
916	struct timespec dmtime;
917	int flags = cnp->cn_flags;
918	struct vnode *newvp;
919	struct nfsmount *nmp;
920	caddr_t bpos, dpos;
921	struct mbuf *mreq, *mrep, *md, *mb;
922	long len;
923	nfsfh_t *fhp;
924	struct nfsnode *np, *newnp;
925	int error = 0, attrflag, fhsize, ltype;
926	int v3 = NFS_ISV3(dvp);
927	struct thread *td = cnp->cn_thread;
928
929	*vpp = NULLVP;
930	if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) &&
931	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
932		return (EROFS);
933	if (dvp->v_type != VDIR)
934		return (ENOTDIR);
935	nmp = VFSTONFS(mp);
936	np = VTONFS(dvp);
937	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) {
938		*vpp = NULLVP;
939		return (error);
940	}
941	error = cache_lookup(dvp, vpp, cnp);
942	if (error > 0 && error != ENOENT)
943		return (error);
944	if (error == -1) {
945		/*
946		 * We only accept a positive hit in the cache if the
947		 * change time of the file matches our cached copy.
948		 * Otherwise, we discard the cache entry and fallback
949		 * to doing a lookup RPC.
950		 *
951		 * To better handle stale file handles and attributes,
952		 * clear the attribute cache of this node if it is a
953		 * leaf component, part of an open() call, and not
954		 * locally modified before fetching the attributes.
955		 * This should allow stale file handles to be detected
956		 * here where we can fall back to a LOOKUP RPC to
957		 * recover rather than having nfs_open() detect the
958		 * stale file handle and failing open(2) with ESTALE.
959		 */
960		newvp = *vpp;
961		newnp = VTONFS(newvp);
962		if (!(nmp->nm_flag & NFSMNT_NOCTO) &&
963		    (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
964		    !(newnp->n_flag & NMODIFIED)) {
965			mtx_lock(&newnp->n_mtx);
966			newnp->n_attrstamp = 0;
967			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
968			mtx_unlock(&newnp->n_mtx);
969		}
970		if (VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 &&
971		    timespeccmp(&vattr.va_ctime, &newnp->n_ctime, ==)) {
972			nfsstats.lookupcache_hits++;
973			if (cnp->cn_nameiop != LOOKUP &&
974			    (flags & ISLASTCN))
975				cnp->cn_flags |= SAVENAME;
976			return (0);
977		}
978		cache_purge(newvp);
979		if (dvp != newvp)
980			vput(newvp);
981		else
982			vrele(newvp);
983		*vpp = NULLVP;
984	} else if (error == ENOENT) {
985		if (dvp->v_iflag & VI_DOOMED)
986			return (ENOENT);
987		/*
988		 * We only accept a negative hit in the cache if the
989		 * modification time of the parent directory matches
990		 * our cached copy.  Otherwise, we discard all of the
991		 * negative cache entries for this directory. We also
992		 * only trust -ve cache entries for less than
993		 * nm_negative_namecache_timeout seconds.
994		 */
995		if ((u_int)(ticks - np->n_dmtime_ticks) <
996		    (nmp->nm_negnametimeo * hz) &&
997		    VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 &&
998		    timespeccmp(&vattr.va_mtime, &np->n_dmtime, ==)) {
999			nfsstats.lookupcache_hits++;
1000			return (ENOENT);
1001		}
1002		cache_purge_negative(dvp);
1003		mtx_lock(&np->n_mtx);
1004		timespecclear(&np->n_dmtime);
1005		mtx_unlock(&np->n_mtx);
1006	}
1007
1008	/*
1009	 * Cache the modification time of the parent directory in case
1010	 * the lookup fails and results in adding the first negative
1011	 * name cache entry for the directory.  Since this is reading
1012	 * a single time_t, don't bother with locking.  The
1013	 * modification time may be a bit stale, but it must be read
1014	 * before performing the lookup RPC to prevent a race where
1015	 * another lookup updates the timestamp on the directory after
1016	 * the lookup RPC has been performed on the server but before
1017	 * n_dmtime is set at the end of this function.
1018	 */
1019	dmtime = np->n_vattr.va_mtime;
1020	error = 0;
1021	newvp = NULLVP;
1022	nfsstats.lookupcache_misses++;
1023	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
1024	len = cnp->cn_namelen;
1025	mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP,
1026		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len));
1027	mb = mreq;
1028	bpos = mtod(mb, caddr_t);
1029	nfsm_fhtom(dvp, v3);
1030	nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
1031	nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_thread, cnp->cn_cred);
1032	if (error) {
1033		if (v3) {
1034			nfsm_postop_attr(dvp, attrflag);
1035			m_freem(mrep);
1036		}
1037		goto nfsmout;
1038	}
1039	nfsm_getfh(fhp, fhsize, v3);
1040
1041	/*
1042	 * Handle RENAME case...
1043	 */
1044	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
1045		if (NFS_CMPFH(np, fhp, fhsize)) {
1046			m_freem(mrep);
1047			return (EISDIR);
1048		}
1049		error = nfs_nget(mp, fhp, fhsize, &np, LK_EXCLUSIVE);
1050		if (error) {
1051			m_freem(mrep);
1052			return (error);
1053		}
1054		newvp = NFSTOV(np);
1055		if (v3) {
1056			nfsm_postop_attr(newvp, attrflag);
1057			nfsm_postop_attr(dvp, attrflag);
1058		} else
1059			nfsm_loadattr(newvp, NULL);
1060		*vpp = newvp;
1061		m_freem(mrep);
1062		cnp->cn_flags |= SAVENAME;
1063		return (0);
1064	}
1065
1066	if (flags & ISDOTDOT) {
1067		ltype = VOP_ISLOCKED(dvp);
1068		error = vfs_busy(mp, MBF_NOWAIT);
1069		if (error != 0) {
1070			vfs_ref(mp);
1071			VOP_UNLOCK(dvp, 0);
1072			error = vfs_busy(mp, 0);
1073			vn_lock(dvp, ltype | LK_RETRY);
1074			vfs_rel(mp);
1075			if (error == 0 && (dvp->v_iflag & VI_DOOMED)) {
1076				vfs_unbusy(mp);
1077				error = ENOENT;
1078			}
1079			if (error != 0) {
1080				m_freem(mrep);
1081				return (error);
1082			}
1083		}
1084		VOP_UNLOCK(dvp, 0);
1085		error = nfs_nget(mp, fhp, fhsize, &np, cnp->cn_lkflags);
1086		if (error == 0)
1087			newvp = NFSTOV(np);
1088		vfs_unbusy(mp);
1089		if (newvp != dvp)
1090			vn_lock(dvp, ltype | LK_RETRY);
1091		if (dvp->v_iflag & VI_DOOMED) {
1092			if (error == 0) {
1093				if (newvp == dvp)
1094					vrele(newvp);
1095				else
1096					vput(newvp);
1097			}
1098			error = ENOENT;
1099		}
1100		if (error) {
1101			m_freem(mrep);
1102			return (error);
1103		}
1104	} else if (NFS_CMPFH(np, fhp, fhsize)) {
1105		VREF(dvp);
1106		newvp = dvp;
1107	} else {
1108		error = nfs_nget(mp, fhp, fhsize, &np, cnp->cn_lkflags);
1109		if (error) {
1110			m_freem(mrep);
1111			return (error);
1112		}
1113		newvp = NFSTOV(np);
1114
1115		/*
1116		 * Flush the attribute cache when opening a leaf node
1117		 * to ensure that fresh attributes are fetched in
1118		 * nfs_open() if we are unable to fetch attributes
1119		 * from the LOOKUP reply.
1120		 */
1121		if ((flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
1122		    !(np->n_flag & NMODIFIED)) {
1123			mtx_lock(&np->n_mtx);
1124			np->n_attrstamp = 0;
1125			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
1126			mtx_unlock(&np->n_mtx);
1127		}
1128	}
1129	if (v3) {
1130		nfsm_postop_attr(newvp, attrflag);
1131		nfsm_postop_attr(dvp, attrflag);
1132	} else
1133		nfsm_loadattr(newvp, NULL);
1134	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
1135		cnp->cn_flags |= SAVENAME;
1136	if ((cnp->cn_flags & MAKEENTRY) &&
1137	    (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) {
1138		np->n_ctime = np->n_vattr.va_ctime;
1139		cache_enter(dvp, newvp, cnp);
1140	}
1141	*vpp = newvp;
1142	m_freem(mrep);
1143nfsmout:
1144	if (error) {
1145		if (newvp != NULLVP) {
1146			vput(newvp);
1147			*vpp = NULLVP;
1148		}
1149
1150		if (error != ENOENT)
1151			goto done;
1152
1153		/* The requested file was not found. */
1154		if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
1155		    (flags & ISLASTCN)) {
1156			/*
1157			 * XXX: UFS does a full VOP_ACCESS(dvp,
1158			 * VWRITE) here instead of just checking
1159			 * MNT_RDONLY.
1160			 */
1161			if (mp->mnt_flag & MNT_RDONLY)
1162				return (EROFS);
1163			cnp->cn_flags |= SAVENAME;
1164			return (EJUSTRETURN);
1165		}
1166
1167		if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE) {
1168			/*
1169			 * Maintain n_dmtime as the modification time
1170			 * of the parent directory when the oldest -ve
1171			 * name cache entry for this directory was
1172			 * added.  If a -ve cache entry has already
1173			 * been added with a newer modification time
1174			 * by a concurrent lookup, then don't bother
1175			 * adding a cache entry.  The modification
1176			 * time of the directory might have changed
1177			 * due to the file this lookup failed to find
1178			 * being created.  In that case a subsequent
1179			 * lookup would incorrectly use the entry
1180			 * added here instead of doing an extra
1181			 * lookup.
1182			 */
1183			mtx_lock(&np->n_mtx);
1184			if (timespeccmp(&np->n_dmtime, &dmtime, <=)) {
1185				if (!timespecisset(&np->n_dmtime)) {
1186					np->n_dmtime = dmtime;
1187					np->n_dmtime_ticks = ticks;
1188				}
1189				mtx_unlock(&np->n_mtx);
1190				cache_enter(dvp, NULL, cnp);
1191			} else
1192				mtx_unlock(&np->n_mtx);
1193		}
1194		return (ENOENT);
1195	}
1196done:
1197	return (error);
1198}
1199
1200/*
1201 * nfs read call.
1202 * Just call nfs_bioread() to do the work.
1203 */
1204static int
1205nfs_read(struct vop_read_args *ap)
1206{
1207	struct vnode *vp = ap->a_vp;
1208
1209	switch (vp->v_type) {
1210	case VREG:
1211		return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
1212	case VDIR:
1213		return (EISDIR);
1214	default:
1215		return (EOPNOTSUPP);
1216	}
1217}
1218
1219/*
1220 * nfs readlink call
1221 */
1222static int
1223nfs_readlink(struct vop_readlink_args *ap)
1224{
1225	struct vnode *vp = ap->a_vp;
1226
1227	if (vp->v_type != VLNK)
1228		return (EINVAL);
1229	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
1230}
1231
1232/*
1233 * Do a readlink rpc.
1234 * Called by nfs_doio() from below the buffer cache.
1235 */
1236int
1237nfs_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
1238{
1239	caddr_t bpos, dpos;
1240	int error = 0, len, attrflag;
1241	struct mbuf *mreq, *mrep, *md, *mb;
1242	int v3 = NFS_ISV3(vp);
1243
1244	nfsstats.rpccnt[NFSPROC_READLINK]++;
1245	mreq = nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3));
1246	mb = mreq;
1247	bpos = mtod(mb, caddr_t);
1248	nfsm_fhtom(vp, v3);
1249	nfsm_request(vp, NFSPROC_READLINK, uiop->uio_td, cred);
1250	if (v3)
1251		nfsm_postop_attr(vp, attrflag);
1252	if (!error) {
1253		nfsm_strsiz(len, NFS_MAXPATHLEN);
1254		if (len == NFS_MAXPATHLEN) {
1255			struct nfsnode *np = VTONFS(vp);
1256			mtx_lock(&np->n_mtx);
1257			if (np->n_size && np->n_size < NFS_MAXPATHLEN)
1258				len = np->n_size;
1259			mtx_unlock(&np->n_mtx);
1260		}
1261		nfsm_mtouio(uiop, len);
1262	}
1263	m_freem(mrep);
1264nfsmout:
1265	return (error);
1266}
1267
1268/*
1269 * nfs read rpc call
1270 * Ditto above
1271 */
1272int
1273nfs_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
1274{
1275	u_int32_t *tl;
1276	caddr_t bpos, dpos;
1277	struct mbuf *mreq, *mrep, *md, *mb;
1278	struct nfsmount *nmp;
1279	int error = 0, len, retlen, tsiz, eof, attrflag;
1280	int v3 = NFS_ISV3(vp);
1281	int rsize;
1282
1283#ifndef nolint
1284	eof = 0;
1285#endif
1286	nmp = VFSTONFS(vp->v_mount);
1287	tsiz = uiop->uio_resid;
1288	mtx_lock(&nmp->nm_mtx);
1289	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
1290		mtx_unlock(&nmp->nm_mtx);
1291		return (EFBIG);
1292	}
1293	rsize = nmp->nm_rsize;
1294	mtx_unlock(&nmp->nm_mtx);
1295	while (tsiz > 0) {
1296		nfsstats.rpccnt[NFSPROC_READ]++;
1297		len = (tsiz > rsize) ? rsize : tsiz;
1298		mreq = nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3);
1299		mb = mreq;
1300		bpos = mtod(mb, caddr_t);
1301		nfsm_fhtom(vp, v3);
1302		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED * 3);
1303		if (v3) {
1304			txdr_hyper(uiop->uio_offset, tl);
1305			*(tl + 2) = txdr_unsigned(len);
1306		} else {
1307			*tl++ = txdr_unsigned(uiop->uio_offset);
1308			*tl++ = txdr_unsigned(len);
1309			*tl = 0;
1310		}
1311		nfsm_request(vp, NFSPROC_READ, uiop->uio_td, cred);
1312		if (v3) {
1313			nfsm_postop_attr(vp, attrflag);
1314			if (error) {
1315				m_freem(mrep);
1316				goto nfsmout;
1317			}
1318			tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED);
1319			eof = fxdr_unsigned(int, *(tl + 1));
1320		} else {
1321			nfsm_loadattr(vp, NULL);
1322		}
1323		nfsm_strsiz(retlen, rsize);
1324		nfsm_mtouio(uiop, retlen);
1325		m_freem(mrep);
1326		tsiz -= retlen;
1327		if (v3) {
1328			if (eof || retlen == 0) {
1329				tsiz = 0;
1330			}
1331		} else if (retlen < len) {
1332			tsiz = 0;
1333		}
1334	}
1335nfsmout:
1336	return (error);
1337}
1338
1339/*
1340 * nfs write call
1341 */
1342int
1343nfs_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
1344	     int *iomode, int *must_commit)
1345{
1346	u_int32_t *tl;
1347	int32_t backup;
1348	caddr_t bpos, dpos;
1349	struct mbuf *mreq, *mrep, *md, *mb;
1350	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1351	int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit;
1352	int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC;
1353	int wsize;
1354
1355	KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1"));
1356	*must_commit = 0;
1357	tsiz = uiop->uio_resid;
1358	mtx_lock(&nmp->nm_mtx);
1359	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
1360		mtx_unlock(&nmp->nm_mtx);
1361		return (EFBIG);
1362	}
1363	wsize = nmp->nm_wsize;
1364	mtx_unlock(&nmp->nm_mtx);
1365	while (tsiz > 0) {
1366		nfsstats.rpccnt[NFSPROC_WRITE]++;
1367		len = (tsiz > wsize) ? wsize : tsiz;
1368		mreq = nfsm_reqhead(vp, NFSPROC_WRITE,
1369			NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
1370		mb = mreq;
1371		bpos = mtod(mb, caddr_t);
1372		nfsm_fhtom(vp, v3);
1373		if (v3) {
1374			tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
1375			txdr_hyper(uiop->uio_offset, tl);
1376			tl += 2;
1377			*tl++ = txdr_unsigned(len);
1378			*tl++ = txdr_unsigned(*iomode);
1379			*tl = txdr_unsigned(len);
1380		} else {
1381			u_int32_t x;
1382
1383			tl = nfsm_build(u_int32_t *, 4 * NFSX_UNSIGNED);
1384			/* Set both "begin" and "current" to non-garbage. */
1385			x = txdr_unsigned((u_int32_t)uiop->uio_offset);
1386			*tl++ = x;	/* "begin offset" */
1387			*tl++ = x;	/* "current offset" */
1388			x = txdr_unsigned(len);
1389			*tl++ = x;	/* total to this offset */
1390			*tl = x;	/* size of this write */
1391		}
1392		nfsm_uiotom(uiop, len);
1393		nfsm_request(vp, NFSPROC_WRITE, uiop->uio_td, cred);
1394		if (v3) {
1395			wccflag = NFSV3_WCCCHK;
1396			nfsm_wcc_data(vp, wccflag);
1397			if (!error) {
1398				tl = nfsm_dissect(u_int32_t *, 2 * NFSX_UNSIGNED
1399					+ NFSX_V3WRITEVERF);
1400				rlen = fxdr_unsigned(int, *tl++);
1401				if (rlen == 0) {
1402					error = NFSERR_IO;
1403					m_freem(mrep);
1404					break;
1405				} else if (rlen < len) {
1406					backup = len - rlen;
1407					uiop->uio_iov->iov_base =
1408					    (char *)uiop->uio_iov->iov_base -
1409					    backup;
1410					uiop->uio_iov->iov_len += backup;
1411					uiop->uio_offset -= backup;
1412					uiop->uio_resid += backup;
1413					len = rlen;
1414				}
1415				commit = fxdr_unsigned(int, *tl++);
1416
1417				/*
1418				 * Return the lowest committment level
1419				 * obtained by any of the RPCs.
1420				 */
1421				if (committed == NFSV3WRITE_FILESYNC)
1422					committed = commit;
1423				else if (committed == NFSV3WRITE_DATASYNC &&
1424					commit == NFSV3WRITE_UNSTABLE)
1425					committed = commit;
1426				mtx_lock(&nmp->nm_mtx);
1427				if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
1428				    bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
1429					NFSX_V3WRITEVERF);
1430				    nmp->nm_state |= NFSSTA_HASWRITEVERF;
1431				} else if (bcmp((caddr_t)tl,
1432				    (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) {
1433				    *must_commit = 1;
1434				    bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
1435					NFSX_V3WRITEVERF);
1436				}
1437				mtx_unlock(&nmp->nm_mtx);
1438			}
1439		} else {
1440			nfsm_loadattr(vp, NULL);
1441		}
1442		if (wccflag) {
1443			mtx_lock(&(VTONFS(vp))->n_mtx);
1444			VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
1445			mtx_unlock(&(VTONFS(vp))->n_mtx);
1446		}
1447		m_freem(mrep);
1448		if (error)
1449			break;
1450		tsiz -= len;
1451	}
1452nfsmout:
1453	if (vp->v_mount->mnt_kern_flag & MNTK_ASYNC)
1454		committed = NFSV3WRITE_FILESYNC;
1455	*iomode = committed;
1456	if (error)
1457		uiop->uio_resid = tsiz;
1458	return (error);
1459}
1460
1461/*
1462 * nfs mknod rpc
1463 * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
1464 * mode set to specify the file type and the size field for rdev.
1465 */
1466static int
1467nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1468    struct vattr *vap)
1469{
1470	struct nfsv2_sattr *sp;
1471	u_int32_t *tl;
1472	struct vnode *newvp = NULL;
1473	struct nfsnode *np = NULL;
1474	struct vattr vattr;
1475	caddr_t bpos, dpos;
1476	int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0;
1477	struct mbuf *mreq, *mrep, *md, *mb;
1478	u_int32_t rdev;
1479	int v3 = NFS_ISV3(dvp);
1480
1481	if (vap->va_type == VCHR || vap->va_type == VBLK)
1482		rdev = txdr_unsigned(vap->va_rdev);
1483	else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
1484		rdev = nfs_xdrneg1;
1485	else {
1486		return (EOPNOTSUPP);
1487	}
1488	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
1489		return (error);
1490	nfsstats.rpccnt[NFSPROC_MKNOD]++;
1491	mreq = nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED +
1492		+ nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3));
1493	mb = mreq;
1494	bpos = mtod(mb, caddr_t);
1495	nfsm_fhtom(dvp, v3);
1496	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1497	if (v3) {
1498		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
1499		*tl++ = vtonfsv3_type(vap->va_type);
1500		nfsm_v3attrbuild(vap, FALSE);
1501		if (vap->va_type == VCHR || vap->va_type == VBLK) {
1502			tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
1503			*tl++ = txdr_unsigned(major(vap->va_rdev));
1504			*tl = txdr_unsigned(minor(vap->va_rdev));
1505		}
1506	} else {
1507		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
1508		sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
1509		sp->sa_uid = nfs_xdrneg1;
1510		sp->sa_gid = nfs_xdrneg1;
1511		sp->sa_size = rdev;
1512		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
1513		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
1514	}
1515	nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_thread, cnp->cn_cred);
1516	if (!error) {
1517		nfsm_mtofh(dvp, newvp, v3, gotvp);
1518		if (!gotvp) {
1519			if (newvp) {
1520				vput(newvp);
1521				newvp = NULL;
1522			}
1523			error = nfs_lookitup(dvp, cnp->cn_nameptr,
1524			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
1525			if (!error)
1526				newvp = NFSTOV(np);
1527		}
1528	}
1529	if (v3)
1530		nfsm_wcc_data(dvp, wccflag);
1531	m_freem(mrep);
1532nfsmout:
1533	if (error) {
1534		if (newvp)
1535			vput(newvp);
1536	} else {
1537		if (cnp->cn_flags & MAKEENTRY)
1538			cache_enter(dvp, newvp, cnp);
1539		*vpp = newvp;
1540	}
1541	mtx_lock(&(VTONFS(dvp))->n_mtx);
1542	VTONFS(dvp)->n_flag |= NMODIFIED;
1543	if (!wccflag) {
1544		VTONFS(dvp)->n_attrstamp = 0;
1545		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
1546	}
1547	mtx_unlock(&(VTONFS(dvp))->n_mtx);
1548	return (error);
1549}
1550
1551/*
1552 * nfs mknod vop
1553 * just call nfs_mknodrpc() to do the work.
1554 */
1555/* ARGSUSED */
1556static int
1557nfs_mknod(struct vop_mknod_args *ap)
1558{
1559	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
1560}
1561
1562static u_long create_verf;
1563/*
1564 * nfs file create call
1565 */
1566static int
1567nfs_create(struct vop_create_args *ap)
1568{
1569	struct vnode *dvp = ap->a_dvp;
1570	struct vattr *vap = ap->a_vap;
1571	struct componentname *cnp = ap->a_cnp;
1572	struct nfsv2_sattr *sp;
1573	u_int32_t *tl;
1574	struct nfsnode *np = NULL;
1575	struct vnode *newvp = NULL;
1576	caddr_t bpos, dpos;
1577	int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0;
1578	struct mbuf *mreq, *mrep, *md, *mb;
1579	struct vattr vattr;
1580	int v3 = NFS_ISV3(dvp);
1581
1582	/*
1583	 * Oops, not for me..
1584	 */
1585	if (vap->va_type == VSOCK) {
1586		error = nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap);
1587		return (error);
1588	}
1589
1590	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0) {
1591		return (error);
1592	}
1593	if (vap->va_vaflags & VA_EXCLUSIVE)
1594		fmode |= O_EXCL;
1595again:
1596	nfsstats.rpccnt[NFSPROC_CREATE]++;
1597	mreq = nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED +
1598		nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3));
1599	mb = mreq;
1600	bpos = mtod(mb, caddr_t);
1601	nfsm_fhtom(dvp, v3);
1602	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1603	if (v3) {
1604		tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
1605		if (fmode & O_EXCL) {
1606			*tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE);
1607			tl = nfsm_build(u_int32_t *, NFSX_V3CREATEVERF);
1608#ifdef INET
1609			CURVNET_SET(CRED_TO_VNET(cnp->cn_cred));
1610			IN_IFADDR_RLOCK();
1611			if (!TAILQ_EMPTY(&V_in_ifaddrhead))
1612				*tl++ = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr.s_addr;
1613			else
1614#endif
1615				*tl++ = create_verf;
1616#ifdef INET
1617			IN_IFADDR_RUNLOCK();
1618			CURVNET_RESTORE();
1619#endif
1620			*tl = ++create_verf;
1621		} else {
1622			*tl = txdr_unsigned(NFSV3CREATE_UNCHECKED);
1623			nfsm_v3attrbuild(vap, FALSE);
1624		}
1625	} else {
1626		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
1627		sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
1628		sp->sa_uid = nfs_xdrneg1;
1629		sp->sa_gid = nfs_xdrneg1;
1630		sp->sa_size = 0;
1631		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
1632		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
1633	}
1634	nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_thread, cnp->cn_cred);
1635	if (!error) {
1636		nfsm_mtofh(dvp, newvp, v3, gotvp);
1637		if (!gotvp) {
1638			if (newvp) {
1639				vput(newvp);
1640				newvp = NULL;
1641			}
1642			error = nfs_lookitup(dvp, cnp->cn_nameptr,
1643			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
1644			if (!error)
1645				newvp = NFSTOV(np);
1646		}
1647	}
1648	if (v3)
1649		nfsm_wcc_data(dvp, wccflag);
1650	m_freem(mrep);
1651nfsmout:
1652	if (error) {
1653		if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) {
1654			fmode &= ~O_EXCL;
1655			goto again;
1656		}
1657		if (newvp)
1658			vput(newvp);
1659	} else if (v3 && (fmode & O_EXCL)) {
1660		/*
1661		 * We are normally called with only a partially initialized
1662		 * VAP.  Since the NFSv3 spec says that server may use the
1663		 * file attributes to store the verifier, the spec requires
1664		 * us to do a SETATTR RPC. FreeBSD servers store the verifier
1665		 * in atime, but we can't really assume that all servers will
1666		 * so we ensure that our SETATTR sets both atime and mtime.
1667		 */
1668		if (vap->va_mtime.tv_sec == VNOVAL)
1669			vfs_timestamp(&vap->va_mtime);
1670		if (vap->va_atime.tv_sec == VNOVAL)
1671			vap->va_atime = vap->va_mtime;
1672		error = nfs_setattrrpc(newvp, vap, cnp->cn_cred);
1673		if (error)
1674			vput(newvp);
1675	}
1676	if (!error) {
1677		if (cnp->cn_flags & MAKEENTRY)
1678			cache_enter(dvp, newvp, cnp);
1679		*ap->a_vpp = newvp;
1680	}
1681	mtx_lock(&(VTONFS(dvp))->n_mtx);
1682	VTONFS(dvp)->n_flag |= NMODIFIED;
1683	if (!wccflag) {
1684		VTONFS(dvp)->n_attrstamp = 0;
1685		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
1686	}
1687	mtx_unlock(&(VTONFS(dvp))->n_mtx);
1688	return (error);
1689}
1690
1691/*
1692 * nfs file remove call
1693 * To try and make nfs semantics closer to ufs semantics, a file that has
1694 * other processes using the vnode is renamed instead of removed and then
1695 * removed later on the last close.
1696 * - If v_usecount > 1
1697 *	  If a rename is not already in the works
1698 *	     call nfs_sillyrename() to set it up
1699 *     else
1700 *	  do the remove rpc
1701 */
1702static int
1703nfs_remove(struct vop_remove_args *ap)
1704{
1705	struct vnode *vp = ap->a_vp;
1706	struct vnode *dvp = ap->a_dvp;
1707	struct componentname *cnp = ap->a_cnp;
1708	struct nfsnode *np = VTONFS(vp);
1709	int error = 0;
1710	struct vattr vattr;
1711
1712	KASSERT((cnp->cn_flags & HASBUF) != 0, ("nfs_remove: no name"));
1713	KASSERT(vrefcnt(vp) > 0, ("nfs_remove: bad v_usecount"));
1714	if (vp->v_type == VDIR)
1715		error = EPERM;
1716	else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
1717	    !VOP_GETATTR(vp, &vattr, cnp->cn_cred) && vattr.va_nlink > 1)) {
1718		/*
1719		 * Purge the name cache so that the chance of a lookup for
1720		 * the name succeeding while the remove is in progress is
1721		 * minimized. Without node locking it can still happen, such
1722		 * that an I/O op returns ESTALE, but since you get this if
1723		 * another host removes the file..
1724		 */
1725		cache_purge(vp);
1726		/*
1727		 * throw away biocache buffers, mainly to avoid
1728		 * unnecessary delayed writes later.
1729		 */
1730		error = nfs_vinvalbuf(vp, 0, cnp->cn_thread, 1);
1731		/* Do the rpc */
1732		if (error != EINTR && error != EIO)
1733			error = nfs_removerpc(dvp, cnp->cn_nameptr,
1734				cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
1735		/*
1736		 * Kludge City: If the first reply to the remove rpc is lost..
1737		 *   the reply to the retransmitted request will be ENOENT
1738		 *   since the file was in fact removed
1739		 *   Therefore, we cheat and return success.
1740		 */
1741		if (error == ENOENT)
1742			error = 0;
1743	} else if (!np->n_sillyrename)
1744		error = nfs_sillyrename(dvp, vp, cnp);
1745	mtx_lock(&np->n_mtx);
1746	np->n_attrstamp = 0;
1747	mtx_unlock(&np->n_mtx);
1748	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
1749	return (error);
1750}
1751
1752/*
1753 * nfs file remove rpc called from nfs_inactive
1754 */
1755int
1756nfs_removeit(struct sillyrename *sp)
1757{
1758	/*
1759	 * Make sure that the directory vnode is still valid.
1760	 * XXX we should lock sp->s_dvp here.
1761	 */
1762	if (sp->s_dvp->v_type == VBAD)
1763		return (0);
1764	return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred,
1765		NULL));
1766}
1767
1768/*
1769 * Nfs remove rpc, called from nfs_remove() and nfs_removeit().
1770 */
1771static int
1772nfs_removerpc(struct vnode *dvp, const char *name, int namelen,
1773    struct ucred *cred, struct thread *td)
1774{
1775	caddr_t bpos, dpos;
1776	int error = 0, wccflag = NFSV3_WCCRATTR;
1777	struct mbuf *mreq, *mrep, *md, *mb;
1778	int v3 = NFS_ISV3(dvp);
1779
1780	nfsstats.rpccnt[NFSPROC_REMOVE]++;
1781	mreq = nfsm_reqhead(dvp, NFSPROC_REMOVE,
1782		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen));
1783	mb = mreq;
1784	bpos = mtod(mb, caddr_t);
1785	nfsm_fhtom(dvp, v3);
1786	nfsm_strtom(name, namelen, NFS_MAXNAMLEN);
1787	nfsm_request(dvp, NFSPROC_REMOVE, td, cred);
1788	if (v3)
1789		nfsm_wcc_data(dvp, wccflag);
1790	m_freem(mrep);
1791nfsmout:
1792	mtx_lock(&(VTONFS(dvp))->n_mtx);
1793	VTONFS(dvp)->n_flag |= NMODIFIED;
1794	if (!wccflag) {
1795		VTONFS(dvp)->n_attrstamp = 0;
1796		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
1797	}
1798	mtx_unlock(&(VTONFS(dvp))->n_mtx);
1799	return (error);
1800}
1801
1802/*
1803 * nfs file rename call
1804 */
1805static int
1806nfs_rename(struct vop_rename_args *ap)
1807{
1808	struct vnode *fvp = ap->a_fvp;
1809	struct vnode *tvp = ap->a_tvp;
1810	struct vnode *fdvp = ap->a_fdvp;
1811	struct vnode *tdvp = ap->a_tdvp;
1812	struct componentname *tcnp = ap->a_tcnp;
1813	struct componentname *fcnp = ap->a_fcnp;
1814	int error;
1815
1816	KASSERT((tcnp->cn_flags & HASBUF) != 0 &&
1817	    (fcnp->cn_flags & HASBUF) != 0, ("nfs_rename: no name"));
1818	/* Check for cross-device rename */
1819	if ((fvp->v_mount != tdvp->v_mount) ||
1820	    (tvp && (fvp->v_mount != tvp->v_mount))) {
1821		error = EXDEV;
1822		goto out;
1823	}
1824
1825	if (fvp == tvp) {
1826		nfs_printf("nfs_rename: fvp == tvp (can't happen)\n");
1827		error = 0;
1828		goto out;
1829	}
1830	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
1831		goto out;
1832
1833	/*
1834	 * We have to flush B_DELWRI data prior to renaming
1835	 * the file.  If we don't, the delayed-write buffers
1836	 * can be flushed out later after the file has gone stale
1837	 * under NFSV3.  NFSV2 does not have this problem because
1838	 * ( as far as I can tell ) it flushes dirty buffers more
1839	 * often.
1840	 *
1841	 * Skip the rename operation if the fsync fails, this can happen
1842	 * due to the server's volume being full, when we pushed out data
1843	 * that was written back to our cache earlier. Not checking for
1844	 * this condition can result in potential (silent) data loss.
1845	 */
1846	error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
1847	VOP_UNLOCK(fvp, 0);
1848	if (!error && tvp)
1849		error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
1850	if (error)
1851		goto out;
1852
1853	/*
1854	 * If the tvp exists and is in use, sillyrename it before doing the
1855	 * rename of the new file over it.
1856	 * XXX Can't sillyrename a directory.
1857	 */
1858	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
1859		tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
1860		vput(tvp);
1861		tvp = NULL;
1862	}
1863
1864	error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen,
1865		tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
1866		tcnp->cn_thread);
1867
1868	if (fvp->v_type == VDIR) {
1869		if (tvp != NULL && tvp->v_type == VDIR)
1870			cache_purge(tdvp);
1871		cache_purge(fdvp);
1872	}
1873
1874out:
1875	if (tdvp == tvp)
1876		vrele(tdvp);
1877	else
1878		vput(tdvp);
1879	if (tvp)
1880		vput(tvp);
1881	vrele(fdvp);
1882	vrele(fvp);
1883	/*
1884	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
1885	 */
1886	if (error == ENOENT)
1887		error = 0;
1888	return (error);
1889}
1890
1891/*
1892 * nfs file rename rpc called from nfs_remove() above
1893 */
1894static int
1895nfs_renameit(struct vnode *sdvp, struct componentname *scnp,
1896    struct sillyrename *sp)
1897{
1898
1899	return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp,
1900	    sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_thread));
1901}
1902
1903/*
1904 * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
1905 */
1906static int
1907nfs_renamerpc(struct vnode *fdvp, const char *fnameptr, int fnamelen,
1908    struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred,
1909    struct thread *td)
1910{
1911	caddr_t bpos, dpos;
1912	int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR;
1913	struct mbuf *mreq, *mrep, *md, *mb;
1914	int v3 = NFS_ISV3(fdvp);
1915
1916	nfsstats.rpccnt[NFSPROC_RENAME]++;
1917	mreq = nfsm_reqhead(fdvp, NFSPROC_RENAME,
1918		(NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) +
1919		nfsm_rndup(tnamelen));
1920	mb = mreq;
1921	bpos = mtod(mb, caddr_t);
1922	nfsm_fhtom(fdvp, v3);
1923	nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN);
1924	nfsm_fhtom(tdvp, v3);
1925	nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN);
1926	nfsm_request(fdvp, NFSPROC_RENAME, td, cred);
1927	if (v3) {
1928		nfsm_wcc_data(fdvp, fwccflag);
1929		nfsm_wcc_data(tdvp, twccflag);
1930	}
1931	m_freem(mrep);
1932nfsmout:
1933	mtx_lock(&(VTONFS(fdvp))->n_mtx);
1934	VTONFS(fdvp)->n_flag |= NMODIFIED;
1935	mtx_unlock(&(VTONFS(fdvp))->n_mtx);
1936	mtx_lock(&(VTONFS(tdvp))->n_mtx);
1937	VTONFS(tdvp)->n_flag |= NMODIFIED;
1938	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
1939	if (!fwccflag) {
1940		VTONFS(fdvp)->n_attrstamp = 0;
1941		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(fdvp);
1942	}
1943	if (!twccflag) {
1944		VTONFS(tdvp)->n_attrstamp = 0;
1945		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
1946	}
1947	return (error);
1948}
1949
1950/*
1951 * nfs hard link create call
1952 */
1953static int
1954nfs_link(struct vop_link_args *ap)
1955{
1956	struct vnode *vp = ap->a_vp;
1957	struct vnode *tdvp = ap->a_tdvp;
1958	struct componentname *cnp = ap->a_cnp;
1959	caddr_t bpos, dpos;
1960	int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0;
1961	struct mbuf *mreq, *mrep, *md, *mb;
1962	int v3;
1963
1964	if (vp->v_mount != tdvp->v_mount) {
1965		return (EXDEV);
1966	}
1967
1968	/*
1969	 * Push all writes to the server, so that the attribute cache
1970	 * doesn't get "out of sync" with the server.
1971	 * XXX There should be a better way!
1972	 */
1973	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
1974
1975	v3 = NFS_ISV3(vp);
1976	nfsstats.rpccnt[NFSPROC_LINK]++;
1977	mreq = nfsm_reqhead(vp, NFSPROC_LINK,
1978		NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
1979	mb = mreq;
1980	bpos = mtod(mb, caddr_t);
1981	nfsm_fhtom(vp, v3);
1982	nfsm_fhtom(tdvp, v3);
1983	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
1984	nfsm_request(vp, NFSPROC_LINK, cnp->cn_thread, cnp->cn_cred);
1985	if (v3) {
1986		nfsm_postop_attr(vp, attrflag);
1987		nfsm_wcc_data(tdvp, wccflag);
1988	}
1989	m_freem(mrep);
1990nfsmout:
1991	mtx_lock(&(VTONFS(tdvp))->n_mtx);
1992	VTONFS(tdvp)->n_flag |= NMODIFIED;
1993	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
1994	if (!attrflag) {
1995		VTONFS(vp)->n_attrstamp = 0;
1996		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
1997	}
1998	if (!wccflag) {
1999		VTONFS(tdvp)->n_attrstamp = 0;
2000		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
2001	}
2002	return (error);
2003}
2004
2005/*
2006 * nfs symbolic link create call
2007 */
2008static int
2009nfs_symlink(struct vop_symlink_args *ap)
2010{
2011	struct vnode *dvp = ap->a_dvp;
2012	struct vattr *vap = ap->a_vap;
2013	struct componentname *cnp = ap->a_cnp;
2014	struct nfsv2_sattr *sp;
2015	caddr_t bpos, dpos;
2016	int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp;
2017	struct mbuf *mreq, *mrep, *md, *mb;
2018	struct vnode *newvp = NULL;
2019	int v3 = NFS_ISV3(dvp);
2020
2021	nfsstats.rpccnt[NFSPROC_SYMLINK]++;
2022	slen = strlen(ap->a_target);
2023	mreq = nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED +
2024	    nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3));
2025	mb = mreq;
2026	bpos = mtod(mb, caddr_t);
2027	nfsm_fhtom(dvp, v3);
2028	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
2029	if (v3) {
2030		nfsm_v3attrbuild(vap, FALSE);
2031	}
2032	nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN);
2033	if (!v3) {
2034		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
2035		sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode);
2036		sp->sa_uid = nfs_xdrneg1;
2037		sp->sa_gid = nfs_xdrneg1;
2038		sp->sa_size = nfs_xdrneg1;
2039		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
2040		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
2041	}
2042
2043	/*
2044	 * Issue the NFS request and get the rpc response.
2045	 *
2046	 * Only NFSv3 responses returning an error of 0 actually return
2047	 * a file handle that can be converted into newvp without having
2048	 * to do an extra lookup rpc.
2049	 */
2050	nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_thread, cnp->cn_cred);
2051	if (v3) {
2052		if (error == 0)
2053			nfsm_mtofh(dvp, newvp, v3, gotvp);
2054		nfsm_wcc_data(dvp, wccflag);
2055	}
2056
2057	/*
2058	 * out code jumps -> here, mrep is also freed.
2059	 */
2060
2061	m_freem(mrep);
2062nfsmout:
2063
2064	/*
2065	 * If we do not have an error and we could not extract the newvp from
2066	 * the response due to the request being NFSv2, we have to do a
2067	 * lookup in order to obtain a newvp to return.
2068	 */
2069	if (error == 0 && newvp == NULL) {
2070		struct nfsnode *np = NULL;
2071
2072		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
2073		    cnp->cn_cred, cnp->cn_thread, &np);
2074		if (!error)
2075			newvp = NFSTOV(np);
2076	}
2077	if (error) {
2078		if (newvp)
2079			vput(newvp);
2080	} else {
2081		*ap->a_vpp = newvp;
2082	}
2083	mtx_lock(&(VTONFS(dvp))->n_mtx);
2084	VTONFS(dvp)->n_flag |= NMODIFIED;
2085	mtx_unlock(&(VTONFS(dvp))->n_mtx);
2086	if (!wccflag) {
2087		VTONFS(dvp)->n_attrstamp = 0;
2088		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
2089	}
2090	return (error);
2091}
2092
2093/*
2094 * nfs make dir call
2095 */
2096static int
2097nfs_mkdir(struct vop_mkdir_args *ap)
2098{
2099	struct vnode *dvp = ap->a_dvp;
2100	struct vattr *vap = ap->a_vap;
2101	struct componentname *cnp = ap->a_cnp;
2102	struct nfsv2_sattr *sp;
2103	int len;
2104	struct nfsnode *np = NULL;
2105	struct vnode *newvp = NULL;
2106	caddr_t bpos, dpos;
2107	int error = 0, wccflag = NFSV3_WCCRATTR;
2108	int gotvp = 0;
2109	struct mbuf *mreq, *mrep, *md, *mb;
2110	struct vattr vattr;
2111	int v3 = NFS_ISV3(dvp);
2112
2113	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
2114		return (error);
2115	len = cnp->cn_namelen;
2116	nfsstats.rpccnt[NFSPROC_MKDIR]++;
2117	mreq = nfsm_reqhead(dvp, NFSPROC_MKDIR,
2118	  NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3));
2119	mb = mreq;
2120	bpos = mtod(mb, caddr_t);
2121	nfsm_fhtom(dvp, v3);
2122	nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
2123	if (v3) {
2124		nfsm_v3attrbuild(vap, FALSE);
2125	} else {
2126		sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
2127		sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode);
2128		sp->sa_uid = nfs_xdrneg1;
2129		sp->sa_gid = nfs_xdrneg1;
2130		sp->sa_size = nfs_xdrneg1;
2131		txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
2132		txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
2133	}
2134	nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_thread, cnp->cn_cred);
2135	if (!error)
2136		nfsm_mtofh(dvp, newvp, v3, gotvp);
2137	if (v3)
2138		nfsm_wcc_data(dvp, wccflag);
2139	m_freem(mrep);
2140nfsmout:
2141	mtx_lock(&(VTONFS(dvp))->n_mtx);
2142	VTONFS(dvp)->n_flag |= NMODIFIED;
2143	mtx_unlock(&(VTONFS(dvp))->n_mtx);
2144	if (!wccflag) {
2145		VTONFS(dvp)->n_attrstamp = 0;
2146		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
2147	}
2148	if (error == 0 && newvp == NULL) {
2149		error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred,
2150			cnp->cn_thread, &np);
2151		if (!error) {
2152			newvp = NFSTOV(np);
2153			if (newvp->v_type != VDIR)
2154				error = EEXIST;
2155		}
2156	}
2157	if (error) {
2158		if (newvp)
2159			vput(newvp);
2160	} else
2161		*ap->a_vpp = newvp;
2162	return (error);
2163}
2164
2165/*
2166 * nfs remove directory call
2167 */
2168static int
2169nfs_rmdir(struct vop_rmdir_args *ap)
2170{
2171	struct vnode *vp = ap->a_vp;
2172	struct vnode *dvp = ap->a_dvp;
2173	struct componentname *cnp = ap->a_cnp;
2174	caddr_t bpos, dpos;
2175	int error = 0, wccflag = NFSV3_WCCRATTR;
2176	struct mbuf *mreq, *mrep, *md, *mb;
2177	int v3 = NFS_ISV3(dvp);
2178
2179	if (dvp == vp)
2180		return (EINVAL);
2181	nfsstats.rpccnt[NFSPROC_RMDIR]++;
2182	mreq = nfsm_reqhead(dvp, NFSPROC_RMDIR,
2183		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
2184	mb = mreq;
2185	bpos = mtod(mb, caddr_t);
2186	nfsm_fhtom(dvp, v3);
2187	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
2188	nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_thread, cnp->cn_cred);
2189	if (v3)
2190		nfsm_wcc_data(dvp, wccflag);
2191	m_freem(mrep);
2192nfsmout:
2193	mtx_lock(&(VTONFS(dvp))->n_mtx);
2194	VTONFS(dvp)->n_flag |= NMODIFIED;
2195	mtx_unlock(&(VTONFS(dvp))->n_mtx);
2196	if (!wccflag) {
2197		VTONFS(dvp)->n_attrstamp = 0;
2198		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
2199	}
2200	cache_purge(dvp);
2201	cache_purge(vp);
2202	/*
2203	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
2204	 */
2205	if (error == ENOENT)
2206		error = 0;
2207	return (error);
2208}
2209
2210/*
2211 * nfs readdir call
2212 */
2213static int
2214nfs_readdir(struct vop_readdir_args *ap)
2215{
2216	struct vnode *vp = ap->a_vp;
2217	struct nfsnode *np = VTONFS(vp);
2218	struct uio *uio = ap->a_uio;
2219	int tresid, error = 0;
2220	struct vattr vattr;
2221
2222	if (vp->v_type != VDIR)
2223		return(EPERM);
2224
2225	/*
2226	 * First, check for hit on the EOF offset cache
2227	 */
2228	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
2229	    (np->n_flag & NMODIFIED) == 0) {
2230		if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) {
2231			mtx_lock(&np->n_mtx);
2232			if (!NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
2233				mtx_unlock(&np->n_mtx);
2234				nfsstats.direofcache_hits++;
2235				goto out;
2236			} else
2237				mtx_unlock(&np->n_mtx);
2238		}
2239	}
2240
2241	/*
2242	 * Call nfs_bioread() to do the real work.
2243	 */
2244	tresid = uio->uio_resid;
2245	error = nfs_bioread(vp, uio, 0, ap->a_cred);
2246
2247	if (!error && uio->uio_resid == tresid) {
2248		nfsstats.direofcache_misses++;
2249	}
2250out:
2251	return (error);
2252}
2253
2254/*
2255 * Readdir rpc call.
2256 * Called from below the buffer cache by nfs_doio().
2257 */
2258int
2259nfs_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
2260{
2261	int len, left;
2262	struct dirent *dp = NULL;
2263	u_int32_t *tl;
2264	caddr_t cp;
2265	nfsuint64 *cookiep;
2266	caddr_t bpos, dpos;
2267	struct mbuf *mreq, *mrep, *md, *mb;
2268	nfsuint64 cookie;
2269	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2270	struct nfsnode *dnp = VTONFS(vp);
2271	u_quad_t fileno;
2272	int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
2273	int attrflag;
2274	int v3 = NFS_ISV3(vp);
2275
2276	KASSERT(uiop->uio_iovcnt == 1 &&
2277	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
2278	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
2279	    ("nfs readdirrpc bad uio"));
2280
2281	/*
2282	 * If there is no cookie, assume directory was stale.
2283	 */
2284	nfs_dircookie_lock(dnp);
2285	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
2286	if (cookiep) {
2287		cookie = *cookiep;
2288		nfs_dircookie_unlock(dnp);
2289	} else {
2290		nfs_dircookie_unlock(dnp);
2291		return (NFSERR_BAD_COOKIE);
2292	}
2293
2294	/*
2295	 * Loop around doing readdir rpc's of size nm_readdirsize
2296	 * truncated to a multiple of DIRBLKSIZ.
2297	 * The stopping criteria is EOF or buffer full.
2298	 */
2299	while (more_dirs && bigenough) {
2300		nfsstats.rpccnt[NFSPROC_READDIR]++;
2301		mreq = nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) +
2302			NFSX_READDIR(v3));
2303		mb = mreq;
2304		bpos = mtod(mb, caddr_t);
2305		nfsm_fhtom(vp, v3);
2306		if (v3) {
2307			tl = nfsm_build(u_int32_t *, 5 * NFSX_UNSIGNED);
2308			*tl++ = cookie.nfsuquad[0];
2309			*tl++ = cookie.nfsuquad[1];
2310			mtx_lock(&dnp->n_mtx);
2311			*tl++ = dnp->n_cookieverf.nfsuquad[0];
2312			*tl++ = dnp->n_cookieverf.nfsuquad[1];
2313			mtx_unlock(&dnp->n_mtx);
2314		} else {
2315			tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
2316			*tl++ = cookie.nfsuquad[0];
2317		}
2318		*tl = txdr_unsigned(nmp->nm_readdirsize);
2319		nfsm_request(vp, NFSPROC_READDIR, uiop->uio_td, cred);
2320		if (v3) {
2321			nfsm_postop_attr(vp, attrflag);
2322			if (!error) {
2323				tl = nfsm_dissect(u_int32_t *,
2324				    2 * NFSX_UNSIGNED);
2325				mtx_lock(&dnp->n_mtx);
2326				dnp->n_cookieverf.nfsuquad[0] = *tl++;
2327				dnp->n_cookieverf.nfsuquad[1] = *tl;
2328				mtx_unlock(&dnp->n_mtx);
2329			} else {
2330				m_freem(mrep);
2331				goto nfsmout;
2332			}
2333		}
2334		tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2335		more_dirs = fxdr_unsigned(int, *tl);
2336
2337		/* loop thru the dir entries, doctoring them to 4bsd form */
2338		while (more_dirs && bigenough) {
2339			if (v3) {
2340				tl = nfsm_dissect(u_int32_t *,
2341				    3 * NFSX_UNSIGNED);
2342				fileno = fxdr_hyper(tl);
2343				len = fxdr_unsigned(int, *(tl + 2));
2344			} else {
2345				tl = nfsm_dissect(u_int32_t *,
2346				    2 * NFSX_UNSIGNED);
2347				fileno = fxdr_unsigned(u_quad_t, *tl++);
2348				len = fxdr_unsigned(int, *tl);
2349			}
2350			if (len <= 0 || len > NFS_MAXNAMLEN) {
2351				error = EBADRPC;
2352				m_freem(mrep);
2353				goto nfsmout;
2354			}
2355			tlen = nfsm_rndup(len);
2356			if (tlen == len)
2357				tlen += 4;	/* To ensure null termination */
2358			left = DIRBLKSIZ - blksiz;
2359			if ((tlen + DIRHDSIZ) > left) {
2360				dp->d_reclen += left;
2361				uiop->uio_iov->iov_base =
2362				    (char *)uiop->uio_iov->iov_base + left;
2363				uiop->uio_iov->iov_len -= left;
2364				uiop->uio_offset += left;
2365				uiop->uio_resid -= left;
2366				blksiz = 0;
2367			}
2368			if ((tlen + DIRHDSIZ) > uiop->uio_resid)
2369				bigenough = 0;
2370			if (bigenough) {
2371				dp = (struct dirent *)uiop->uio_iov->iov_base;
2372				dp->d_fileno = (int)fileno;
2373				dp->d_namlen = len;
2374				dp->d_reclen = tlen + DIRHDSIZ;
2375				dp->d_type = DT_UNKNOWN;
2376				blksiz += dp->d_reclen;
2377				if (blksiz == DIRBLKSIZ)
2378					blksiz = 0;
2379				uiop->uio_offset += DIRHDSIZ;
2380				uiop->uio_resid -= DIRHDSIZ;
2381				uiop->uio_iov->iov_base =
2382				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
2383				uiop->uio_iov->iov_len -= DIRHDSIZ;
2384				nfsm_mtouio(uiop, len);
2385				cp = uiop->uio_iov->iov_base;
2386				tlen -= len;
2387				*cp = '\0';	/* null terminate */
2388				uiop->uio_iov->iov_base =
2389				    (char *)uiop->uio_iov->iov_base + tlen;
2390				uiop->uio_iov->iov_len -= tlen;
2391				uiop->uio_offset += tlen;
2392				uiop->uio_resid -= tlen;
2393			} else
2394				nfsm_adv(nfsm_rndup(len));
2395			if (v3) {
2396				tl = nfsm_dissect(u_int32_t *,
2397				    3 * NFSX_UNSIGNED);
2398			} else {
2399				tl = nfsm_dissect(u_int32_t *,
2400				    2 * NFSX_UNSIGNED);
2401			}
2402			if (bigenough) {
2403				cookie.nfsuquad[0] = *tl++;
2404				if (v3)
2405					cookie.nfsuquad[1] = *tl++;
2406			} else if (v3)
2407				tl += 2;
2408			else
2409				tl++;
2410			more_dirs = fxdr_unsigned(int, *tl);
2411		}
2412		/*
2413		 * If at end of rpc data, get the eof boolean
2414		 */
2415		if (!more_dirs) {
2416			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2417			more_dirs = (fxdr_unsigned(int, *tl) == 0);
2418		}
2419		m_freem(mrep);
2420	}
2421	/*
2422	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
2423	 * by increasing d_reclen for the last record.
2424	 */
2425	if (blksiz > 0) {
2426		left = DIRBLKSIZ - blksiz;
2427		dp->d_reclen += left;
2428		uiop->uio_iov->iov_base =
2429		    (char *)uiop->uio_iov->iov_base + left;
2430		uiop->uio_iov->iov_len -= left;
2431		uiop->uio_offset += left;
2432		uiop->uio_resid -= left;
2433	}
2434
2435	/*
2436	 * We are now either at the end of the directory or have filled the
2437	 * block.
2438	 */
2439	if (bigenough)
2440		dnp->n_direofoffset = uiop->uio_offset;
2441	else {
2442		if (uiop->uio_resid > 0)
2443			nfs_printf("EEK! readdirrpc resid > 0\n");
2444		nfs_dircookie_lock(dnp);
2445		cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
2446		*cookiep = cookie;
2447		nfs_dircookie_unlock(dnp);
2448	}
2449nfsmout:
2450	return (error);
2451}
2452
2453/*
2454 * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc().
2455 */
2456int
2457nfs_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
2458{
2459	int len, left;
2460	struct dirent *dp;
2461	u_int32_t *tl;
2462	caddr_t cp;
2463	struct vnode *newvp;
2464	nfsuint64 *cookiep;
2465	caddr_t bpos, dpos, dpossav1, dpossav2;
2466	struct mbuf *mreq, *mrep, *md, *mb, *mdsav1, *mdsav2;
2467	struct nameidata nami, *ndp = &nami;
2468	struct componentname *cnp = &ndp->ni_cnd;
2469	nfsuint64 cookie;
2470	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2471	struct nfsnode *dnp = VTONFS(vp), *np;
2472	nfsfh_t *fhp;
2473	u_quad_t fileno;
2474	int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i;
2475	int attrflag, fhsize;
2476
2477#ifndef nolint
2478	dp = NULL;
2479#endif
2480	KASSERT(uiop->uio_iovcnt == 1 &&
2481	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
2482	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
2483	    ("nfs readdirplusrpc bad uio"));
2484	ndp->ni_dvp = vp;
2485	newvp = NULLVP;
2486
2487	/*
2488	 * If there is no cookie, assume directory was stale.
2489	 */
2490	nfs_dircookie_lock(dnp);
2491	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
2492	if (cookiep) {
2493		cookie = *cookiep;
2494		nfs_dircookie_unlock(dnp);
2495	} else {
2496		nfs_dircookie_unlock(dnp);
2497		return (NFSERR_BAD_COOKIE);
2498	}
2499	/*
2500	 * Loop around doing readdir rpc's of size nm_readdirsize
2501	 * truncated to a multiple of DIRBLKSIZ.
2502	 * The stopping criteria is EOF or buffer full.
2503	 */
2504	while (more_dirs && bigenough) {
2505		nfsstats.rpccnt[NFSPROC_READDIRPLUS]++;
2506		mreq = nfsm_reqhead(vp, NFSPROC_READDIRPLUS,
2507			NFSX_FH(1) + 6 * NFSX_UNSIGNED);
2508		mb = mreq;
2509		bpos = mtod(mb, caddr_t);
2510		nfsm_fhtom(vp, 1);
2511 		tl = nfsm_build(u_int32_t *, 6 * NFSX_UNSIGNED);
2512		*tl++ = cookie.nfsuquad[0];
2513		*tl++ = cookie.nfsuquad[1];
2514		mtx_lock(&dnp->n_mtx);
2515		*tl++ = dnp->n_cookieverf.nfsuquad[0];
2516		*tl++ = dnp->n_cookieverf.nfsuquad[1];
2517		mtx_unlock(&dnp->n_mtx);
2518		*tl++ = txdr_unsigned(nmp->nm_readdirsize);
2519		*tl = txdr_unsigned(nmp->nm_rsize);
2520		nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_td, cred);
2521		nfsm_postop_attr(vp, attrflag);
2522		if (error) {
2523			m_freem(mrep);
2524			goto nfsmout;
2525		}
2526		tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
2527		mtx_lock(&dnp->n_mtx);
2528		dnp->n_cookieverf.nfsuquad[0] = *tl++;
2529		dnp->n_cookieverf.nfsuquad[1] = *tl++;
2530		mtx_unlock(&dnp->n_mtx);
2531		more_dirs = fxdr_unsigned(int, *tl);
2532
2533		/* loop thru the dir entries, doctoring them to 4bsd form */
2534		while (more_dirs && bigenough) {
2535			tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
2536			fileno = fxdr_hyper(tl);
2537			len = fxdr_unsigned(int, *(tl + 2));
2538			if (len <= 0 || len > NFS_MAXNAMLEN) {
2539				error = EBADRPC;
2540				m_freem(mrep);
2541				goto nfsmout;
2542			}
2543			tlen = nfsm_rndup(len);
2544			if (tlen == len)
2545				tlen += 4;	/* To ensure null termination*/
2546			left = DIRBLKSIZ - blksiz;
2547			if ((tlen + DIRHDSIZ) > left) {
2548				dp->d_reclen += left;
2549				uiop->uio_iov->iov_base =
2550				    (char *)uiop->uio_iov->iov_base + left;
2551				uiop->uio_iov->iov_len -= left;
2552				uiop->uio_offset += left;
2553				uiop->uio_resid -= left;
2554				blksiz = 0;
2555			}
2556			if ((tlen + DIRHDSIZ) > uiop->uio_resid)
2557				bigenough = 0;
2558			if (bigenough) {
2559				dp = (struct dirent *)uiop->uio_iov->iov_base;
2560				dp->d_fileno = (int)fileno;
2561				dp->d_namlen = len;
2562				dp->d_reclen = tlen + DIRHDSIZ;
2563				dp->d_type = DT_UNKNOWN;
2564				blksiz += dp->d_reclen;
2565				if (blksiz == DIRBLKSIZ)
2566					blksiz = 0;
2567				uiop->uio_offset += DIRHDSIZ;
2568				uiop->uio_resid -= DIRHDSIZ;
2569				uiop->uio_iov->iov_base =
2570				    (char *)uiop->uio_iov->iov_base + DIRHDSIZ;
2571				uiop->uio_iov->iov_len -= DIRHDSIZ;
2572				cnp->cn_nameptr = uiop->uio_iov->iov_base;
2573				cnp->cn_namelen = len;
2574				nfsm_mtouio(uiop, len);
2575				cp = uiop->uio_iov->iov_base;
2576				tlen -= len;
2577				*cp = '\0';
2578				uiop->uio_iov->iov_base =
2579				    (char *)uiop->uio_iov->iov_base + tlen;
2580				uiop->uio_iov->iov_len -= tlen;
2581				uiop->uio_offset += tlen;
2582				uiop->uio_resid -= tlen;
2583			} else
2584				nfsm_adv(nfsm_rndup(len));
2585			tl = nfsm_dissect(u_int32_t *, 3 * NFSX_UNSIGNED);
2586			if (bigenough) {
2587				cookie.nfsuquad[0] = *tl++;
2588				cookie.nfsuquad[1] = *tl++;
2589			} else
2590				tl += 2;
2591
2592			/*
2593			 * Since the attributes are before the file handle
2594			 * (sigh), we must skip over the attributes and then
2595			 * come back and get them.
2596			 */
2597			attrflag = fxdr_unsigned(int, *tl);
2598			if (attrflag) {
2599			    dpossav1 = dpos;
2600			    mdsav1 = md;
2601			    nfsm_adv(NFSX_V3FATTR);
2602			    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2603			    doit = fxdr_unsigned(int, *tl);
2604			    /*
2605 			     * Skip loading the attrs for "..". There's a
2606 			     * race between loading the attrs here and
2607 			     * lookups that look for the directory currently
2608 			     * being read (in the parent). We try to acquire
2609 			     * the exclusive lock on ".." here, owning the
2610 			     * lock on the directory being read. Lookup will
2611 			     * hold the lock on ".." and try to acquire the
2612 			     * lock on the directory being read.
2613 			     *
2614 			     * There are other ways of fixing this, one would
2615 			     * be to do a trylock on the ".." vnode and skip
2616 			     * loading the attrs on ".." if it happens to be
2617 			     * locked by another process. But skipping the
2618 			     * attrload on ".." seems the easiest option.
2619 			     */
2620 			    if (strcmp(dp->d_name, "..") == 0) {
2621 				    doit = 0;
2622 				    /*
2623 				     * We've already skipped over the attrs,
2624 				     * skip over the filehandle. And store d_type
2625 				     * as VDIR.
2626 				     */
2627 				    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2628 				    i = fxdr_unsigned(int, *tl);
2629 				    nfsm_adv(nfsm_rndup(i));
2630 				    dp->d_type = IFTODT(VTTOIF(VDIR));
2631 			    }
2632			    if (doit) {
2633				nfsm_getfh(fhp, fhsize, 1);
2634				if (NFS_CMPFH(dnp, fhp, fhsize)) {
2635				    VREF(vp);
2636				    newvp = vp;
2637				    np = dnp;
2638				} else {
2639				    error = nfs_nget(vp->v_mount, fhp,
2640					fhsize, &np, LK_EXCLUSIVE);
2641				    if (error)
2642					doit = 0;
2643				    else
2644					newvp = NFSTOV(np);
2645				}
2646			    }
2647			    if (doit && bigenough) {
2648				dpossav2 = dpos;
2649				dpos = dpossav1;
2650				mdsav2 = md;
2651				md = mdsav1;
2652				nfsm_loadattr(newvp, NULL);
2653				dpos = dpossav2;
2654				md = mdsav2;
2655				dp->d_type =
2656				    IFTODT(VTTOIF(np->n_vattr.va_type));
2657				ndp->ni_vp = newvp;
2658				/*
2659				 * Update n_ctime so subsequent lookup
2660				 * doesn't purge entry.
2661				 */
2662				np->n_ctime = np->n_vattr.va_ctime;
2663			        cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp);
2664			    }
2665			} else {
2666			    /* Just skip over the file handle */
2667			    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2668			    i = fxdr_unsigned(int, *tl);
2669			    if (i) {
2670				    tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2671				    fhsize = fxdr_unsigned(int, *tl);
2672				    nfsm_adv(nfsm_rndup(fhsize));
2673			    }
2674			}
2675			if (newvp != NULLVP) {
2676			    if (newvp == vp)
2677				vrele(newvp);
2678			    else
2679				vput(newvp);
2680			    newvp = NULLVP;
2681			}
2682			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2683			more_dirs = fxdr_unsigned(int, *tl);
2684		}
2685		/*
2686		 * If at end of rpc data, get the eof boolean
2687		 */
2688		if (!more_dirs) {
2689			tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
2690			more_dirs = (fxdr_unsigned(int, *tl) == 0);
2691		}
2692		m_freem(mrep);
2693	}
2694	/*
2695	 * Fill last record, iff any, out to a multiple of DIRBLKSIZ
2696	 * by increasing d_reclen for the last record.
2697	 */
2698	if (blksiz > 0) {
2699		left = DIRBLKSIZ - blksiz;
2700		dp->d_reclen += left;
2701		uiop->uio_iov->iov_base =
2702		    (char *)uiop->uio_iov->iov_base + left;
2703		uiop->uio_iov->iov_len -= left;
2704		uiop->uio_offset += left;
2705		uiop->uio_resid -= left;
2706	}
2707
2708	/*
2709	 * We are now either at the end of the directory or have filled the
2710	 * block.
2711	 */
2712	if (bigenough)
2713		dnp->n_direofoffset = uiop->uio_offset;
2714	else {
2715		if (uiop->uio_resid > 0)
2716			nfs_printf("EEK! readdirplusrpc resid > 0\n");
2717		nfs_dircookie_lock(dnp);
2718		cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
2719		*cookiep = cookie;
2720		nfs_dircookie_unlock(dnp);
2721	}
2722nfsmout:
2723	if (newvp != NULLVP) {
2724	        if (newvp == vp)
2725			vrele(newvp);
2726		else
2727			vput(newvp);
2728		newvp = NULLVP;
2729	}
2730	return (error);
2731}
2732
2733/*
2734 * Silly rename. To make the NFS filesystem that is stateless look a little
2735 * more like the "ufs" a remove of an active vnode is translated to a rename
2736 * to a funny looking filename that is removed by nfs_inactive on the
2737 * nfsnode. There is the potential for another process on a different client
2738 * to create the same funny name between the nfs_lookitup() fails and the
2739 * nfs_rename() completes, but...
2740 */
2741static int
2742nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2743{
2744	struct sillyrename *sp;
2745	struct nfsnode *np;
2746	int error;
2747	short pid;
2748	unsigned int lticks;
2749
2750	cache_purge(dvp);
2751	np = VTONFS(vp);
2752	KASSERT(vp->v_type != VDIR, ("nfs: sillyrename dir"));
2753	sp = malloc(sizeof (struct sillyrename),
2754		M_NFSREQ, M_WAITOK);
2755	sp->s_cred = crhold(cnp->cn_cred);
2756	sp->s_dvp = dvp;
2757	sp->s_removeit = nfs_removeit;
2758	VREF(dvp);
2759
2760	/*
2761	 * Fudge together a funny name.
2762	 * Changing the format of the funny name to accomodate more
2763	 * sillynames per directory.
2764	 * The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is
2765	 * CPU ticks since boot.
2766	 */
2767	pid = cnp->cn_thread->td_proc->p_pid;
2768	lticks = (unsigned int)ticks;
2769	for ( ; ; ) {
2770		sp->s_namlen = sprintf(sp->s_name,
2771				       ".nfs.%08x.%04x4.4", lticks,
2772				       pid);
2773		if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
2774				 cnp->cn_thread, NULL))
2775			break;
2776		lticks++;
2777	}
2778	error = nfs_renameit(dvp, cnp, sp);
2779	if (error)
2780		goto bad;
2781	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
2782		cnp->cn_thread, &np);
2783	np->n_sillyrename = sp;
2784	return (0);
2785bad:
2786	vrele(sp->s_dvp);
2787	crfree(sp->s_cred);
2788	free((caddr_t)sp, M_NFSREQ);
2789	return (error);
2790}
2791
2792/*
2793 * Look up a file name and optionally either update the file handle or
2794 * allocate an nfsnode, depending on the value of npp.
2795 * npp == NULL	--> just do the lookup
2796 * *npp == NULL --> allocate a new nfsnode and make sure attributes are
2797 *			handled too
2798 * *npp != NULL --> update the file handle in the vnode
2799 */
2800static int
2801nfs_lookitup(struct vnode *dvp, const char *name, int len, struct ucred *cred,
2802    struct thread *td, struct nfsnode **npp)
2803{
2804	struct vnode *newvp = NULL;
2805	struct nfsnode *np, *dnp = VTONFS(dvp);
2806	caddr_t bpos, dpos;
2807	int error = 0, fhlen, attrflag;
2808	struct mbuf *mreq, *mrep, *md, *mb;
2809	nfsfh_t *nfhp;
2810	int v3 = NFS_ISV3(dvp);
2811
2812	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
2813	mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP,
2814		NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len));
2815	mb = mreq;
2816	bpos = mtod(mb, caddr_t);
2817	nfsm_fhtom(dvp, v3);
2818	nfsm_strtom(name, len, NFS_MAXNAMLEN);
2819	nfsm_request(dvp, NFSPROC_LOOKUP, td, cred);
2820	if (npp && !error) {
2821		nfsm_getfh(nfhp, fhlen, v3);
2822		if (*npp) {
2823		    np = *npp;
2824		    if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) {
2825			free((caddr_t)np->n_fhp, M_NFSBIGFH);
2826			np->n_fhp = &np->n_fh;
2827		    } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH)
2828			np->n_fhp =(nfsfh_t *)malloc(fhlen, M_NFSBIGFH, M_WAITOK);
2829		    bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen);
2830		    np->n_fhsize = fhlen;
2831		    newvp = NFSTOV(np);
2832		} else if (NFS_CMPFH(dnp, nfhp, fhlen)) {
2833		    VREF(dvp);
2834		    newvp = dvp;
2835		} else {
2836		    error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE);
2837		    if (error) {
2838			m_freem(mrep);
2839			return (error);
2840		    }
2841		    newvp = NFSTOV(np);
2842		}
2843		if (v3) {
2844			nfsm_postop_attr(newvp, attrflag);
2845			if (!attrflag && *npp == NULL) {
2846				m_freem(mrep);
2847				if (newvp == dvp)
2848					vrele(newvp);
2849				else
2850					vput(newvp);
2851				return (ENOENT);
2852			}
2853		} else
2854			nfsm_loadattr(newvp, NULL);
2855	}
2856	m_freem(mrep);
2857nfsmout:
2858	if (npp && *npp == NULL) {
2859		if (error) {
2860			if (newvp) {
2861				if (newvp == dvp)
2862					vrele(newvp);
2863				else
2864					vput(newvp);
2865			}
2866		} else
2867			*npp = np;
2868	}
2869	return (error);
2870}
2871
2872/*
2873 * Nfs Version 3 commit rpc
2874 */
2875int
2876nfs_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
2877	   struct thread *td)
2878{
2879	u_int32_t *tl;
2880	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2881	caddr_t bpos, dpos;
2882	int error = 0, wccflag = NFSV3_WCCRATTR;
2883	struct mbuf *mreq, *mrep, *md, *mb;
2884
2885	mtx_lock(&nmp->nm_mtx);
2886	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
2887		mtx_unlock(&nmp->nm_mtx);
2888		return (0);
2889	}
2890	mtx_unlock(&nmp->nm_mtx);
2891	nfsstats.rpccnt[NFSPROC_COMMIT]++;
2892	mreq = nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1));
2893	mb = mreq;
2894	bpos = mtod(mb, caddr_t);
2895	nfsm_fhtom(vp, 1);
2896	tl = nfsm_build(u_int32_t *, 3 * NFSX_UNSIGNED);
2897	txdr_hyper(offset, tl);
2898	tl += 2;
2899	*tl = txdr_unsigned(cnt);
2900	nfsm_request(vp, NFSPROC_COMMIT, td, cred);
2901	nfsm_wcc_data(vp, wccflag);
2902	if (!error) {
2903		tl = nfsm_dissect(u_int32_t *, NFSX_V3WRITEVERF);
2904		if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl,
2905			NFSX_V3WRITEVERF)) {
2906			bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
2907				NFSX_V3WRITEVERF);
2908			error = NFSERR_STALEWRITEVERF;
2909		}
2910	}
2911	m_freem(mrep);
2912nfsmout:
2913	return (error);
2914}
2915
2916/*
2917 * Strategy routine.
2918 * For async requests when nfsiod(s) are running, queue the request by
2919 * calling nfs_asyncio(), otherwise just all nfs_doio() to do the
2920 * request.
2921 */
2922static int
2923nfs_strategy(struct vop_strategy_args *ap)
2924{
2925	struct buf *bp = ap->a_bp;
2926	struct ucred *cr;
2927
2928	KASSERT(!(bp->b_flags & B_DONE),
2929	    ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
2930	BUF_ASSERT_HELD(bp);
2931
2932	if (bp->b_iocmd == BIO_READ)
2933		cr = bp->b_rcred;
2934	else
2935		cr = bp->b_wcred;
2936
2937	/*
2938	 * If the op is asynchronous and an i/o daemon is waiting
2939	 * queue the request, wake it up and wait for completion
2940	 * otherwise just do it ourselves.
2941	 */
2942	if ((bp->b_flags & B_ASYNC) == 0 ||
2943	    nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
2944		(void)nfs_doio(ap->a_vp, bp, cr, curthread);
2945	return (0);
2946}
2947
2948/*
2949 * fsync vnode op. Just call nfs_flush() with commit == 1.
2950 */
2951/* ARGSUSED */
2952static int
2953nfs_fsync(struct vop_fsync_args *ap)
2954{
2955
2956	return (nfs_flush(ap->a_vp, ap->a_waitfor, 1));
2957}
2958
2959/*
2960 * Flush all the blocks associated with a vnode.
2961 * 	Walk through the buffer pool and push any dirty pages
2962 *	associated with the vnode.
2963 */
2964static int
2965nfs_flush(struct vnode *vp, int waitfor, int commit)
2966{
2967	struct nfsnode *np = VTONFS(vp);
2968	struct buf *bp;
2969	int i;
2970	struct buf *nbp;
2971	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2972	int error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
2973	int passone = 1;
2974	u_quad_t off, endoff, toff;
2975	struct ucred* wcred = NULL;
2976	struct buf **bvec = NULL;
2977	struct bufobj *bo;
2978	struct thread *td = curthread;
2979#ifndef NFS_COMMITBVECSIZ
2980#define NFS_COMMITBVECSIZ	20
2981#endif
2982	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
2983	int bvecsize = 0, bveccount;
2984
2985	if (nmp->nm_flag & NFSMNT_INT)
2986		slpflag = NFS_PCATCH;
2987	if (!commit)
2988		passone = 0;
2989	bo = &vp->v_bufobj;
2990	/*
2991	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
2992	 * server, but has not been committed to stable storage on the server
2993	 * yet. On the first pass, the byte range is worked out and the commit
2994	 * rpc is done. On the second pass, nfs_writebp() is called to do the
2995	 * job.
2996	 */
2997again:
2998	off = (u_quad_t)-1;
2999	endoff = 0;
3000	bvecpos = 0;
3001	if (NFS_ISV3(vp) && commit) {
3002		if (bvec != NULL && bvec != bvec_on_stack)
3003			free(bvec, M_TEMP);
3004		/*
3005		 * Count up how many buffers waiting for a commit.
3006		 */
3007		bveccount = 0;
3008		BO_LOCK(bo);
3009		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
3010			if (!BUF_ISLOCKED(bp) &&
3011			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
3012				== (B_DELWRI | B_NEEDCOMMIT))
3013				bveccount++;
3014		}
3015		/*
3016		 * Allocate space to remember the list of bufs to commit.  It is
3017		 * important to use M_NOWAIT here to avoid a race with nfs_write.
3018		 * If we can't get memory (for whatever reason), we will end up
3019		 * committing the buffers one-by-one in the loop below.
3020		 */
3021		if (bveccount > NFS_COMMITBVECSIZ) {
3022			/*
3023			 * Release the vnode interlock to avoid a lock
3024			 * order reversal.
3025			 */
3026			BO_UNLOCK(bo);
3027			bvec = (struct buf **)
3028				malloc(bveccount * sizeof(struct buf *),
3029				       M_TEMP, M_NOWAIT);
3030			BO_LOCK(bo);
3031			if (bvec == NULL) {
3032				bvec = bvec_on_stack;
3033				bvecsize = NFS_COMMITBVECSIZ;
3034			} else
3035				bvecsize = bveccount;
3036		} else {
3037			bvec = bvec_on_stack;
3038			bvecsize = NFS_COMMITBVECSIZ;
3039		}
3040		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
3041			if (bvecpos >= bvecsize)
3042				break;
3043			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
3044				nbp = TAILQ_NEXT(bp, b_bobufs);
3045				continue;
3046			}
3047			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
3048			    (B_DELWRI | B_NEEDCOMMIT)) {
3049				BUF_UNLOCK(bp);
3050				nbp = TAILQ_NEXT(bp, b_bobufs);
3051				continue;
3052			}
3053			BO_UNLOCK(bo);
3054			bremfree(bp);
3055			/*
3056			 * Work out if all buffers are using the same cred
3057			 * so we can deal with them all with one commit.
3058			 *
3059			 * NOTE: we are not clearing B_DONE here, so we have
3060			 * to do it later on in this routine if we intend to
3061			 * initiate I/O on the bp.
3062			 *
3063			 * Note: to avoid loopback deadlocks, we do not
3064			 * assign b_runningbufspace.
3065			 */
3066			if (wcred == NULL)
3067				wcred = bp->b_wcred;
3068			else if (wcred != bp->b_wcred)
3069				wcred = NOCRED;
3070			vfs_busy_pages(bp, 1);
3071
3072			BO_LOCK(bo);
3073			/*
3074			 * bp is protected by being locked, but nbp is not
3075			 * and vfs_busy_pages() may sleep.  We have to
3076			 * recalculate nbp.
3077			 */
3078			nbp = TAILQ_NEXT(bp, b_bobufs);
3079
3080			/*
3081			 * A list of these buffers is kept so that the
3082			 * second loop knows which buffers have actually
3083			 * been committed. This is necessary, since there
3084			 * may be a race between the commit rpc and new
3085			 * uncommitted writes on the file.
3086			 */
3087			bvec[bvecpos++] = bp;
3088			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
3089				bp->b_dirtyoff;
3090			if (toff < off)
3091				off = toff;
3092			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
3093			if (toff > endoff)
3094				endoff = toff;
3095		}
3096		BO_UNLOCK(bo);
3097	}
3098	if (bvecpos > 0) {
3099		/*
3100		 * Commit data on the server, as required.
3101		 * If all bufs are using the same wcred, then use that with
3102		 * one call for all of them, otherwise commit each one
3103		 * separately.
3104		 */
3105		if (wcred != NOCRED)
3106			retv = nfs_commit(vp, off, (int)(endoff - off),
3107					  wcred, td);
3108		else {
3109			retv = 0;
3110			for (i = 0; i < bvecpos; i++) {
3111				off_t off, size;
3112				bp = bvec[i];
3113				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
3114					bp->b_dirtyoff;
3115				size = (u_quad_t)(bp->b_dirtyend
3116						  - bp->b_dirtyoff);
3117				retv = nfs_commit(vp, off, (int)size,
3118						  bp->b_wcred, td);
3119				if (retv) break;
3120			}
3121		}
3122
3123		if (retv == NFSERR_STALEWRITEVERF)
3124			nfs_clearcommit(vp->v_mount);
3125
3126		/*
3127		 * Now, either mark the blocks I/O done or mark the
3128		 * blocks dirty, depending on whether the commit
3129		 * succeeded.
3130		 */
3131		for (i = 0; i < bvecpos; i++) {
3132			bp = bvec[i];
3133			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
3134			if (retv) {
3135				/*
3136				 * Error, leave B_DELWRI intact
3137				 */
3138				vfs_unbusy_pages(bp);
3139				brelse(bp);
3140			} else {
3141				/*
3142				 * Success, remove B_DELWRI ( bundirty() ).
3143				 *
3144				 * b_dirtyoff/b_dirtyend seem to be NFS
3145				 * specific.  We should probably move that
3146				 * into bundirty(). XXX
3147				 */
3148				bufobj_wref(bo);
3149				bp->b_flags |= B_ASYNC;
3150				bundirty(bp);
3151				bp->b_flags &= ~B_DONE;
3152				bp->b_ioflags &= ~BIO_ERROR;
3153				bp->b_dirtyoff = bp->b_dirtyend = 0;
3154				bufdone(bp);
3155			}
3156		}
3157	}
3158
3159	/*
3160	 * Start/do any write(s) that are required.
3161	 */
3162loop:
3163	BO_LOCK(bo);
3164	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
3165		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
3166			if (waitfor != MNT_WAIT || passone)
3167				continue;
3168
3169			error = BUF_TIMELOCK(bp,
3170			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
3171			    BO_MTX(bo), "nfsfsync", slpflag, slptimeo);
3172			if (error == 0) {
3173				BUF_UNLOCK(bp);
3174				goto loop;
3175			}
3176			if (error == ENOLCK) {
3177				error = 0;
3178				goto loop;
3179			}
3180			if (nfs_sigintr(nmp, td)) {
3181				error = EINTR;
3182				goto done;
3183			}
3184			if (slpflag & PCATCH) {
3185				slpflag = 0;
3186				slptimeo = 2 * hz;
3187			}
3188			goto loop;
3189		}
3190		if ((bp->b_flags & B_DELWRI) == 0)
3191			panic("nfs_fsync: not dirty");
3192		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
3193			BUF_UNLOCK(bp);
3194			continue;
3195		}
3196		BO_UNLOCK(bo);
3197		bremfree(bp);
3198		if (passone || !commit)
3199		    bp->b_flags |= B_ASYNC;
3200		else
3201		    bp->b_flags |= B_ASYNC;
3202		bwrite(bp);
3203		if (nfs_sigintr(nmp, td)) {
3204			error = EINTR;
3205			goto done;
3206		}
3207		goto loop;
3208	}
3209	if (passone) {
3210		passone = 0;
3211		BO_UNLOCK(bo);
3212		goto again;
3213	}
3214	if (waitfor == MNT_WAIT) {
3215		while (bo->bo_numoutput) {
3216			error = bufobj_wwait(bo, slpflag, slptimeo);
3217			if (error) {
3218			    BO_UNLOCK(bo);
3219			    error = nfs_sigintr(nmp, td);
3220			    if (error)
3221				goto done;
3222			    if (slpflag & PCATCH) {
3223				slpflag = 0;
3224				slptimeo = 2 * hz;
3225			    }
3226			    BO_LOCK(bo);
3227			}
3228		}
3229		if (bo->bo_dirty.bv_cnt != 0 && commit) {
3230			BO_UNLOCK(bo);
3231			goto loop;
3232		}
3233		/*
3234		 * Wait for all the async IO requests to drain
3235		 */
3236		BO_UNLOCK(bo);
3237		mtx_lock(&np->n_mtx);
3238		while (np->n_directio_asyncwr > 0) {
3239			np->n_flag |= NFSYNCWAIT;
3240			error = nfs_msleep(td, (caddr_t)&np->n_directio_asyncwr,
3241					   &np->n_mtx, slpflag | (PRIBIO + 1),
3242					   "nfsfsync", 0);
3243			if (error) {
3244				if (nfs_sigintr(nmp, td)) {
3245					mtx_unlock(&np->n_mtx);
3246					error = EINTR;
3247					goto done;
3248				}
3249			}
3250		}
3251		mtx_unlock(&np->n_mtx);
3252	} else
3253		BO_UNLOCK(bo);
3254	mtx_lock(&np->n_mtx);
3255	if (np->n_flag & NWRITEERR) {
3256		error = np->n_error;
3257		np->n_flag &= ~NWRITEERR;
3258	}
3259  	if (commit && bo->bo_dirty.bv_cnt == 0 &&
3260	    bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0)
3261  		np->n_flag &= ~NMODIFIED;
3262	mtx_unlock(&np->n_mtx);
3263done:
3264	if (bvec != NULL && bvec != bvec_on_stack)
3265		free(bvec, M_TEMP);
3266	return (error);
3267}
3268
3269/*
3270 * NFS advisory byte-level locks.
3271 */
3272static int
3273nfs_advlock(struct vop_advlock_args *ap)
3274{
3275	struct vnode *vp = ap->a_vp;
3276	u_quad_t size;
3277	int error;
3278
3279	error = vn_lock(vp, LK_SHARED);
3280	if (error)
3281		return (error);
3282	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
3283		size = VTONFS(vp)->n_size;
3284		VOP_UNLOCK(vp, 0);
3285		error = lf_advlock(ap, &(vp->v_lockf), size);
3286	} else {
3287		if (nfs_advlock_p)
3288			error = nfs_advlock_p(ap);
3289		else
3290			error = ENOLCK;
3291	}
3292
3293	return (error);
3294}
3295
3296/*
3297 * NFS advisory byte-level locks.
3298 */
3299static int
3300nfs_advlockasync(struct vop_advlockasync_args *ap)
3301{
3302	struct vnode *vp = ap->a_vp;
3303	u_quad_t size;
3304	int error;
3305
3306	error = vn_lock(vp, LK_SHARED);
3307	if (error)
3308		return (error);
3309	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
3310		size = VTONFS(vp)->n_size;
3311		VOP_UNLOCK(vp, 0);
3312		error = lf_advlockasync(ap, &(vp->v_lockf), size);
3313	} else {
3314		VOP_UNLOCK(vp, 0);
3315		error = EOPNOTSUPP;
3316	}
3317	return (error);
3318}
3319
3320/*
3321 * Print out the contents of an nfsnode.
3322 */
3323static int
3324nfs_print(struct vop_print_args *ap)
3325{
3326	struct vnode *vp = ap->a_vp;
3327	struct nfsnode *np = VTONFS(vp);
3328
3329	nfs_printf("\tfileid %ld fsid 0x%x",
3330	   np->n_vattr.va_fileid, np->n_vattr.va_fsid);
3331	if (vp->v_type == VFIFO)
3332		fifo_printinfo(vp);
3333	printf("\n");
3334	return (0);
3335}
3336
3337/*
3338 * This is the "real" nfs::bwrite(struct buf*).
3339 * We set B_CACHE if this is a VMIO buffer.
3340 */
3341int
3342nfs_writebp(struct buf *bp, int force __unused, struct thread *td)
3343{
3344	int s;
3345	int oldflags = bp->b_flags;
3346#if 0
3347	int retv = 1;
3348	off_t off;
3349#endif
3350
3351	BUF_ASSERT_HELD(bp);
3352
3353	if (bp->b_flags & B_INVAL) {
3354		brelse(bp);
3355		return(0);
3356	}
3357
3358	bp->b_flags |= B_CACHE;
3359
3360	/*
3361	 * Undirty the bp.  We will redirty it later if the I/O fails.
3362	 */
3363
3364	s = splbio();
3365	bundirty(bp);
3366	bp->b_flags &= ~B_DONE;
3367	bp->b_ioflags &= ~BIO_ERROR;
3368	bp->b_iocmd = BIO_WRITE;
3369
3370	bufobj_wref(bp->b_bufobj);
3371	curthread->td_ru.ru_oublock++;
3372	splx(s);
3373
3374	/*
3375	 * Note: to avoid loopback deadlocks, we do not
3376	 * assign b_runningbufspace.
3377	 */
3378	vfs_busy_pages(bp, 1);
3379
3380	BUF_KERNPROC(bp);
3381	bp->b_iooffset = dbtob(bp->b_blkno);
3382	bstrategy(bp);
3383
3384	if( (oldflags & B_ASYNC) == 0) {
3385		int rtval = bufwait(bp);
3386
3387		if (oldflags & B_DELWRI) {
3388			s = splbio();
3389			reassignbuf(bp);
3390			splx(s);
3391		}
3392		brelse(bp);
3393		return (rtval);
3394	}
3395
3396	return (0);
3397}
3398
3399/*
3400 * nfs special file access vnode op.
3401 * Essentially just get vattr and then imitate iaccess() since the device is
3402 * local to the client.
3403 */
3404static int
3405nfsspec_access(struct vop_access_args *ap)
3406{
3407	struct vattr *vap;
3408	struct ucred *cred = ap->a_cred;
3409	struct vnode *vp = ap->a_vp;
3410	accmode_t accmode = ap->a_accmode;
3411	struct vattr vattr;
3412	int error;
3413
3414	/*
3415	 * Disallow write attempts on filesystems mounted read-only;
3416	 * unless the file is a socket, fifo, or a block or character
3417	 * device resident on the filesystem.
3418	 */
3419	if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
3420		switch (vp->v_type) {
3421		case VREG:
3422		case VDIR:
3423		case VLNK:
3424			return (EROFS);
3425		default:
3426			break;
3427		}
3428	}
3429	vap = &vattr;
3430	error = VOP_GETATTR(vp, vap, cred);
3431	if (error)
3432		goto out;
3433	error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
3434			 accmode, cred, NULL);
3435out:
3436	return error;
3437}
3438
3439/*
3440 * Read wrapper for fifos.
3441 */
3442static int
3443nfsfifo_read(struct vop_read_args *ap)
3444{
3445	struct nfsnode *np = VTONFS(ap->a_vp);
3446	int error;
3447
3448	/*
3449	 * Set access flag.
3450	 */
3451	mtx_lock(&np->n_mtx);
3452	np->n_flag |= NACC;
3453	getnanotime(&np->n_atim);
3454	mtx_unlock(&np->n_mtx);
3455	error = fifo_specops.vop_read(ap);
3456	return error;
3457}
3458
3459/*
3460 * Write wrapper for fifos.
3461 */
3462static int
3463nfsfifo_write(struct vop_write_args *ap)
3464{
3465	struct nfsnode *np = VTONFS(ap->a_vp);
3466
3467	/*
3468	 * Set update flag.
3469	 */
3470	mtx_lock(&np->n_mtx);
3471	np->n_flag |= NUPD;
3472	getnanotime(&np->n_mtim);
3473	mtx_unlock(&np->n_mtx);
3474	return(fifo_specops.vop_write(ap));
3475}
3476
3477/*
3478 * Close wrapper for fifos.
3479 *
3480 * Update the times on the nfsnode then do fifo close.
3481 */
3482static int
3483nfsfifo_close(struct vop_close_args *ap)
3484{
3485	struct vnode *vp = ap->a_vp;
3486	struct nfsnode *np = VTONFS(vp);
3487	struct vattr vattr;
3488	struct timespec ts;
3489
3490	mtx_lock(&np->n_mtx);
3491	if (np->n_flag & (NACC | NUPD)) {
3492		getnanotime(&ts);
3493		if (np->n_flag & NACC)
3494			np->n_atim = ts;
3495		if (np->n_flag & NUPD)
3496			np->n_mtim = ts;
3497		np->n_flag |= NCHG;
3498		if (vrefcnt(vp) == 1 &&
3499		    (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
3500			VATTR_NULL(&vattr);
3501			if (np->n_flag & NACC)
3502				vattr.va_atime = np->n_atim;
3503			if (np->n_flag & NUPD)
3504				vattr.va_mtime = np->n_mtim;
3505			mtx_unlock(&np->n_mtx);
3506			(void)VOP_SETATTR(vp, &vattr, ap->a_cred);
3507			goto out;
3508		}
3509	}
3510	mtx_unlock(&np->n_mtx);
3511out:
3512	return (fifo_specops.vop_close(ap));
3513}
3514
3515/*
3516 * Just call nfs_writebp() with the force argument set to 1.
3517 *
3518 * NOTE: B_DONE may or may not be set in a_bp on call.
3519 */
3520static int
3521nfs_bwrite(struct buf *bp)
3522{
3523
3524	return (nfs_writebp(bp, 1, curthread));
3525}
3526
3527struct buf_ops buf_ops_nfs = {
3528	.bop_name	=	"buf_ops_nfs",
3529	.bop_write	=	nfs_bwrite,
3530	.bop_strategy	=	bufstrategy,
3531	.bop_sync	=	bufsync,
3532	.bop_bdflush	=	bufbdflush,
3533};
3534