nfs_clvnops.c revision 222719
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	from nfs_vnops.c	8.16 (Berkeley) 5/27/95
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clvnops.c 222719 2011-06-05 18:17:37Z rmacklem $");
37
38/*
39 * vnode op calls for Sun NFS version 2, 3 and 4
40 */
41
42#include "opt_inet.h"
43
44#include <sys/param.h>
45#include <sys/kernel.h>
46#include <sys/systm.h>
47#include <sys/resourcevar.h>
48#include <sys/proc.h>
49#include <sys/mount.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/jail.h>
53#include <sys/malloc.h>
54#include <sys/mbuf.h>
55#include <sys/namei.h>
56#include <sys/socket.h>
57#include <sys/vnode.h>
58#include <sys/dirent.h>
59#include <sys/fcntl.h>
60#include <sys/lockf.h>
61#include <sys/stat.h>
62#include <sys/sysctl.h>
63#include <sys/signalvar.h>
64
65#include <vm/vm.h>
66#include <vm/vm_extern.h>
67#include <vm/vm_object.h>
68
69#include <fs/nfs/nfsport.h>
70#include <fs/nfsclient/nfsnode.h>
71#include <fs/nfsclient/nfsmount.h>
72#include <fs/nfsclient/nfs.h>
73
74#include <net/if.h>
75#include <netinet/in.h>
76#include <netinet/in_var.h>
77
78#include <nfs/nfs_lock.h>
79
80/* Defs */
81#define	TRUE	1
82#define	FALSE	0
83
84extern struct nfsstats newnfsstats;
85extern int nfsrv_useacl;
86MALLOC_DECLARE(M_NEWNFSREQ);
87
88/*
89 * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these
90 * calls are not in getblk() and brelse() so that they would not be necessary
91 * here.
92 */
93#ifndef B_VMIO
94#define	vfs_busy_pages(bp, f)
95#endif
96
97static vop_read_t	nfsfifo_read;
98static vop_write_t	nfsfifo_write;
99static vop_close_t	nfsfifo_close;
100static int	nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *,
101		    struct thread *);
102static vop_lookup_t	nfs_lookup;
103static vop_create_t	nfs_create;
104static vop_mknod_t	nfs_mknod;
105static vop_open_t	nfs_open;
106static vop_pathconf_t	nfs_pathconf;
107static vop_close_t	nfs_close;
108static vop_access_t	nfs_access;
109static vop_getattr_t	nfs_getattr;
110static vop_setattr_t	nfs_setattr;
111static vop_read_t	nfs_read;
112static vop_fsync_t	nfs_fsync;
113static vop_remove_t	nfs_remove;
114static vop_link_t	nfs_link;
115static vop_rename_t	nfs_rename;
116static vop_mkdir_t	nfs_mkdir;
117static vop_rmdir_t	nfs_rmdir;
118static vop_symlink_t	nfs_symlink;
119static vop_readdir_t	nfs_readdir;
120static vop_strategy_t	nfs_strategy;
121static vop_lock1_t	nfs_lock1;
122static	int	nfs_lookitup(struct vnode *, char *, int,
123		    struct ucred *, struct thread *, struct nfsnode **);
124static	int	nfs_sillyrename(struct vnode *, struct vnode *,
125		    struct componentname *);
126static vop_access_t	nfsspec_access;
127static vop_readlink_t	nfs_readlink;
128static vop_print_t	nfs_print;
129static vop_advlock_t	nfs_advlock;
130static vop_advlockasync_t nfs_advlockasync;
131static vop_getacl_t nfs_getacl;
132static vop_setacl_t nfs_setacl;
133
134/*
135 * Global vfs data structures for nfs
136 */
137struct vop_vector newnfs_vnodeops = {
138	.vop_default =		&default_vnodeops,
139	.vop_access =		nfs_access,
140	.vop_advlock =		nfs_advlock,
141	.vop_advlockasync =	nfs_advlockasync,
142	.vop_close =		nfs_close,
143	.vop_create =		nfs_create,
144	.vop_fsync =		nfs_fsync,
145	.vop_getattr =		nfs_getattr,
146	.vop_getpages =		ncl_getpages,
147	.vop_putpages =		ncl_putpages,
148	.vop_inactive =		ncl_inactive,
149	.vop_link =		nfs_link,
150	.vop_lock1 = 		nfs_lock1,
151	.vop_lookup =		nfs_lookup,
152	.vop_mkdir =		nfs_mkdir,
153	.vop_mknod =		nfs_mknod,
154	.vop_open =		nfs_open,
155	.vop_pathconf =		nfs_pathconf,
156	.vop_print =		nfs_print,
157	.vop_read =		nfs_read,
158	.vop_readdir =		nfs_readdir,
159	.vop_readlink =		nfs_readlink,
160	.vop_reclaim =		ncl_reclaim,
161	.vop_remove =		nfs_remove,
162	.vop_rename =		nfs_rename,
163	.vop_rmdir =		nfs_rmdir,
164	.vop_setattr =		nfs_setattr,
165	.vop_strategy =		nfs_strategy,
166	.vop_symlink =		nfs_symlink,
167	.vop_write =		ncl_write,
168	.vop_getacl =		nfs_getacl,
169	.vop_setacl =		nfs_setacl,
170};
171
172struct vop_vector newnfs_fifoops = {
173	.vop_default =		&fifo_specops,
174	.vop_access =		nfsspec_access,
175	.vop_close =		nfsfifo_close,
176	.vop_fsync =		nfs_fsync,
177	.vop_getattr =		nfs_getattr,
178	.vop_inactive =		ncl_inactive,
179	.vop_print =		nfs_print,
180	.vop_read =		nfsfifo_read,
181	.vop_reclaim =		ncl_reclaim,
182	.vop_setattr =		nfs_setattr,
183	.vop_write =		nfsfifo_write,
184};
185
186static int nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp,
187    struct componentname *cnp, struct vattr *vap);
188static int nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name,
189    int namelen, struct ucred *cred, struct thread *td);
190static int nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp,
191    char *fnameptr, int fnamelen, struct vnode *tdvp, struct vnode *tvp,
192    char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td);
193static int nfs_renameit(struct vnode *sdvp, struct vnode *svp,
194    struct componentname *scnp, struct sillyrename *sp);
195
196/*
197 * Global variables
198 */
199#define	DIRHDSIZ	(sizeof (struct dirent) - (MAXNAMLEN + 1))
200
201SYSCTL_DECL(_vfs_nfs);
202
203static int	nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
204SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
205	   &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");
206
207static int	nfs_prime_access_cache = 0;
208SYSCTL_INT(_vfs_nfs, OID_AUTO, prime_access_cache, CTLFLAG_RW,
209	   &nfs_prime_access_cache, 0,
210	   "Prime NFS ACCESS cache when fetching attributes");
211
212static int	newnfs_commit_on_close = 0;
213SYSCTL_INT(_vfs_nfs, OID_AUTO, commit_on_close, CTLFLAG_RW,
214    &newnfs_commit_on_close, 0, "write+commit on close, else only write");
215
216static int	nfs_clean_pages_on_close = 1;
217SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
218	   &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
219
220int newnfs_directio_enable = 0;
221SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
222	   &newnfs_directio_enable, 0, "Enable NFS directio");
223
224/*
225 * This sysctl allows other processes to mmap a file that has been opened
226 * O_DIRECT by a process.  In general, having processes mmap the file while
227 * Direct IO is in progress can lead to Data Inconsistencies.  But, we allow
228 * this by default to prevent DoS attacks - to prevent a malicious user from
229 * opening up files O_DIRECT preventing other users from mmap'ing these
230 * files.  "Protected" environments where stricter consistency guarantees are
231 * required can disable this knob.  The process that opened the file O_DIRECT
232 * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
233 * meaningful.
234 */
235int newnfs_directio_allow_mmap = 1;
236SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
237	   &newnfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
238
239#if 0
240SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD,
241	   &newnfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");
242
243SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_misses, CTLFLAG_RD,
244	   &newnfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count");
245#endif
246
247#define	NFSACCESS_ALL (NFSACCESS_READ | NFSACCESS_MODIFY		\
248			 | NFSACCESS_EXTEND | NFSACCESS_EXECUTE	\
249			 | NFSACCESS_DELETE | NFSACCESS_LOOKUP)
250
251/*
252 * SMP Locking Note :
253 * The list of locks after the description of the lock is the ordering
254 * of other locks acquired with the lock held.
255 * np->n_mtx : Protects the fields in the nfsnode.
256       VM Object Lock
257       VI_MTX (acquired indirectly)
258 * nmp->nm_mtx : Protects the fields in the nfsmount.
259       rep->r_mtx
260 * ncl_iod_mutex : Global lock, protects shared nfsiod state.
261 * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
262       nmp->nm_mtx
263       rep->r_mtx
264 * rep->r_mtx : Protects the fields in an nfsreq.
265 */
266
267static int
268nfs34_access_otw(struct vnode *vp, int wmode, struct thread *td,
269    struct ucred *cred, u_int32_t *retmode)
270{
271	int error = 0, attrflag, i, lrupos;
272	u_int32_t rmode;
273	struct nfsnode *np = VTONFS(vp);
274	struct nfsvattr nfsva;
275
276	error = nfsrpc_accessrpc(vp, wmode, cred, td, &nfsva, &attrflag,
277	    &rmode, NULL);
278	if (attrflag)
279		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
280	if (!error) {
281		lrupos = 0;
282		mtx_lock(&np->n_mtx);
283		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
284			if (np->n_accesscache[i].uid == cred->cr_uid) {
285				np->n_accesscache[i].mode = rmode;
286				np->n_accesscache[i].stamp = time_second;
287				break;
288			}
289			if (i > 0 && np->n_accesscache[i].stamp <
290			    np->n_accesscache[lrupos].stamp)
291				lrupos = i;
292		}
293		if (i == NFS_ACCESSCACHESIZE) {
294			np->n_accesscache[lrupos].uid = cred->cr_uid;
295			np->n_accesscache[lrupos].mode = rmode;
296			np->n_accesscache[lrupos].stamp = time_second;
297		}
298		mtx_unlock(&np->n_mtx);
299		if (retmode != NULL)
300			*retmode = rmode;
301	} else if (NFS_ISV4(vp)) {
302		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
303	}
304	return (error);
305}
306
307/*
308 * nfs access vnode op.
309 * For nfs version 2, just return ok. File accesses may fail later.
310 * For nfs version 3, use the access rpc to check accessibility. If file modes
311 * are changed on the server, accesses might still fail later.
312 */
313static int
314nfs_access(struct vop_access_args *ap)
315{
316	struct vnode *vp = ap->a_vp;
317	int error = 0, i, gotahit;
318	u_int32_t mode, wmode, rmode;
319	int v34 = NFS_ISV34(vp);
320	struct nfsnode *np = VTONFS(vp);
321
322	/*
323	 * Disallow write attempts on filesystems mounted read-only;
324	 * unless the file is a socket, fifo, or a block or character
325	 * device resident on the filesystem.
326	 */
327	if ((ap->a_accmode & (VWRITE | VAPPEND | VWRITE_NAMED_ATTRS |
328	    VDELETE_CHILD | VWRITE_ATTRIBUTES | VDELETE | VWRITE_ACL |
329	    VWRITE_OWNER)) != 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0) {
330		switch (vp->v_type) {
331		case VREG:
332		case VDIR:
333		case VLNK:
334			return (EROFS);
335		default:
336			break;
337		}
338	}
339	/*
340	 * For nfs v3 or v4, check to see if we have done this recently, and if
341	 * so return our cached result instead of making an ACCESS call.
342	 * If not, do an access rpc, otherwise you are stuck emulating
343	 * ufs_access() locally using the vattr. This may not be correct,
344	 * since the server may apply other access criteria such as
345	 * client uid-->server uid mapping that we do not know about.
346	 */
347	if (v34) {
348		if (ap->a_accmode & VREAD)
349			mode = NFSACCESS_READ;
350		else
351			mode = 0;
352		if (vp->v_type != VDIR) {
353			if (ap->a_accmode & VWRITE)
354				mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
355			if (ap->a_accmode & VAPPEND)
356				mode |= NFSACCESS_EXTEND;
357			if (ap->a_accmode & VEXEC)
358				mode |= NFSACCESS_EXECUTE;
359			if (ap->a_accmode & VDELETE)
360				mode |= NFSACCESS_DELETE;
361		} else {
362			if (ap->a_accmode & VWRITE)
363				mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
364			if (ap->a_accmode & VAPPEND)
365				mode |= NFSACCESS_EXTEND;
366			if (ap->a_accmode & VEXEC)
367				mode |= NFSACCESS_LOOKUP;
368			if (ap->a_accmode & VDELETE)
369				mode |= NFSACCESS_DELETE;
370			if (ap->a_accmode & VDELETE_CHILD)
371				mode |= NFSACCESS_MODIFY;
372		}
373		/* XXX safety belt, only make blanket request if caching */
374		if (nfsaccess_cache_timeout > 0) {
375			wmode = NFSACCESS_READ | NFSACCESS_MODIFY |
376				NFSACCESS_EXTEND | NFSACCESS_EXECUTE |
377				NFSACCESS_DELETE | NFSACCESS_LOOKUP;
378		} else {
379			wmode = mode;
380		}
381
382		/*
383		 * Does our cached result allow us to give a definite yes to
384		 * this request?
385		 */
386		gotahit = 0;
387		mtx_lock(&np->n_mtx);
388		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
389			if (ap->a_cred->cr_uid == np->n_accesscache[i].uid) {
390			    if (time_second < (np->n_accesscache[i].stamp
391				+ nfsaccess_cache_timeout) &&
392				(np->n_accesscache[i].mode & mode) == mode) {
393				NFSINCRGLOBAL(newnfsstats.accesscache_hits);
394				gotahit = 1;
395			    }
396			    break;
397			}
398		}
399		mtx_unlock(&np->n_mtx);
400		if (gotahit == 0) {
401			/*
402			 * Either a no, or a don't know.  Go to the wire.
403			 */
404			NFSINCRGLOBAL(newnfsstats.accesscache_misses);
405		        error = nfs34_access_otw(vp, wmode, ap->a_td,
406			    ap->a_cred, &rmode);
407			if (!error &&
408			    (rmode & mode) != mode)
409				error = EACCES;
410		}
411		return (error);
412	} else {
413		if ((error = nfsspec_access(ap)) != 0) {
414			return (error);
415		}
416		/*
417		 * Attempt to prevent a mapped root from accessing a file
418		 * which it shouldn't.  We try to read a byte from the file
419		 * if the user is root and the file is not zero length.
420		 * After calling nfsspec_access, we should have the correct
421		 * file size cached.
422		 */
423		mtx_lock(&np->n_mtx);
424		if (ap->a_cred->cr_uid == 0 && (ap->a_accmode & VREAD)
425		    && VTONFS(vp)->n_size > 0) {
426			struct iovec aiov;
427			struct uio auio;
428			char buf[1];
429
430			mtx_unlock(&np->n_mtx);
431			aiov.iov_base = buf;
432			aiov.iov_len = 1;
433			auio.uio_iov = &aiov;
434			auio.uio_iovcnt = 1;
435			auio.uio_offset = 0;
436			auio.uio_resid = 1;
437			auio.uio_segflg = UIO_SYSSPACE;
438			auio.uio_rw = UIO_READ;
439			auio.uio_td = ap->a_td;
440
441			if (vp->v_type == VREG)
442				error = ncl_readrpc(vp, &auio, ap->a_cred);
443			else if (vp->v_type == VDIR) {
444				char* bp;
445				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
446				aiov.iov_base = bp;
447				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
448				error = ncl_readdirrpc(vp, &auio, ap->a_cred,
449				    ap->a_td);
450				free(bp, M_TEMP);
451			} else if (vp->v_type == VLNK)
452				error = ncl_readlinkrpc(vp, &auio, ap->a_cred);
453			else
454				error = EACCES;
455		} else
456			mtx_unlock(&np->n_mtx);
457		return (error);
458	}
459}
460
461
462/*
463 * nfs open vnode op
464 * Check to see if the type is ok
465 * and that deletion is not in progress.
466 * For paged in text files, you will need to flush the page cache
467 * if consistency is lost.
468 */
469/* ARGSUSED */
470static int
471nfs_open(struct vop_open_args *ap)
472{
473	struct vnode *vp = ap->a_vp;
474	struct nfsnode *np = VTONFS(vp);
475	struct vattr vattr;
476	int error;
477	int fmode = ap->a_mode;
478
479	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
480		return (EOPNOTSUPP);
481
482	/*
483	 * For NFSv4, we need to do the Open Op before cache validation,
484	 * so that we conform to RFC3530 Sec. 9.3.1.
485	 */
486	if (NFS_ISV4(vp)) {
487		error = nfsrpc_open(vp, fmode, ap->a_cred, ap->a_td);
488		if (error) {
489			error = nfscl_maperr(ap->a_td, error, (uid_t)0,
490			    (gid_t)0);
491			return (error);
492		}
493	}
494
495	/*
496	 * Now, if this Open will be doing reading, re-validate/flush the
497	 * cache, so that Close/Open coherency is maintained.
498	 */
499	mtx_lock(&np->n_mtx);
500	if (np->n_flag & NMODIFIED) {
501		mtx_unlock(&np->n_mtx);
502		error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
503		if (error == EINTR || error == EIO) {
504			if (NFS_ISV4(vp))
505				(void) nfsrpc_close(vp, 0, ap->a_td);
506			return (error);
507		}
508		mtx_lock(&np->n_mtx);
509		np->n_attrstamp = 0;
510		if (vp->v_type == VDIR)
511			np->n_direofoffset = 0;
512		mtx_unlock(&np->n_mtx);
513		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
514		if (error) {
515			if (NFS_ISV4(vp))
516				(void) nfsrpc_close(vp, 0, ap->a_td);
517			return (error);
518		}
519		mtx_lock(&np->n_mtx);
520		np->n_mtime = vattr.va_mtime;
521		if (NFS_ISV4(vp))
522			np->n_change = vattr.va_filerev;
523	} else {
524		mtx_unlock(&np->n_mtx);
525		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
526		if (error) {
527			if (NFS_ISV4(vp))
528				(void) nfsrpc_close(vp, 0, ap->a_td);
529			return (error);
530		}
531		mtx_lock(&np->n_mtx);
532		if ((NFS_ISV4(vp) && np->n_change != vattr.va_filerev) ||
533		    NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
534			if (vp->v_type == VDIR)
535				np->n_direofoffset = 0;
536			mtx_unlock(&np->n_mtx);
537			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
538			if (error == EINTR || error == EIO) {
539				if (NFS_ISV4(vp))
540					(void) nfsrpc_close(vp, 0, ap->a_td);
541				return (error);
542			}
543			mtx_lock(&np->n_mtx);
544			np->n_mtime = vattr.va_mtime;
545			if (NFS_ISV4(vp))
546				np->n_change = vattr.va_filerev;
547		}
548	}
549
550	/*
551	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
552	 */
553	if (newnfs_directio_enable && (fmode & O_DIRECT) &&
554	    (vp->v_type == VREG)) {
555		if (np->n_directio_opens == 0) {
556			mtx_unlock(&np->n_mtx);
557			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
558			if (error) {
559				if (NFS_ISV4(vp))
560					(void) nfsrpc_close(vp, 0, ap->a_td);
561				return (error);
562			}
563			mtx_lock(&np->n_mtx);
564			np->n_flag |= NNONCACHE;
565		}
566		np->n_directio_opens++;
567	}
568	mtx_unlock(&np->n_mtx);
569	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
570	return (0);
571}
572
573/*
574 * nfs close vnode op
575 * What an NFS client should do upon close after writing is a debatable issue.
576 * Most NFS clients push delayed writes to the server upon close, basically for
577 * two reasons:
578 * 1 - So that any write errors may be reported back to the client process
579 *     doing the close system call. By far the two most likely errors are
580 *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
581 * 2 - To put a worst case upper bound on cache inconsistency between
582 *     multiple clients for the file.
583 * There is also a consistency problem for Version 2 of the protocol w.r.t.
584 * not being able to tell if other clients are writing a file concurrently,
585 * since there is no way of knowing if the changed modify time in the reply
586 * is only due to the write for this client.
587 * (NFS Version 3 provides weak cache consistency data in the reply that
588 *  should be sufficient to detect and handle this case.)
589 *
590 * The current code does the following:
591 * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
592 * for NFS Version 3 - flush dirty buffers to the server but don't invalidate
593 *                     or commit them (this satisfies 1 and 2 except for the
594 *                     case where the server crashes after this close but
595 *                     before the commit RPC, which is felt to be "good
596 *                     enough". Changing the last argument to ncl_flush() to
597 *                     a 1 would force a commit operation, if it is felt a
598 *                     commit is necessary now.
599 * for NFS Version 4 - flush the dirty buffers and commit them, if
600 *		       nfscl_mustflush() says this is necessary.
601 *                     It is necessary if there is no write delegation held,
602 *                     in order to satisfy open/close coherency.
603 *                     If the file isn't cached on local stable storage,
604 *                     it may be necessary in order to detect "out of space"
605 *                     errors from the server, if the write delegation
606 *                     issued by the server doesn't allow the file to grow.
607 */
608/* ARGSUSED */
609static int
610nfs_close(struct vop_close_args *ap)
611{
612	struct vnode *vp = ap->a_vp;
613	struct nfsnode *np = VTONFS(vp);
614	struct nfsvattr nfsva;
615	struct ucred *cred;
616	int error = 0, ret, localcred = 0;
617	int fmode = ap->a_fflag;
618
619	if ((vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF))
620		return (0);
621	/*
622	 * During shutdown, a_cred isn't valid, so just use root.
623	 */
624	if (ap->a_cred == NOCRED) {
625		cred = newnfs_getcred();
626		localcred = 1;
627	} else {
628		cred = ap->a_cred;
629	}
630	if (vp->v_type == VREG) {
631	    /*
632	     * Examine and clean dirty pages, regardless of NMODIFIED.
633	     * This closes a major hole in close-to-open consistency.
634	     * We want to push out all dirty pages (and buffers) on
635	     * close, regardless of whether they were dirtied by
636	     * mmap'ed writes or via write().
637	     */
638	    if (nfs_clean_pages_on_close && vp->v_object) {
639		VM_OBJECT_LOCK(vp->v_object);
640		vm_object_page_clean(vp->v_object, 0, 0, 0);
641		VM_OBJECT_UNLOCK(vp->v_object);
642	    }
643	    mtx_lock(&np->n_mtx);
644	    if (np->n_flag & NMODIFIED) {
645		mtx_unlock(&np->n_mtx);
646		if (NFS_ISV3(vp)) {
647		    /*
648		     * Under NFSv3 we have dirty buffers to dispose of.  We
649		     * must flush them to the NFS server.  We have the option
650		     * of waiting all the way through the commit rpc or just
651		     * waiting for the initial write.  The default is to only
652		     * wait through the initial write so the data is in the
653		     * server's cache, which is roughly similar to the state
654		     * a standard disk subsystem leaves the file in on close().
655		     *
656		     * We cannot clear the NMODIFIED bit in np->n_flag due to
657		     * potential races with other processes, and certainly
658		     * cannot clear it if we don't commit.
659		     * These races occur when there is no longer the old
660		     * traditional vnode locking implemented for Vnode Ops.
661		     */
662		    int cm = newnfs_commit_on_close ? 1 : 0;
663		    error = ncl_flush(vp, MNT_WAIT, cred, ap->a_td, cm, 0);
664		    /* np->n_flag &= ~NMODIFIED; */
665		} else if (NFS_ISV4(vp)) {
666			if (nfscl_mustflush(vp) != 0) {
667				int cm = newnfs_commit_on_close ? 1 : 0;
668				error = ncl_flush(vp, MNT_WAIT, cred, ap->a_td,
669				    cm, 0);
670				/*
671				 * as above w.r.t races when clearing
672				 * NMODIFIED.
673				 * np->n_flag &= ~NMODIFIED;
674				 */
675			}
676		} else
677		    error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
678		mtx_lock(&np->n_mtx);
679	    }
680 	    /*
681 	     * Invalidate the attribute cache in all cases.
682 	     * An open is going to fetch fresh attrs any way, other procs
683 	     * on this node that have file open will be forced to do an
684 	     * otw attr fetch, but this is safe.
685	     * --> A user found that their RPC count dropped by 20% when
686	     *     this was commented out and I can't see any requirement
687	     *     for it, so I've disabled it when negative lookups are
688	     *     enabled. (What does this have to do with negative lookup
689	     *     caching? Well nothing, except it was reported by the
690	     *     same user that needed negative lookup caching and I wanted
691	     *     there to be a way to disable it to see if it
692	     *     is the cause of some caching/coherency issue that might
693	     *     crop up.)
694 	     */
695	    if (VFSTONFS(vp->v_mount)->nm_negnametimeo == 0)
696		    np->n_attrstamp = 0;
697	    if (np->n_flag & NWRITEERR) {
698		np->n_flag &= ~NWRITEERR;
699		error = np->n_error;
700	    }
701	    mtx_unlock(&np->n_mtx);
702	}
703
704	if (NFS_ISV4(vp)) {
705		/*
706		 * Get attributes so "change" is up to date.
707		 */
708		if (error == 0 && nfscl_mustflush(vp) != 0) {
709			ret = nfsrpc_getattr(vp, cred, ap->a_td, &nfsva,
710			    NULL);
711			if (!ret) {
712				np->n_change = nfsva.na_filerev;
713				(void) nfscl_loadattrcache(&vp, &nfsva, NULL,
714				    NULL, 0, 0);
715			}
716		}
717
718		/*
719		 * and do the close.
720		 */
721		ret = nfsrpc_close(vp, 0, ap->a_td);
722		if (!error && ret)
723			error = ret;
724		if (error)
725			error = nfscl_maperr(ap->a_td, error, (uid_t)0,
726			    (gid_t)0);
727	}
728	if (newnfs_directio_enable)
729		KASSERT((np->n_directio_asyncwr == 0),
730			("nfs_close: dirty unflushed (%d) directio buffers\n",
731			 np->n_directio_asyncwr));
732	if (newnfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
733		mtx_lock(&np->n_mtx);
734		KASSERT((np->n_directio_opens > 0),
735			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
736		np->n_directio_opens--;
737		if (np->n_directio_opens == 0)
738			np->n_flag &= ~NNONCACHE;
739		mtx_unlock(&np->n_mtx);
740	}
741	if (localcred)
742		NFSFREECRED(cred);
743	return (error);
744}
745
746/*
747 * nfs getattr call from vfs.
748 */
749static int
750nfs_getattr(struct vop_getattr_args *ap)
751{
752	struct vnode *vp = ap->a_vp;
753	struct thread *td = curthread;	/* XXX */
754	struct nfsnode *np = VTONFS(vp);
755	int error = 0;
756	struct nfsvattr nfsva;
757	struct vattr *vap = ap->a_vap;
758	struct vattr vattr;
759
760	/*
761	 * Update local times for special files.
762	 */
763	mtx_lock(&np->n_mtx);
764	if (np->n_flag & (NACC | NUPD))
765		np->n_flag |= NCHG;
766	mtx_unlock(&np->n_mtx);
767	/*
768	 * First look in the cache.
769	 */
770	if (ncl_getattrcache(vp, &vattr) == 0) {
771		vap->va_type = vattr.va_type;
772		vap->va_mode = vattr.va_mode;
773		vap->va_nlink = vattr.va_nlink;
774		vap->va_uid = vattr.va_uid;
775		vap->va_gid = vattr.va_gid;
776		vap->va_fsid = vattr.va_fsid;
777		vap->va_fileid = vattr.va_fileid;
778		vap->va_size = vattr.va_size;
779		vap->va_blocksize = vattr.va_blocksize;
780		vap->va_atime = vattr.va_atime;
781		vap->va_mtime = vattr.va_mtime;
782		vap->va_ctime = vattr.va_ctime;
783		vap->va_gen = vattr.va_gen;
784		vap->va_flags = vattr.va_flags;
785		vap->va_rdev = vattr.va_rdev;
786		vap->va_bytes = vattr.va_bytes;
787		vap->va_filerev = vattr.va_filerev;
788		/*
789		 * Get the local modify time for the case of a write
790		 * delegation.
791		 */
792		nfscl_deleggetmodtime(vp, &vap->va_mtime);
793		return (0);
794	}
795
796	if (NFS_ISV34(vp) && nfs_prime_access_cache &&
797	    nfsaccess_cache_timeout > 0) {
798		NFSINCRGLOBAL(newnfsstats.accesscache_misses);
799		nfs34_access_otw(vp, NFSACCESS_ALL, td, ap->a_cred, NULL);
800		if (ncl_getattrcache(vp, ap->a_vap) == 0) {
801			nfscl_deleggetmodtime(vp, &ap->a_vap->va_mtime);
802			return (0);
803		}
804	}
805	error = nfsrpc_getattr(vp, ap->a_cred, td, &nfsva, NULL);
806	if (!error)
807		error = nfscl_loadattrcache(&vp, &nfsva, vap, NULL, 0, 0);
808	if (!error) {
809		/*
810		 * Get the local modify time for the case of a write
811		 * delegation.
812		 */
813		nfscl_deleggetmodtime(vp, &vap->va_mtime);
814	} else if (NFS_ISV4(vp)) {
815		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
816	}
817	return (error);
818}
819
820/*
821 * nfs setattr call.
822 */
823static int
824nfs_setattr(struct vop_setattr_args *ap)
825{
826	struct vnode *vp = ap->a_vp;
827	struct nfsnode *np = VTONFS(vp);
828	struct thread *td = curthread;	/* XXX */
829	struct vattr *vap = ap->a_vap;
830	int error = 0;
831	u_quad_t tsize;
832
833#ifndef nolint
834	tsize = (u_quad_t)0;
835#endif
836
837	/*
838	 * Setting of flags and marking of atimes are not supported.
839	 */
840	if (vap->va_flags != VNOVAL)
841		return (EOPNOTSUPP);
842
843	/*
844	 * Disallow write attempts if the filesystem is mounted read-only.
845	 */
846  	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
847	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
848	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
849	    (vp->v_mount->mnt_flag & MNT_RDONLY))
850		return (EROFS);
851	if (vap->va_size != VNOVAL) {
852 		switch (vp->v_type) {
853 		case VDIR:
854 			return (EISDIR);
855 		case VCHR:
856 		case VBLK:
857 		case VSOCK:
858 		case VFIFO:
859			if (vap->va_mtime.tv_sec == VNOVAL &&
860			    vap->va_atime.tv_sec == VNOVAL &&
861			    vap->va_mode == (mode_t)VNOVAL &&
862			    vap->va_uid == (uid_t)VNOVAL &&
863			    vap->va_gid == (gid_t)VNOVAL)
864				return (0);
865 			vap->va_size = VNOVAL;
866 			break;
867 		default:
868			/*
869			 * Disallow write attempts if the filesystem is
870			 * mounted read-only.
871			 */
872			if (vp->v_mount->mnt_flag & MNT_RDONLY)
873				return (EROFS);
874			/*
875			 *  We run vnode_pager_setsize() early (why?),
876			 * we must set np->n_size now to avoid vinvalbuf
877			 * V_SAVE races that might setsize a lower
878			 * value.
879			 */
880			mtx_lock(&np->n_mtx);
881			tsize = np->n_size;
882			mtx_unlock(&np->n_mtx);
883			error = ncl_meta_setsize(vp, ap->a_cred, td,
884			    vap->va_size);
885			mtx_lock(&np->n_mtx);
886 			if (np->n_flag & NMODIFIED) {
887			    tsize = np->n_size;
888			    mtx_unlock(&np->n_mtx);
889 			    if (vap->va_size == 0)
890 				error = ncl_vinvalbuf(vp, 0, td, 1);
891 			    else
892 				error = ncl_vinvalbuf(vp, V_SAVE, td, 1);
893 			    if (error) {
894				vnode_pager_setsize(vp, tsize);
895				return (error);
896			    }
897			    /*
898			     * Call nfscl_delegmodtime() to set the modify time
899			     * locally, as required.
900			     */
901			    nfscl_delegmodtime(vp);
902 			} else
903			    mtx_unlock(&np->n_mtx);
904			/*
905			 * np->n_size has already been set to vap->va_size
906			 * in ncl_meta_setsize(). We must set it again since
907			 * nfs_loadattrcache() could be called through
908			 * ncl_meta_setsize() and could modify np->n_size.
909			 */
910			mtx_lock(&np->n_mtx);
911 			np->n_vattr.na_size = np->n_size = vap->va_size;
912			mtx_unlock(&np->n_mtx);
913  		};
914  	} else {
915		mtx_lock(&np->n_mtx);
916		if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) &&
917		    (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
918			mtx_unlock(&np->n_mtx);
919			if ((error = ncl_vinvalbuf(vp, V_SAVE, td, 1)) != 0 &&
920			    (error == EINTR || error == EIO))
921				return (error);
922		} else
923			mtx_unlock(&np->n_mtx);
924	}
925	error = nfs_setattrrpc(vp, vap, ap->a_cred, td);
926	if (error && vap->va_size != VNOVAL) {
927		mtx_lock(&np->n_mtx);
928		np->n_size = np->n_vattr.na_size = tsize;
929		vnode_pager_setsize(vp, tsize);
930		mtx_unlock(&np->n_mtx);
931	}
932	return (error);
933}
934
935/*
936 * Do an nfs setattr rpc.
937 */
938static int
939nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred,
940    struct thread *td)
941{
942	struct nfsnode *np = VTONFS(vp);
943	int error, ret, attrflag, i;
944	struct nfsvattr nfsva;
945
946	if (NFS_ISV34(vp)) {
947		mtx_lock(&np->n_mtx);
948		for (i = 0; i < NFS_ACCESSCACHESIZE; i++)
949			np->n_accesscache[i].stamp = 0;
950		np->n_flag |= NDELEGMOD;
951		mtx_unlock(&np->n_mtx);
952	}
953	error = nfsrpc_setattr(vp, vap, NULL, cred, td, &nfsva, &attrflag,
954	    NULL);
955	if (attrflag) {
956		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
957		if (ret && !error)
958			error = ret;
959	}
960	if (error && NFS_ISV4(vp))
961		error = nfscl_maperr(td, error, vap->va_uid, vap->va_gid);
962	return (error);
963}
964
965/*
966 * nfs lookup call, one step at a time...
967 * First look in cache
968 * If not found, unlock the directory nfsnode and do the rpc
969 */
970static int
971nfs_lookup(struct vop_lookup_args *ap)
972{
973	struct componentname *cnp = ap->a_cnp;
974	struct vnode *dvp = ap->a_dvp;
975	struct vnode **vpp = ap->a_vpp;
976	struct mount *mp = dvp->v_mount;
977	int flags = cnp->cn_flags;
978	struct vnode *newvp;
979	struct nfsmount *nmp;
980	struct nfsnode *np, *newnp;
981	int error = 0, attrflag, dattrflag, ltype;
982	struct thread *td = cnp->cn_thread;
983	struct nfsfh *nfhp;
984	struct nfsvattr dnfsva, nfsva;
985	struct vattr vattr;
986	struct timespec dmtime;
987
988	*vpp = NULLVP;
989	if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) &&
990	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
991		return (EROFS);
992	if (dvp->v_type != VDIR)
993		return (ENOTDIR);
994	nmp = VFSTONFS(mp);
995	np = VTONFS(dvp);
996
997	/* For NFSv4, wait until any remove is done. */
998	mtx_lock(&np->n_mtx);
999	while (NFSHASNFSV4(nmp) && (np->n_flag & NREMOVEINPROG)) {
1000		np->n_flag |= NREMOVEWANT;
1001		(void) msleep((caddr_t)np, &np->n_mtx, PZERO, "nfslkup", 0);
1002	}
1003	mtx_unlock(&np->n_mtx);
1004
1005	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0)
1006		return (error);
1007	error = cache_lookup(dvp, vpp, cnp);
1008	if (error > 0 && error != ENOENT)
1009		return (error);
1010	if (error == -1) {
1011		/*
1012		 * We only accept a positive hit in the cache if the
1013		 * change time of the file matches our cached copy.
1014		 * Otherwise, we discard the cache entry and fallback
1015		 * to doing a lookup RPC.
1016		 *
1017		 * To better handle stale file handles and attributes,
1018		 * clear the attribute cache of this node if it is a
1019		 * leaf component, part of an open() call, and not
1020		 * locally modified before fetching the attributes.
1021		 * This should allow stale file handles to be detected
1022		 * here where we can fall back to a LOOKUP RPC to
1023		 * recover rather than having nfs_open() detect the
1024		 * stale file handle and failing open(2) with ESTALE.
1025		 */
1026		newvp = *vpp;
1027		newnp = VTONFS(newvp);
1028		if (!(nmp->nm_flag & NFSMNT_NOCTO) &&
1029		    (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
1030		    !(newnp->n_flag & NMODIFIED)) {
1031			mtx_lock(&newnp->n_mtx);
1032			newnp->n_attrstamp = 0;
1033			mtx_unlock(&newnp->n_mtx);
1034		}
1035		if (nfscl_nodeleg(newvp, 0) == 0 ||
1036		    (VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 &&
1037		    timespeccmp(&vattr.va_ctime, &newnp->n_ctime, ==))) {
1038			NFSINCRGLOBAL(newnfsstats.lookupcache_hits);
1039			if (cnp->cn_nameiop != LOOKUP &&
1040			    (flags & ISLASTCN))
1041				cnp->cn_flags |= SAVENAME;
1042			return (0);
1043		}
1044		cache_purge(newvp);
1045		if (dvp != newvp)
1046			vput(newvp);
1047		else
1048			vrele(newvp);
1049		*vpp = NULLVP;
1050	} else if (error == ENOENT) {
1051		if (dvp->v_iflag & VI_DOOMED)
1052			return (ENOENT);
1053		/*
1054		 * We only accept a negative hit in the cache if the
1055		 * modification time of the parent directory matches
1056		 * our cached copy.  Otherwise, we discard all of the
1057		 * negative cache entries for this directory. We also
1058		 * only trust -ve cache entries for less than
1059		 * nm_negative_namecache_timeout seconds.
1060		 */
1061		if ((u_int)(ticks - np->n_dmtime_ticks) <
1062		    (nmp->nm_negnametimeo * hz) &&
1063		    VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 &&
1064		    timespeccmp(&vattr.va_mtime, &np->n_dmtime, ==)) {
1065			NFSINCRGLOBAL(newnfsstats.lookupcache_hits);
1066			return (ENOENT);
1067		}
1068		cache_purge_negative(dvp);
1069		mtx_lock(&np->n_mtx);
1070		timespecclear(&np->n_dmtime);
1071		mtx_unlock(&np->n_mtx);
1072	}
1073
1074	/*
1075	 * Cache the modification time of the parent directory in case
1076	 * the lookup fails and results in adding the first negative
1077	 * name cache entry for the directory.  Since this is reading
1078	 * a single time_t, don't bother with locking.  The
1079	 * modification time may be a bit stale, but it must be read
1080	 * before performing the lookup RPC to prevent a race where
1081	 * another lookup updates the timestamp on the directory after
1082	 * the lookup RPC has been performed on the server but before
1083	 * n_dmtime is set at the end of this function.
1084	 */
1085	dmtime = np->n_vattr.na_mtime;
1086	error = 0;
1087	newvp = NULLVP;
1088	NFSINCRGLOBAL(newnfsstats.lookupcache_misses);
1089	error = nfsrpc_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
1090	    cnp->cn_cred, td, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
1091	    NULL);
1092	if (dattrflag)
1093		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
1094	if (error) {
1095		if (newvp != NULLVP) {
1096			vput(newvp);
1097			*vpp = NULLVP;
1098		}
1099
1100		if (error != ENOENT) {
1101			if (NFS_ISV4(dvp))
1102				error = nfscl_maperr(td, error, (uid_t)0,
1103				    (gid_t)0);
1104			return (error);
1105		}
1106
1107		/* The requested file was not found. */
1108		if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
1109		    (flags & ISLASTCN)) {
1110			/*
1111			 * XXX: UFS does a full VOP_ACCESS(dvp,
1112			 * VWRITE) here instead of just checking
1113			 * MNT_RDONLY.
1114			 */
1115			if (mp->mnt_flag & MNT_RDONLY)
1116				return (EROFS);
1117			cnp->cn_flags |= SAVENAME;
1118			return (EJUSTRETURN);
1119		}
1120
1121		if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE) {
1122			/*
1123			 * Maintain n_dmtime as the modification time
1124			 * of the parent directory when the oldest -ve
1125			 * name cache entry for this directory was
1126			 * added.  If a -ve cache entry has already
1127			 * been added with a newer modification time
1128			 * by a concurrent lookup, then don't bother
1129			 * adding a cache entry.  The modification
1130			 * time of the directory might have changed
1131			 * due to the file this lookup failed to find
1132			 * being created.  In that case a subsequent
1133			 * lookup would incorrectly use the entry
1134			 * added here instead of doing an extra
1135			 * lookup.
1136			 */
1137			mtx_lock(&np->n_mtx);
1138			if (timespeccmp(&np->n_dmtime, &dmtime, <=)) {
1139				if (!timespecisset(&np->n_dmtime)) {
1140					np->n_dmtime = dmtime;
1141					np->n_dmtime_ticks = ticks;
1142				}
1143				mtx_unlock(&np->n_mtx);
1144				cache_enter(dvp, NULL, cnp);
1145			} else
1146				mtx_unlock(&np->n_mtx);
1147		}
1148		return (ENOENT);
1149	}
1150
1151	/*
1152	 * Handle RENAME case...
1153	 */
1154	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
1155		if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) {
1156			FREE((caddr_t)nfhp, M_NFSFH);
1157			return (EISDIR);
1158		}
1159		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
1160		    LK_EXCLUSIVE);
1161		if (error)
1162			return (error);
1163		newvp = NFSTOV(np);
1164		if (attrflag)
1165			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
1166			    0, 1);
1167		*vpp = newvp;
1168		cnp->cn_flags |= SAVENAME;
1169		return (0);
1170	}
1171
1172	if (flags & ISDOTDOT) {
1173		ltype = VOP_ISLOCKED(dvp);
1174		error = vfs_busy(mp, MBF_NOWAIT);
1175		if (error != 0) {
1176			vfs_ref(mp);
1177			VOP_UNLOCK(dvp, 0);
1178			error = vfs_busy(mp, 0);
1179			vn_lock(dvp, ltype | LK_RETRY);
1180			vfs_rel(mp);
1181			if (error == 0 && (dvp->v_iflag & VI_DOOMED)) {
1182				vfs_unbusy(mp);
1183				error = ENOENT;
1184			}
1185			if (error != 0)
1186				return (error);
1187		}
1188		VOP_UNLOCK(dvp, 0);
1189		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
1190		    cnp->cn_lkflags);
1191		if (error == 0)
1192			newvp = NFSTOV(np);
1193		vfs_unbusy(mp);
1194		if (newvp != dvp)
1195			vn_lock(dvp, ltype | LK_RETRY);
1196		if (dvp->v_iflag & VI_DOOMED) {
1197			if (error == 0) {
1198				if (newvp == dvp)
1199					vrele(newvp);
1200				else
1201					vput(newvp);
1202			}
1203			error = ENOENT;
1204		}
1205		if (error != 0)
1206			return (error);
1207		if (attrflag)
1208			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
1209			    0, 1);
1210	} else if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) {
1211		FREE((caddr_t)nfhp, M_NFSFH);
1212		VREF(dvp);
1213		newvp = dvp;
1214		if (attrflag)
1215			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
1216			    0, 1);
1217	} else {
1218		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
1219		    cnp->cn_lkflags);
1220		if (error)
1221			return (error);
1222		newvp = NFSTOV(np);
1223		if (attrflag)
1224			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
1225			    0, 1);
1226		else if ((flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
1227		    !(np->n_flag & NMODIFIED)) {
1228			/*
1229			 * Flush the attribute cache when opening a
1230			 * leaf node to ensure that fresh attributes
1231			 * are fetched in nfs_open() since we did not
1232			 * fetch attributes from the LOOKUP reply.
1233			 */
1234			mtx_lock(&np->n_mtx);
1235			np->n_attrstamp = 0;
1236			mtx_unlock(&np->n_mtx);
1237		}
1238	}
1239	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
1240		cnp->cn_flags |= SAVENAME;
1241	if ((cnp->cn_flags & MAKEENTRY) &&
1242	    (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) {
1243		np->n_ctime = np->n_vattr.na_vattr.va_ctime;
1244		cache_enter(dvp, newvp, cnp);
1245	}
1246	*vpp = newvp;
1247	return (0);
1248}
1249
1250/*
1251 * nfs read call.
1252 * Just call ncl_bioread() to do the work.
1253 */
1254static int
1255nfs_read(struct vop_read_args *ap)
1256{
1257	struct vnode *vp = ap->a_vp;
1258
1259	switch (vp->v_type) {
1260	case VREG:
1261		return (ncl_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
1262	case VDIR:
1263		return (EISDIR);
1264	default:
1265		return (EOPNOTSUPP);
1266	}
1267}
1268
1269/*
1270 * nfs readlink call
1271 */
1272static int
1273nfs_readlink(struct vop_readlink_args *ap)
1274{
1275	struct vnode *vp = ap->a_vp;
1276
1277	if (vp->v_type != VLNK)
1278		return (EINVAL);
1279	return (ncl_bioread(vp, ap->a_uio, 0, ap->a_cred));
1280}
1281
1282/*
1283 * Do a readlink rpc.
1284 * Called by ncl_doio() from below the buffer cache.
1285 */
1286int
1287ncl_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
1288{
1289	int error, ret, attrflag;
1290	struct nfsvattr nfsva;
1291
1292	error = nfsrpc_readlink(vp, uiop, cred, uiop->uio_td, &nfsva,
1293	    &attrflag, NULL);
1294	if (attrflag) {
1295		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
1296		if (ret && !error)
1297			error = ret;
1298	}
1299	if (error && NFS_ISV4(vp))
1300		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
1301	return (error);
1302}
1303
1304/*
1305 * nfs read rpc call
1306 * Ditto above
1307 */
1308int
1309ncl_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
1310{
1311	int error, ret, attrflag;
1312	struct nfsvattr nfsva;
1313
1314	error = nfsrpc_read(vp, uiop, cred, uiop->uio_td, &nfsva, &attrflag,
1315	    NULL);
1316	if (attrflag) {
1317		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
1318		if (ret && !error)
1319			error = ret;
1320	}
1321	if (error && NFS_ISV4(vp))
1322		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
1323	return (error);
1324}
1325
1326/*
1327 * nfs write call
1328 */
1329int
1330ncl_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
1331    int *iomode, int *must_commit, int called_from_strategy)
1332{
1333	struct nfsvattr nfsva;
1334	int error = 0, attrflag, ret;
1335
1336	error = nfsrpc_write(vp, uiop, iomode, must_commit, cred,
1337	    uiop->uio_td, &nfsva, &attrflag, NULL, called_from_strategy);
1338	if (attrflag) {
1339		if (VTONFS(vp)->n_flag & ND_NFSV4)
1340			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 1,
1341			    1);
1342		else
1343			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
1344			    1);
1345		if (ret && !error)
1346			error = ret;
1347	}
1348	if (vp->v_mount->mnt_kern_flag & MNTK_ASYNC)
1349		*iomode = NFSWRITE_FILESYNC;
1350	if (error && NFS_ISV4(vp))
1351		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
1352	return (error);
1353}
1354
1355/*
1356 * nfs mknod rpc
1357 * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
1358 * mode set to specify the file type and the size field for rdev.
1359 */
1360static int
1361nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1362    struct vattr *vap)
1363{
1364	struct nfsvattr nfsva, dnfsva;
1365	struct vnode *newvp = NULL;
1366	struct nfsnode *np = NULL, *dnp;
1367	struct nfsfh *nfhp;
1368	struct vattr vattr;
1369	int error = 0, attrflag, dattrflag;
1370	u_int32_t rdev;
1371
1372	if (vap->va_type == VCHR || vap->va_type == VBLK)
1373		rdev = vap->va_rdev;
1374	else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
1375		rdev = 0xffffffff;
1376	else
1377		return (EOPNOTSUPP);
1378	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
1379		return (error);
1380	error = nfsrpc_mknod(dvp, cnp->cn_nameptr, cnp->cn_namelen, vap,
1381	    rdev, vap->va_type, cnp->cn_cred, cnp->cn_thread, &dnfsva,
1382	    &nfsva, &nfhp, &attrflag, &dattrflag, NULL);
1383	if (!error) {
1384		if (!nfhp)
1385			(void) nfsrpc_lookup(dvp, cnp->cn_nameptr,
1386			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread,
1387			    &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
1388			    NULL);
1389		if (nfhp)
1390			error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp,
1391			    cnp->cn_thread, &np, NULL, LK_EXCLUSIVE);
1392	}
1393	if (dattrflag)
1394		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
1395	if (!error) {
1396		newvp = NFSTOV(np);
1397		if (attrflag != 0) {
1398			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
1399			    0, 1);
1400			if (error != 0)
1401				vput(newvp);
1402		}
1403	}
1404	if (!error) {
1405		if ((cnp->cn_flags & MAKEENTRY))
1406			cache_enter(dvp, newvp, cnp);
1407		*vpp = newvp;
1408	} else if (NFS_ISV4(dvp)) {
1409		error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid,
1410		    vap->va_gid);
1411	}
1412	dnp = VTONFS(dvp);
1413	mtx_lock(&dnp->n_mtx);
1414	dnp->n_flag |= NMODIFIED;
1415	if (!dattrflag)
1416		dnp->n_attrstamp = 0;
1417	mtx_unlock(&dnp->n_mtx);
1418	return (error);
1419}
1420
1421/*
1422 * nfs mknod vop
1423 * just call nfs_mknodrpc() to do the work.
1424 */
1425/* ARGSUSED */
1426static int
1427nfs_mknod(struct vop_mknod_args *ap)
1428{
1429	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
1430}
1431
1432static struct mtx nfs_cverf_mtx;
1433MTX_SYSINIT(nfs_cverf_mtx, &nfs_cverf_mtx, "NFS create verifier mutex",
1434    MTX_DEF);
1435
1436static nfsquad_t
1437nfs_get_cverf(void)
1438{
1439	static nfsquad_t cverf;
1440	nfsquad_t ret;
1441	static int cverf_initialized = 0;
1442
1443	mtx_lock(&nfs_cverf_mtx);
1444	if (cverf_initialized == 0) {
1445		cverf.lval[0] = arc4random();
1446		cverf.lval[1] = arc4random();
1447		cverf_initialized = 1;
1448	} else
1449		cverf.qval++;
1450	ret = cverf;
1451	mtx_unlock(&nfs_cverf_mtx);
1452
1453	return (ret);
1454}
1455
1456/*
1457 * nfs file create call
1458 */
1459static int
1460nfs_create(struct vop_create_args *ap)
1461{
1462	struct vnode *dvp = ap->a_dvp;
1463	struct vattr *vap = ap->a_vap;
1464	struct componentname *cnp = ap->a_cnp;
1465	struct nfsnode *np = NULL, *dnp;
1466	struct vnode *newvp = NULL;
1467	struct nfsmount *nmp;
1468	struct nfsvattr dnfsva, nfsva;
1469	struct nfsfh *nfhp;
1470	nfsquad_t cverf;
1471	int error = 0, attrflag, dattrflag, fmode = 0;
1472	struct vattr vattr;
1473
1474	/*
1475	 * Oops, not for me..
1476	 */
1477	if (vap->va_type == VSOCK)
1478		return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap));
1479
1480	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
1481		return (error);
1482	if (vap->va_vaflags & VA_EXCLUSIVE)
1483		fmode |= O_EXCL;
1484	dnp = VTONFS(dvp);
1485	nmp = VFSTONFS(vnode_mount(dvp));
1486again:
1487	/* For NFSv4, wait until any remove is done. */
1488	mtx_lock(&dnp->n_mtx);
1489	while (NFSHASNFSV4(nmp) && (dnp->n_flag & NREMOVEINPROG)) {
1490		dnp->n_flag |= NREMOVEWANT;
1491		(void) msleep((caddr_t)dnp, &dnp->n_mtx, PZERO, "nfscrt", 0);
1492	}
1493	mtx_unlock(&dnp->n_mtx);
1494
1495	cverf = nfs_get_cverf();
1496	error = nfsrpc_create(dvp, cnp->cn_nameptr, cnp->cn_namelen,
1497	    vap, cverf, fmode, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva,
1498	    &nfhp, &attrflag, &dattrflag, NULL);
1499	if (!error) {
1500		if (nfhp == NULL)
1501			(void) nfsrpc_lookup(dvp, cnp->cn_nameptr,
1502			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread,
1503			    &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
1504			    NULL);
1505		if (nfhp != NULL)
1506			error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp,
1507			    cnp->cn_thread, &np, NULL, LK_EXCLUSIVE);
1508	}
1509	if (dattrflag)
1510		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
1511	if (!error) {
1512		newvp = NFSTOV(np);
1513		if (attrflag)
1514			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
1515			    0, 1);
1516	}
1517	if (error) {
1518		if (newvp != NULL) {
1519			vput(newvp);
1520			newvp = NULL;
1521		}
1522		if (NFS_ISV34(dvp) && (fmode & O_EXCL) &&
1523		    error == NFSERR_NOTSUPP) {
1524			fmode &= ~O_EXCL;
1525			goto again;
1526		}
1527	} else if (NFS_ISV34(dvp) && (fmode & O_EXCL)) {
1528		if (nfscl_checksattr(vap, &nfsva)) {
1529			/*
1530			 * We are normally called with only a partially
1531			 * initialized VAP. Since the NFSv3 spec says that
1532			 * the server may use the file attributes to
1533			 * store the verifier, the spec requires us to do a
1534			 * SETATTR RPC. FreeBSD servers store the verifier in
1535			 * atime, but we can't really assume that all servers
1536			 * will so we ensure that our SETATTR sets both atime
1537			 * and mtime.
1538			 */
1539			if (vap->va_mtime.tv_sec == VNOVAL)
1540				vfs_timestamp(&vap->va_mtime);
1541			if (vap->va_atime.tv_sec == VNOVAL)
1542				vap->va_atime = vap->va_mtime;
1543			error = nfsrpc_setattr(newvp, vap, NULL, cnp->cn_cred,
1544			    cnp->cn_thread, &nfsva, &attrflag, NULL);
1545			if (error && (vap->va_uid != (uid_t)VNOVAL ||
1546			    vap->va_gid != (gid_t)VNOVAL)) {
1547				/* try again without setting uid/gid */
1548				vap->va_uid = (uid_t)VNOVAL;
1549				vap->va_gid = (uid_t)VNOVAL;
1550				error = nfsrpc_setattr(newvp, vap, NULL,
1551				    cnp->cn_cred, cnp->cn_thread, &nfsva,
1552				    &attrflag, NULL);
1553			}
1554			if (attrflag)
1555				(void) nfscl_loadattrcache(&newvp, &nfsva, NULL,
1556				    NULL, 0, 1);
1557		}
1558	}
1559	if (!error) {
1560		if (cnp->cn_flags & MAKEENTRY)
1561			cache_enter(dvp, newvp, cnp);
1562		*ap->a_vpp = newvp;
1563	} else if (NFS_ISV4(dvp)) {
1564		error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid,
1565		    vap->va_gid);
1566	}
1567	mtx_lock(&dnp->n_mtx);
1568	dnp->n_flag |= NMODIFIED;
1569	if (!dattrflag)
1570		dnp->n_attrstamp = 0;
1571	mtx_unlock(&dnp->n_mtx);
1572	return (error);
1573}
1574
1575/*
1576 * nfs file remove call
1577 * To try and make nfs semantics closer to ufs semantics, a file that has
1578 * other processes using the vnode is renamed instead of removed and then
1579 * removed later on the last close.
1580 * - If v_usecount > 1
1581 *	  If a rename is not already in the works
1582 *	     call nfs_sillyrename() to set it up
1583 *     else
1584 *	  do the remove rpc
1585 */
1586static int
1587nfs_remove(struct vop_remove_args *ap)
1588{
1589	struct vnode *vp = ap->a_vp;
1590	struct vnode *dvp = ap->a_dvp;
1591	struct componentname *cnp = ap->a_cnp;
1592	struct nfsnode *np = VTONFS(vp);
1593	int error = 0;
1594	struct vattr vattr;
1595
1596	KASSERT((cnp->cn_flags & HASBUF) != 0, ("nfs_remove: no name"));
1597	KASSERT(vrefcnt(vp) > 0, ("nfs_remove: bad v_usecount"));
1598	if (vp->v_type == VDIR)
1599		error = EPERM;
1600	else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
1601	    VOP_GETATTR(vp, &vattr, cnp->cn_cred) == 0 &&
1602	    vattr.va_nlink > 1)) {
1603		/*
1604		 * Purge the name cache so that the chance of a lookup for
1605		 * the name succeeding while the remove is in progress is
1606		 * minimized. Without node locking it can still happen, such
1607		 * that an I/O op returns ESTALE, but since you get this if
1608		 * another host removes the file..
1609		 */
1610		cache_purge(vp);
1611		/*
1612		 * throw away biocache buffers, mainly to avoid
1613		 * unnecessary delayed writes later.
1614		 */
1615		error = ncl_vinvalbuf(vp, 0, cnp->cn_thread, 1);
1616		/* Do the rpc */
1617		if (error != EINTR && error != EIO)
1618			error = nfs_removerpc(dvp, vp, cnp->cn_nameptr,
1619			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
1620		/*
1621		 * Kludge City: If the first reply to the remove rpc is lost..
1622		 *   the reply to the retransmitted request will be ENOENT
1623		 *   since the file was in fact removed
1624		 *   Therefore, we cheat and return success.
1625		 */
1626		if (error == ENOENT)
1627			error = 0;
1628	} else if (!np->n_sillyrename)
1629		error = nfs_sillyrename(dvp, vp, cnp);
1630	mtx_lock(&np->n_mtx);
1631	np->n_attrstamp = 0;
1632	mtx_unlock(&np->n_mtx);
1633	return (error);
1634}
1635
1636/*
1637 * nfs file remove rpc called from nfs_inactive
1638 */
1639int
1640ncl_removeit(struct sillyrename *sp, struct vnode *vp)
1641{
1642	/*
1643	 * Make sure that the directory vnode is still valid.
1644	 * XXX we should lock sp->s_dvp here.
1645	 */
1646	if (sp->s_dvp->v_type == VBAD)
1647		return (0);
1648	return (nfs_removerpc(sp->s_dvp, vp, sp->s_name, sp->s_namlen,
1649	    sp->s_cred, NULL));
1650}
1651
1652/*
1653 * Nfs remove rpc, called from nfs_remove() and ncl_removeit().
1654 */
1655static int
1656nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name,
1657    int namelen, struct ucred *cred, struct thread *td)
1658{
1659	struct nfsvattr dnfsva;
1660	struct nfsnode *dnp = VTONFS(dvp);
1661	int error = 0, dattrflag;
1662
1663	mtx_lock(&dnp->n_mtx);
1664	dnp->n_flag |= NREMOVEINPROG;
1665	mtx_unlock(&dnp->n_mtx);
1666	error = nfsrpc_remove(dvp, name, namelen, vp, cred, td, &dnfsva,
1667	    &dattrflag, NULL);
1668	mtx_lock(&dnp->n_mtx);
1669	if ((dnp->n_flag & NREMOVEWANT)) {
1670		dnp->n_flag &= ~(NREMOVEWANT | NREMOVEINPROG);
1671		mtx_unlock(&dnp->n_mtx);
1672		wakeup((caddr_t)dnp);
1673	} else {
1674		dnp->n_flag &= ~NREMOVEINPROG;
1675		mtx_unlock(&dnp->n_mtx);
1676	}
1677	if (dattrflag)
1678		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
1679	mtx_lock(&dnp->n_mtx);
1680	dnp->n_flag |= NMODIFIED;
1681	if (!dattrflag)
1682		dnp->n_attrstamp = 0;
1683	mtx_unlock(&dnp->n_mtx);
1684	if (error && NFS_ISV4(dvp))
1685		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
1686	return (error);
1687}
1688
1689/*
1690 * nfs file rename call
1691 */
1692static int
1693nfs_rename(struct vop_rename_args *ap)
1694{
1695	struct vnode *fvp = ap->a_fvp;
1696	struct vnode *tvp = ap->a_tvp;
1697	struct vnode *fdvp = ap->a_fdvp;
1698	struct vnode *tdvp = ap->a_tdvp;
1699	struct componentname *tcnp = ap->a_tcnp;
1700	struct componentname *fcnp = ap->a_fcnp;
1701	struct nfsnode *fnp = VTONFS(ap->a_fvp);
1702	struct nfsnode *tdnp = VTONFS(ap->a_tdvp);
1703	struct nfsv4node *newv4 = NULL;
1704	int error;
1705
1706	KASSERT((tcnp->cn_flags & HASBUF) != 0 &&
1707	    (fcnp->cn_flags & HASBUF) != 0, ("nfs_rename: no name"));
1708	/* Check for cross-device rename */
1709	if ((fvp->v_mount != tdvp->v_mount) ||
1710	    (tvp && (fvp->v_mount != tvp->v_mount))) {
1711		error = EXDEV;
1712		goto out;
1713	}
1714
1715	if (fvp == tvp) {
1716		ncl_printf("nfs_rename: fvp == tvp (can't happen)\n");
1717		error = 0;
1718		goto out;
1719	}
1720	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
1721		goto out;
1722
1723	/*
1724	 * We have to flush B_DELWRI data prior to renaming
1725	 * the file.  If we don't, the delayed-write buffers
1726	 * can be flushed out later after the file has gone stale
1727	 * under NFSV3.  NFSV2 does not have this problem because
1728	 * ( as far as I can tell ) it flushes dirty buffers more
1729	 * often.
1730	 *
1731	 * Skip the rename operation if the fsync fails, this can happen
1732	 * due to the server's volume being full, when we pushed out data
1733	 * that was written back to our cache earlier. Not checking for
1734	 * this condition can result in potential (silent) data loss.
1735	 */
1736	error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
1737	VOP_UNLOCK(fvp, 0);
1738	if (!error && tvp)
1739		error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
1740	if (error)
1741		goto out;
1742
1743	/*
1744	 * If the tvp exists and is in use, sillyrename it before doing the
1745	 * rename of the new file over it.
1746	 * XXX Can't sillyrename a directory.
1747	 */
1748	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
1749		tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
1750		vput(tvp);
1751		tvp = NULL;
1752	}
1753
1754	error = nfs_renamerpc(fdvp, fvp, fcnp->cn_nameptr, fcnp->cn_namelen,
1755	    tdvp, tvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
1756	    tcnp->cn_thread);
1757
1758	if (error == 0 && NFS_ISV4(tdvp)) {
1759		/*
1760		 * For NFSv4, check to see if it is the same name and
1761		 * replace the name, if it is different.
1762		 */
1763		MALLOC(newv4, struct nfsv4node *,
1764		    sizeof (struct nfsv4node) +
1765		    tdnp->n_fhp->nfh_len + tcnp->cn_namelen - 1,
1766		    M_NFSV4NODE, M_WAITOK);
1767		mtx_lock(&tdnp->n_mtx);
1768		mtx_lock(&fnp->n_mtx);
1769		if (fnp->n_v4 != NULL && fvp->v_type == VREG &&
1770		    (fnp->n_v4->n4_namelen != tcnp->cn_namelen ||
1771		      NFSBCMP(tcnp->cn_nameptr, NFS4NODENAME(fnp->n_v4),
1772		      tcnp->cn_namelen) ||
1773		      tdnp->n_fhp->nfh_len != fnp->n_v4->n4_fhlen ||
1774		      NFSBCMP(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data,
1775			tdnp->n_fhp->nfh_len))) {
1776#ifdef notdef
1777{ char nnn[100]; int nnnl;
1778nnnl = (tcnp->cn_namelen < 100) ? tcnp->cn_namelen : 99;
1779bcopy(tcnp->cn_nameptr, nnn, nnnl);
1780nnn[nnnl] = '\0';
1781printf("ren replace=%s\n",nnn);
1782}
1783#endif
1784			FREE((caddr_t)fnp->n_v4, M_NFSV4NODE);
1785			fnp->n_v4 = newv4;
1786			newv4 = NULL;
1787			fnp->n_v4->n4_fhlen = tdnp->n_fhp->nfh_len;
1788			fnp->n_v4->n4_namelen = tcnp->cn_namelen;
1789			NFSBCOPY(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data,
1790			    tdnp->n_fhp->nfh_len);
1791			NFSBCOPY(tcnp->cn_nameptr,
1792			    NFS4NODENAME(fnp->n_v4), tcnp->cn_namelen);
1793		}
1794		mtx_unlock(&tdnp->n_mtx);
1795		mtx_unlock(&fnp->n_mtx);
1796		if (newv4 != NULL)
1797			FREE((caddr_t)newv4, M_NFSV4NODE);
1798	}
1799
1800	if (fvp->v_type == VDIR) {
1801		if (tvp != NULL && tvp->v_type == VDIR)
1802			cache_purge(tdvp);
1803		cache_purge(fdvp);
1804	}
1805
1806out:
1807	if (tdvp == tvp)
1808		vrele(tdvp);
1809	else
1810		vput(tdvp);
1811	if (tvp)
1812		vput(tvp);
1813	vrele(fdvp);
1814	vrele(fvp);
1815	/*
1816	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
1817	 */
1818	if (error == ENOENT)
1819		error = 0;
1820	return (error);
1821}
1822
1823/*
1824 * nfs file rename rpc called from nfs_remove() above
1825 */
1826static int
1827nfs_renameit(struct vnode *sdvp, struct vnode *svp, struct componentname *scnp,
1828    struct sillyrename *sp)
1829{
1830
1831	return (nfs_renamerpc(sdvp, svp, scnp->cn_nameptr, scnp->cn_namelen,
1832	    sdvp, NULL, sp->s_name, sp->s_namlen, scnp->cn_cred,
1833	    scnp->cn_thread));
1834}
1835
1836/*
1837 * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
1838 */
1839static int
1840nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr,
1841    int fnamelen, struct vnode *tdvp, struct vnode *tvp, char *tnameptr,
1842    int tnamelen, struct ucred *cred, struct thread *td)
1843{
1844	struct nfsvattr fnfsva, tnfsva;
1845	struct nfsnode *fdnp = VTONFS(fdvp);
1846	struct nfsnode *tdnp = VTONFS(tdvp);
1847	int error = 0, fattrflag, tattrflag;
1848
1849	error = nfsrpc_rename(fdvp, fvp, fnameptr, fnamelen, tdvp, tvp,
1850	    tnameptr, tnamelen, cred, td, &fnfsva, &tnfsva, &fattrflag,
1851	    &tattrflag, NULL, NULL);
1852	mtx_lock(&fdnp->n_mtx);
1853	fdnp->n_flag |= NMODIFIED;
1854	if (fattrflag != 0) {
1855		mtx_unlock(&fdnp->n_mtx);
1856		(void) nfscl_loadattrcache(&fdvp, &fnfsva, NULL, NULL, 0, 1);
1857	} else {
1858		fdnp->n_attrstamp = 0;
1859		mtx_unlock(&fdnp->n_mtx);
1860	}
1861	mtx_lock(&tdnp->n_mtx);
1862	tdnp->n_flag |= NMODIFIED;
1863	if (tattrflag != 0) {
1864		mtx_unlock(&tdnp->n_mtx);
1865		(void) nfscl_loadattrcache(&tdvp, &tnfsva, NULL, NULL, 0, 1);
1866	} else {
1867		tdnp->n_attrstamp = 0;
1868		mtx_unlock(&tdnp->n_mtx);
1869	}
1870	if (error && NFS_ISV4(fdvp))
1871		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
1872	return (error);
1873}
1874
1875/*
1876 * nfs hard link create call
1877 */
1878static int
1879nfs_link(struct vop_link_args *ap)
1880{
1881	struct vnode *vp = ap->a_vp;
1882	struct vnode *tdvp = ap->a_tdvp;
1883	struct componentname *cnp = ap->a_cnp;
1884	struct nfsnode *np, *tdnp;
1885	struct nfsvattr nfsva, dnfsva;
1886	int error = 0, attrflag, dattrflag;
1887
1888	if (vp->v_mount != tdvp->v_mount) {
1889		return (EXDEV);
1890	}
1891
1892	/*
1893	 * Push all writes to the server, so that the attribute cache
1894	 * doesn't get "out of sync" with the server.
1895	 * XXX There should be a better way!
1896	 */
1897	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
1898
1899	error = nfsrpc_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_namelen,
1900	    cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &attrflag,
1901	    &dattrflag, NULL);
1902	tdnp = VTONFS(tdvp);
1903	mtx_lock(&tdnp->n_mtx);
1904	tdnp->n_flag |= NMODIFIED;
1905	if (dattrflag != 0) {
1906		mtx_unlock(&tdnp->n_mtx);
1907		(void) nfscl_loadattrcache(&tdvp, &dnfsva, NULL, NULL, 0, 1);
1908	} else {
1909		tdnp->n_attrstamp = 0;
1910		mtx_unlock(&tdnp->n_mtx);
1911	}
1912	if (attrflag)
1913		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
1914	else {
1915		np = VTONFS(vp);
1916		mtx_lock(&np->n_mtx);
1917		np->n_attrstamp = 0;
1918		mtx_unlock(&np->n_mtx);
1919	}
1920	/*
1921	 * If negative lookup caching is enabled, I might as well
1922	 * add an entry for this node. Not necessary for correctness,
1923	 * but if negative caching is enabled, then the system
1924	 * must care about lookup caching hit rate, so...
1925	 */
1926	if (VFSTONFS(vp->v_mount)->nm_negnametimeo != 0 &&
1927	    (cnp->cn_flags & MAKEENTRY))
1928		cache_enter(tdvp, vp, cnp);
1929	if (error && NFS_ISV4(vp))
1930		error = nfscl_maperr(cnp->cn_thread, error, (uid_t)0,
1931		    (gid_t)0);
1932	return (error);
1933}
1934
1935/*
1936 * nfs symbolic link create call
1937 */
1938static int
1939nfs_symlink(struct vop_symlink_args *ap)
1940{
1941	struct vnode *dvp = ap->a_dvp;
1942	struct vattr *vap = ap->a_vap;
1943	struct componentname *cnp = ap->a_cnp;
1944	struct nfsvattr nfsva, dnfsva;
1945	struct nfsfh *nfhp;
1946	struct nfsnode *np = NULL, *dnp;
1947	struct vnode *newvp = NULL;
1948	int error = 0, attrflag, dattrflag, ret;
1949
1950	vap->va_type = VLNK;
1951	error = nfsrpc_symlink(dvp, cnp->cn_nameptr, cnp->cn_namelen,
1952	    ap->a_target, vap, cnp->cn_cred, cnp->cn_thread, &dnfsva,
1953	    &nfsva, &nfhp, &attrflag, &dattrflag, NULL);
1954	if (nfhp) {
1955		ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, cnp->cn_thread,
1956		    &np, NULL, LK_EXCLUSIVE);
1957		if (!ret)
1958			newvp = NFSTOV(np);
1959		else if (!error)
1960			error = ret;
1961	}
1962	if (newvp != NULL) {
1963		if (attrflag)
1964			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
1965			    0, 1);
1966	} else if (!error) {
1967		/*
1968		 * If we do not have an error and we could not extract the
1969		 * newvp from the response due to the request being NFSv2, we
1970		 * have to do a lookup in order to obtain a newvp to return.
1971		 */
1972		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
1973		    cnp->cn_cred, cnp->cn_thread, &np);
1974		if (!error)
1975			newvp = NFSTOV(np);
1976	}
1977	if (error) {
1978		if (newvp)
1979			vput(newvp);
1980		if (NFS_ISV4(dvp))
1981			error = nfscl_maperr(cnp->cn_thread, error,
1982			    vap->va_uid, vap->va_gid);
1983	} else {
1984		/*
1985		 * If negative lookup caching is enabled, I might as well
1986		 * add an entry for this node. Not necessary for correctness,
1987		 * but if negative caching is enabled, then the system
1988		 * must care about lookup caching hit rate, so...
1989		 */
1990		if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 &&
1991		    (cnp->cn_flags & MAKEENTRY))
1992			cache_enter(dvp, newvp, cnp);
1993		*ap->a_vpp = newvp;
1994	}
1995
1996	dnp = VTONFS(dvp);
1997	mtx_lock(&dnp->n_mtx);
1998	dnp->n_flag |= NMODIFIED;
1999	if (dattrflag != 0) {
2000		mtx_unlock(&dnp->n_mtx);
2001		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
2002	} else {
2003		dnp->n_attrstamp = 0;
2004		mtx_unlock(&dnp->n_mtx);
2005	}
2006	return (error);
2007}
2008
2009/*
2010 * nfs make dir call
2011 */
2012static int
2013nfs_mkdir(struct vop_mkdir_args *ap)
2014{
2015	struct vnode *dvp = ap->a_dvp;
2016	struct vattr *vap = ap->a_vap;
2017	struct componentname *cnp = ap->a_cnp;
2018	struct nfsnode *np = NULL, *dnp;
2019	struct vnode *newvp = NULL;
2020	struct vattr vattr;
2021	struct nfsfh *nfhp;
2022	struct nfsvattr nfsva, dnfsva;
2023	int error = 0, attrflag, dattrflag, ret;
2024
2025	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
2026		return (error);
2027	vap->va_type = VDIR;
2028	error = nfsrpc_mkdir(dvp, cnp->cn_nameptr, cnp->cn_namelen,
2029	    vap, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &nfhp,
2030	    &attrflag, &dattrflag, NULL);
2031	dnp = VTONFS(dvp);
2032	mtx_lock(&dnp->n_mtx);
2033	dnp->n_flag |= NMODIFIED;
2034	if (dattrflag != 0) {
2035		mtx_unlock(&dnp->n_mtx);
2036		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
2037	} else {
2038		dnp->n_attrstamp = 0;
2039		mtx_unlock(&dnp->n_mtx);
2040	}
2041	if (nfhp) {
2042		ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, cnp->cn_thread,
2043		    &np, NULL, LK_EXCLUSIVE);
2044		if (!ret) {
2045			newvp = NFSTOV(np);
2046			if (attrflag)
2047			   (void) nfscl_loadattrcache(&newvp, &nfsva, NULL,
2048				NULL, 0, 1);
2049		} else if (!error)
2050			error = ret;
2051	}
2052	if (!error && newvp == NULL) {
2053		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
2054		    cnp->cn_cred, cnp->cn_thread, &np);
2055		if (!error) {
2056			newvp = NFSTOV(np);
2057			if (newvp->v_type != VDIR)
2058				error = EEXIST;
2059		}
2060	}
2061	if (error) {
2062		if (newvp)
2063			vput(newvp);
2064		if (NFS_ISV4(dvp))
2065			error = nfscl_maperr(cnp->cn_thread, error,
2066			    vap->va_uid, vap->va_gid);
2067	} else {
2068		/*
2069		 * If negative lookup caching is enabled, I might as well
2070		 * add an entry for this node. Not necessary for correctness,
2071		 * but if negative caching is enabled, then the system
2072		 * must care about lookup caching hit rate, so...
2073		 */
2074		if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 &&
2075		    (cnp->cn_flags & MAKEENTRY))
2076			cache_enter(dvp, newvp, cnp);
2077		*ap->a_vpp = newvp;
2078	}
2079	return (error);
2080}
2081
2082/*
2083 * nfs remove directory call
2084 */
2085static int
2086nfs_rmdir(struct vop_rmdir_args *ap)
2087{
2088	struct vnode *vp = ap->a_vp;
2089	struct vnode *dvp = ap->a_dvp;
2090	struct componentname *cnp = ap->a_cnp;
2091	struct nfsnode *dnp;
2092	struct nfsvattr dnfsva;
2093	int error, dattrflag;
2094
2095	if (dvp == vp)
2096		return (EINVAL);
2097	error = nfsrpc_rmdir(dvp, cnp->cn_nameptr, cnp->cn_namelen,
2098	    cnp->cn_cred, cnp->cn_thread, &dnfsva, &dattrflag, NULL);
2099	dnp = VTONFS(dvp);
2100	mtx_lock(&dnp->n_mtx);
2101	dnp->n_flag |= NMODIFIED;
2102	if (dattrflag != 0) {
2103		mtx_unlock(&dnp->n_mtx);
2104		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
2105	} else {
2106		dnp->n_attrstamp = 0;
2107		mtx_unlock(&dnp->n_mtx);
2108	}
2109
2110	cache_purge(dvp);
2111	cache_purge(vp);
2112	if (error && NFS_ISV4(dvp))
2113		error = nfscl_maperr(cnp->cn_thread, error, (uid_t)0,
2114		    (gid_t)0);
2115	/*
2116	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
2117	 */
2118	if (error == ENOENT)
2119		error = 0;
2120	return (error);
2121}
2122
2123/*
2124 * nfs readdir call
2125 */
2126static int
2127nfs_readdir(struct vop_readdir_args *ap)
2128{
2129	struct vnode *vp = ap->a_vp;
2130	struct nfsnode *np = VTONFS(vp);
2131	struct uio *uio = ap->a_uio;
2132	int tresid, error = 0;
2133	struct vattr vattr;
2134
2135	if (vp->v_type != VDIR)
2136		return(EPERM);
2137
2138	/*
2139	 * First, check for hit on the EOF offset cache
2140	 */
2141	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
2142	    (np->n_flag & NMODIFIED) == 0) {
2143		if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) {
2144			mtx_lock(&np->n_mtx);
2145			if ((NFS_ISV4(vp) && np->n_change == vattr.va_filerev) ||
2146			    !NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
2147				mtx_unlock(&np->n_mtx);
2148				NFSINCRGLOBAL(newnfsstats.direofcache_hits);
2149				return (0);
2150			} else
2151				mtx_unlock(&np->n_mtx);
2152		}
2153	}
2154
2155	/*
2156	 * Call ncl_bioread() to do the real work.
2157	 */
2158	tresid = uio->uio_resid;
2159	error = ncl_bioread(vp, uio, 0, ap->a_cred);
2160
2161	if (!error && uio->uio_resid == tresid)
2162		NFSINCRGLOBAL(newnfsstats.direofcache_misses);
2163	return (error);
2164}
2165
2166/*
2167 * Readdir rpc call.
2168 * Called from below the buffer cache by ncl_doio().
2169 */
2170int
2171ncl_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
2172    struct thread *td)
2173{
2174	struct nfsvattr nfsva;
2175	nfsuint64 *cookiep, cookie;
2176	struct nfsnode *dnp = VTONFS(vp);
2177	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2178	int error = 0, eof, attrflag;
2179
2180	KASSERT(uiop->uio_iovcnt == 1 &&
2181	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
2182	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
2183	    ("nfs readdirrpc bad uio"));
2184
2185	/*
2186	 * If there is no cookie, assume directory was stale.
2187	 */
2188	ncl_dircookie_lock(dnp);
2189	cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0);
2190	if (cookiep) {
2191		cookie = *cookiep;
2192		ncl_dircookie_unlock(dnp);
2193	} else {
2194		ncl_dircookie_unlock(dnp);
2195		return (NFSERR_BAD_COOKIE);
2196	}
2197
2198	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp))
2199		(void)ncl_fsinfo(nmp, vp, cred, td);
2200
2201	error = nfsrpc_readdir(vp, uiop, &cookie, cred, td, &nfsva,
2202	    &attrflag, &eof, NULL);
2203	if (attrflag)
2204		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
2205
2206	if (!error) {
2207		/*
2208		 * We are now either at the end of the directory or have filled
2209		 * the block.
2210		 */
2211		if (eof)
2212			dnp->n_direofoffset = uiop->uio_offset;
2213		else {
2214			if (uiop->uio_resid > 0)
2215				ncl_printf("EEK! readdirrpc resid > 0\n");
2216			ncl_dircookie_lock(dnp);
2217			cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1);
2218			*cookiep = cookie;
2219			ncl_dircookie_unlock(dnp);
2220		}
2221	} else if (NFS_ISV4(vp)) {
2222		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
2223	}
2224	return (error);
2225}
2226
2227/*
2228 * NFS V3 readdir plus RPC. Used in place of ncl_readdirrpc().
2229 */
2230int
2231ncl_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
2232    struct thread *td)
2233{
2234	struct nfsvattr nfsva;
2235	nfsuint64 *cookiep, cookie;
2236	struct nfsnode *dnp = VTONFS(vp);
2237	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2238	int error = 0, attrflag, eof;
2239
2240	KASSERT(uiop->uio_iovcnt == 1 &&
2241	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
2242	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
2243	    ("nfs readdirplusrpc bad uio"));
2244
2245	/*
2246	 * If there is no cookie, assume directory was stale.
2247	 */
2248	ncl_dircookie_lock(dnp);
2249	cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0);
2250	if (cookiep) {
2251		cookie = *cookiep;
2252		ncl_dircookie_unlock(dnp);
2253	} else {
2254		ncl_dircookie_unlock(dnp);
2255		return (NFSERR_BAD_COOKIE);
2256	}
2257
2258	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp))
2259		(void)ncl_fsinfo(nmp, vp, cred, td);
2260	error = nfsrpc_readdirplus(vp, uiop, &cookie, cred, td, &nfsva,
2261	    &attrflag, &eof, NULL);
2262	if (attrflag)
2263		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
2264
2265	if (!error) {
2266		/*
2267		 * We are now either at end of the directory or have filled the
2268		 * the block.
2269		 */
2270		if (eof)
2271			dnp->n_direofoffset = uiop->uio_offset;
2272		else {
2273			if (uiop->uio_resid > 0)
2274				ncl_printf("EEK! readdirplusrpc resid > 0\n");
2275			ncl_dircookie_lock(dnp);
2276			cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1);
2277			*cookiep = cookie;
2278			ncl_dircookie_unlock(dnp);
2279		}
2280	} else if (NFS_ISV4(vp)) {
2281		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
2282	}
2283	return (error);
2284}
2285
2286/*
2287 * Silly rename. To make the NFS filesystem that is stateless look a little
2288 * more like the "ufs" a remove of an active vnode is translated to a rename
2289 * to a funny looking filename that is removed by nfs_inactive on the
2290 * nfsnode. There is the potential for another process on a different client
2291 * to create the same funny name between the nfs_lookitup() fails and the
2292 * nfs_rename() completes, but...
2293 */
2294static int
2295nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2296{
2297	struct sillyrename *sp;
2298	struct nfsnode *np;
2299	int error;
2300	short pid;
2301	unsigned int lticks;
2302
2303	cache_purge(dvp);
2304	np = VTONFS(vp);
2305	KASSERT(vp->v_type != VDIR, ("nfs: sillyrename dir"));
2306	MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename),
2307	    M_NEWNFSREQ, M_WAITOK);
2308	sp->s_cred = crhold(cnp->cn_cred);
2309	sp->s_dvp = dvp;
2310	VREF(dvp);
2311
2312	/*
2313	 * Fudge together a funny name.
2314	 * Changing the format of the funny name to accomodate more
2315	 * sillynames per directory.
2316	 * The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is
2317	 * CPU ticks since boot.
2318	 */
2319	pid = cnp->cn_thread->td_proc->p_pid;
2320	lticks = (unsigned int)ticks;
2321	for ( ; ; ) {
2322		sp->s_namlen = sprintf(sp->s_name,
2323				       ".nfs.%08x.%04x4.4", lticks,
2324				       pid);
2325		if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
2326				 cnp->cn_thread, NULL))
2327			break;
2328		lticks++;
2329	}
2330	error = nfs_renameit(dvp, vp, cnp, sp);
2331	if (error)
2332		goto bad;
2333	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
2334		cnp->cn_thread, &np);
2335	np->n_sillyrename = sp;
2336	return (0);
2337bad:
2338	vrele(sp->s_dvp);
2339	crfree(sp->s_cred);
2340	free((caddr_t)sp, M_NEWNFSREQ);
2341	return (error);
2342}
2343
2344/*
2345 * Look up a file name and optionally either update the file handle or
2346 * allocate an nfsnode, depending on the value of npp.
2347 * npp == NULL	--> just do the lookup
2348 * *npp == NULL --> allocate a new nfsnode and make sure attributes are
2349 *			handled too
2350 * *npp != NULL --> update the file handle in the vnode
2351 */
2352static int
2353nfs_lookitup(struct vnode *dvp, char *name, int len, struct ucred *cred,
2354    struct thread *td, struct nfsnode **npp)
2355{
2356	struct vnode *newvp = NULL, *vp;
2357	struct nfsnode *np, *dnp = VTONFS(dvp);
2358	struct nfsfh *nfhp, *onfhp;
2359	struct nfsvattr nfsva, dnfsva;
2360	struct componentname cn;
2361	int error = 0, attrflag, dattrflag;
2362	u_int hash;
2363
2364	error = nfsrpc_lookup(dvp, name, len, cred, td, &dnfsva, &nfsva,
2365	    &nfhp, &attrflag, &dattrflag, NULL);
2366	if (dattrflag)
2367		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
2368	if (npp && !error) {
2369		if (*npp != NULL) {
2370		    np = *npp;
2371		    vp = NFSTOV(np);
2372		    /*
2373		     * For NFSv4, check to see if it is the same name and
2374		     * replace the name, if it is different.
2375		     */
2376		    if (np->n_v4 != NULL && nfsva.na_type == VREG &&
2377			(np->n_v4->n4_namelen != len ||
2378			 NFSBCMP(name, NFS4NODENAME(np->n_v4), len) ||
2379			 dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
2380			 NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
2381			 dnp->n_fhp->nfh_len))) {
2382#ifdef notdef
2383{ char nnn[100]; int nnnl;
2384nnnl = (len < 100) ? len : 99;
2385bcopy(name, nnn, nnnl);
2386nnn[nnnl] = '\0';
2387printf("replace=%s\n",nnn);
2388}
2389#endif
2390			    FREE((caddr_t)np->n_v4, M_NFSV4NODE);
2391			    MALLOC(np->n_v4, struct nfsv4node *,
2392				sizeof (struct nfsv4node) +
2393				dnp->n_fhp->nfh_len + len - 1,
2394				M_NFSV4NODE, M_WAITOK);
2395			    np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
2396			    np->n_v4->n4_namelen = len;
2397			    NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
2398				dnp->n_fhp->nfh_len);
2399			    NFSBCOPY(name, NFS4NODENAME(np->n_v4), len);
2400		    }
2401		    hash = fnv_32_buf(nfhp->nfh_fh, nfhp->nfh_len,
2402			FNV1_32_INIT);
2403		    onfhp = np->n_fhp;
2404		    /*
2405		     * Rehash node for new file handle.
2406		     */
2407		    vfs_hash_rehash(vp, hash);
2408		    np->n_fhp = nfhp;
2409		    if (onfhp != NULL)
2410			FREE((caddr_t)onfhp, M_NFSFH);
2411		    newvp = NFSTOV(np);
2412		} else if (NFS_CMPFH(dnp, nfhp->nfh_fh, nfhp->nfh_len)) {
2413		    FREE((caddr_t)nfhp, M_NFSFH);
2414		    VREF(dvp);
2415		    newvp = dvp;
2416		} else {
2417		    cn.cn_nameptr = name;
2418		    cn.cn_namelen = len;
2419		    error = nfscl_nget(dvp->v_mount, dvp, nfhp, &cn, td,
2420			&np, NULL, LK_EXCLUSIVE);
2421		    if (error)
2422			return (error);
2423		    newvp = NFSTOV(np);
2424		}
2425		if (!attrflag && *npp == NULL) {
2426			if (newvp == dvp)
2427				vrele(newvp);
2428			else
2429				vput(newvp);
2430			return (ENOENT);
2431		}
2432		if (attrflag)
2433			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
2434			    0, 1);
2435	}
2436	if (npp && *npp == NULL) {
2437		if (error) {
2438			if (newvp) {
2439				if (newvp == dvp)
2440					vrele(newvp);
2441				else
2442					vput(newvp);
2443			}
2444		} else
2445			*npp = np;
2446	}
2447	if (error && NFS_ISV4(dvp))
2448		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
2449	return (error);
2450}
2451
2452/*
2453 * Nfs Version 3 and 4 commit rpc
2454 */
2455int
2456ncl_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
2457   struct thread *td)
2458{
2459	struct nfsvattr nfsva;
2460	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2461	int error, attrflag;
2462	u_char verf[NFSX_VERF];
2463
2464	mtx_lock(&nmp->nm_mtx);
2465	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
2466		mtx_unlock(&nmp->nm_mtx);
2467		return (0);
2468	}
2469	mtx_unlock(&nmp->nm_mtx);
2470	error = nfsrpc_commit(vp, offset, cnt, cred, td, verf, &nfsva,
2471	    &attrflag, NULL);
2472	if (!error) {
2473		mtx_lock(&nmp->nm_mtx);
2474		if (NFSBCMP((caddr_t)nmp->nm_verf, verf, NFSX_VERF)) {
2475			NFSBCOPY(verf, (caddr_t)nmp->nm_verf, NFSX_VERF);
2476			error = NFSERR_STALEWRITEVERF;
2477		}
2478		mtx_unlock(&nmp->nm_mtx);
2479		if (!error && attrflag)
2480			(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL,
2481			    0, 1);
2482	} else if (NFS_ISV4(vp)) {
2483		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
2484	}
2485	return (error);
2486}
2487
2488/*
2489 * Strategy routine.
2490 * For async requests when nfsiod(s) are running, queue the request by
2491 * calling ncl_asyncio(), otherwise just all ncl_doio() to do the
2492 * request.
2493 */
2494static int
2495nfs_strategy(struct vop_strategy_args *ap)
2496{
2497	struct buf *bp = ap->a_bp;
2498	struct ucred *cr;
2499
2500	KASSERT(!(bp->b_flags & B_DONE),
2501	    ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
2502	BUF_ASSERT_HELD(bp);
2503
2504	if (bp->b_iocmd == BIO_READ)
2505		cr = bp->b_rcred;
2506	else
2507		cr = bp->b_wcred;
2508
2509	/*
2510	 * If the op is asynchronous and an i/o daemon is waiting
2511	 * queue the request, wake it up and wait for completion
2512	 * otherwise just do it ourselves.
2513	 */
2514	if ((bp->b_flags & B_ASYNC) == 0 ||
2515	    ncl_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
2516		(void) ncl_doio(ap->a_vp, bp, cr, curthread, 1);
2517	return (0);
2518}
2519
2520/*
2521 * fsync vnode op. Just call ncl_flush() with commit == 1.
2522 */
2523/* ARGSUSED */
2524static int
2525nfs_fsync(struct vop_fsync_args *ap)
2526{
2527	return (ncl_flush(ap->a_vp, ap->a_waitfor, NULL, ap->a_td, 1, 0));
2528}
2529
2530/*
2531 * Flush all the blocks associated with a vnode.
2532 * 	Walk through the buffer pool and push any dirty pages
2533 *	associated with the vnode.
2534 * If the called_from_renewthread argument is TRUE, it has been called
2535 * from the NFSv4 renew thread and, as such, cannot block indefinitely
2536 * waiting for a buffer write to complete.
2537 */
2538int
2539ncl_flush(struct vnode *vp, int waitfor, struct ucred *cred, struct thread *td,
2540    int commit, int called_from_renewthread)
2541{
2542	struct nfsnode *np = VTONFS(vp);
2543	struct buf *bp;
2544	int i;
2545	struct buf *nbp;
2546	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2547	int error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
2548	int passone = 1, trycnt = 0;
2549	u_quad_t off, endoff, toff;
2550	struct ucred* wcred = NULL;
2551	struct buf **bvec = NULL;
2552	struct bufobj *bo;
2553#ifndef NFS_COMMITBVECSIZ
2554#define	NFS_COMMITBVECSIZ	20
2555#endif
2556	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
2557	int bvecsize = 0, bveccount;
2558
2559	if (called_from_renewthread != 0)
2560		slptimeo = hz;
2561	if (nmp->nm_flag & NFSMNT_INT)
2562		slpflag = NFS_PCATCH;
2563	if (!commit)
2564		passone = 0;
2565	bo = &vp->v_bufobj;
2566	/*
2567	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
2568	 * server, but has not been committed to stable storage on the server
2569	 * yet. On the first pass, the byte range is worked out and the commit
2570	 * rpc is done. On the second pass, ncl_writebp() is called to do the
2571	 * job.
2572	 */
2573again:
2574	off = (u_quad_t)-1;
2575	endoff = 0;
2576	bvecpos = 0;
2577	if (NFS_ISV34(vp) && commit) {
2578		if (bvec != NULL && bvec != bvec_on_stack)
2579			free(bvec, M_TEMP);
2580		/*
2581		 * Count up how many buffers waiting for a commit.
2582		 */
2583		bveccount = 0;
2584		BO_LOCK(bo);
2585		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2586			if (!BUF_ISLOCKED(bp) &&
2587			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
2588				== (B_DELWRI | B_NEEDCOMMIT))
2589				bveccount++;
2590		}
2591		/*
2592		 * Allocate space to remember the list of bufs to commit.  It is
2593		 * important to use M_NOWAIT here to avoid a race with nfs_write.
2594		 * If we can't get memory (for whatever reason), we will end up
2595		 * committing the buffers one-by-one in the loop below.
2596		 */
2597		if (bveccount > NFS_COMMITBVECSIZ) {
2598			/*
2599			 * Release the vnode interlock to avoid a lock
2600			 * order reversal.
2601			 */
2602			BO_UNLOCK(bo);
2603			bvec = (struct buf **)
2604				malloc(bveccount * sizeof(struct buf *),
2605				       M_TEMP, M_NOWAIT);
2606			BO_LOCK(bo);
2607			if (bvec == NULL) {
2608				bvec = bvec_on_stack;
2609				bvecsize = NFS_COMMITBVECSIZ;
2610			} else
2611				bvecsize = bveccount;
2612		} else {
2613			bvec = bvec_on_stack;
2614			bvecsize = NFS_COMMITBVECSIZ;
2615		}
2616		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2617			if (bvecpos >= bvecsize)
2618				break;
2619			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
2620				nbp = TAILQ_NEXT(bp, b_bobufs);
2621				continue;
2622			}
2623			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
2624			    (B_DELWRI | B_NEEDCOMMIT)) {
2625				BUF_UNLOCK(bp);
2626				nbp = TAILQ_NEXT(bp, b_bobufs);
2627				continue;
2628			}
2629			BO_UNLOCK(bo);
2630			bremfree(bp);
2631			/*
2632			 * Work out if all buffers are using the same cred
2633			 * so we can deal with them all with one commit.
2634			 *
2635			 * NOTE: we are not clearing B_DONE here, so we have
2636			 * to do it later on in this routine if we intend to
2637			 * initiate I/O on the bp.
2638			 *
2639			 * Note: to avoid loopback deadlocks, we do not
2640			 * assign b_runningbufspace.
2641			 */
2642			if (wcred == NULL)
2643				wcred = bp->b_wcred;
2644			else if (wcred != bp->b_wcred)
2645				wcred = NOCRED;
2646			vfs_busy_pages(bp, 1);
2647
2648			BO_LOCK(bo);
2649			/*
2650			 * bp is protected by being locked, but nbp is not
2651			 * and vfs_busy_pages() may sleep.  We have to
2652			 * recalculate nbp.
2653			 */
2654			nbp = TAILQ_NEXT(bp, b_bobufs);
2655
2656			/*
2657			 * A list of these buffers is kept so that the
2658			 * second loop knows which buffers have actually
2659			 * been committed. This is necessary, since there
2660			 * may be a race between the commit rpc and new
2661			 * uncommitted writes on the file.
2662			 */
2663			bvec[bvecpos++] = bp;
2664			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
2665				bp->b_dirtyoff;
2666			if (toff < off)
2667				off = toff;
2668			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
2669			if (toff > endoff)
2670				endoff = toff;
2671		}
2672		BO_UNLOCK(bo);
2673	}
2674	if (bvecpos > 0) {
2675		/*
2676		 * Commit data on the server, as required.
2677		 * If all bufs are using the same wcred, then use that with
2678		 * one call for all of them, otherwise commit each one
2679		 * separately.
2680		 */
2681		if (wcred != NOCRED)
2682			retv = ncl_commit(vp, off, (int)(endoff - off),
2683					  wcred, td);
2684		else {
2685			retv = 0;
2686			for (i = 0; i < bvecpos; i++) {
2687				off_t off, size;
2688				bp = bvec[i];
2689				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
2690					bp->b_dirtyoff;
2691				size = (u_quad_t)(bp->b_dirtyend
2692						  - bp->b_dirtyoff);
2693				retv = ncl_commit(vp, off, (int)size,
2694						  bp->b_wcred, td);
2695				if (retv) break;
2696			}
2697		}
2698
2699		if (retv == NFSERR_STALEWRITEVERF)
2700			ncl_clearcommit(vp->v_mount);
2701
2702		/*
2703		 * Now, either mark the blocks I/O done or mark the
2704		 * blocks dirty, depending on whether the commit
2705		 * succeeded.
2706		 */
2707		for (i = 0; i < bvecpos; i++) {
2708			bp = bvec[i];
2709			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
2710			if (retv) {
2711				/*
2712				 * Error, leave B_DELWRI intact
2713				 */
2714				vfs_unbusy_pages(bp);
2715				brelse(bp);
2716			} else {
2717				/*
2718				 * Success, remove B_DELWRI ( bundirty() ).
2719				 *
2720				 * b_dirtyoff/b_dirtyend seem to be NFS
2721				 * specific.  We should probably move that
2722				 * into bundirty(). XXX
2723				 */
2724				bufobj_wref(bo);
2725				bp->b_flags |= B_ASYNC;
2726				bundirty(bp);
2727				bp->b_flags &= ~B_DONE;
2728				bp->b_ioflags &= ~BIO_ERROR;
2729				bp->b_dirtyoff = bp->b_dirtyend = 0;
2730				bufdone(bp);
2731			}
2732		}
2733	}
2734
2735	/*
2736	 * Start/do any write(s) that are required.
2737	 */
2738loop:
2739	BO_LOCK(bo);
2740	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2741		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
2742			if (waitfor != MNT_WAIT || passone)
2743				continue;
2744
2745			error = BUF_TIMELOCK(bp,
2746			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2747			    BO_MTX(bo), "nfsfsync", slpflag, slptimeo);
2748			if (error == 0) {
2749				BUF_UNLOCK(bp);
2750				goto loop;
2751			}
2752			if (error == ENOLCK) {
2753				error = 0;
2754				goto loop;
2755			}
2756			if (called_from_renewthread != 0) {
2757				/*
2758				 * Return EIO so the flush will be retried
2759				 * later.
2760				 */
2761				error = EIO;
2762				goto done;
2763			}
2764			if (newnfs_sigintr(nmp, td)) {
2765				error = EINTR;
2766				goto done;
2767			}
2768			if (slpflag & PCATCH) {
2769				slpflag = 0;
2770				slptimeo = 2 * hz;
2771			}
2772			goto loop;
2773		}
2774		if ((bp->b_flags & B_DELWRI) == 0)
2775			panic("nfs_fsync: not dirty");
2776		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
2777			BUF_UNLOCK(bp);
2778			continue;
2779		}
2780		BO_UNLOCK(bo);
2781		bremfree(bp);
2782		if (passone || !commit)
2783		    bp->b_flags |= B_ASYNC;
2784		else
2785		    bp->b_flags |= B_ASYNC;
2786		bwrite(bp);
2787		if (newnfs_sigintr(nmp, td)) {
2788			error = EINTR;
2789			goto done;
2790		}
2791		goto loop;
2792	}
2793	if (passone) {
2794		passone = 0;
2795		BO_UNLOCK(bo);
2796		goto again;
2797	}
2798	if (waitfor == MNT_WAIT) {
2799		while (bo->bo_numoutput) {
2800			error = bufobj_wwait(bo, slpflag, slptimeo);
2801			if (error) {
2802			    BO_UNLOCK(bo);
2803			    if (called_from_renewthread != 0) {
2804				/*
2805				 * Return EIO so that the flush will be
2806				 * retried later.
2807				 */
2808				error = EIO;
2809				goto done;
2810			    }
2811			    error = newnfs_sigintr(nmp, td);
2812			    if (error)
2813				goto done;
2814			    if (slpflag & PCATCH) {
2815				slpflag = 0;
2816				slptimeo = 2 * hz;
2817			    }
2818			    BO_LOCK(bo);
2819			}
2820		}
2821		if (bo->bo_dirty.bv_cnt != 0 && commit) {
2822			BO_UNLOCK(bo);
2823			goto loop;
2824		}
2825		/*
2826		 * Wait for all the async IO requests to drain
2827		 */
2828		BO_UNLOCK(bo);
2829		mtx_lock(&np->n_mtx);
2830		while (np->n_directio_asyncwr > 0) {
2831			np->n_flag |= NFSYNCWAIT;
2832			error = newnfs_msleep(td, &np->n_directio_asyncwr,
2833			    &np->n_mtx, slpflag | (PRIBIO + 1),
2834			    "nfsfsync", 0);
2835			if (error) {
2836				if (newnfs_sigintr(nmp, td)) {
2837					mtx_unlock(&np->n_mtx);
2838					error = EINTR;
2839					goto done;
2840				}
2841			}
2842		}
2843		mtx_unlock(&np->n_mtx);
2844	} else
2845		BO_UNLOCK(bo);
2846	mtx_lock(&np->n_mtx);
2847	if (np->n_flag & NWRITEERR) {
2848		error = np->n_error;
2849		np->n_flag &= ~NWRITEERR;
2850	}
2851  	if (commit && bo->bo_dirty.bv_cnt == 0 &&
2852	    bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0)
2853  		np->n_flag &= ~NMODIFIED;
2854	mtx_unlock(&np->n_mtx);
2855done:
2856	if (bvec != NULL && bvec != bvec_on_stack)
2857		free(bvec, M_TEMP);
2858	if (error == 0 && commit != 0 && waitfor == MNT_WAIT &&
2859	    (bo->bo_dirty.bv_cnt != 0 || bo->bo_numoutput != 0 ||
2860	     np->n_directio_asyncwr != 0) && trycnt++ < 5) {
2861		/* try, try again... */
2862		passone = 1;
2863		wcred = NULL;
2864		bvec = NULL;
2865		bvecsize = 0;
2866printf("try%d\n", trycnt);
2867		goto again;
2868	}
2869	return (error);
2870}
2871
2872/*
2873 * NFS advisory byte-level locks.
2874 */
2875static int
2876nfs_advlock(struct vop_advlock_args *ap)
2877{
2878	struct vnode *vp = ap->a_vp;
2879	struct ucred *cred;
2880	struct nfsnode *np = VTONFS(ap->a_vp);
2881	struct proc *p = (struct proc *)ap->a_id;
2882	struct thread *td = curthread;	/* XXX */
2883	struct vattr va;
2884	int ret, error = EOPNOTSUPP;
2885	u_quad_t size;
2886
2887	if (NFS_ISV4(vp) && (ap->a_flags & F_POSIX)) {
2888		cred = p->p_ucred;
2889		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2890		if (vp->v_iflag & VI_DOOMED) {
2891			VOP_UNLOCK(vp, 0);
2892			return (EBADF);
2893		}
2894
2895		/*
2896		 * If this is unlocking a write locked region, flush and
2897		 * commit them before unlocking. This is required by
2898		 * RFC3530 Sec. 9.3.2.
2899		 */
2900		if (ap->a_op == F_UNLCK &&
2901		    nfscl_checkwritelocked(vp, ap->a_fl, cred, td, ap->a_id,
2902		    ap->a_flags))
2903			(void) ncl_flush(vp, MNT_WAIT, cred, td, 1, 0);
2904
2905		/*
2906		 * Loop around doing the lock op, while a blocking lock
2907		 * must wait for the lock op to succeed.
2908		 */
2909		do {
2910			ret = nfsrpc_advlock(vp, np->n_size, ap->a_op,
2911			    ap->a_fl, 0, cred, td, ap->a_id, ap->a_flags);
2912			if (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) &&
2913			    ap->a_op == F_SETLK) {
2914				VOP_UNLOCK(vp, 0);
2915				error = nfs_catnap(PZERO | PCATCH, ret,
2916				    "ncladvl");
2917				if (error)
2918					return (EINTR);
2919				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2920				if (vp->v_iflag & VI_DOOMED) {
2921					VOP_UNLOCK(vp, 0);
2922					return (EBADF);
2923				}
2924			}
2925		} while (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) &&
2926		     ap->a_op == F_SETLK);
2927		if (ret == NFSERR_DENIED) {
2928			VOP_UNLOCK(vp, 0);
2929			return (EAGAIN);
2930		} else if (ret == EINVAL || ret == EBADF || ret == EINTR) {
2931			VOP_UNLOCK(vp, 0);
2932			return (ret);
2933		} else if (ret != 0) {
2934			VOP_UNLOCK(vp, 0);
2935			return (EACCES);
2936		}
2937
2938		/*
2939		 * Now, if we just got a lock, invalidate data in the buffer
2940		 * cache, as required, so that the coherency conforms with
2941		 * RFC3530 Sec. 9.3.2.
2942		 */
2943		if (ap->a_op == F_SETLK) {
2944			if ((np->n_flag & NMODIFIED) == 0) {
2945				np->n_attrstamp = 0;
2946				ret = VOP_GETATTR(vp, &va, cred);
2947			}
2948			if ((np->n_flag & NMODIFIED) || ret ||
2949			    np->n_change != va.va_filerev) {
2950				(void) ncl_vinvalbuf(vp, V_SAVE, td, 1);
2951				np->n_attrstamp = 0;
2952				ret = VOP_GETATTR(vp, &va, cred);
2953				if (!ret) {
2954					np->n_mtime = va.va_mtime;
2955					np->n_change = va.va_filerev;
2956				}
2957			}
2958		}
2959		VOP_UNLOCK(vp, 0);
2960		return (0);
2961	} else if (!NFS_ISV4(vp)) {
2962		error = vn_lock(vp, LK_SHARED);
2963		if (error)
2964			return (error);
2965		if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
2966			size = VTONFS(vp)->n_size;
2967			VOP_UNLOCK(vp, 0);
2968			error = lf_advlock(ap, &(vp->v_lockf), size);
2969		} else {
2970			if (nfs_advlock_p != NULL)
2971				error = nfs_advlock_p(ap);
2972			else {
2973				VOP_UNLOCK(vp, 0);
2974				error = ENOLCK;
2975			}
2976		}
2977	}
2978	return (error);
2979}
2980
2981/*
2982 * NFS advisory byte-level locks.
2983 */
2984static int
2985nfs_advlockasync(struct vop_advlockasync_args *ap)
2986{
2987	struct vnode *vp = ap->a_vp;
2988	u_quad_t size;
2989	int error;
2990
2991	if (NFS_ISV4(vp))
2992		return (EOPNOTSUPP);
2993	error = vn_lock(vp, LK_SHARED);
2994	if (error)
2995		return (error);
2996	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
2997		size = VTONFS(vp)->n_size;
2998		VOP_UNLOCK(vp, 0);
2999		error = lf_advlockasync(ap, &(vp->v_lockf), size);
3000	} else {
3001		VOP_UNLOCK(vp, 0);
3002		error = EOPNOTSUPP;
3003	}
3004	return (error);
3005}
3006
3007/*
3008 * Print out the contents of an nfsnode.
3009 */
3010static int
3011nfs_print(struct vop_print_args *ap)
3012{
3013	struct vnode *vp = ap->a_vp;
3014	struct nfsnode *np = VTONFS(vp);
3015
3016	ncl_printf("\tfileid %ld fsid 0x%x",
3017	   np->n_vattr.na_fileid, np->n_vattr.na_fsid);
3018	if (vp->v_type == VFIFO)
3019		fifo_printinfo(vp);
3020	printf("\n");
3021	return (0);
3022}
3023
3024/*
3025 * This is the "real" nfs::bwrite(struct buf*).
3026 * We set B_CACHE if this is a VMIO buffer.
3027 */
3028int
3029ncl_writebp(struct buf *bp, int force __unused, struct thread *td)
3030{
3031	int s;
3032	int oldflags = bp->b_flags;
3033#if 0
3034	int retv = 1;
3035	off_t off;
3036#endif
3037
3038	BUF_ASSERT_HELD(bp);
3039
3040	if (bp->b_flags & B_INVAL) {
3041		brelse(bp);
3042		return(0);
3043	}
3044
3045	bp->b_flags |= B_CACHE;
3046
3047	/*
3048	 * Undirty the bp.  We will redirty it later if the I/O fails.
3049	 */
3050
3051	s = splbio();
3052	bundirty(bp);
3053	bp->b_flags &= ~B_DONE;
3054	bp->b_ioflags &= ~BIO_ERROR;
3055	bp->b_iocmd = BIO_WRITE;
3056
3057	bufobj_wref(bp->b_bufobj);
3058	curthread->td_ru.ru_oublock++;
3059	splx(s);
3060
3061	/*
3062	 * Note: to avoid loopback deadlocks, we do not
3063	 * assign b_runningbufspace.
3064	 */
3065	vfs_busy_pages(bp, 1);
3066
3067	BUF_KERNPROC(bp);
3068	bp->b_iooffset = dbtob(bp->b_blkno);
3069	bstrategy(bp);
3070
3071	if( (oldflags & B_ASYNC) == 0) {
3072		int rtval = bufwait(bp);
3073
3074		if (oldflags & B_DELWRI) {
3075			s = splbio();
3076			reassignbuf(bp);
3077			splx(s);
3078		}
3079		brelse(bp);
3080		return (rtval);
3081	}
3082
3083	return (0);
3084}
3085
3086/*
3087 * nfs special file access vnode op.
3088 * Essentially just get vattr and then imitate iaccess() since the device is
3089 * local to the client.
3090 */
3091static int
3092nfsspec_access(struct vop_access_args *ap)
3093{
3094	struct vattr *vap;
3095	struct ucred *cred = ap->a_cred;
3096	struct vnode *vp = ap->a_vp;
3097	accmode_t accmode = ap->a_accmode;
3098	struct vattr vattr;
3099	int error;
3100
3101	/*
3102	 * Disallow write attempts on filesystems mounted read-only;
3103	 * unless the file is a socket, fifo, or a block or character
3104	 * device resident on the filesystem.
3105	 */
3106	if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
3107		switch (vp->v_type) {
3108		case VREG:
3109		case VDIR:
3110		case VLNK:
3111			return (EROFS);
3112		default:
3113			break;
3114		}
3115	}
3116	vap = &vattr;
3117	error = VOP_GETATTR(vp, vap, cred);
3118	if (error)
3119		goto out;
3120	error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
3121	    accmode, cred, NULL);
3122out:
3123	return error;
3124}
3125
3126/*
3127 * Read wrapper for fifos.
3128 */
3129static int
3130nfsfifo_read(struct vop_read_args *ap)
3131{
3132	struct nfsnode *np = VTONFS(ap->a_vp);
3133	int error;
3134
3135	/*
3136	 * Set access flag.
3137	 */
3138	mtx_lock(&np->n_mtx);
3139	np->n_flag |= NACC;
3140	getnanotime(&np->n_atim);
3141	mtx_unlock(&np->n_mtx);
3142	error = fifo_specops.vop_read(ap);
3143	return error;
3144}
3145
3146/*
3147 * Write wrapper for fifos.
3148 */
3149static int
3150nfsfifo_write(struct vop_write_args *ap)
3151{
3152	struct nfsnode *np = VTONFS(ap->a_vp);
3153
3154	/*
3155	 * Set update flag.
3156	 */
3157	mtx_lock(&np->n_mtx);
3158	np->n_flag |= NUPD;
3159	getnanotime(&np->n_mtim);
3160	mtx_unlock(&np->n_mtx);
3161	return(fifo_specops.vop_write(ap));
3162}
3163
3164/*
3165 * Close wrapper for fifos.
3166 *
3167 * Update the times on the nfsnode then do fifo close.
3168 */
3169static int
3170nfsfifo_close(struct vop_close_args *ap)
3171{
3172	struct vnode *vp = ap->a_vp;
3173	struct nfsnode *np = VTONFS(vp);
3174	struct vattr vattr;
3175	struct timespec ts;
3176
3177	mtx_lock(&np->n_mtx);
3178	if (np->n_flag & (NACC | NUPD)) {
3179		getnanotime(&ts);
3180		if (np->n_flag & NACC)
3181			np->n_atim = ts;
3182		if (np->n_flag & NUPD)
3183			np->n_mtim = ts;
3184		np->n_flag |= NCHG;
3185		if (vrefcnt(vp) == 1 &&
3186		    (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
3187			VATTR_NULL(&vattr);
3188			if (np->n_flag & NACC)
3189				vattr.va_atime = np->n_atim;
3190			if (np->n_flag & NUPD)
3191				vattr.va_mtime = np->n_mtim;
3192			mtx_unlock(&np->n_mtx);
3193			(void)VOP_SETATTR(vp, &vattr, ap->a_cred);
3194			goto out;
3195		}
3196	}
3197	mtx_unlock(&np->n_mtx);
3198out:
3199	return (fifo_specops.vop_close(ap));
3200}
3201
3202/*
3203 * Just call ncl_writebp() with the force argument set to 1.
3204 *
3205 * NOTE: B_DONE may or may not be set in a_bp on call.
3206 */
3207static int
3208nfs_bwrite(struct buf *bp)
3209{
3210
3211	return (ncl_writebp(bp, 1, curthread));
3212}
3213
3214struct buf_ops buf_ops_newnfs = {
3215	.bop_name	=	"buf_ops_nfs",
3216	.bop_write	=	nfs_bwrite,
3217	.bop_strategy	=	bufstrategy,
3218	.bop_sync	=	bufsync,
3219	.bop_bdflush	=	bufbdflush,
3220};
3221
3222/*
3223 * Cloned from vop_stdlock(), and then the ugly hack added.
3224 */
3225static int
3226nfs_lock1(struct vop_lock1_args *ap)
3227{
3228	struct vnode *vp = ap->a_vp;
3229	int error = 0;
3230
3231	/*
3232	 * Since vfs_hash_get() calls vget() and it will no longer work
3233	 * for FreeBSD8 with flags == 0, I can only think of this horrible
3234	 * hack to work around it. I call vfs_hash_get() with LK_EXCLOTHER
3235	 * and then handle it here. All I want for this case is a v_usecount
3236	 * on the vnode to use for recovery, while another thread might
3237	 * hold a lock on the vnode. I have the other threads blocked, so
3238	 * there isn't any race problem.
3239	 */
3240	if ((ap->a_flags & LK_TYPE_MASK) == LK_EXCLOTHER) {
3241		if ((ap->a_flags & LK_INTERLOCK) == 0)
3242			panic("ncllock1");
3243		if ((vp->v_iflag & VI_DOOMED))
3244			error = ENOENT;
3245		VI_UNLOCK(vp);
3246		return (error);
3247	}
3248	return (_lockmgr_args(vp->v_vnlock, ap->a_flags, VI_MTX(vp),
3249	    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file,
3250	    ap->a_line));
3251}
3252
3253static int
3254nfs_getacl(struct vop_getacl_args *ap)
3255{
3256	int error;
3257
3258	if (ap->a_type != ACL_TYPE_NFS4)
3259		return (EOPNOTSUPP);
3260	error = nfsrpc_getacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp,
3261	    NULL);
3262	if (error > NFSERR_STALE) {
3263		(void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0);
3264		error = EPERM;
3265	}
3266	return (error);
3267}
3268
3269static int
3270nfs_setacl(struct vop_setacl_args *ap)
3271{
3272	int error;
3273
3274	if (ap->a_type != ACL_TYPE_NFS4)
3275		return (EOPNOTSUPP);
3276	error = nfsrpc_setacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp,
3277	    NULL);
3278	if (error > NFSERR_STALE) {
3279		(void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0);
3280		error = EPERM;
3281	}
3282	return (error);
3283}
3284
3285/*
3286 * Return POSIX pathconf information applicable to nfs filesystems.
3287 */
3288static int
3289nfs_pathconf(struct vop_pathconf_args *ap)
3290{
3291	struct nfsv3_pathconf pc;
3292	struct nfsvattr nfsva;
3293	struct vnode *vp = ap->a_vp;
3294	struct thread *td = curthread;
3295	int attrflag, error;
3296
3297	if (NFS_ISV4(vp) || (NFS_ISV3(vp) && (ap->a_name == _PC_LINK_MAX ||
3298	    ap->a_name == _PC_NAME_MAX || ap->a_name == _PC_CHOWN_RESTRICTED ||
3299	    ap->a_name == _PC_NO_TRUNC))) {
3300		/*
3301		 * Since only the above 4 a_names are returned by the NFSv3
3302		 * Pathconf RPC, there is no point in doing it for others.
3303		 */
3304		error = nfsrpc_pathconf(vp, &pc, td->td_ucred, td, &nfsva,
3305		    &attrflag, NULL);
3306		if (attrflag != 0)
3307			(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
3308			    1);
3309		if (error != 0)
3310			return (error);
3311	} else {
3312		/*
3313		 * For NFSv2 (or NFSv3 when not one of the above 4 a_names),
3314		 * just fake them.
3315		 */
3316		pc.pc_linkmax = LINK_MAX;
3317		pc.pc_namemax = NFS_MAXNAMLEN;
3318		pc.pc_notrunc = 1;
3319		pc.pc_chownrestricted = 1;
3320		pc.pc_caseinsensitive = 0;
3321		pc.pc_casepreserving = 1;
3322		error = 0;
3323	}
3324	switch (ap->a_name) {
3325	case _PC_LINK_MAX:
3326		*ap->a_retval = pc.pc_linkmax;
3327		break;
3328	case _PC_NAME_MAX:
3329		*ap->a_retval = pc.pc_namemax;
3330		break;
3331	case _PC_PATH_MAX:
3332		*ap->a_retval = PATH_MAX;
3333		break;
3334	case _PC_PIPE_BUF:
3335		*ap->a_retval = PIPE_BUF;
3336		break;
3337	case _PC_CHOWN_RESTRICTED:
3338		*ap->a_retval = pc.pc_chownrestricted;
3339		break;
3340	case _PC_NO_TRUNC:
3341		*ap->a_retval = pc.pc_notrunc;
3342		break;
3343	case _PC_ACL_EXTENDED:
3344		*ap->a_retval = 0;
3345		break;
3346	case _PC_ACL_NFS4:
3347		if (NFS_ISV4(vp) && nfsrv_useacl != 0 && attrflag != 0 &&
3348		    NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL))
3349			*ap->a_retval = 1;
3350		else
3351			*ap->a_retval = 0;
3352		break;
3353	case _PC_ACL_PATH_MAX:
3354		if (NFS_ISV4(vp))
3355			*ap->a_retval = ACL_MAX_ENTRIES;
3356		else
3357			*ap->a_retval = 3;
3358		break;
3359	case _PC_MAC_PRESENT:
3360		*ap->a_retval = 0;
3361		break;
3362	case _PC_ASYNC_IO:
3363		/* _PC_ASYNC_IO should have been handled by upper layers. */
3364		KASSERT(0, ("_PC_ASYNC_IO should not get here"));
3365		error = EINVAL;
3366		break;
3367	case _PC_PRIO_IO:
3368		*ap->a_retval = 0;
3369		break;
3370	case _PC_SYNC_IO:
3371		*ap->a_retval = 0;
3372		break;
3373	case _PC_ALLOC_SIZE_MIN:
3374		*ap->a_retval = vp->v_mount->mnt_stat.f_bsize;
3375		break;
3376	case _PC_FILESIZEBITS:
3377		if (NFS_ISV34(vp))
3378			*ap->a_retval = 64;
3379		else
3380			*ap->a_retval = 32;
3381		break;
3382	case _PC_REC_INCR_XFER_SIZE:
3383		*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
3384		break;
3385	case _PC_REC_MAX_XFER_SIZE:
3386		*ap->a_retval = -1; /* means ``unlimited'' */
3387		break;
3388	case _PC_REC_MIN_XFER_SIZE:
3389		*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
3390		break;
3391	case _PC_REC_XFER_ALIGN:
3392		*ap->a_retval = PAGE_SIZE;
3393		break;
3394	case _PC_SYMLINK_MAX:
3395		*ap->a_retval = NFS_MAXPATHLEN;
3396		break;
3397
3398	default:
3399		error = EINVAL;
3400		break;
3401	}
3402	return (error);
3403}
3404
3405