1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
25 *	All rights reserved.
26 */
27
28#include <sys/param.h>
29#include <sys/types.h>
30#include <sys/systm.h>
31#include <sys/thread.h>
32#include <sys/t_lock.h>
33#include <sys/time.h>
34#include <sys/vnode.h>
35#include <sys/vfs.h>
36#include <sys/errno.h>
37#include <sys/buf.h>
38#include <sys/stat.h>
39#include <sys/cred.h>
40#include <sys/kmem.h>
41#include <sys/debug.h>
42#include <sys/dnlc.h>
43#include <sys/vmsystm.h>
44#include <sys/flock.h>
45#include <sys/share.h>
46#include <sys/cmn_err.h>
47#include <sys/tiuser.h>
48#include <sys/sysmacros.h>
49#include <sys/callb.h>
50#include <sys/acl.h>
51#include <sys/kstat.h>
52#include <sys/signal.h>
53#include <sys/list.h>
54#include <sys/zone.h>
55
56#include <rpc/types.h>
57#include <rpc/xdr.h>
58#include <rpc/auth.h>
59#include <rpc/clnt.h>
60
61#include <nfs/nfs.h>
62#include <nfs/nfs_clnt.h>
63
64#include <nfs/rnode.h>
65#include <nfs/nfs_acl.h>
66#include <nfs/lm.h>
67
68#include <vm/hat.h>
69#include <vm/as.h>
70#include <vm/page.h>
71#include <vm/pvn.h>
72#include <vm/seg.h>
73#include <vm/seg_map.h>
74#include <vm/seg_vn.h>
75
76static void	nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
77			cred_t *);
78static int	nfs_getattr_cache(vnode_t *, struct vattr *);
79static int	nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
80
81struct mi_globals {
82	kmutex_t	mig_lock;  /* lock protecting mig_list */
83	list_t		mig_list;  /* list of NFS v2 or v3 mounts in zone */
84	boolean_t	mig_destructor_called;
85};
86
87static zone_key_t mi_list_key;
88
89/* Debugging flag for PC file shares. */
90extern int	share_debug;
91
92/*
93 * Attributes caching:
94 *
95 * Attributes are cached in the rnode in struct vattr form.
96 * There is a time associated with the cached attributes (r_attrtime)
97 * which tells whether the attributes are valid. The time is initialized
98 * to the difference between current time and the modify time of the vnode
99 * when new attributes are cached. This allows the attributes for
100 * files that have changed recently to be timed out sooner than for files
101 * that have not changed for a long time. There are minimum and maximum
102 * timeout values that can be set per mount point.
103 */
104
105int
106nfs_waitfor_purge_complete(vnode_t *vp)
107{
108	rnode_t *rp;
109	k_sigset_t smask;
110
111	rp = VTOR(vp);
112	if (rp->r_serial != NULL && rp->r_serial != curthread) {
113		mutex_enter(&rp->r_statelock);
114		sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
115		while (rp->r_serial != NULL) {
116			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
117				sigunintr(&smask);
118				mutex_exit(&rp->r_statelock);
119				return (EINTR);
120			}
121		}
122		sigunintr(&smask);
123		mutex_exit(&rp->r_statelock);
124	}
125	return (0);
126}
127
128/*
129 * Validate caches by checking cached attributes. If the cached
130 * attributes have timed out, then get new attributes from the server.
131 * As a side affect, this will do cache invalidation if the attributes
132 * have changed.
133 *
134 * If the attributes have not timed out and if there is a cache
135 * invalidation being done by some other thread, then wait until that
136 * thread has completed the cache invalidation.
137 */
138int
139nfs_validate_caches(vnode_t *vp, cred_t *cr)
140{
141	int error;
142	struct vattr va;
143
144	if (ATTRCACHE_VALID(vp)) {
145		error = nfs_waitfor_purge_complete(vp);
146		if (error)
147			return (error);
148		return (0);
149	}
150
151	va.va_mask = AT_ALL;
152	return (nfs_getattr_otw(vp, &va, cr));
153}
154
155/*
156 * Validate caches by checking cached attributes. If the cached
157 * attributes have timed out, then get new attributes from the server.
158 * As a side affect, this will do cache invalidation if the attributes
159 * have changed.
160 *
161 * If the attributes have not timed out and if there is a cache
162 * invalidation being done by some other thread, then wait until that
163 * thread has completed the cache invalidation.
164 */
165int
166nfs3_validate_caches(vnode_t *vp, cred_t *cr)
167{
168	int error;
169	struct vattr va;
170
171	if (ATTRCACHE_VALID(vp)) {
172		error = nfs_waitfor_purge_complete(vp);
173		if (error)
174			return (error);
175		return (0);
176	}
177
178	va.va_mask = AT_ALL;
179	return (nfs3_getattr_otw(vp, &va, cr));
180}
181
182/*
183 * Purge all of the various NFS `data' caches.
184 */
185void
186nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
187{
188	rnode_t *rp;
189	char *contents;
190	int size;
191	int error;
192
193	/*
194	 * Purge the DNLC for any entries which refer to this file.
195	 * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
196	 */
197	rp = VTOR(vp);
198	mutex_enter(&rp->r_statelock);
199	if (vp->v_count > 1 &&
200	    (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
201	    !(rp->r_flags & RINDNLCPURGE)) {
202		/*
203		 * Set the RINDNLCPURGE flag to prevent recursive entry
204		 * into dnlc_purge_vp()
205		 */
206		if (vp->v_type == VDIR)
207			rp->r_flags |= RINDNLCPURGE;
208		mutex_exit(&rp->r_statelock);
209		dnlc_purge_vp(vp);
210		mutex_enter(&rp->r_statelock);
211		if (rp->r_flags & RINDNLCPURGE)
212			rp->r_flags &= ~RINDNLCPURGE;
213	}
214
215	/*
216	 * Clear any readdir state bits and purge the readlink response cache.
217	 */
218	contents = rp->r_symlink.contents;
219	size = rp->r_symlink.size;
220	rp->r_symlink.contents = NULL;
221	mutex_exit(&rp->r_statelock);
222
223	if (contents != NULL) {
224
225		kmem_free((void *)contents, size);
226	}
227
228	/*
229	 * Flush the page cache.
230	 */
231	if (vn_has_cached_data(vp)) {
232		error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
233		if (error && (error == ENOSPC || error == EDQUOT)) {
234			mutex_enter(&rp->r_statelock);
235			if (!rp->r_error)
236				rp->r_error = error;
237			mutex_exit(&rp->r_statelock);
238		}
239	}
240
241	/*
242	 * Flush the readdir response cache.
243	 */
244	if (HAVE_RDDIR_CACHE(rp))
245		nfs_purge_rddir_cache(vp);
246}
247
248/*
249 * Purge the readdir cache of all entries
250 */
251void
252nfs_purge_rddir_cache(vnode_t *vp)
253{
254	rnode_t *rp;
255	rddir_cache *rdc;
256	rddir_cache *nrdc;
257
258	rp = VTOR(vp);
259top:
260	mutex_enter(&rp->r_statelock);
261	rp->r_direof = NULL;
262	rp->r_flags &= ~RLOOKUP;
263	rp->r_flags |= RREADDIRPLUS;
264	rdc = avl_first(&rp->r_dir);
265	while (rdc != NULL) {
266		nrdc = AVL_NEXT(&rp->r_dir, rdc);
267		avl_remove(&rp->r_dir, rdc);
268		rddir_cache_rele(rdc);
269		rdc = nrdc;
270	}
271	mutex_exit(&rp->r_statelock);
272}
273
274/*
275 * Do a cache check based on the post-operation attributes.
276 * Then make them the new cached attributes.  If no attributes
277 * were returned, then mark the attributes as timed out.
278 */
279void
280nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
281{
282	vattr_t attr;
283
284	if (!poap->attributes) {
285		PURGE_ATTRCACHE(vp);
286		return;
287	}
288	(void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
289}
290
291/*
292 * Same as above, but using a vattr
293 */
294void
295nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
296    cred_t *cr)
297{
298	if (!poap->attributes) {
299		PURGE_ATTRCACHE(vp);
300		return;
301	}
302	nfs_attr_cache(vp, poap->fres.vap, t, cr);
303}
304
305/*
306 * Do a cache check based on the weak cache consistency attributes.
307 * These consist of a small set of pre-operation attributes and the
308 * full set of post-operation attributes.
309 *
310 * If we are given the pre-operation attributes, then use them to
311 * check the validity of the various caches.  Then, if we got the
312 * post-operation attributes, make them the new cached attributes.
313 * If we didn't get the post-operation attributes, then mark the
314 * attribute cache as timed out so that the next reference will
315 * cause a GETATTR to the server to refresh with the current
316 * attributes.
317 *
318 * Otherwise, if we didn't get the pre-operation attributes, but
319 * we did get the post-operation attributes, then use these
320 * attributes to check the validity of the various caches.  This
321 * will probably cause a flush of the caches because if the
322 * operation succeeded, the attributes of the object were changed
323 * in some way from the old post-operation attributes.  This
324 * should be okay because it is the safe thing to do.  After
325 * checking the data caches, then we make these the new cached
326 * attributes.
327 *
328 * Otherwise, we didn't get either the pre- or post-operation
329 * attributes.  Simply mark the attribute cache as timed out so
330 * the next reference will cause a GETATTR to the server to
331 * refresh with the current attributes.
332 *
333 * If an error occurred trying to convert the over the wire
334 * attributes to a vattr, then simply mark the attribute cache as
335 * timed out.
336 */
337void
338nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
339{
340	vattr_t bva;
341	vattr_t ava;
342
343	if (wccp->after.attributes) {
344		if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
345			PURGE_ATTRCACHE(vp);
346			return;
347		}
348		if (wccp->before.attributes) {
349			bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
350			bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
351			bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
352			bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
353			bva.va_size = wccp->before.attr.size;
354			nfs3_attr_cache(vp, &bva, &ava, t, cr);
355		} else
356			nfs_attr_cache(vp, &ava, t, cr);
357	} else {
358		PURGE_ATTRCACHE(vp);
359	}
360}
361
362/*
363 * Set attributes cache for given vnode using nfsattr.
364 *
365 * This routine does not do cache validation with the attributes.
366 *
367 * If an error occurred trying to convert the over the wire
368 * attributes to a vattr, then simply mark the attribute cache as
369 * timed out.
370 */
371void
372nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
373{
374	rnode_t *rp;
375	struct vattr va;
376
377	if (!nattr_to_vattr(vp, na, &va)) {
378		rp = VTOR(vp);
379		mutex_enter(&rp->r_statelock);
380		if (rp->r_mtime <= t)
381			nfs_attrcache_va(vp, &va);
382		mutex_exit(&rp->r_statelock);
383	} else {
384		PURGE_ATTRCACHE(vp);
385	}
386}
387
388/*
389 * Set attributes cache for given vnode using fattr3.
390 *
391 * This routine does not do cache validation with the attributes.
392 *
393 * If an error occurred trying to convert the over the wire
394 * attributes to a vattr, then simply mark the attribute cache as
395 * timed out.
396 */
397void
398nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
399{
400	rnode_t *rp;
401	struct vattr va;
402
403	if (!fattr3_to_vattr(vp, na, &va)) {
404		rp = VTOR(vp);
405		mutex_enter(&rp->r_statelock);
406		if (rp->r_mtime <= t)
407			nfs_attrcache_va(vp, &va);
408		mutex_exit(&rp->r_statelock);
409	} else {
410		PURGE_ATTRCACHE(vp);
411	}
412}
413
414/*
415 * Do a cache check based on attributes returned over the wire.  The
416 * new attributes are cached.
417 *
418 * If an error occurred trying to convert the over the wire attributes
419 * to a vattr, then just return that error.
420 *
421 * As a side affect, the vattr argument is filled in with the converted
422 * attributes.
423 */
424int
425nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
426    cred_t *cr)
427{
428	int error;
429
430	error = nattr_to_vattr(vp, na, vap);
431	if (error)
432		return (error);
433	nfs_attr_cache(vp, vap, t, cr);
434	return (0);
435}
436
437/*
438 * Do a cache check based on attributes returned over the wire.  The
439 * new attributes are cached.
440 *
441 * If an error occurred trying to convert the over the wire attributes
442 * to a vattr, then just return that error.
443 *
444 * As a side affect, the vattr argument is filled in with the converted
445 * attributes.
446 */
447int
448nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
449{
450	int error;
451
452	error = fattr3_to_vattr(vp, na, vap);
453	if (error)
454		return (error);
455	nfs_attr_cache(vp, vap, t, cr);
456	return (0);
457}
458
459/*
460 * Use the passed in virtual attributes to check to see whether the
461 * data and metadata caches are valid, cache the new attributes, and
462 * then do the cache invalidation if required.
463 *
464 * The cache validation and caching of the new attributes is done
465 * atomically via the use of the mutex, r_statelock.  If required,
466 * the cache invalidation is done atomically w.r.t. the cache
467 * validation and caching of the attributes via the pseudo lock,
468 * r_serial.
469 *
470 * This routine is used to do cache validation and attributes caching
471 * for operations with a single set of post operation attributes.
472 */
473void
474nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
475{
476	rnode_t *rp;
477	int mtime_changed = 0;
478	int ctime_changed = 0;
479	vsecattr_t *vsp;
480	int was_serial;
481	len_t preattr_rsize;
482	boolean_t writeattr_set = B_FALSE;
483	boolean_t cachepurge_set = B_FALSE;
484
485	rp = VTOR(vp);
486
487	mutex_enter(&rp->r_statelock);
488
489	if (rp->r_serial != curthread) {
490		klwp_t *lwp = ttolwp(curthread);
491
492		was_serial = 0;
493		if (lwp != NULL)
494			lwp->lwp_nostop++;
495		while (rp->r_serial != NULL) {
496			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
497				mutex_exit(&rp->r_statelock);
498				if (lwp != NULL)
499					lwp->lwp_nostop--;
500				return;
501			}
502		}
503		if (lwp != NULL)
504			lwp->lwp_nostop--;
505	} else
506		was_serial = 1;
507
508	if (rp->r_mtime > t) {
509		if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
510			PURGE_ATTRCACHE_LOCKED(rp);
511		mutex_exit(&rp->r_statelock);
512		return;
513	}
514
515	/*
516	 * Write thread after writing data to file on remote server,
517	 * will always set RWRITEATTR to indicate that file on remote
518	 * server was modified with a WRITE operation and would have
519	 * marked attribute cache as timed out. If RWRITEATTR
520	 * is set, then do not check for mtime and ctime change.
521	 */
522	if (!(rp->r_flags & RWRITEATTR)) {
523		if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
524			mtime_changed = 1;
525
526		if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
527		    rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
528			ctime_changed = 1;
529	} else {
530		writeattr_set = B_TRUE;
531	}
532
533	preattr_rsize = rp->r_size;
534
535	nfs_attrcache_va(vp, vap);
536
537	/*
538	 * If we have updated filesize in nfs_attrcache_va, as soon as we
539	 * drop statelock we will be in transition of purging all
540	 * our caches and updating them. It is possible for another
541	 * thread to pick this new file size and read in zeroed data.
542	 * stall other threads till cache purge is complete.
543	 */
544	if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
545		/*
546		 * If RWRITEATTR was set and we have updated the file
547		 * size, Server's returned file size need not necessarily
548		 * be because of this Client's WRITE. We need to purge
549		 * all caches.
550		 */
551		if (writeattr_set)
552			mtime_changed = 1;
553
554		if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
555			rp->r_flags |= RINCACHEPURGE;
556			cachepurge_set = B_TRUE;
557		}
558	}
559
560	if (!mtime_changed && !ctime_changed) {
561		mutex_exit(&rp->r_statelock);
562		return;
563	}
564
565	rp->r_serial = curthread;
566
567	mutex_exit(&rp->r_statelock);
568
569	if (mtime_changed)
570		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
571
572	if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
573		mutex_enter(&rp->r_statelock);
574		rp->r_flags &= ~RINCACHEPURGE;
575		cv_broadcast(&rp->r_cv);
576		mutex_exit(&rp->r_statelock);
577		cachepurge_set = B_FALSE;
578	}
579
580	if (ctime_changed) {
581		(void) nfs_access_purge_rp(rp);
582		if (rp->r_secattr != NULL) {
583			mutex_enter(&rp->r_statelock);
584			vsp = rp->r_secattr;
585			rp->r_secattr = NULL;
586			mutex_exit(&rp->r_statelock);
587			if (vsp != NULL)
588				nfs_acl_free(vsp);
589		}
590	}
591
592	if (!was_serial) {
593		mutex_enter(&rp->r_statelock);
594		rp->r_serial = NULL;
595		cv_broadcast(&rp->r_cv);
596		mutex_exit(&rp->r_statelock);
597	}
598}
599
600/*
601 * Use the passed in "before" virtual attributes to check to see
602 * whether the data and metadata caches are valid, cache the "after"
603 * new attributes, and then do the cache invalidation if required.
604 *
605 * The cache validation and caching of the new attributes is done
606 * atomically via the use of the mutex, r_statelock.  If required,
607 * the cache invalidation is done atomically w.r.t. the cache
608 * validation and caching of the attributes via the pseudo lock,
609 * r_serial.
610 *
611 * This routine is used to do cache validation and attributes caching
612 * for operations with both pre operation attributes and post operation
613 * attributes.
614 */
615static void
616nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
617    cred_t *cr)
618{
619	rnode_t *rp;
620	int mtime_changed = 0;
621	int ctime_changed = 0;
622	vsecattr_t *vsp;
623	int was_serial;
624	len_t preattr_rsize;
625	boolean_t writeattr_set = B_FALSE;
626	boolean_t cachepurge_set = B_FALSE;
627
628	rp = VTOR(vp);
629
630	mutex_enter(&rp->r_statelock);
631
632	if (rp->r_serial != curthread) {
633		klwp_t *lwp = ttolwp(curthread);
634
635		was_serial = 0;
636		if (lwp != NULL)
637			lwp->lwp_nostop++;
638		while (rp->r_serial != NULL) {
639			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
640				mutex_exit(&rp->r_statelock);
641				if (lwp != NULL)
642					lwp->lwp_nostop--;
643				return;
644			}
645		}
646		if (lwp != NULL)
647			lwp->lwp_nostop--;
648	} else
649		was_serial = 1;
650
651	if (rp->r_mtime > t) {
652		if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size))
653			PURGE_ATTRCACHE_LOCKED(rp);
654		mutex_exit(&rp->r_statelock);
655		return;
656	}
657
658	/*
659	 * Write thread after writing data to file on remote server,
660	 * will always set RWRITEATTR to indicate that file on remote
661	 * server was modified with a WRITE operation and would have
662	 * marked attribute cache as timed out. If RWRITEATTR
663	 * is set, then do not check for mtime and ctime change.
664	 */
665	if (!(rp->r_flags & RWRITEATTR)) {
666		if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
667			mtime_changed = 1;
668
669		if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
670		    rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
671			ctime_changed = 1;
672	} else {
673		writeattr_set = B_TRUE;
674	}
675
676	preattr_rsize = rp->r_size;
677
678	nfs_attrcache_va(vp, avap);
679
680	/*
681	 * If we have updated filesize in nfs_attrcache_va, as soon as we
682	 * drop statelock we will be in transition of purging all
683	 * our caches and updating them. It is possible for another
684	 * thread to pick this new file size and read in zeroed data.
685	 * stall other threads till cache purge is complete.
686	 */
687	if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
688		/*
689		 * If RWRITEATTR was set and we have updated the file
690		 * size, Server's returned file size need not necessarily
691		 * be because of this Client's WRITE. We need to purge
692		 * all caches.
693		 */
694		if (writeattr_set)
695			mtime_changed = 1;
696
697		if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
698			rp->r_flags |= RINCACHEPURGE;
699			cachepurge_set = B_TRUE;
700		}
701	}
702
703	if (!mtime_changed && !ctime_changed) {
704		mutex_exit(&rp->r_statelock);
705		return;
706	}
707
708	rp->r_serial = curthread;
709
710	mutex_exit(&rp->r_statelock);
711
712	if (mtime_changed)
713		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
714
715	if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
716		mutex_enter(&rp->r_statelock);
717		rp->r_flags &= ~RINCACHEPURGE;
718		cv_broadcast(&rp->r_cv);
719		mutex_exit(&rp->r_statelock);
720		cachepurge_set = B_FALSE;
721	}
722
723	if (ctime_changed) {
724		(void) nfs_access_purge_rp(rp);
725		if (rp->r_secattr != NULL) {
726			mutex_enter(&rp->r_statelock);
727			vsp = rp->r_secattr;
728			rp->r_secattr = NULL;
729			mutex_exit(&rp->r_statelock);
730			if (vsp != NULL)
731				nfs_acl_free(vsp);
732		}
733	}
734
735	if (!was_serial) {
736		mutex_enter(&rp->r_statelock);
737		rp->r_serial = NULL;
738		cv_broadcast(&rp->r_cv);
739		mutex_exit(&rp->r_statelock);
740	}
741}
742
743/*
744 * Set attributes cache for given vnode using virtual attributes.
745 *
746 * Set the timeout value on the attribute cache and fill it
747 * with the passed in attributes.
748 *
749 * The caller must be holding r_statelock.
750 */
751void
752nfs_attrcache_va(vnode_t *vp, struct vattr *va)
753{
754	rnode_t *rp;
755	mntinfo_t *mi;
756	hrtime_t delta;
757	hrtime_t now;
758
759	rp = VTOR(vp);
760
761	ASSERT(MUTEX_HELD(&rp->r_statelock));
762
763	now = gethrtime();
764
765	mi = VTOMI(vp);
766
767	/*
768	 * Delta is the number of nanoseconds that we will
769	 * cache the attributes of the file.  It is based on
770	 * the number of nanoseconds since the last time that
771	 * we detected a change.  The assumption is that files
772	 * that changed recently are likely to change again.
773	 * There is a minimum and a maximum for regular files
774	 * and for directories which is enforced though.
775	 *
776	 * Using the time since last change was detected
777	 * eliminates direct comparison or calculation
778	 * using mixed client and server times.  NFS does
779	 * not make any assumptions regarding the client
780	 * and server clocks being synchronized.
781	 */
782	if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
783	    va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
784	    va->va_size != rp->r_attr.va_size)
785		rp->r_mtime = now;
786
787	if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
788		delta = 0;
789	else {
790		delta = now - rp->r_mtime;
791		if (vp->v_type == VDIR) {
792			if (delta < mi->mi_acdirmin)
793				delta = mi->mi_acdirmin;
794			else if (delta > mi->mi_acdirmax)
795				delta = mi->mi_acdirmax;
796		} else {
797			if (delta < mi->mi_acregmin)
798				delta = mi->mi_acregmin;
799			else if (delta > mi->mi_acregmax)
800				delta = mi->mi_acregmax;
801		}
802	}
803	rp->r_attrtime = now + delta;
804	rp->r_attr = *va;
805	/*
806	 * Update the size of the file if there is no cached data or if
807	 * the cached data is clean and there is no data being written
808	 * out.
809	 */
810	if (rp->r_size != va->va_size &&
811	    (!vn_has_cached_data(vp) ||
812	    (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
813		rp->r_size = va->va_size;
814	nfs_setswaplike(vp, va);
815	rp->r_flags &= ~RWRITEATTR;
816}
817
818/*
819 * Fill in attribute from the cache.
820 * If valid, then return 0 to indicate that no error occurred,
821 * otherwise return 1 to indicate that an error occurred.
822 */
823static int
824nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
825{
826	rnode_t *rp;
827	uint_t mask = vap->va_mask;
828
829	rp = VTOR(vp);
830	mutex_enter(&rp->r_statelock);
831	if (ATTRCACHE_VALID(vp)) {
832		/*
833		 * Cached attributes are valid
834		 */
835		*vap = rp->r_attr;
836		/*
837		 * Set the caller's va_mask to the set of attributes
838		 * that were requested ANDed with the attributes that
839		 * are available.  If attributes were requested that
840		 * are not available, those bits must be turned off
841		 * in the callers va_mask.
842		 */
843		vap->va_mask &= mask;
844		mutex_exit(&rp->r_statelock);
845		return (0);
846	}
847	mutex_exit(&rp->r_statelock);
848	return (1);
849}
850
851/*
852 * Get attributes over-the-wire and update attributes cache
853 * if no error occurred in the over-the-wire operation.
854 * Return 0 if successful, otherwise error.
855 */
856int
857nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
858{
859	int error;
860	struct nfsattrstat ns;
861	int douprintf;
862	mntinfo_t *mi;
863	failinfo_t fi;
864	hrtime_t t;
865
866	mi = VTOMI(vp);
867	fi.vp = vp;
868	fi.fhp = NULL;		/* no need to update, filehandle not copied */
869	fi.copyproc = nfscopyfh;
870	fi.lookupproc = nfslookup;
871	fi.xattrdirproc = acl_getxattrdir2;
872
873	if (mi->mi_flags & MI_ACL) {
874		error = acl_getattr2_otw(vp, vap, cr);
875		if (mi->mi_flags & MI_ACL)
876			return (error);
877	}
878
879	douprintf = 1;
880
881	t = gethrtime();
882
883	error = rfs2call(mi, RFS_GETATTR,
884	    xdr_fhandle, (caddr_t)VTOFH(vp),
885	    xdr_attrstat, (caddr_t)&ns, cr,
886	    &douprintf, &ns.ns_status, 0, &fi);
887
888	if (!error) {
889		error = geterrno(ns.ns_status);
890		if (!error)
891			error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
892		else {
893			PURGE_STALE_FH(error, vp, cr);
894		}
895	}
896
897	return (error);
898}
899
900/*
901 * Return either cached ot remote attributes. If get remote attr
902 * use them to check and invalidate caches, then cache the new attributes.
903 */
904int
905nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
906{
907	int error;
908	rnode_t *rp;
909
910	/*
911	 * If we've got cached attributes, we're done, otherwise go
912	 * to the server to get attributes, which will update the cache
913	 * in the process.
914	 */
915	error = nfs_getattr_cache(vp, vap);
916	if (error)
917		error = nfs_getattr_otw(vp, vap, cr);
918
919	/* Return the client's view of file size */
920	rp = VTOR(vp);
921	mutex_enter(&rp->r_statelock);
922	vap->va_size = rp->r_size;
923	mutex_exit(&rp->r_statelock);
924
925	return (error);
926}
927
928/*
929 * Get attributes over-the-wire and update attributes cache
930 * if no error occurred in the over-the-wire operation.
931 * Return 0 if successful, otherwise error.
932 */
933int
934nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
935{
936	int error;
937	GETATTR3args args;
938	GETATTR3vres res;
939	int douprintf;
940	failinfo_t fi;
941	hrtime_t t;
942
943	args.object = *VTOFH3(vp);
944	fi.vp = vp;
945	fi.fhp = (caddr_t)&args.object;
946	fi.copyproc = nfs3copyfh;
947	fi.lookupproc = nfs3lookup;
948	fi.xattrdirproc = acl_getxattrdir3;
949	res.fres.vp = vp;
950	res.fres.vap = vap;
951
952	douprintf = 1;
953
954	t = gethrtime();
955
956	error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
957	    xdr_nfs_fh3, (caddr_t)&args,
958	    xdr_GETATTR3vres, (caddr_t)&res, cr,
959	    &douprintf, &res.status, 0, &fi);
960
961	if (error)
962		return (error);
963
964	error = geterrno3(res.status);
965	if (error) {
966		PURGE_STALE_FH(error, vp, cr);
967		return (error);
968	}
969
970	/*
971	 * Catch status codes that indicate fattr3 to vattr translation failure
972	 */
973	if (res.fres.status)
974		return (res.fres.status);
975
976	nfs_attr_cache(vp, vap, t, cr);
977	return (0);
978}
979
980/*
981 * Return either cached or remote attributes. If get remote attr
982 * use them to check and invalidate caches, then cache the new attributes.
983 */
984int
985nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
986{
987	int error;
988	rnode_t *rp;
989
990	/*
991	 * If we've got cached attributes, we're done, otherwise go
992	 * to the server to get attributes, which will update the cache
993	 * in the process.
994	 */
995	error = nfs_getattr_cache(vp, vap);
996	if (error)
997		error = nfs3_getattr_otw(vp, vap, cr);
998
999	/* Return the client's view of file size */
1000	rp = VTOR(vp);
1001	mutex_enter(&rp->r_statelock);
1002	vap->va_size = rp->r_size;
1003	mutex_exit(&rp->r_statelock);
1004
1005	return (error);
1006}
1007
1008vtype_t nf_to_vt[] = {
1009	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
1010};
1011/*
1012 * Convert NFS Version 2 over the network attributes to the local
1013 * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
1014 * network representation and the local representation is done here.
1015 * Returns 0 for success, error if failed due to overflow.
1016 */
1017int
1018nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
1019{
1020	/* overflow in time attributes? */
1021#ifndef _LP64
1022	if (!NFS2_FATTR_TIME_OK(na))
1023		return (EOVERFLOW);
1024#endif
1025
1026	vap->va_mask = AT_ALL;
1027
1028	if (na->na_type < NFNON || na->na_type > NFSOC)
1029		vap->va_type = VBAD;
1030	else
1031		vap->va_type = nf_to_vt[na->na_type];
1032	vap->va_mode = na->na_mode;
1033	vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
1034	vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
1035	vap->va_fsid = vp->v_vfsp->vfs_dev;
1036	vap->va_nodeid = na->na_nodeid;
1037	vap->va_nlink = na->na_nlink;
1038	vap->va_size = na->na_size;	/* keep for cache validation */
1039	/*
1040	 * nfs protocol defines times as unsigned so don't extend sign,
1041	 * unless sysadmin set nfs_allow_preepoch_time.
1042	 */
1043	NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
1044	vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
1045	NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
1046	vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
1047	NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
1048	vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
1049	/*
1050	 * Shannon's law - uncompress the received dev_t
1051	 * if the top half of is zero indicating a response
1052	 * from an `older style' OS. Except for when it is a
1053	 * `new style' OS sending the maj device of zero,
1054	 * in which case the algorithm still works because the
1055	 * fact that it is a new style server
1056	 * is hidden by the minor device not being greater
1057	 * than 255 (a requirement in this case).
1058	 */
1059	if ((na->na_rdev & 0xffff0000) == 0)
1060		vap->va_rdev = nfsv2_expdev(na->na_rdev);
1061	else
1062		vap->va_rdev = expldev(na->na_rdev);
1063
1064	vap->va_nblocks = na->na_blocks;
1065	switch (na->na_type) {
1066	case NFBLK:
1067		vap->va_blksize = DEV_BSIZE;
1068		break;
1069
1070	case NFCHR:
1071		vap->va_blksize = MAXBSIZE;
1072		break;
1073
1074	case NFSOC:
1075	default:
1076		vap->va_blksize = na->na_blocksize;
1077		break;
1078	}
1079	/*
1080	 * This bit of ugliness is a hack to preserve the
1081	 * over-the-wire protocols for named-pipe vnodes.
1082	 * It remaps the special over-the-wire type to the
1083	 * VFIFO type. (see note in nfs.h)
1084	 */
1085	if (NA_ISFIFO(na)) {
1086		vap->va_type = VFIFO;
1087		vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1088		vap->va_rdev = 0;
1089		vap->va_blksize = na->na_blocksize;
1090	}
1091	vap->va_seq = 0;
1092	return (0);
1093}
1094
1095/*
1096 * Convert NFS Version 3 over the network attributes to the local
1097 * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
1098 * network representation and the local representation is done here.
1099 */
1100vtype_t nf3_to_vt[] = {
1101	VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1102};
1103
1104int
1105fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1106{
1107
1108#ifndef _LP64
1109	/* overflow in time attributes? */
1110	if (!NFS3_FATTR_TIME_OK(na))
1111		return (EOVERFLOW);
1112#endif
1113	if (!NFS3_SIZE_OK(na->size))
1114		/* file too big */
1115		return (EFBIG);
1116
1117	vap->va_mask = AT_ALL;
1118
1119	if (na->type < NF3REG || na->type > NF3FIFO)
1120		vap->va_type = VBAD;
1121	else
1122		vap->va_type = nf3_to_vt[na->type];
1123	vap->va_mode = na->mode;
1124	vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1125	vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1126	vap->va_fsid = vp->v_vfsp->vfs_dev;
1127	vap->va_nodeid = na->fileid;
1128	vap->va_nlink = na->nlink;
1129	vap->va_size = na->size;
1130
1131	/*
1132	 * nfs protocol defines times as unsigned so don't extend sign,
1133	 * unless sysadmin set nfs_allow_preepoch_time.
1134	 */
1135	NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1136	vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1137	NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1138	vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1139	NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1140	vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1141
1142	switch (na->type) {
1143	case NF3BLK:
1144		vap->va_rdev = makedevice(na->rdev.specdata1,
1145		    na->rdev.specdata2);
1146		vap->va_blksize = DEV_BSIZE;
1147		vap->va_nblocks = 0;
1148		break;
1149	case NF3CHR:
1150		vap->va_rdev = makedevice(na->rdev.specdata1,
1151		    na->rdev.specdata2);
1152		vap->va_blksize = MAXBSIZE;
1153		vap->va_nblocks = 0;
1154		break;
1155	case NF3REG:
1156	case NF3DIR:
1157	case NF3LNK:
1158		vap->va_rdev = 0;
1159		vap->va_blksize = MAXBSIZE;
1160		vap->va_nblocks = (u_longlong_t)
1161		    ((na->used + (size3)DEV_BSIZE - (size3)1) /
1162		    (size3)DEV_BSIZE);
1163		break;
1164	case NF3SOCK:
1165	case NF3FIFO:
1166	default:
1167		vap->va_rdev = 0;
1168		vap->va_blksize = MAXBSIZE;
1169		vap->va_nblocks = 0;
1170		break;
1171	}
1172	vap->va_seq = 0;
1173	return (0);
1174}
1175
1176/*
1177 * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1178 * for the demand-based allocation of async threads per-mount.  The
1179 * nfs_async_timeout is the amount of time a thread will live after it
1180 * becomes idle, unless new I/O requests are received before the thread
1181 * dies.  See nfs_async_putpage and nfs_async_start.
1182 */
1183
1184int nfs_async_timeout = -1;	/* uninitialized */
1185
1186static void	nfs_async_start(struct vfs *);
1187static void	nfs_async_pgops_start(struct vfs *);
1188static void	nfs_async_common_start(struct vfs *, int);
1189
1190static void
1191free_async_args(struct nfs_async_reqs *args)
1192{
1193	rnode_t *rp;
1194
1195	if (args->a_io != NFS_INACTIVE) {
1196		rp = VTOR(args->a_vp);
1197		mutex_enter(&rp->r_statelock);
1198		rp->r_count--;
1199		if (args->a_io == NFS_PUTAPAGE ||
1200		    args->a_io == NFS_PAGEIO)
1201			rp->r_awcount--;
1202		cv_broadcast(&rp->r_cv);
1203		mutex_exit(&rp->r_statelock);
1204		VN_RELE(args->a_vp);
1205	}
1206	crfree(args->a_cred);
1207	kmem_free(args, sizeof (*args));
1208}
1209
1210/*
1211 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1212 * pageout(), running in the global zone, have legitimate reasons to do
1213 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1214 * use of a a per-mount "asynchronous requests manager thread" which is
1215 * signaled by the various asynchronous work routines when there is
1216 * asynchronous work to be done.  It is responsible for creating new
1217 * worker threads if necessary, and notifying existing worker threads
1218 * that there is work to be done.
1219 *
1220 * In other words, it will "take the specifications from the customers and
1221 * give them to the engineers."
1222 *
1223 * Worker threads die off of their own accord if they are no longer
1224 * needed.
1225 *
1226 * This thread is killed when the zone is going away or the filesystem
1227 * is being unmounted.
1228 */
1229void
1230nfs_async_manager(vfs_t *vfsp)
1231{
1232	callb_cpr_t cprinfo;
1233	mntinfo_t *mi;
1234	uint_t max_threads;
1235
1236	mi = VFTOMI(vfsp);
1237
1238	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1239	    "nfs_async_manager");
1240
1241	mutex_enter(&mi->mi_async_lock);
1242	/*
1243	 * We want to stash the max number of threads that this mount was
1244	 * allowed so we can use it later when the variable is set to zero as
1245	 * part of the zone/mount going away.
1246	 *
1247	 * We want to be able to create at least one thread to handle
1248	 * asynchronous inactive calls.
1249	 */
1250	max_threads = MAX(mi->mi_max_threads, 1);
1251	/*
1252	 * We don't want to wait for mi_max_threads to go to zero, since that
1253	 * happens as part of a failed unmount, but this thread should only
1254	 * exit when the mount/zone is really going away.
1255	 *
1256	 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1257	 * attempted: the various _async_*() functions know to do things
1258	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1259	 * outstanding requests.
1260	 *
1261	 * Note that we still create zthreads even if we notice the zone is
1262	 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1263	 * shutdown sequence to take slightly longer in some cases, but
1264	 * doesn't violate the protocol, as all threads will exit as soon as
1265	 * they're done processing the remaining requests.
1266	 */
1267	for (;;) {
1268		while (mi->mi_async_req_count > 0) {
1269			/*
1270			 * Paranoia: If the mount started out having
1271			 * (mi->mi_max_threads == 0), and the value was
1272			 * later changed (via a debugger or somesuch),
1273			 * we could be confused since we will think we
1274			 * can't create any threads, and the calling
1275			 * code (which looks at the current value of
1276			 * mi->mi_max_threads, now non-zero) thinks we
1277			 * can.
1278			 *
1279			 * So, because we're paranoid, we create threads
1280			 * up to the maximum of the original and the
1281			 * current value. This means that future
1282			 * (debugger-induced) lowerings of
1283			 * mi->mi_max_threads are ignored for our
1284			 * purposes, but who told them they could change
1285			 * random values on a live kernel anyhow?
1286			 */
1287			if (mi->mi_threads[NFS_ASYNC_QUEUE] <
1288			    MAX(mi->mi_max_threads, max_threads)) {
1289				mi->mi_threads[NFS_ASYNC_QUEUE]++;
1290				mutex_exit(&mi->mi_async_lock);
1291				VFS_HOLD(vfsp);	/* hold for new thread */
1292				(void) zthread_create(NULL, 0, nfs_async_start,
1293				    vfsp, 0, minclsyspri);
1294				mutex_enter(&mi->mi_async_lock);
1295			} else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] <
1296			    NUM_ASYNC_PGOPS_THREADS) {
1297				mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++;
1298				mutex_exit(&mi->mi_async_lock);
1299				VFS_HOLD(vfsp); /* hold for new thread */
1300				(void) zthread_create(NULL, 0,
1301				    nfs_async_pgops_start, vfsp, 0,
1302				    minclsyspri);
1303				mutex_enter(&mi->mi_async_lock);
1304			}
1305			NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1306			ASSERT(mi->mi_async_req_count != 0);
1307			mi->mi_async_req_count--;
1308		}
1309
1310		mutex_enter(&mi->mi_lock);
1311		if (mi->mi_flags & MI_ASYNC_MGR_STOP) {
1312			mutex_exit(&mi->mi_lock);
1313			break;
1314		}
1315		mutex_exit(&mi->mi_lock);
1316
1317		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1318		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1319		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1320	}
1321	/*
1322	 * Let everyone know we're done.
1323	 */
1324	mi->mi_manager_thread = NULL;
1325	cv_broadcast(&mi->mi_async_cv);
1326
1327	/*
1328	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1329	 * since CALLB_CPR_EXIT is actually responsible for releasing
1330	 * 'mi_async_lock'.
1331	 */
1332	CALLB_CPR_EXIT(&cprinfo);
1333	VFS_RELE(vfsp);	/* release thread's hold */
1334	zthread_exit();
1335}
1336
1337/*
1338 * Signal (and wait for) the async manager thread to clean up and go away.
1339 */
1340void
1341nfs_async_manager_stop(vfs_t *vfsp)
1342{
1343	mntinfo_t *mi = VFTOMI(vfsp);
1344
1345	mutex_enter(&mi->mi_async_lock);
1346	mutex_enter(&mi->mi_lock);
1347	mi->mi_flags |= MI_ASYNC_MGR_STOP;
1348	mutex_exit(&mi->mi_lock);
1349	cv_broadcast(&mi->mi_async_reqs_cv);
1350	while (mi->mi_manager_thread != NULL)
1351		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1352	mutex_exit(&mi->mi_async_lock);
1353}
1354
1355int
1356nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1357	struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1358	u_offset_t, caddr_t, struct seg *, cred_t *))
1359{
1360	rnode_t *rp;
1361	mntinfo_t *mi;
1362	struct nfs_async_reqs *args;
1363
1364	rp = VTOR(vp);
1365	ASSERT(rp->r_freef == NULL);
1366
1367	mi = VTOMI(vp);
1368
1369	/*
1370	 * If addr falls in a different segment, don't bother doing readahead.
1371	 */
1372	if (addr >= seg->s_base + seg->s_size)
1373		return (-1);
1374
1375	/*
1376	 * If we can't allocate a request structure, punt on the readahead.
1377	 */
1378	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1379		return (-1);
1380
1381	/*
1382	 * If a lock operation is pending, don't initiate any new
1383	 * readaheads.  Otherwise, bump r_count to indicate the new
1384	 * asynchronous I/O.
1385	 */
1386	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1387		kmem_free(args, sizeof (*args));
1388		return (-1);
1389	}
1390	mutex_enter(&rp->r_statelock);
1391	rp->r_count++;
1392	mutex_exit(&rp->r_statelock);
1393	nfs_rw_exit(&rp->r_lkserlock);
1394
1395	args->a_next = NULL;
1396#ifdef DEBUG
1397	args->a_queuer = curthread;
1398#endif
1399	VN_HOLD(vp);
1400	args->a_vp = vp;
1401	ASSERT(cr != NULL);
1402	crhold(cr);
1403	args->a_cred = cr;
1404	args->a_io = NFS_READ_AHEAD;
1405	args->a_nfs_readahead = readahead;
1406	args->a_nfs_blkoff = blkoff;
1407	args->a_nfs_seg = seg;
1408	args->a_nfs_addr = addr;
1409
1410	mutex_enter(&mi->mi_async_lock);
1411
1412	/*
1413	 * If asyncio has been disabled, don't bother readahead.
1414	 */
1415	if (mi->mi_max_threads == 0) {
1416		mutex_exit(&mi->mi_async_lock);
1417		goto noasync;
1418	}
1419
1420	/*
1421	 * Link request structure into the async list and
1422	 * wakeup async thread to do the i/o.
1423	 */
1424	if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1425		mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1426		mi->mi_async_tail[NFS_READ_AHEAD] = args;
1427	} else {
1428		mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1429		mi->mi_async_tail[NFS_READ_AHEAD] = args;
1430	}
1431
1432	if (mi->mi_io_kstats) {
1433		mutex_enter(&mi->mi_lock);
1434		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1435		mutex_exit(&mi->mi_lock);
1436	}
1437
1438	mi->mi_async_req_count++;
1439	ASSERT(mi->mi_async_req_count != 0);
1440	cv_signal(&mi->mi_async_reqs_cv);
1441	mutex_exit(&mi->mi_async_lock);
1442	return (0);
1443
1444noasync:
1445	mutex_enter(&rp->r_statelock);
1446	rp->r_count--;
1447	cv_broadcast(&rp->r_cv);
1448	mutex_exit(&rp->r_statelock);
1449	VN_RELE(vp);
1450	crfree(cr);
1451	kmem_free(args, sizeof (*args));
1452	return (-1);
1453}
1454
1455int
1456nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1457	int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1458	u_offset_t, size_t, int, cred_t *))
1459{
1460	rnode_t *rp;
1461	mntinfo_t *mi;
1462	struct nfs_async_reqs *args;
1463
1464	ASSERT(flags & B_ASYNC);
1465	ASSERT(vp->v_vfsp != NULL);
1466
1467	rp = VTOR(vp);
1468	ASSERT(rp->r_count > 0);
1469
1470	mi = VTOMI(vp);
1471
1472	/*
1473	 * If we can't allocate a request structure, do the putpage
1474	 * operation synchronously in this thread's context.
1475	 */
1476	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1477		goto noasync;
1478
1479	args->a_next = NULL;
1480#ifdef DEBUG
1481	args->a_queuer = curthread;
1482#endif
1483	VN_HOLD(vp);
1484	args->a_vp = vp;
1485	ASSERT(cr != NULL);
1486	crhold(cr);
1487	args->a_cred = cr;
1488	args->a_io = NFS_PUTAPAGE;
1489	args->a_nfs_putapage = putapage;
1490	args->a_nfs_pp = pp;
1491	args->a_nfs_off = off;
1492	args->a_nfs_len = (uint_t)len;
1493	args->a_nfs_flags = flags;
1494
1495	mutex_enter(&mi->mi_async_lock);
1496
1497	/*
1498	 * If asyncio has been disabled, then make a synchronous request.
1499	 * This check is done a second time in case async io was diabled
1500	 * while this thread was blocked waiting for memory pressure to
1501	 * reduce or for the queue to drain.
1502	 */
1503	if (mi->mi_max_threads == 0) {
1504		mutex_exit(&mi->mi_async_lock);
1505		goto noasync;
1506	}
1507
1508	/*
1509	 * Link request structure into the async list and
1510	 * wakeup async thread to do the i/o.
1511	 */
1512	if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1513		mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1514		mi->mi_async_tail[NFS_PUTAPAGE] = args;
1515	} else {
1516		mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1517		mi->mi_async_tail[NFS_PUTAPAGE] = args;
1518	}
1519
1520	mutex_enter(&rp->r_statelock);
1521	rp->r_count++;
1522	rp->r_awcount++;
1523	mutex_exit(&rp->r_statelock);
1524
1525	if (mi->mi_io_kstats) {
1526		mutex_enter(&mi->mi_lock);
1527		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1528		mutex_exit(&mi->mi_lock);
1529	}
1530
1531	mi->mi_async_req_count++;
1532	ASSERT(mi->mi_async_req_count != 0);
1533	cv_signal(&mi->mi_async_reqs_cv);
1534	mutex_exit(&mi->mi_async_lock);
1535	return (0);
1536
1537noasync:
1538	if (args != NULL) {
1539		VN_RELE(vp);
1540		crfree(cr);
1541		kmem_free(args, sizeof (*args));
1542	}
1543
1544	if (curproc == proc_pageout || curproc == proc_fsflush) {
1545		/*
1546		 * If we get here in the context of the pageout/fsflush,
1547		 * we refuse to do a sync write, because this may hang
1548		 * pageout (and the machine). In this case, we just
1549		 * re-mark the page as dirty and punt on the page.
1550		 *
1551		 * Make sure B_FORCE isn't set.  We can re-mark the
1552		 * pages as dirty and unlock the pages in one swoop by
1553		 * passing in B_ERROR to pvn_write_done().  However,
1554		 * we should make sure B_FORCE isn't set - we don't
1555		 * want the page tossed before it gets written out.
1556		 */
1557		if (flags & B_FORCE)
1558			flags &= ~(B_INVAL | B_FORCE);
1559		pvn_write_done(pp, flags | B_ERROR);
1560		return (0);
1561	}
1562	if (nfs_zone() != mi->mi_zone) {
1563		/*
1564		 * So this was a cross-zone sync putpage.  We pass in B_ERROR
1565		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1566		 * them.
1567		 *
1568		 * We don't want to clear B_FORCE here as the caller presumably
1569		 * knows what they're doing if they set it.
1570		 */
1571		pvn_write_done(pp, flags | B_ERROR);
1572		return (EPERM);
1573	}
1574	return ((*putapage)(vp, pp, off, len, flags, cr));
1575}
1576
1577int
1578nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1579	int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1580	size_t, int, cred_t *))
1581{
1582	rnode_t *rp;
1583	mntinfo_t *mi;
1584	struct nfs_async_reqs *args;
1585
1586	ASSERT(flags & B_ASYNC);
1587	ASSERT(vp->v_vfsp != NULL);
1588
1589	rp = VTOR(vp);
1590	ASSERT(rp->r_count > 0);
1591
1592	mi = VTOMI(vp);
1593
1594	/*
1595	 * If we can't allocate a request structure, do the pageio
1596	 * request synchronously in this thread's context.
1597	 */
1598	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1599		goto noasync;
1600
1601	args->a_next = NULL;
1602#ifdef DEBUG
1603	args->a_queuer = curthread;
1604#endif
1605	VN_HOLD(vp);
1606	args->a_vp = vp;
1607	ASSERT(cr != NULL);
1608	crhold(cr);
1609	args->a_cred = cr;
1610	args->a_io = NFS_PAGEIO;
1611	args->a_nfs_pageio = pageio;
1612	args->a_nfs_pp = pp;
1613	args->a_nfs_off = io_off;
1614	args->a_nfs_len = (uint_t)io_len;
1615	args->a_nfs_flags = flags;
1616
1617	mutex_enter(&mi->mi_async_lock);
1618
1619	/*
1620	 * If asyncio has been disabled, then make a synchronous request.
1621	 * This check is done a second time in case async io was diabled
1622	 * while this thread was blocked waiting for memory pressure to
1623	 * reduce or for the queue to drain.
1624	 */
1625	if (mi->mi_max_threads == 0) {
1626		mutex_exit(&mi->mi_async_lock);
1627		goto noasync;
1628	}
1629
1630	/*
1631	 * Link request structure into the async list and
1632	 * wakeup async thread to do the i/o.
1633	 */
1634	if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1635		mi->mi_async_reqs[NFS_PAGEIO] = args;
1636		mi->mi_async_tail[NFS_PAGEIO] = args;
1637	} else {
1638		mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1639		mi->mi_async_tail[NFS_PAGEIO] = args;
1640	}
1641
1642	mutex_enter(&rp->r_statelock);
1643	rp->r_count++;
1644	rp->r_awcount++;
1645	mutex_exit(&rp->r_statelock);
1646
1647	if (mi->mi_io_kstats) {
1648		mutex_enter(&mi->mi_lock);
1649		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1650		mutex_exit(&mi->mi_lock);
1651	}
1652
1653	mi->mi_async_req_count++;
1654	ASSERT(mi->mi_async_req_count != 0);
1655	cv_signal(&mi->mi_async_reqs_cv);
1656	mutex_exit(&mi->mi_async_lock);
1657	return (0);
1658
1659noasync:
1660	if (args != NULL) {
1661		VN_RELE(vp);
1662		crfree(cr);
1663		kmem_free(args, sizeof (*args));
1664	}
1665
1666	/*
1667	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1668	 * the page list), for writes we do it synchronously, except for
1669	 * proc_pageout/proc_fsflush as described below.
1670	 */
1671	if (flags & B_READ) {
1672		pvn_read_done(pp, flags | B_ERROR);
1673		return (0);
1674	}
1675
1676	if (curproc == proc_pageout || curproc == proc_fsflush) {
1677		/*
1678		 * If we get here in the context of the pageout/fsflush,
1679		 * we refuse to do a sync write, because this may hang
1680		 * pageout/fsflush (and the machine). In this case, we just
1681		 * re-mark the page as dirty and punt on the page.
1682		 *
1683		 * Make sure B_FORCE isn't set.  We can re-mark the
1684		 * pages as dirty and unlock the pages in one swoop by
1685		 * passing in B_ERROR to pvn_write_done().  However,
1686		 * we should make sure B_FORCE isn't set - we don't
1687		 * want the page tossed before it gets written out.
1688		 */
1689		if (flags & B_FORCE)
1690			flags &= ~(B_INVAL | B_FORCE);
1691		pvn_write_done(pp, flags | B_ERROR);
1692		return (0);
1693	}
1694
1695	if (nfs_zone() != mi->mi_zone) {
1696		/*
1697		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1698		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1699		 * them.
1700		 *
1701		 * We don't want to clear B_FORCE here as the caller presumably
1702		 * knows what they're doing if they set it.
1703		 */
1704		pvn_write_done(pp, flags | B_ERROR);
1705		return (EPERM);
1706	}
1707	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1708}
1709
1710void
1711nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1712	int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1713{
1714	rnode_t *rp;
1715	mntinfo_t *mi;
1716	struct nfs_async_reqs *args;
1717
1718	rp = VTOR(vp);
1719	ASSERT(rp->r_freef == NULL);
1720
1721	mi = VTOMI(vp);
1722
1723	/*
1724	 * If we can't allocate a request structure, do the readdir
1725	 * operation synchronously in this thread's context.
1726	 */
1727	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1728		goto noasync;
1729
1730	args->a_next = NULL;
1731#ifdef DEBUG
1732	args->a_queuer = curthread;
1733#endif
1734	VN_HOLD(vp);
1735	args->a_vp = vp;
1736	ASSERT(cr != NULL);
1737	crhold(cr);
1738	args->a_cred = cr;
1739	args->a_io = NFS_READDIR;
1740	args->a_nfs_readdir = readdir;
1741	args->a_nfs_rdc = rdc;
1742
1743	mutex_enter(&mi->mi_async_lock);
1744
1745	/*
1746	 * If asyncio has been disabled, then make a synchronous request.
1747	 */
1748	if (mi->mi_max_threads == 0) {
1749		mutex_exit(&mi->mi_async_lock);
1750		goto noasync;
1751	}
1752
1753	/*
1754	 * Link request structure into the async list and
1755	 * wakeup async thread to do the i/o.
1756	 */
1757	if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1758		mi->mi_async_reqs[NFS_READDIR] = args;
1759		mi->mi_async_tail[NFS_READDIR] = args;
1760	} else {
1761		mi->mi_async_tail[NFS_READDIR]->a_next = args;
1762		mi->mi_async_tail[NFS_READDIR] = args;
1763	}
1764
1765	mutex_enter(&rp->r_statelock);
1766	rp->r_count++;
1767	mutex_exit(&rp->r_statelock);
1768
1769	if (mi->mi_io_kstats) {
1770		mutex_enter(&mi->mi_lock);
1771		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1772		mutex_exit(&mi->mi_lock);
1773	}
1774
1775	mi->mi_async_req_count++;
1776	ASSERT(mi->mi_async_req_count != 0);
1777	cv_signal(&mi->mi_async_reqs_cv);
1778	mutex_exit(&mi->mi_async_lock);
1779	return;
1780
1781noasync:
1782	if (args != NULL) {
1783		VN_RELE(vp);
1784		crfree(cr);
1785		kmem_free(args, sizeof (*args));
1786	}
1787
1788	rdc->entries = NULL;
1789	mutex_enter(&rp->r_statelock);
1790	ASSERT(rdc->flags & RDDIR);
1791	rdc->flags &= ~RDDIR;
1792	rdc->flags |= RDDIRREQ;
1793	/*
1794	 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1795	 * is set, wakeup the thread sleeping in cv_wait_sig().
1796	 * The woken up thread will reset the flag to RDDIR and will
1797	 * continue with the readdir opeartion.
1798	 */
1799	if (rdc->flags & RDDIRWAIT) {
1800		rdc->flags &= ~RDDIRWAIT;
1801		cv_broadcast(&rdc->cv);
1802	}
1803	mutex_exit(&rp->r_statelock);
1804	rddir_cache_rele(rdc);
1805}
1806
1807void
1808nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1809	cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
1810	cred_t *))
1811{
1812	rnode_t *rp;
1813	mntinfo_t *mi;
1814	struct nfs_async_reqs *args;
1815	page_t *pp;
1816
1817	rp = VTOR(vp);
1818	mi = VTOMI(vp);
1819
1820	/*
1821	 * If we can't allocate a request structure, do the commit
1822	 * operation synchronously in this thread's context.
1823	 */
1824	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1825		goto noasync;
1826
1827	args->a_next = NULL;
1828#ifdef DEBUG
1829	args->a_queuer = curthread;
1830#endif
1831	VN_HOLD(vp);
1832	args->a_vp = vp;
1833	ASSERT(cr != NULL);
1834	crhold(cr);
1835	args->a_cred = cr;
1836	args->a_io = NFS_COMMIT;
1837	args->a_nfs_commit = commit;
1838	args->a_nfs_plist = plist;
1839	args->a_nfs_offset = offset;
1840	args->a_nfs_count = count;
1841
1842	mutex_enter(&mi->mi_async_lock);
1843
1844	/*
1845	 * If asyncio has been disabled, then make a synchronous request.
1846	 * This check is done a second time in case async io was diabled
1847	 * while this thread was blocked waiting for memory pressure to
1848	 * reduce or for the queue to drain.
1849	 */
1850	if (mi->mi_max_threads == 0) {
1851		mutex_exit(&mi->mi_async_lock);
1852		goto noasync;
1853	}
1854
1855	/*
1856	 * Link request structure into the async list and
1857	 * wakeup async thread to do the i/o.
1858	 */
1859	if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1860		mi->mi_async_reqs[NFS_COMMIT] = args;
1861		mi->mi_async_tail[NFS_COMMIT] = args;
1862	} else {
1863		mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1864		mi->mi_async_tail[NFS_COMMIT] = args;
1865	}
1866
1867	mutex_enter(&rp->r_statelock);
1868	rp->r_count++;
1869	mutex_exit(&rp->r_statelock);
1870
1871	if (mi->mi_io_kstats) {
1872		mutex_enter(&mi->mi_lock);
1873		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1874		mutex_exit(&mi->mi_lock);
1875	}
1876
1877	mi->mi_async_req_count++;
1878	ASSERT(mi->mi_async_req_count != 0);
1879	cv_signal(&mi->mi_async_reqs_cv);
1880	mutex_exit(&mi->mi_async_lock);
1881	return;
1882
1883noasync:
1884	if (args != NULL) {
1885		VN_RELE(vp);
1886		crfree(cr);
1887		kmem_free(args, sizeof (*args));
1888	}
1889
1890	if (curproc == proc_pageout || curproc == proc_fsflush ||
1891	    nfs_zone() != mi->mi_zone) {
1892		while (plist != NULL) {
1893			pp = plist;
1894			page_sub(&plist, pp);
1895			pp->p_fsdata = C_COMMIT;
1896			page_unlock(pp);
1897		}
1898		return;
1899	}
1900	(*commit)(vp, plist, offset, count, cr);
1901}
1902
1903void
1904nfs_async_inactive(vnode_t *vp, cred_t *cr,
1905    void (*inactive)(vnode_t *, cred_t *, caller_context_t *))
1906{
1907	mntinfo_t *mi;
1908	struct nfs_async_reqs *args;
1909
1910	mi = VTOMI(vp);
1911
1912	args = kmem_alloc(sizeof (*args), KM_SLEEP);
1913	args->a_next = NULL;
1914#ifdef DEBUG
1915	args->a_queuer = curthread;
1916#endif
1917	args->a_vp = vp;
1918	ASSERT(cr != NULL);
1919	crhold(cr);
1920	args->a_cred = cr;
1921	args->a_io = NFS_INACTIVE;
1922	args->a_nfs_inactive = inactive;
1923
1924	/*
1925	 * Note that we don't check mi->mi_max_threads here, since we
1926	 * *need* to get rid of this vnode regardless of whether someone
1927	 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1928	 *
1929	 * The manager thread knows about this and is willing to create
1930	 * at least one thread to accommodate us.
1931	 */
1932	mutex_enter(&mi->mi_async_lock);
1933	if (mi->mi_manager_thread == NULL) {
1934		rnode_t *rp = VTOR(vp);
1935
1936		mutex_exit(&mi->mi_async_lock);
1937		crfree(cr);	/* drop our reference */
1938		kmem_free(args, sizeof (*args));
1939		/*
1940		 * We can't do an over-the-wire call since we're in the wrong
1941		 * zone, so we need to clean up state as best we can and then
1942		 * throw away the vnode.
1943		 */
1944		mutex_enter(&rp->r_statelock);
1945		if (rp->r_unldvp != NULL) {
1946			vnode_t *unldvp;
1947			char *unlname;
1948			cred_t *unlcred;
1949
1950			unldvp = rp->r_unldvp;
1951			rp->r_unldvp = NULL;
1952			unlname = rp->r_unlname;
1953			rp->r_unlname = NULL;
1954			unlcred = rp->r_unlcred;
1955			rp->r_unlcred = NULL;
1956			mutex_exit(&rp->r_statelock);
1957
1958			VN_RELE(unldvp);
1959			kmem_free(unlname, MAXNAMELEN);
1960			crfree(unlcred);
1961		} else {
1962			mutex_exit(&rp->r_statelock);
1963		}
1964		/*
1965		 * No need to explicitly throw away any cached pages.  The
1966		 * eventual rinactive() will attempt a synchronous
1967		 * VOP_PUTPAGE() which will immediately fail since the request
1968		 * is coming from the wrong zone, and then will proceed to call
1969		 * nfs_invalidate_pages() which will clean things up for us.
1970		 */
1971		rp_addfree(VTOR(vp), cr);
1972		return;
1973	}
1974
1975	if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1976		mi->mi_async_reqs[NFS_INACTIVE] = args;
1977	} else {
1978		mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1979	}
1980	mi->mi_async_tail[NFS_INACTIVE] = args;
1981	/*
1982	 * Don't increment r_count, since we're trying to get rid of the vnode.
1983	 */
1984
1985	mi->mi_async_req_count++;
1986	ASSERT(mi->mi_async_req_count != 0);
1987	cv_signal(&mi->mi_async_reqs_cv);
1988	mutex_exit(&mi->mi_async_lock);
1989}
1990
1991static void
1992nfs_async_start(struct vfs *vfsp)
1993{
1994	nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE);
1995}
1996
1997static void
1998nfs_async_pgops_start(struct vfs *vfsp)
1999{
2000	nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE);
2001}
2002
2003/*
2004 * The async queues for each mounted file system are arranged as a
2005 * set of queues, one for each async i/o type.  Requests are taken
2006 * from the queues in a round-robin fashion.  A number of consecutive
2007 * requests are taken from each queue before moving on to the next
2008 * queue.  This functionality may allow the NFS Version 2 server to do
2009 * write clustering, even if the client is mixing writes and reads
2010 * because it will take multiple write requests from the queue
2011 * before processing any of the other async i/o types.
2012 *
2013 * XXX The nfs_async_common_start thread is unsafe in the light of the present
2014 * model defined by cpr to suspend the system. Specifically over the
2015 * wire calls are cpr-unsafe. The thread should be reevaluated in
2016 * case of future updates to the cpr model.
2017 */
2018static void
2019nfs_async_common_start(struct vfs *vfsp, int async_queue)
2020{
2021	struct nfs_async_reqs *args;
2022	mntinfo_t *mi = VFTOMI(vfsp);
2023	clock_t time_left = 1;
2024	callb_cpr_t cprinfo;
2025	int i;
2026	int async_types;
2027	kcondvar_t *async_work_cv;
2028
2029	if (async_queue == NFS_ASYNC_QUEUE) {
2030		async_types = NFS_ASYNC_TYPES;
2031		async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE];
2032	} else {
2033		async_types = NFS_ASYNC_PGOPS_TYPES;
2034		async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE];
2035	}
2036
2037	/*
2038	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
2039	 * built in an implementation independent manner.
2040	 */
2041	if (nfs_async_timeout == -1)
2042		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
2043
2044	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
2045
2046	mutex_enter(&mi->mi_async_lock);
2047	for (;;) {
2048		/*
2049		 * Find the next queue containing an entry.  We start
2050		 * at the current queue pointer and then round robin
2051		 * through all of them until we either find a non-empty
2052		 * queue or have looked through all of them.
2053		 */
2054		for (i = 0; i < async_types; i++) {
2055			args = *mi->mi_async_curr[async_queue];
2056			if (args != NULL)
2057				break;
2058			mi->mi_async_curr[async_queue]++;
2059			if (mi->mi_async_curr[async_queue] ==
2060			    &mi->mi_async_reqs[async_types]) {
2061				mi->mi_async_curr[async_queue] =
2062				    &mi->mi_async_reqs[0];
2063			}
2064		}
2065		/*
2066		 * If we didn't find a entry, then block until woken up
2067		 * again and then look through the queues again.
2068		 */
2069		if (args == NULL) {
2070			/*
2071			 * Exiting is considered to be safe for CPR as well
2072			 */
2073			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2074
2075			/*
2076			 * Wakeup thread waiting to unmount the file
2077			 * system only if all async threads are inactive.
2078			 *
2079			 * If we've timed-out and there's nothing to do,
2080			 * then get rid of this thread.
2081			 */
2082			if (mi->mi_max_threads == 0 || time_left <= 0) {
2083				--mi->mi_threads[async_queue];
2084
2085				if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
2086				    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0)
2087					cv_signal(&mi->mi_async_cv);
2088				CALLB_CPR_EXIT(&cprinfo);
2089				VFS_RELE(vfsp);	/* release thread's hold */
2090				zthread_exit();
2091				/* NOTREACHED */
2092			}
2093			time_left = cv_reltimedwait(async_work_cv,
2094			    &mi->mi_async_lock, nfs_async_timeout,
2095			    TR_CLOCK_TICK);
2096
2097			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
2098
2099			continue;
2100		}
2101		time_left = 1;
2102
2103		/*
2104		 * Remove the request from the async queue and then
2105		 * update the current async request queue pointer.  If
2106		 * the current queue is empty or we have removed enough
2107		 * consecutive entries from it, then reset the counter
2108		 * for this queue and then move the current pointer to
2109		 * the next queue.
2110		 */
2111		*mi->mi_async_curr[async_queue] = args->a_next;
2112		if (*mi->mi_async_curr[async_queue] == NULL ||
2113		    --mi->mi_async_clusters[args->a_io] == 0) {
2114			mi->mi_async_clusters[args->a_io] =
2115			    mi->mi_async_init_clusters;
2116			mi->mi_async_curr[async_queue]++;
2117			if (mi->mi_async_curr[async_queue] ==
2118			    &mi->mi_async_reqs[async_types]) {
2119				mi->mi_async_curr[async_queue] =
2120				    &mi->mi_async_reqs[0];
2121			}
2122		}
2123
2124		if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
2125			mutex_enter(&mi->mi_lock);
2126			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
2127			mutex_exit(&mi->mi_lock);
2128		}
2129
2130		mutex_exit(&mi->mi_async_lock);
2131
2132		/*
2133		 * Obtain arguments from the async request structure.
2134		 */
2135		if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2136			(*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2137			    args->a_nfs_addr, args->a_nfs_seg,
2138			    args->a_cred);
2139		} else if (args->a_io == NFS_PUTAPAGE) {
2140			(void) (*args->a_nfs_putapage)(args->a_vp,
2141			    args->a_nfs_pp, args->a_nfs_off,
2142			    args->a_nfs_len, args->a_nfs_flags,
2143			    args->a_cred);
2144		} else if (args->a_io == NFS_PAGEIO) {
2145			(void) (*args->a_nfs_pageio)(args->a_vp,
2146			    args->a_nfs_pp, args->a_nfs_off,
2147			    args->a_nfs_len, args->a_nfs_flags,
2148			    args->a_cred);
2149		} else if (args->a_io == NFS_READDIR) {
2150			(void) ((*args->a_nfs_readdir)(args->a_vp,
2151			    args->a_nfs_rdc, args->a_cred));
2152		} else if (args->a_io == NFS_COMMIT) {
2153			(*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2154			    args->a_nfs_offset, args->a_nfs_count,
2155			    args->a_cred);
2156		} else if (args->a_io == NFS_INACTIVE) {
2157			(*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL);
2158		}
2159
2160		/*
2161		 * Now, release the vnode and free the credentials
2162		 * structure.
2163		 */
2164		free_async_args(args);
2165		/*
2166		 * Reacquire the mutex because it will be needed above.
2167		 */
2168		mutex_enter(&mi->mi_async_lock);
2169	}
2170}
2171
2172void
2173nfs_async_stop(struct vfs *vfsp)
2174{
2175	mntinfo_t *mi = VFTOMI(vfsp);
2176
2177	/*
2178	 * Wait for all outstanding async operations to complete and for the
2179	 * worker threads to exit.
2180	 */
2181	mutex_enter(&mi->mi_async_lock);
2182	mi->mi_max_threads = 0;
2183	NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2184	while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2185	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0)
2186		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2187	mutex_exit(&mi->mi_async_lock);
2188}
2189
2190/*
2191 * nfs_async_stop_sig:
2192 * Wait for all outstanding putpage operation to complete. If a signal
2193 * is deliver we will abort and return non-zero. If we can put all the
2194 * pages we will return 0. This routine is called from nfs_unmount and
2195 * nfs3_unmount to make these operations interruptible.
2196 */
2197int
2198nfs_async_stop_sig(struct vfs *vfsp)
2199{
2200	mntinfo_t *mi = VFTOMI(vfsp);
2201	ushort_t omax;
2202	int rval;
2203
2204	/*
2205	 * Wait for all outstanding async operations to complete and for the
2206	 * worker threads to exit.
2207	 */
2208	mutex_enter(&mi->mi_async_lock);
2209	omax = mi->mi_max_threads;
2210	mi->mi_max_threads = 0;
2211	/*
2212	 * Tell all the worker threads to exit.
2213	 */
2214	NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2215	while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2216	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) {
2217		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2218			break;
2219	}
2220	rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2221	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]  != 0); /* Interrupted */
2222	if (rval)
2223		mi->mi_max_threads = omax;
2224	mutex_exit(&mi->mi_async_lock);
2225
2226	return (rval);
2227}
2228
2229int
2230writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2231{
2232	int pagecreate;
2233	int n;
2234	int saved_n;
2235	caddr_t saved_base;
2236	u_offset_t offset;
2237	int error;
2238	int sm_error;
2239	vnode_t *vp = RTOV(rp);
2240
2241	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2242	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2243	if (!vpm_enable) {
2244		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2245	}
2246
2247	/*
2248	 * Move bytes in at most PAGESIZE chunks. We must avoid
2249	 * spanning pages in uiomove() because page faults may cause
2250	 * the cache to be invalidated out from under us. The r_size is not
2251	 * updated until after the uiomove. If we push the last page of a
2252	 * file before r_size is correct, we will lose the data written past
2253	 * the current (and invalid) r_size.
2254	 */
2255	do {
2256		offset = uio->uio_loffset;
2257		pagecreate = 0;
2258
2259		/*
2260		 * n is the number of bytes required to satisfy the request
2261		 *   or the number of bytes to fill out the page.
2262		 */
2263		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2264
2265		/*
2266		 * Check to see if we can skip reading in the page
2267		 * and just allocate the memory.  We can do this
2268		 * if we are going to rewrite the entire mapping
2269		 * or if we are going to write to or beyond the current
2270		 * end of file from the beginning of the mapping.
2271		 *
2272		 * The read of r_size is now protected by r_statelock.
2273		 */
2274		mutex_enter(&rp->r_statelock);
2275		/*
2276		 * When pgcreated is nonzero the caller has already done
2277		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2278		 * segkpm this means we already have at least one page
2279		 * created and mapped at base.
2280		 */
2281		pagecreate = pgcreated ||
2282		    ((offset & PAGEOFFSET) == 0 &&
2283		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2284
2285		mutex_exit(&rp->r_statelock);
2286		if (!vpm_enable && pagecreate) {
2287			/*
2288			 * The last argument tells segmap_pagecreate() to
2289			 * always lock the page, as opposed to sometimes
2290			 * returning with the page locked. This way we avoid a
2291			 * fault on the ensuing uiomove(), but also
2292			 * more importantly (to fix bug 1094402) we can
2293			 * call segmap_fault() to unlock the page in all
2294			 * cases. An alternative would be to modify
2295			 * segmap_pagecreate() to tell us when it is
2296			 * locking a page, but that's a fairly major
2297			 * interface change.
2298			 */
2299			if (pgcreated == 0)
2300				(void) segmap_pagecreate(segkmap, base,
2301				    (uint_t)n, 1);
2302			saved_base = base;
2303			saved_n = n;
2304		}
2305
2306		/*
2307		 * The number of bytes of data in the last page can not
2308		 * be accurately be determined while page is being
2309		 * uiomove'd to and the size of the file being updated.
2310		 * Thus, inform threads which need to know accurately
2311		 * how much data is in the last page of the file.  They
2312		 * will not do the i/o immediately, but will arrange for
2313		 * the i/o to happen later when this modify operation
2314		 * will have finished.
2315		 */
2316		ASSERT(!(rp->r_flags & RMODINPROGRESS));
2317		mutex_enter(&rp->r_statelock);
2318		rp->r_flags |= RMODINPROGRESS;
2319		rp->r_modaddr = (offset & MAXBMASK);
2320		mutex_exit(&rp->r_statelock);
2321
2322		if (vpm_enable) {
2323			/*
2324			 * Copy data. If new pages are created, part of
2325			 * the page that is not written will be initizliazed
2326			 * with zeros.
2327			 */
2328			error = vpm_data_copy(vp, offset, n, uio,
2329			    !pagecreate, NULL, 0, S_WRITE);
2330		} else {
2331			error = uiomove(base, n, UIO_WRITE, uio);
2332		}
2333
2334		/*
2335		 * r_size is the maximum number of
2336		 * bytes known to be in the file.
2337		 * Make sure it is at least as high as the
2338		 * first unwritten byte pointed to by uio_loffset.
2339		 */
2340		mutex_enter(&rp->r_statelock);
2341		if (rp->r_size < uio->uio_loffset)
2342			rp->r_size = uio->uio_loffset;
2343		rp->r_flags &= ~RMODINPROGRESS;
2344		rp->r_flags |= RDIRTY;
2345		mutex_exit(&rp->r_statelock);
2346
2347		/* n = # of bytes written */
2348		n = (int)(uio->uio_loffset - offset);
2349
2350		if (!vpm_enable) {
2351			base += n;
2352		}
2353		tcount -= n;
2354		/*
2355		 * If we created pages w/o initializing them completely,
2356		 * we need to zero the part that wasn't set up.
2357		 * This happens on a most EOF write cases and if
2358		 * we had some sort of error during the uiomove.
2359		 */
2360		if (!vpm_enable && pagecreate) {
2361			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2362				(void) kzero(base, PAGESIZE - n);
2363
2364			if (pgcreated) {
2365				/*
2366				 * Caller is responsible for this page,
2367				 * it was not created in this loop.
2368				 */
2369				pgcreated = 0;
2370			} else {
2371				/*
2372				 * For bug 1094402: segmap_pagecreate locks
2373				 * page. Unlock it. This also unlocks the
2374				 * pages allocated by page_create_va() in
2375				 * segmap_pagecreate().
2376				 */
2377				sm_error = segmap_fault(kas.a_hat, segkmap,
2378				    saved_base, saved_n,
2379				    F_SOFTUNLOCK, S_WRITE);
2380				if (error == 0)
2381					error = sm_error;
2382			}
2383		}
2384	} while (tcount > 0 && error == 0);
2385
2386	return (error);
2387}
2388
2389int
2390nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2391{
2392	rnode_t *rp;
2393	page_t *pp;
2394	u_offset_t eoff;
2395	u_offset_t io_off;
2396	size_t io_len;
2397	int error;
2398	int rdirty;
2399	int err;
2400
2401	rp = VTOR(vp);
2402	ASSERT(rp->r_count > 0);
2403
2404	if (!vn_has_cached_data(vp))
2405		return (0);
2406
2407	ASSERT(vp->v_type != VCHR);
2408
2409	/*
2410	 * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2411	 * writes.  B_FORCE is set to force the VM system to actually
2412	 * invalidate the pages, even if the i/o failed.  The pages
2413	 * need to get invalidated because they can't be written out
2414	 * because there isn't any space left on either the server's
2415	 * file system or in the user's disk quota.  The B_FREE bit
2416	 * is cleared to avoid confusion as to whether this is a
2417	 * request to place the page on the freelist or to destroy
2418	 * it.
2419	 */
2420	if ((rp->r_flags & ROUTOFSPACE) ||
2421	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2422		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2423
2424	if (len == 0) {
2425		/*
2426		 * If doing a full file synchronous operation, then clear
2427		 * the RDIRTY bit.  If a page gets dirtied while the flush
2428		 * is happening, then RDIRTY will get set again.  The
2429		 * RDIRTY bit must get cleared before the flush so that
2430		 * we don't lose this information.
2431		 *
2432		 * If there are no full file async write operations
2433		 * pending and RDIRTY bit is set, clear it.
2434		 */
2435		if (off == (u_offset_t)0 &&
2436		    !(flags & B_ASYNC) &&
2437		    (rp->r_flags & RDIRTY)) {
2438			mutex_enter(&rp->r_statelock);
2439			rdirty = (rp->r_flags & RDIRTY);
2440			rp->r_flags &= ~RDIRTY;
2441			mutex_exit(&rp->r_statelock);
2442		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
2443			mutex_enter(&rp->r_statelock);
2444			if (rp->r_flags & RDIRTY && rp->r_awcount == 0) {
2445				rdirty = (rp->r_flags & RDIRTY);
2446				rp->r_flags &= ~RDIRTY;
2447			}
2448			mutex_exit(&rp->r_statelock);
2449		} else
2450			rdirty = 0;
2451
2452		/*
2453		 * Search the entire vp list for pages >= off, and flush
2454		 * the dirty pages.
2455		 */
2456		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2457		    flags, cr);
2458
2459		/*
2460		 * If an error occurred and the file was marked as dirty
2461		 * before and we aren't forcibly invalidating pages, then
2462		 * reset the RDIRTY flag.
2463		 */
2464		if (error && rdirty &&
2465		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2466			mutex_enter(&rp->r_statelock);
2467			rp->r_flags |= RDIRTY;
2468			mutex_exit(&rp->r_statelock);
2469		}
2470	} else {
2471		/*
2472		 * Do a range from [off...off + len) looking for pages
2473		 * to deal with.
2474		 */
2475		error = 0;
2476#ifdef lint
2477		io_len = 0;
2478#endif
2479		eoff = off + len;
2480		mutex_enter(&rp->r_statelock);
2481		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2482		    io_off += io_len) {
2483			mutex_exit(&rp->r_statelock);
2484			/*
2485			 * If we are not invalidating, synchronously
2486			 * freeing or writing pages use the routine
2487			 * page_lookup_nowait() to prevent reclaiming
2488			 * them from the free list.
2489			 */
2490			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2491				pp = page_lookup(vp, io_off,
2492				    (flags & (B_INVAL | B_FREE)) ?
2493				    SE_EXCL : SE_SHARED);
2494			} else {
2495				pp = page_lookup_nowait(vp, io_off,
2496				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2497			}
2498
2499			if (pp == NULL || !pvn_getdirty(pp, flags))
2500				io_len = PAGESIZE;
2501			else {
2502				err = (*rp->r_putapage)(vp, pp, &io_off,
2503				    &io_len, flags, cr);
2504				if (!error)
2505					error = err;
2506				/*
2507				 * "io_off" and "io_len" are returned as
2508				 * the range of pages we actually wrote.
2509				 * This allows us to skip ahead more quickly
2510				 * since several pages may've been dealt
2511				 * with by this iteration of the loop.
2512				 */
2513			}
2514			mutex_enter(&rp->r_statelock);
2515		}
2516		mutex_exit(&rp->r_statelock);
2517	}
2518
2519	return (error);
2520}
2521
2522void
2523nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2524{
2525	rnode_t *rp;
2526
2527	rp = VTOR(vp);
2528	mutex_enter(&rp->r_statelock);
2529	while (rp->r_flags & RTRUNCATE)
2530		cv_wait(&rp->r_cv, &rp->r_statelock);
2531	rp->r_flags |= RTRUNCATE;
2532	if (off == (u_offset_t)0) {
2533		rp->r_flags &= ~RDIRTY;
2534		if (!(rp->r_flags & RSTALE))
2535			rp->r_error = 0;
2536	}
2537	rp->r_truncaddr = off;
2538	mutex_exit(&rp->r_statelock);
2539	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2540	    B_INVAL | B_TRUNC, cr);
2541	mutex_enter(&rp->r_statelock);
2542	rp->r_flags &= ~RTRUNCATE;
2543	cv_broadcast(&rp->r_cv);
2544	mutex_exit(&rp->r_statelock);
2545}
2546
2547static int nfs_write_error_to_cons_only = 0;
2548#define	MSG(x)	(nfs_write_error_to_cons_only ? (x) : (x) + 1)
2549
2550/*
2551 * Print a file handle
2552 */
2553void
2554nfs_printfhandle(nfs_fhandle *fhp)
2555{
2556	int *ip;
2557	char *buf;
2558	size_t bufsize;
2559	char *cp;
2560
2561	/*
2562	 * 13 == "(file handle:"
2563	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2564	 *	1 == ' '
2565	 *	8 == maximum strlen of "%x"
2566	 * 3 == ")\n\0"
2567	 */
2568	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2569	buf = kmem_alloc(bufsize, KM_NOSLEEP);
2570	if (buf == NULL)
2571		return;
2572
2573	cp = buf;
2574	(void) strcpy(cp, "(file handle:");
2575	while (*cp != '\0')
2576		cp++;
2577	for (ip = (int *)fhp->fh_buf;
2578	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
2579	    ip++) {
2580		(void) sprintf(cp, " %x", *ip);
2581		while (*cp != '\0')
2582			cp++;
2583	}
2584	(void) strcpy(cp, ")\n");
2585
2586	zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2587
2588	kmem_free(buf, bufsize);
2589}
2590
2591/*
2592 * Notify the system administrator that an NFS write error has
2593 * occurred.
2594 */
2595
2596/* seconds between ENOSPC/EDQUOT messages */
2597clock_t nfs_write_error_interval = 5;
2598
2599void
2600nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2601{
2602	mntinfo_t *mi;
2603	clock_t now;
2604
2605	mi = VTOMI(vp);
2606	/*
2607	 * In case of forced unmount or zone shutdown, do not print any
2608	 * messages since it can flood the console with error messages.
2609	 */
2610	if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2611		return;
2612
2613	/*
2614	 * No use in flooding the console with ENOSPC
2615	 * messages from the same file system.
2616	 */
2617	now = ddi_get_lbolt();
2618	if ((error != ENOSPC && error != EDQUOT) ||
2619	    now - mi->mi_printftime > 0) {
2620		zoneid_t zoneid = mi->mi_zone->zone_id;
2621
2622#ifdef DEBUG
2623		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2624		    mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2625#else
2626		nfs_perror(error, "NFS write error on host %s: %m.\n",
2627		    VTOR(vp)->r_server->sv_hostname, NULL);
2628#endif
2629		if (error == ENOSPC || error == EDQUOT) {
2630			zcmn_err(zoneid, CE_CONT,
2631			    MSG("^File: userid=%d, groupid=%d\n"),
2632			    crgetuid(cr), crgetgid(cr));
2633			if (crgetuid(CRED()) != crgetuid(cr) ||
2634			    crgetgid(CRED()) != crgetgid(cr)) {
2635				zcmn_err(zoneid, CE_CONT,
2636				    MSG("^User: userid=%d, groupid=%d\n"),
2637				    crgetuid(CRED()), crgetgid(CRED()));
2638			}
2639			mi->mi_printftime = now +
2640			    nfs_write_error_interval * hz;
2641		}
2642		nfs_printfhandle(&VTOR(vp)->r_fh);
2643#ifdef DEBUG
2644		if (error == EACCES) {
2645			zcmn_err(zoneid, CE_CONT,
2646			    MSG("^nfs_bio: cred is%s kcred\n"),
2647			    cr == kcred ? "" : " not");
2648		}
2649#endif
2650	}
2651}
2652
2653/* ARGSUSED */
2654static void *
2655nfs_mi_init(zoneid_t zoneid)
2656{
2657	struct mi_globals *mig;
2658
2659	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2660	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2661	list_create(&mig->mig_list, sizeof (mntinfo_t),
2662	    offsetof(mntinfo_t, mi_zone_node));
2663	mig->mig_destructor_called = B_FALSE;
2664	return (mig);
2665}
2666
2667/*
2668 * Callback routine to tell all NFS mounts in the zone to stop creating new
2669 * threads.  Existing threads should exit.
2670 */
2671/* ARGSUSED */
2672static void
2673nfs_mi_shutdown(zoneid_t zoneid, void *data)
2674{
2675	struct mi_globals *mig = data;
2676	mntinfo_t *mi;
2677
2678	ASSERT(mig != NULL);
2679again:
2680	mutex_enter(&mig->mig_lock);
2681	for (mi = list_head(&mig->mig_list); mi != NULL;
2682	    mi = list_next(&mig->mig_list, mi)) {
2683
2684		/*
2685		 * If we've done the shutdown work for this FS, skip.
2686		 * Once we go off the end of the list, we're done.
2687		 */
2688		if (mi->mi_flags & MI_DEAD)
2689			continue;
2690
2691		/*
2692		 * We will do work, so not done.  Get a hold on the FS.
2693		 */
2694		VFS_HOLD(mi->mi_vfsp);
2695
2696		/*
2697		 * purge the DNLC for this filesystem
2698		 */
2699		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2700
2701		mutex_enter(&mi->mi_async_lock);
2702		/*
2703		 * Tell existing async worker threads to exit.
2704		 */
2705		mi->mi_max_threads = 0;
2706		NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2707		/*
2708		 * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2709		 * getting ready to exit when it's done with its current work.
2710		 * Also set MI_DEAD to note we've acted on this FS.
2711		 */
2712		mutex_enter(&mi->mi_lock);
2713		mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2714		mutex_exit(&mi->mi_lock);
2715		/*
2716		 * Wake up the async manager thread.
2717		 */
2718		cv_broadcast(&mi->mi_async_reqs_cv);
2719		mutex_exit(&mi->mi_async_lock);
2720
2721		/*
2722		 * Drop lock and release FS, which may change list, then repeat.
2723		 * We're done when every mi has been done or the list is empty.
2724		 */
2725		mutex_exit(&mig->mig_lock);
2726		VFS_RELE(mi->mi_vfsp);
2727		goto again;
2728	}
2729	mutex_exit(&mig->mig_lock);
2730}
2731
2732static void
2733nfs_mi_free_globals(struct mi_globals *mig)
2734{
2735	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2736	mutex_destroy(&mig->mig_lock);
2737	kmem_free(mig, sizeof (*mig));
2738
2739}
2740
2741/* ARGSUSED */
2742static void
2743nfs_mi_destroy(zoneid_t zoneid, void *data)
2744{
2745	struct mi_globals *mig = data;
2746
2747	ASSERT(mig != NULL);
2748	mutex_enter(&mig->mig_lock);
2749	if (list_head(&mig->mig_list) != NULL) {
2750		/* Still waiting for VFS_FREEVFS() */
2751		mig->mig_destructor_called = B_TRUE;
2752		mutex_exit(&mig->mig_lock);
2753		return;
2754	}
2755	nfs_mi_free_globals(mig);
2756}
2757
2758/*
2759 * Add an NFS mount to the per-zone list of NFS mounts.
2760 */
2761void
2762nfs_mi_zonelist_add(mntinfo_t *mi)
2763{
2764	struct mi_globals *mig;
2765
2766	mig = zone_getspecific(mi_list_key, mi->mi_zone);
2767	mutex_enter(&mig->mig_lock);
2768	list_insert_head(&mig->mig_list, mi);
2769	mutex_exit(&mig->mig_lock);
2770}
2771
2772/*
2773 * Remove an NFS mount from the per-zone list of NFS mounts.
2774 */
2775static void
2776nfs_mi_zonelist_remove(mntinfo_t *mi)
2777{
2778	struct mi_globals *mig;
2779
2780	mig = zone_getspecific(mi_list_key, mi->mi_zone);
2781	mutex_enter(&mig->mig_lock);
2782	list_remove(&mig->mig_list, mi);
2783	/*
2784	 * We can be called asynchronously by VFS_FREEVFS() after the zone
2785	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2786	 * mi globals.
2787	 */
2788	if (list_head(&mig->mig_list) == NULL &&
2789	    mig->mig_destructor_called == B_TRUE) {
2790		nfs_mi_free_globals(mig);
2791		return;
2792	}
2793	mutex_exit(&mig->mig_lock);
2794}
2795
2796/*
2797 * NFS Client initialization routine.  This routine should only be called
2798 * once.  It performs the following tasks:
2799 *	- Initalize all global locks
2800 * 	- Call sub-initialization routines (localize access to variables)
2801 */
2802int
2803nfs_clntinit(void)
2804{
2805#ifdef DEBUG
2806	static boolean_t nfs_clntup = B_FALSE;
2807#endif
2808	int error;
2809
2810#ifdef DEBUG
2811	ASSERT(nfs_clntup == B_FALSE);
2812#endif
2813
2814	error = nfs_subrinit();
2815	if (error)
2816		return (error);
2817
2818	error = nfs_vfsinit();
2819	if (error) {
2820		/*
2821		 * Cleanup nfs_subrinit() work
2822		 */
2823		nfs_subrfini();
2824		return (error);
2825	}
2826	zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2827	    nfs_mi_destroy);
2828
2829	nfs4_clnt_init();
2830
2831#ifdef DEBUG
2832	nfs_clntup = B_TRUE;
2833#endif
2834
2835	return (0);
2836}
2837
2838/*
2839 * This routine is only called if the NFS Client has been initialized but
2840 * the module failed to be installed. This routine will cleanup the previously
2841 * allocated/initialized work.
2842 */
2843void
2844nfs_clntfini(void)
2845{
2846	(void) zone_key_delete(mi_list_key);
2847	nfs_subrfini();
2848	nfs_vfsfini();
2849	nfs4_clnt_fini();
2850}
2851
2852/*
2853 * nfs_lockrelease:
2854 *
2855 * Release any locks on the given vnode that are held by the current
2856 * process.
2857 */
2858void
2859nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2860{
2861	flock64_t ld;
2862	struct shrlock shr;
2863	char *buf;
2864	int remote_lock_possible;
2865	int ret;
2866
2867	ASSERT((uintptr_t)vp > KERNELBASE);
2868
2869	/*
2870	 * Generate an explicit unlock operation for the entire file.  As a
2871	 * partial optimization, only generate the unlock if there is a
2872	 * lock registered for the file.  We could check whether this
2873	 * particular process has any locks on the file, but that would
2874	 * require the local locking code to provide yet another query
2875	 * routine.  Note that no explicit synchronization is needed here.
2876	 * At worst, flk_has_remote_locks() will return a false positive,
2877	 * in which case the unlock call wastes time but doesn't harm
2878	 * correctness.
2879	 *
2880	 * In addition, an unlock request is generated if the process
2881	 * is listed as possibly having a lock on the file because the
2882	 * server and client lock managers may have gotten out of sync.
2883	 * N.B. It is important to make sure nfs_remove_locking_id() is
2884	 * called here even if flk_has_remote_locks(vp) reports true.
2885	 * If it is not called and there is an entry on the process id
2886	 * list, that entry will never get removed.
2887	 */
2888	remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2889	    (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2890	if (remote_lock_possible || flk_has_remote_locks(vp)) {
2891		ld.l_type = F_UNLCK;	/* set to unlock entire file */
2892		ld.l_whence = 0;	/* unlock from start of file */
2893		ld.l_start = 0;
2894		ld.l_len = 0;		/* do entire file */
2895		ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr,
2896		    NULL);
2897
2898		if (ret != 0) {
2899			/*
2900			 * If VOP_FRLOCK fails, make sure we unregister
2901			 * local locks before we continue.
2902			 */
2903			ld.l_pid = ttoproc(curthread)->p_pid;
2904			lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2905#ifdef DEBUG
2906			nfs_perror(ret,
2907			    "NFS lock release error on vp %p: %m.\n",
2908			    (void *)vp, NULL);
2909#endif
2910		}
2911
2912		/*
2913		 * The call to VOP_FRLOCK may put the pid back on the
2914		 * list.  We need to remove it.
2915		 */
2916		(void) nfs_remove_locking_id(vp, RLMPL_PID,
2917		    (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2918	}
2919
2920	/*
2921	 * As long as the vp has a share matching our pid,
2922	 * pluck it off and unshare it.  There are circumstances in
2923	 * which the call to nfs_remove_locking_id() may put the
2924	 * owner back on the list, in which case we simply do a
2925	 * redundant and harmless unshare.
2926	 */
2927	buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2928	while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2929	    (char *)NULL, buf, &shr.s_own_len)) {
2930		shr.s_owner = buf;
2931		shr.s_access = 0;
2932		shr.s_deny = 0;
2933		shr.s_sysid = 0;
2934		shr.s_pid = curproc->p_pid;
2935
2936		ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL);
2937#ifdef DEBUG
2938		if (ret != 0) {
2939			nfs_perror(ret,
2940			    "NFS share release error on vp %p: %m.\n",
2941			    (void *)vp, NULL);
2942		}
2943#endif
2944	}
2945	kmem_free(buf, MAX_SHR_OWNER_LEN);
2946}
2947
2948/*
2949 * nfs_lockcompletion:
2950 *
2951 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2952 * as non cachable (set VNOCACHE bit).
2953 */
2954
2955void
2956nfs_lockcompletion(vnode_t *vp, int cmd)
2957{
2958#ifdef DEBUG
2959	rnode_t *rp = VTOR(vp);
2960
2961	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2962#endif
2963
2964	if (cmd == F_SETLK || cmd == F_SETLKW) {
2965		if (!lm_safemap(vp)) {
2966			mutex_enter(&vp->v_lock);
2967			vp->v_flag |= VNOCACHE;
2968			mutex_exit(&vp->v_lock);
2969		} else {
2970			mutex_enter(&vp->v_lock);
2971			vp->v_flag &= ~VNOCACHE;
2972			mutex_exit(&vp->v_lock);
2973		}
2974	}
2975	/*
2976	 * The cached attributes of the file are stale after acquiring
2977	 * the lock on the file. They were updated when the file was
2978	 * opened, but not updated when the lock was acquired. Therefore the
2979	 * cached attributes are invalidated after the lock is obtained.
2980	 */
2981	PURGE_ATTRCACHE(vp);
2982}
2983
2984/*
2985 * The lock manager holds state making it possible for the client
2986 * and server to be out of sync.  For example, if the response from
2987 * the server granting a lock request is lost, the server will think
2988 * the lock is granted and the client will think the lock is lost.
2989 * The client can tell when it is not positive if it is in sync with
2990 * the server.
2991 *
2992 * To deal with this, a list of processes for which the client is
2993 * not sure if the server holds a lock is attached to the rnode.
2994 * When such a process closes the rnode, an unlock request is sent
2995 * to the server to unlock the entire file.
2996 *
2997 * The list is kept as a singularly linked NULL terminated list.
2998 * Because it is only added to under extreme error conditions, the
2999 * list shouldn't get very big.  DEBUG kernels print a message if
3000 * the list gets bigger than nfs_lmpl_high_water.  This is arbitrarily
3001 * choosen to be 8, but can be tuned at runtime.
3002 */
3003#ifdef DEBUG
3004/* int nfs_lmpl_high_water = 8; */
3005int nfs_lmpl_high_water = 128;
3006int nfs_cnt_add_locking_id = 0;
3007int nfs_len_add_locking_id = 0;
3008#endif /* DEBUG */
3009
3010/*
3011 * Record that the nfs lock manager server may be holding a lock on
3012 * a vnode for a process.
3013 *
3014 * Because the nfs lock manager server holds state, it is possible
3015 * for the server to get out of sync with the client.  This routine is called
3016 * from the client when it is no longer sure if the server is in sync
3017 * with the client.  nfs_lockrelease() will then notice this and send
3018 * an unlock request when the file is closed
3019 */
3020void
3021nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
3022{
3023	rnode_t *rp;
3024	lmpl_t *new;
3025	lmpl_t *cur;
3026	lmpl_t **lmplp;
3027#ifdef DEBUG
3028	int list_len = 1;
3029#endif /* DEBUG */
3030
3031#ifdef DEBUG
3032	++nfs_cnt_add_locking_id;
3033#endif /* DEBUG */
3034	/*
3035	 * allocate new lmpl_t now so we don't sleep
3036	 * later after grabbing mutexes
3037	 */
3038	ASSERT(len < MAX_SHR_OWNER_LEN);
3039	new = kmem_alloc(sizeof (*new), KM_SLEEP);
3040	new->lmpl_type = type;
3041	new->lmpl_pid = pid;
3042	new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
3043	bcopy(id, new->lmpl_owner, len);
3044	new->lmpl_own_len = len;
3045	new->lmpl_next = (lmpl_t *)NULL;
3046#ifdef DEBUG
3047	if (type == RLMPL_PID) {
3048		ASSERT(len == sizeof (pid_t));
3049		ASSERT(pid == *(pid_t *)new->lmpl_owner);
3050	} else {
3051		ASSERT(type == RLMPL_OWNER);
3052	}
3053#endif
3054
3055	rp = VTOR(vp);
3056	mutex_enter(&rp->r_statelock);
3057
3058	/*
3059	 * Add this id to the list for this rnode only if the
3060	 * rnode is active and the id is not already there.
3061	 */
3062	ASSERT(rp->r_flags & RHASHED);
3063	lmplp = &(rp->r_lmpl);
3064	for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3065		if (cur->lmpl_pid == pid &&
3066		    cur->lmpl_type == type &&
3067		    cur->lmpl_own_len == len &&
3068		    bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
3069			kmem_free(new->lmpl_owner, len);
3070			kmem_free(new, sizeof (*new));
3071			break;
3072		}
3073		lmplp = &cur->lmpl_next;
3074#ifdef DEBUG
3075		++list_len;
3076#endif /* DEBUG */
3077	}
3078	if (cur == (lmpl_t *)NULL) {
3079		*lmplp = new;
3080#ifdef DEBUG
3081		if (list_len > nfs_len_add_locking_id) {
3082			nfs_len_add_locking_id = list_len;
3083		}
3084		if (list_len > nfs_lmpl_high_water) {
3085			cmn_err(CE_WARN, "nfs_add_locking_id: long list "
3086			    "vp=%p is %d", (void *)vp, list_len);
3087		}
3088#endif /* DEBUG */
3089	}
3090
3091#ifdef DEBUG
3092	if (share_debug) {
3093		int nitems = 0;
3094		int npids = 0;
3095		int nowners = 0;
3096
3097		/*
3098		 * Count the number of things left on r_lmpl after the remove.
3099		 */
3100		for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3101		    cur = cur->lmpl_next) {
3102			nitems++;
3103			if (cur->lmpl_type == RLMPL_PID) {
3104				npids++;
3105			} else if (cur->lmpl_type == RLMPL_OWNER) {
3106				nowners++;
3107			} else {
3108				cmn_err(CE_PANIC, "nfs_add_locking_id: "
3109				    "unrecognized lmpl_type %d",
3110				    cur->lmpl_type);
3111			}
3112		}
3113
3114		cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
3115		    "OWNs = %d items left on r_lmpl\n",
3116		    (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
3117	}
3118#endif
3119
3120	mutex_exit(&rp->r_statelock);
3121}
3122
3123/*
3124 * Remove an id from the lock manager id list.
3125 *
3126 * If the id is not in the list return 0.  If it was found and
3127 * removed, return 1.
3128 */
3129static int
3130nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
3131{
3132	lmpl_t *cur;
3133	lmpl_t **lmplp;
3134	rnode_t *rp;
3135	int rv = 0;
3136
3137	ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
3138
3139	rp = VTOR(vp);
3140
3141	mutex_enter(&rp->r_statelock);
3142	ASSERT(rp->r_flags & RHASHED);
3143	lmplp = &(rp->r_lmpl);
3144
3145	/*
3146	 * Search through the list and remove the entry for this id
3147	 * if it is there.  The special case id == NULL allows removal
3148	 * of the first share on the r_lmpl list belonging to the
3149	 * current process (if any), without regard to further details
3150	 * of its identity.
3151	 */
3152	for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3153		if (cur->lmpl_type == type &&
3154		    cur->lmpl_pid == curproc->p_pid &&
3155		    (id == (char *)NULL ||
3156		    bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
3157			*lmplp = cur->lmpl_next;
3158			ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3159			if (rid != NULL) {
3160				bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3161				*rlen = cur->lmpl_own_len;
3162			}
3163			kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3164			kmem_free(cur, sizeof (*cur));
3165			rv = 1;
3166			break;
3167		}
3168		lmplp = &cur->lmpl_next;
3169	}
3170
3171#ifdef DEBUG
3172	if (share_debug) {
3173		int nitems = 0;
3174		int npids = 0;
3175		int nowners = 0;
3176
3177		/*
3178		 * Count the number of things left on r_lmpl after the remove.
3179		 */
3180		for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3181		    cur = cur->lmpl_next) {
3182			nitems++;
3183			if (cur->lmpl_type == RLMPL_PID) {
3184				npids++;
3185			} else if (cur->lmpl_type == RLMPL_OWNER) {
3186				nowners++;
3187			} else {
3188				cmn_err(CE_PANIC,
3189				    "nrli: unrecognized lmpl_type %d",
3190				    cur->lmpl_type);
3191			}
3192		}
3193
3194		cmn_err(CE_CONT,
3195		"nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3196		    (type == RLMPL_PID) ? "P" : "O",
3197		    npids,
3198		    nowners,
3199		    nitems);
3200	}
3201#endif
3202
3203	mutex_exit(&rp->r_statelock);
3204	return (rv);
3205}
3206
3207void
3208nfs_free_mi(mntinfo_t *mi)
3209{
3210	ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3211	ASSERT(mi->mi_manager_thread == NULL);
3212	ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
3213	    mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0);
3214
3215	/*
3216	 * Remove the node from the global list before we start tearing it down.
3217	 */
3218	nfs_mi_zonelist_remove(mi);
3219	if (mi->mi_klmconfig) {
3220		lm_free_config(mi->mi_klmconfig);
3221		kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3222	}
3223	mutex_destroy(&mi->mi_lock);
3224	mutex_destroy(&mi->mi_remap_lock);
3225	mutex_destroy(&mi->mi_async_lock);
3226	cv_destroy(&mi->mi_failover_cv);
3227	cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]);
3228	cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]);
3229	cv_destroy(&mi->mi_async_reqs_cv);
3230	cv_destroy(&mi->mi_async_cv);
3231	zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS);
3232	kmem_free(mi, sizeof (*mi));
3233}
3234
3235static int
3236mnt_kstat_update(kstat_t *ksp, int rw)
3237{
3238	mntinfo_t *mi;
3239	struct mntinfo_kstat *mik;
3240	vfs_t *vfsp;
3241	int i;
3242
3243	/* this is a read-only kstat. Bail out on a write */
3244	if (rw == KSTAT_WRITE)
3245		return (EACCES);
3246
3247	/*
3248	 * We don't want to wait here as kstat_chain_lock could be held by
3249	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
3250	 * and thus could lead to a deadlock.
3251	 */
3252	vfsp = (struct vfs *)ksp->ks_private;
3253
3254
3255	mi = VFTOMI(vfsp);
3256
3257	mik = (struct mntinfo_kstat *)ksp->ks_data;
3258
3259	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3260	mik->mik_vers = (uint32_t)mi->mi_vers;
3261	mik->mik_flags = mi->mi_flags;
3262	mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3263	mik->mik_curread = (uint32_t)mi->mi_curread;
3264	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3265	mik->mik_retrans = mi->mi_retrans;
3266	mik->mik_timeo = mi->mi_timeo;
3267	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3268	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3269	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3270	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3271	for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3272		mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3273		mik->mik_timers[i].deviate =
3274		    (uint32_t)mi->mi_timers[i].rt_deviate;
3275		mik->mik_timers[i].rtxcur =
3276		    (uint32_t)mi->mi_timers[i].rt_rtxcur;
3277	}
3278	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3279	mik->mik_failover = (uint32_t)mi->mi_failover;
3280	mik->mik_remap = (uint32_t)mi->mi_remap;
3281	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3282
3283	return (0);
3284}
3285
3286void
3287nfs_mnt_kstat_init(struct vfs *vfsp)
3288{
3289	mntinfo_t *mi = VFTOMI(vfsp);
3290
3291	/*
3292	 * Create the version specific kstats.
3293	 *
3294	 * PSARC 2001/697 Contract Private Interface
3295	 * All nfs kstats are under SunMC contract
3296	 * Please refer to the PSARC listed above and contact
3297	 * SunMC before making any changes!
3298	 *
3299	 * Changes must be reviewed by Solaris File Sharing
3300	 * Changes must be communicated to contract-2001-697@sun.com
3301	 *
3302	 */
3303
3304	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3305	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3306	if (mi->mi_io_kstats) {
3307		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3308			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3309		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3310		kstat_install(mi->mi_io_kstats);
3311	}
3312
3313	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3314	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3315	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3316		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3317			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3318		mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3319		mi->mi_ro_kstats->ks_private = (void *)vfsp;
3320		kstat_install(mi->mi_ro_kstats);
3321	}
3322}
3323
3324nfs_delmapcall_t *
3325nfs_init_delmapcall()
3326{
3327	nfs_delmapcall_t	*delmap_call;
3328
3329	delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3330	delmap_call->call_id = curthread;
3331	delmap_call->error = 0;
3332
3333	return (delmap_call);
3334}
3335
3336void
3337nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3338{
3339	kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3340}
3341
3342/*
3343 * Searches for the current delmap caller (based on curthread) in the list of
3344 * callers.  If it is found, we remove it and free the delmap caller.
3345 * Returns:
3346 *	0 if the caller wasn't found
3347 *	1 if the caller was found, removed and freed.  *errp is set to what
3348 * 	the result of the delmap was.
3349 */
3350int
3351nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3352{
3353	nfs_delmapcall_t	*delmap_call;
3354
3355	/*
3356	 * If the list doesn't exist yet, we create it and return
3357	 * that the caller wasn't found.  No list = no callers.
3358	 */
3359	mutex_enter(&rp->r_statelock);
3360	if (!(rp->r_flags & RDELMAPLIST)) {
3361		/* The list does not exist */
3362		list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3363		    offsetof(nfs_delmapcall_t, call_node));
3364		rp->r_flags |= RDELMAPLIST;
3365		mutex_exit(&rp->r_statelock);
3366		return (0);
3367	} else {
3368		/* The list exists so search it */
3369		for (delmap_call = list_head(&rp->r_indelmap);
3370		    delmap_call != NULL;
3371		    delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3372			if (delmap_call->call_id == curthread) {
3373				/* current caller is in the list */
3374				*errp = delmap_call->error;
3375				list_remove(&rp->r_indelmap, delmap_call);
3376				mutex_exit(&rp->r_statelock);
3377				nfs_free_delmapcall(delmap_call);
3378				return (1);
3379			}
3380		}
3381	}
3382	mutex_exit(&rp->r_statelock);
3383	return (0);
3384}
3385