1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/param.h>
27#include <sys/types.h>
28#include <sys/systm.h>
29#include <sys/cred.h>
30#include <sys/proc.h>
31#include <sys/user.h>
32#include <sys/time.h>
33#include <sys/buf.h>
34#include <sys/vfs.h>
35#include <sys/vnode.h>
36#include <sys/socket.h>
37#include <sys/uio.h>
38#include <sys/tiuser.h>
39#include <sys/swap.h>
40#include <sys/errno.h>
41#include <sys/debug.h>
42#include <sys/kmem.h>
43#include <sys/kstat.h>
44#include <sys/cmn_err.h>
45#include <sys/vtrace.h>
46#include <sys/session.h>
47#include <sys/dnlc.h>
48#include <sys/bitmap.h>
49#include <sys/acl.h>
50#include <sys/ddi.h>
51#include <sys/pathname.h>
52#include <sys/flock.h>
53#include <sys/dirent.h>
54#include <sys/flock.h>
55#include <sys/callb.h>
56#include <sys/atomic.h>
57#include <sys/list.h>
58#include <sys/tsol/tnet.h>
59#include <sys/priv.h>
60#include <sys/sdt.h>
61#include <sys/attr.h>
62
63#include <inet/ip6.h>
64
65#include <rpc/types.h>
66#include <rpc/xdr.h>
67#include <rpc/auth.h>
68#include <rpc/clnt.h>
69
70#include <nfs/nfs.h>
71#include <nfs/nfs4.h>
72#include <nfs/nfs_clnt.h>
73#include <nfs/rnode.h>
74#include <nfs/nfs_acl.h>
75
76#include <sys/tsol/label.h>
77
78/*
79 * The hash queues for the access to active and cached rnodes
80 * are organized as doubly linked lists.  A reader/writer lock
81 * for each hash bucket is used to control access and to synchronize
82 * lookups, additions, and deletions from the hash queue.
83 *
84 * The rnode freelist is organized as a doubly linked list with
85 * a head pointer.  Additions and deletions are synchronized via
86 * a single mutex.
87 *
88 * In order to add an rnode to the free list, it must be hashed into
89 * a hash queue and the exclusive lock to the hash queue be held.
90 * If an rnode is not hashed into a hash queue, then it is destroyed
91 * because it represents no valuable information that can be reused
92 * about the file.  The exclusive lock to the hash queue must be
93 * held in order to prevent a lookup in the hash queue from finding
94 * the rnode and using it and assuming that the rnode is not on the
95 * freelist.  The lookup in the hash queue will have the hash queue
96 * locked, either exclusive or shared.
97 *
98 * The vnode reference count for each rnode is not allowed to drop
99 * below 1.  This prevents external entities, such as the VM
100 * subsystem, from acquiring references to vnodes already on the
101 * freelist and then trying to place them back on the freelist
102 * when their reference is released.  This means that the when an
103 * rnode is looked up in the hash queues, then either the rnode
104 * is removed from the freelist and that reference is transferred to
105 * the new reference or the vnode reference count must be incremented
106 * accordingly.  The mutex for the freelist must be held in order to
107 * accurately test to see if the rnode is on the freelist or not.
108 * The hash queue lock might be held shared and it is possible that
109 * two different threads may race to remove the rnode from the
110 * freelist.  This race can be resolved by holding the mutex for the
111 * freelist.  Please note that the mutex for the freelist does not
112 * need to held if the rnode is not on the freelist.  It can not be
113 * placed on the freelist due to the requirement that the thread
114 * putting the rnode on the freelist must hold the exclusive lock
115 * to the hash queue and the thread doing the lookup in the hash
116 * queue is holding either a shared or exclusive lock to the hash
117 * queue.
118 *
119 * The lock ordering is:
120 *
121 *	hash bucket lock -> vnode lock
122 *	hash bucket lock -> freelist lock
123 */
124static rhashq_t *rtable;
125
126static kmutex_t rpfreelist_lock;
127static rnode_t *rpfreelist = NULL;
128static long rnew = 0;
129long nrnode = 0;
130
131static int rtablesize;
132static int rtablemask;
133
134static int hashlen = 4;
135
136static struct kmem_cache *rnode_cache;
137
138/*
139 * Mutex to protect the following variables:
140 *	nfs_major
141 *	nfs_minor
142 */
143kmutex_t nfs_minor_lock;
144int nfs_major;
145int nfs_minor;
146
147/* Do we allow preepoch (negative) time values otw? */
148bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
149
150/*
151 * Access cache
152 */
153static acache_hash_t *acache;
154static long nacache;	/* used strictly to size the number of hash queues */
155
156static int acachesize;
157static int acachemask;
158static struct kmem_cache *acache_cache;
159
160/*
161 * Client side utilities
162 */
163
164/*
165 * client side statistics
166 */
167static const struct clstat clstat_tmpl = {
168	{ "calls",	KSTAT_DATA_UINT64 },
169	{ "badcalls",	KSTAT_DATA_UINT64 },
170	{ "clgets",	KSTAT_DATA_UINT64 },
171	{ "cltoomany",	KSTAT_DATA_UINT64 },
172#ifdef DEBUG
173	{ "clalloc",	KSTAT_DATA_UINT64 },
174	{ "noresponse",	KSTAT_DATA_UINT64 },
175	{ "failover",	KSTAT_DATA_UINT64 },
176	{ "remap",	KSTAT_DATA_UINT64 },
177#endif
178};
179
180/*
181 * The following are statistics that describe behavior of the system as a whole
182 * and doesn't correspond to any one particular zone.
183 */
184#ifdef DEBUG
185static struct clstat_debug {
186	kstat_named_t	nrnode;			/* number of allocated rnodes */
187	kstat_named_t	access;			/* size of access cache */
188	kstat_named_t	dirent;			/* size of readdir cache */
189	kstat_named_t	dirents;		/* size of readdir buf cache */
190	kstat_named_t	reclaim;		/* number of reclaims */
191	kstat_named_t	clreclaim;		/* number of cl reclaims */
192	kstat_named_t	f_reclaim;		/* number of free reclaims */
193	kstat_named_t	a_reclaim;		/* number of active reclaims */
194	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
195	kstat_named_t	rpath;			/* bytes used to store rpaths */
196} clstat_debug = {
197	{ "nrnode",	KSTAT_DATA_UINT64 },
198	{ "access",	KSTAT_DATA_UINT64 },
199	{ "dirent",	KSTAT_DATA_UINT64 },
200	{ "dirents",	KSTAT_DATA_UINT64 },
201	{ "reclaim",	KSTAT_DATA_UINT64 },
202	{ "clreclaim",	KSTAT_DATA_UINT64 },
203	{ "f_reclaim",	KSTAT_DATA_UINT64 },
204	{ "a_reclaim",	KSTAT_DATA_UINT64 },
205	{ "r_reclaim",	KSTAT_DATA_UINT64 },
206	{ "r_path",	KSTAT_DATA_UINT64 },
207};
208#endif	/* DEBUG */
209
210/*
211 * We keep a global list of per-zone client data, so we can clean up all zones
212 * if we get low on memory.
213 */
214static list_t nfs_clnt_list;
215static kmutex_t nfs_clnt_list_lock;
216static zone_key_t nfsclnt_zone_key;
217
218static struct kmem_cache *chtab_cache;
219
220/*
221 * Some servers do not properly update the attributes of the
222 * directory when changes are made.  To allow interoperability
223 * with these broken servers, the nfs_disable_rddir_cache
224 * parameter must be set in /etc/system
225 */
226int nfs_disable_rddir_cache = 0;
227
228int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
229		    struct chtab **);
230void		clfree(CLIENT *, struct chtab *);
231static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
232		    struct chtab **, struct nfs_clnt *);
233static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234		    struct chtab **, struct nfs_clnt *);
235static void	clreclaim(void *);
236static int	nfs_feedback(int, int, mntinfo_t *);
237static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
238		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
239		    failinfo_t *);
240static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
241		    caddr_t, cred_t *, int *, int, failinfo_t *);
242static void	rinactive(rnode_t *, cred_t *);
243static int	rtablehash(nfs_fhandle *);
244static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
245		    struct vnodeops *,
246		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
247			cred_t *),
248		    int (*)(const void *, const void *), int *, cred_t *,
249		    char *, char *);
250static void	rp_rmfree(rnode_t *);
251static void	rp_addhash(rnode_t *);
252static void	rp_rmhash_locked(rnode_t *);
253static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
254static void	destroy_rnode(rnode_t *);
255static void	rddir_cache_free(rddir_cache *);
256static int	nfs_free_data_reclaim(rnode_t *);
257static int	nfs_active_data_reclaim(rnode_t *);
258static int	nfs_free_reclaim(void);
259static int	nfs_active_reclaim(void);
260static int	nfs_rnode_reclaim(void);
261static void	nfs_reclaim(void *);
262static int	failover_safe(failinfo_t *);
263static void	failover_newserver(mntinfo_t *mi);
264static void	failover_thread(mntinfo_t *mi);
265static int	failover_wait(mntinfo_t *);
266static int	failover_remap(failinfo_t *);
267static int	failover_lookup(char *, vnode_t *,
268		    int (*)(vnode_t *, char *, vnode_t **,
269			struct pathname *, int, vnode_t *, cred_t *, int),
270		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
271		    vnode_t **);
272static void	nfs_free_r_path(rnode_t *);
273static void	nfs_set_vroot(vnode_t *);
274static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
275
276/*
277 * from rpcsec module (common/rpcsec)
278 */
279extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
280extern void sec_clnt_freeh(AUTH *);
281extern void sec_clnt_freeinfo(struct sec_data *);
282
283/*
284 * used in mount policy
285 */
286extern ts_label_t *getflabel_cipso(vfs_t *);
287
288/*
289 * EIO or EINTR are not recoverable errors.
290 */
291#define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
292
293#ifdef DEBUG
294#define	SRV_QFULL_MSG	"send queue to NFS%d server %s is full; still trying\n"
295#define	SRV_NOTRESP_MSG	"NFS%d server %s not responding still trying\n"
296#else
297#define	SRV_QFULL_MSG	"send queue to NFS server %s is full still trying\n"
298#define	SRV_NOTRESP_MSG	"NFS server %s not responding still trying\n"
299#endif
300/*
301 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
302 */
303static int
304clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
305    struct chtab **chp, struct nfs_clnt *nfscl)
306{
307	struct chhead *ch, *newch;
308	struct chhead **plistp;
309	struct chtab *cp;
310	int error;
311	k_sigset_t smask;
312
313	if (newcl == NULL || chp == NULL || ci == NULL)
314		return (EINVAL);
315
316	*newcl = NULL;
317	*chp = NULL;
318
319	/*
320	 * Find an unused handle or create one
321	 */
322	newch = NULL;
323	nfscl->nfscl_stat.clgets.value.ui64++;
324top:
325	/*
326	 * Find the correct entry in the cache to check for free
327	 * client handles.  The search is based on the RPC program
328	 * number, program version number, dev_t for the transport
329	 * device, and the protocol family.
330	 */
331	mutex_enter(&nfscl->nfscl_chtable_lock);
332	plistp = &nfscl->nfscl_chtable;
333	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
334		if (ch->ch_prog == ci->cl_prog &&
335		    ch->ch_vers == ci->cl_vers &&
336		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
337		    (strcmp(ch->ch_protofmly,
338		    svp->sv_knconf->knc_protofmly) == 0))
339			break;
340		plistp = &ch->ch_next;
341	}
342
343	/*
344	 * If we didn't find a cache entry for this quadruple, then
345	 * create one.  If we don't have one already preallocated,
346	 * then drop the cache lock, create one, and then start over.
347	 * If we did have a preallocated entry, then just add it to
348	 * the front of the list.
349	 */
350	if (ch == NULL) {
351		if (newch == NULL) {
352			mutex_exit(&nfscl->nfscl_chtable_lock);
353			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
354			newch->ch_timesused = 0;
355			newch->ch_prog = ci->cl_prog;
356			newch->ch_vers = ci->cl_vers;
357			newch->ch_dev = svp->sv_knconf->knc_rdev;
358			newch->ch_protofmly = kmem_alloc(
359			    strlen(svp->sv_knconf->knc_protofmly) + 1,
360			    KM_SLEEP);
361			(void) strcpy(newch->ch_protofmly,
362			    svp->sv_knconf->knc_protofmly);
363			newch->ch_list = NULL;
364			goto top;
365		}
366		ch = newch;
367		newch = NULL;
368		ch->ch_next = nfscl->nfscl_chtable;
369		nfscl->nfscl_chtable = ch;
370	/*
371	 * We found a cache entry, but if it isn't on the front of the
372	 * list, then move it to the front of the list to try to take
373	 * advantage of locality of operations.
374	 */
375	} else if (ch != nfscl->nfscl_chtable) {
376		*plistp = ch->ch_next;
377		ch->ch_next = nfscl->nfscl_chtable;
378		nfscl->nfscl_chtable = ch;
379	}
380
381	/*
382	 * If there was a free client handle cached, then remove it
383	 * from the list, init it, and use it.
384	 */
385	if (ch->ch_list != NULL) {
386		cp = ch->ch_list;
387		ch->ch_list = cp->ch_list;
388		mutex_exit(&nfscl->nfscl_chtable_lock);
389		if (newch != NULL) {
390			kmem_free(newch->ch_protofmly,
391			    strlen(newch->ch_protofmly) + 1);
392			kmem_free(newch, sizeof (*newch));
393		}
394		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
395		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
396		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
397		    &cp->ch_client->cl_auth);
398		if (error || cp->ch_client->cl_auth == NULL) {
399			CLNT_DESTROY(cp->ch_client);
400			kmem_cache_free(chtab_cache, cp);
401			return ((error != 0) ? error : EINTR);
402		}
403		ch->ch_timesused++;
404		*newcl = cp->ch_client;
405		*chp = cp;
406		return (0);
407	}
408
409	/*
410	 * There weren't any free client handles which fit, so allocate
411	 * a new one and use that.
412	 */
413#ifdef DEBUG
414	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
415#endif
416	mutex_exit(&nfscl->nfscl_chtable_lock);
417
418	nfscl->nfscl_stat.cltoomany.value.ui64++;
419	if (newch != NULL) {
420		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
421		kmem_free(newch, sizeof (*newch));
422	}
423
424	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
425	cp->ch_head = ch;
426
427	sigintr(&smask, (int)ci->cl_flags & MI_INT);
428	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
429	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
430	sigunintr(&smask);
431
432	if (error != 0) {
433		kmem_cache_free(chtab_cache, cp);
434#ifdef DEBUG
435		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
436#endif
437		/*
438		 * Warning is unnecessary if error is EINTR.
439		 */
440		if (error != EINTR) {
441			nfs_cmn_err(error, CE_WARN,
442			    "clget: couldn't create handle: %m\n");
443		}
444		return (error);
445	}
446	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
447	auth_destroy(cp->ch_client->cl_auth);
448	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
449	    &cp->ch_client->cl_auth);
450	if (error || cp->ch_client->cl_auth == NULL) {
451		CLNT_DESTROY(cp->ch_client);
452		kmem_cache_free(chtab_cache, cp);
453#ifdef DEBUG
454		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
455#endif
456		return ((error != 0) ? error : EINTR);
457	}
458	ch->ch_timesused++;
459	*newcl = cp->ch_client;
460	ASSERT(cp->ch_client->cl_nosignal == FALSE);
461	*chp = cp;
462	return (0);
463}
464
465int
466clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
467    struct chtab **chp)
468{
469	struct nfs_clnt *nfscl;
470
471	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
472	ASSERT(nfscl != NULL);
473
474	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
475}
476
477static int
478acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
479    struct chtab **chp, struct nfs_clnt *nfscl)
480{
481	clinfo_t ci;
482	int error;
483
484	/*
485	 * Set read buffer size to rsize
486	 * and add room for RPC headers.
487	 */
488	ci.cl_readsize = mi->mi_tsize;
489	if (ci.cl_readsize != 0)
490		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
491
492	/*
493	 * If soft mount and server is down just try once.
494	 * meaning: do not retransmit.
495	 */
496	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
497		ci.cl_retrans = 0;
498	else
499		ci.cl_retrans = mi->mi_retrans;
500
501	ci.cl_prog = NFS_ACL_PROGRAM;
502	ci.cl_vers = mi->mi_vers;
503	ci.cl_flags = mi->mi_flags;
504
505	/*
506	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
507	 * security flavor, the client tries to establish a security context
508	 * by contacting the server. If the connection is timed out or reset,
509	 * e.g. server reboot, we will try again.
510	 */
511	do {
512		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
513
514		if (error == 0)
515			break;
516
517		/*
518		 * For forced unmount or zone shutdown, bail out, no retry.
519		 */
520		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
521			error = EIO;
522			break;
523		}
524
525		/* do not retry for softmount */
526		if (!(mi->mi_flags & MI_HARD))
527			break;
528
529		/* let the caller deal with the failover case */
530		if (FAILOVER_MOUNT(mi))
531			break;
532
533	} while (error == ETIMEDOUT || error == ECONNRESET);
534
535	return (error);
536}
537
538static int
539nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
540    struct chtab **chp, struct nfs_clnt *nfscl)
541{
542	clinfo_t ci;
543	int error;
544
545	/*
546	 * Set read buffer size to rsize
547	 * and add room for RPC headers.
548	 */
549	ci.cl_readsize = mi->mi_tsize;
550	if (ci.cl_readsize != 0)
551		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
552
553	/*
554	 * If soft mount and server is down just try once.
555	 * meaning: do not retransmit.
556	 */
557	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
558		ci.cl_retrans = 0;
559	else
560		ci.cl_retrans = mi->mi_retrans;
561
562	ci.cl_prog = mi->mi_prog;
563	ci.cl_vers = mi->mi_vers;
564	ci.cl_flags = mi->mi_flags;
565
566	/*
567	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
568	 * security flavor, the client tries to establish a security context
569	 * by contacting the server. If the connection is timed out or reset,
570	 * e.g. server reboot, we will try again.
571	 */
572	do {
573		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
574
575		if (error == 0)
576			break;
577
578		/*
579		 * For forced unmount or zone shutdown, bail out, no retry.
580		 */
581		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
582			error = EIO;
583			break;
584		}
585
586		/* do not retry for softmount */
587		if (!(mi->mi_flags & MI_HARD))
588			break;
589
590		/* let the caller deal with the failover case */
591		if (FAILOVER_MOUNT(mi))
592			break;
593
594	} while (error == ETIMEDOUT || error == ECONNRESET);
595
596	return (error);
597}
598
599static void
600clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
601{
602	if (cl->cl_auth != NULL) {
603		sec_clnt_freeh(cl->cl_auth);
604		cl->cl_auth = NULL;
605	}
606
607	/*
608	 * Timestamp this cache entry so that we know when it was last
609	 * used.
610	 */
611	cp->ch_freed = gethrestime_sec();
612
613	/*
614	 * Add the free client handle to the front of the list.
615	 * This way, the list will be sorted in youngest to oldest
616	 * order.
617	 */
618	mutex_enter(&nfscl->nfscl_chtable_lock);
619	cp->ch_list = cp->ch_head->ch_list;
620	cp->ch_head->ch_list = cp;
621	mutex_exit(&nfscl->nfscl_chtable_lock);
622}
623
624void
625clfree(CLIENT *cl, struct chtab *cp)
626{
627	struct nfs_clnt *nfscl;
628
629	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
630	ASSERT(nfscl != NULL);
631
632	clfree_impl(cl, cp, nfscl);
633}
634
635#define	CL_HOLDTIME	60	/* time to hold client handles */
636
637static void
638clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
639{
640	struct chhead *ch;
641	struct chtab *cp;	/* list of objects that can be reclaimed */
642	struct chtab *cpe;
643	struct chtab *cpl;
644	struct chtab **cpp;
645#ifdef DEBUG
646	int n = 0;
647#endif
648
649	/*
650	 * Need to reclaim some memory, so step through the cache
651	 * looking through the lists for entries which can be freed.
652	 */
653	cp = NULL;
654
655	mutex_enter(&nfscl->nfscl_chtable_lock);
656
657	/*
658	 * Here we step through each non-NULL quadruple and start to
659	 * construct the reclaim list pointed to by cp.  Note that
660	 * cp will contain all eligible chtab entries.  When this traversal
661	 * completes, chtab entries from the last quadruple will be at the
662	 * front of cp and entries from previously inspected quadruples have
663	 * been appended to the rear of cp.
664	 */
665	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
666		if (ch->ch_list == NULL)
667			continue;
668		/*
669		 * Search each list for entries older then
670		 * cl_holdtime seconds.  The lists are maintained
671		 * in youngest to oldest order so that when the
672		 * first entry is found which is old enough, then
673		 * all of the rest of the entries on the list will
674		 * be old enough as well.
675		 */
676		cpl = ch->ch_list;
677		cpp = &ch->ch_list;
678		while (cpl != NULL &&
679		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
680			cpp = &cpl->ch_list;
681			cpl = cpl->ch_list;
682		}
683		if (cpl != NULL) {
684			*cpp = NULL;
685			if (cp != NULL) {
686				cpe = cpl;
687				while (cpe->ch_list != NULL)
688					cpe = cpe->ch_list;
689				cpe->ch_list = cp;
690			}
691			cp = cpl;
692		}
693	}
694
695	mutex_exit(&nfscl->nfscl_chtable_lock);
696
697	/*
698	 * If cp is empty, then there is nothing to reclaim here.
699	 */
700	if (cp == NULL)
701		return;
702
703	/*
704	 * Step through the list of entries to free, destroying each client
705	 * handle and kmem_free'ing the memory for each entry.
706	 */
707	while (cp != NULL) {
708#ifdef DEBUG
709		n++;
710#endif
711		CLNT_DESTROY(cp->ch_client);
712		cpl = cp->ch_list;
713		kmem_cache_free(chtab_cache, cp);
714		cp = cpl;
715	}
716
717#ifdef DEBUG
718	/*
719	 * Update clalloc so that nfsstat shows the current number
720	 * of allocated client handles.
721	 */
722	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
723#endif
724}
725
726/* ARGSUSED */
727static void
728clreclaim(void *all)
729{
730	struct nfs_clnt *nfscl;
731
732#ifdef DEBUG
733	clstat_debug.clreclaim.value.ui64++;
734#endif
735	/*
736	 * The system is low on memory; go through and try to reclaim some from
737	 * every zone on the system.
738	 */
739	mutex_enter(&nfs_clnt_list_lock);
740	nfscl = list_head(&nfs_clnt_list);
741	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
742		clreclaim_zone(nfscl, CL_HOLDTIME);
743	mutex_exit(&nfs_clnt_list_lock);
744}
745
746/*
747 * Minimum time-out values indexed by call type
748 * These units are in "eights" of a second to avoid multiplies
749 */
750static unsigned int minimum_timeo[] = {
751	6, 7, 10
752};
753
754/*
755 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
756 */
757#define	MAXTIMO	(20*hz)
758#define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
759#define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
760
761#define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
762#define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
763#define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
764
765/*
766 * Function called when rfscall notices that we have been
767 * re-transmitting, or when we get a response without retransmissions.
768 * Return 1 if the transfer size was adjusted down - 0 if no change.
769 */
770static int
771nfs_feedback(int flag, int which, mntinfo_t *mi)
772{
773	int kind;
774	int r = 0;
775
776	mutex_enter(&mi->mi_lock);
777	if (flag == FEEDBACK_REXMIT1) {
778		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
779		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
780			goto done;
781		if (mi->mi_curread > MIN_NFS_TSIZE) {
782			mi->mi_curread /= 2;
783			if (mi->mi_curread < MIN_NFS_TSIZE)
784				mi->mi_curread = MIN_NFS_TSIZE;
785			r = 1;
786		}
787
788		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
789			mi->mi_curwrite /= 2;
790			if (mi->mi_curwrite < MIN_NFS_TSIZE)
791				mi->mi_curwrite = MIN_NFS_TSIZE;
792			r = 1;
793		}
794	} else if (flag == FEEDBACK_OK) {
795		kind = mi->mi_timer_type[which];
796		if (kind == 0 ||
797		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
798			goto done;
799		if (kind == 1) {
800			if (mi->mi_curread >= mi->mi_tsize)
801				goto done;
802			mi->mi_curread +=  MIN_NFS_TSIZE;
803			if (mi->mi_curread > mi->mi_tsize/2)
804				mi->mi_curread = mi->mi_tsize;
805		} else if (kind == 2) {
806			if (mi->mi_curwrite >= mi->mi_stsize)
807				goto done;
808			mi->mi_curwrite += MIN_NFS_TSIZE;
809			if (mi->mi_curwrite > mi->mi_stsize/2)
810				mi->mi_curwrite = mi->mi_stsize;
811		}
812	}
813done:
814	mutex_exit(&mi->mi_lock);
815	return (r);
816}
817
818#ifdef DEBUG
819static int rfs2call_hits = 0;
820static int rfs2call_misses = 0;
821#endif
822
823int
824rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
825    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
826    enum nfsstat *statusp, int flags, failinfo_t *fi)
827{
828	int rpcerror;
829	enum clnt_stat rpc_status;
830
831	ASSERT(statusp != NULL);
832
833	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
834	    cr, douprintf, &rpc_status, flags, fi);
835	if (!rpcerror) {
836		/*
837		 * See crnetadjust() for comments.
838		 */
839		if (*statusp == NFSERR_ACCES &&
840		    (cr = crnetadjust(cr)) != NULL) {
841#ifdef DEBUG
842			rfs2call_hits++;
843#endif
844			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
845			    resp, cr, douprintf, NULL, flags, fi);
846			crfree(cr);
847#ifdef DEBUG
848			if (*statusp == NFSERR_ACCES)
849				rfs2call_misses++;
850#endif
851		}
852	} else if (rpc_status == RPC_PROCUNAVAIL) {
853		*statusp = NFSERR_OPNOTSUPP;
854		rpcerror = 0;
855	}
856
857	return (rpcerror);
858}
859
860#define	NFS3_JUKEBOX_DELAY	10 * hz
861
862static clock_t nfs3_jukebox_delay = 0;
863
864#ifdef DEBUG
865static int rfs3call_hits = 0;
866static int rfs3call_misses = 0;
867#endif
868
869int
870rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
871    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
872    nfsstat3 *statusp, int flags, failinfo_t *fi)
873{
874	int rpcerror;
875	int user_informed;
876
877	user_informed = 0;
878	do {
879		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
880		    cr, douprintf, NULL, flags, fi);
881		if (!rpcerror) {
882			cred_t *crr;
883			if (*statusp == NFS3ERR_JUKEBOX) {
884				if (ttoproc(curthread) == &p0) {
885					rpcerror = EAGAIN;
886					break;
887				}
888				if (!user_informed) {
889					user_informed = 1;
890					uprintf(
891		"file temporarily unavailable on the server, retrying...\n");
892				}
893				delay(nfs3_jukebox_delay);
894			}
895			/*
896			 * See crnetadjust() for comments.
897			 */
898			else if (*statusp == NFS3ERR_ACCES &&
899			    (crr = crnetadjust(cr)) != NULL) {
900#ifdef DEBUG
901				rfs3call_hits++;
902#endif
903				rpcerror = rfscall(mi, which, xdrargs, argsp,
904				    xdrres, resp, crr, douprintf,
905				    NULL, flags, fi);
906
907				crfree(crr);
908#ifdef DEBUG
909				if (*statusp == NFS3ERR_ACCES)
910					rfs3call_misses++;
911#endif
912			}
913		}
914	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
915
916	return (rpcerror);
917}
918
919#define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
920#define	INC_READERS(mi)		{ \
921	mi->mi_readers++; \
922}
923#define	DEC_READERS(mi)		{ \
924	mi->mi_readers--; \
925	if (mi->mi_readers == 0) \
926		cv_broadcast(&mi->mi_failover_cv); \
927}
928
929static int
930rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
931    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
932    enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
933{
934	CLIENT *client;
935	struct chtab *ch;
936	cred_t *cr = icr;
937	enum clnt_stat status;
938	struct rpc_err rpcerr, rpcerr_tmp;
939	struct timeval wait;
940	int timeo;		/* in units of hz */
941	int my_rsize, my_wsize;
942	bool_t tryagain;
943	bool_t cred_cloned = FALSE;
944	k_sigset_t smask;
945	servinfo_t *svp;
946	struct nfs_clnt *nfscl;
947	zoneid_t zoneid = getzoneid();
948	char *msg;
949#ifdef DEBUG
950	char *bufp;
951#endif
952
953
954	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
955	    "rfscall_start:which %d mi %p", which, mi);
956
957	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
958	ASSERT(nfscl != NULL);
959
960	nfscl->nfscl_stat.calls.value.ui64++;
961	mi->mi_reqs[which].value.ui64++;
962
963	rpcerr.re_status = RPC_SUCCESS;
964
965	/*
966	 * In case of forced unmount or zone shutdown, return EIO.
967	 */
968
969	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
970		rpcerr.re_status = RPC_FAILED;
971		rpcerr.re_errno = EIO;
972		return (rpcerr.re_errno);
973	}
974
975	/*
976	 * Remember the transfer sizes in case
977	 * nfs_feedback changes them underneath us.
978	 */
979	my_rsize = mi->mi_curread;
980	my_wsize = mi->mi_curwrite;
981
982	/*
983	 * NFS client failover support
984	 *
985	 * If this rnode is not in sync with the current server (VALID_FH),
986	 * we'd like to do a remap to get in sync.  We can be interrupted
987	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
988	 * use the best info we have to try the RPC.  Part of that is
989	 * unconditionally updating the filehandle copy kept for V3.
990	 *
991	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
992	 * rw_enter(); we're trying to keep the current server from being
993	 * changed on us until we're done with the remapping and have a
994	 * matching client handle.  We don't want to sending a filehandle
995	 * to the wrong host.
996	 */
997failoverretry:
998	if (FAILOVER_MOUNT(mi)) {
999		mutex_enter(&mi->mi_lock);
1000		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1001			if (failover_wait(mi)) {
1002				mutex_exit(&mi->mi_lock);
1003				return (EINTR);
1004			}
1005		}
1006		INC_READERS(mi);
1007		mutex_exit(&mi->mi_lock);
1008		if (fi) {
1009			if (!VALID_FH(fi) &&
1010			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1011				int remaperr;
1012
1013				svp = mi->mi_curr_serv;
1014				remaperr = failover_remap(fi);
1015				if (remaperr != 0) {
1016#ifdef DEBUG
1017					if (remaperr != EINTR)
1018						nfs_cmn_err(remaperr, CE_WARN,
1019					    "rfscall couldn't failover: %m");
1020#endif
1021					mutex_enter(&mi->mi_lock);
1022					DEC_READERS(mi);
1023					mutex_exit(&mi->mi_lock);
1024					/*
1025					 * If failover_remap returns ETIMEDOUT
1026					 * and the filesystem is hard mounted
1027					 * we have to retry the call with a new
1028					 * server.
1029					 */
1030					if ((mi->mi_flags & MI_HARD) &&
1031					    IS_RECOVERABLE_ERROR(remaperr)) {
1032						if (svp == mi->mi_curr_serv)
1033							failover_newserver(mi);
1034						rpcerr.re_status = RPC_SUCCESS;
1035						goto failoverretry;
1036					}
1037					rpcerr.re_errno = remaperr;
1038					return (remaperr);
1039				}
1040			}
1041			if (fi->fhp && fi->copyproc)
1042				(*fi->copyproc)(fi->fhp, fi->vp);
1043		}
1044	}
1045
1046	/* For TSOL, use a new cred which has net_mac_aware flag */
1047	if (!cred_cloned && is_system_labeled()) {
1048		cred_cloned = TRUE;
1049		cr = crdup(icr);
1050		(void) setpflags(NET_MAC_AWARE, 1, cr);
1051	}
1052
1053	/*
1054	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1055	 * are guaranteed to reprocess the retry as a new request.
1056	 */
1057	svp = mi->mi_curr_serv;
1058	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1059
1060	if (FAILOVER_MOUNT(mi)) {
1061		mutex_enter(&mi->mi_lock);
1062		DEC_READERS(mi);
1063		mutex_exit(&mi->mi_lock);
1064
1065		if ((rpcerr.re_errno == ETIMEDOUT ||
1066		    rpcerr.re_errno == ECONNRESET) &&
1067		    failover_safe(fi)) {
1068			if (svp == mi->mi_curr_serv)
1069				failover_newserver(mi);
1070			goto failoverretry;
1071		}
1072	}
1073	if (rpcerr.re_errno != 0)
1074		return (rpcerr.re_errno);
1075
1076	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1077	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1078		timeo = (mi->mi_timeo * hz) / 10;
1079	} else {
1080		mutex_enter(&mi->mi_lock);
1081		timeo = CLNT_SETTIMERS(client,
1082		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1083		    &(mi->mi_timers[NFS_CALLTYPES]),
1084		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1085		    (void (*)())NULL, (caddr_t)mi, 0);
1086		mutex_exit(&mi->mi_lock);
1087	}
1088
1089	/*
1090	 * If hard mounted fs, retry call forever unless hard error occurs.
1091	 */
1092	do {
1093		tryagain = FALSE;
1094
1095		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1096			status = RPC_FAILED;
1097			rpcerr.re_status = RPC_FAILED;
1098			rpcerr.re_errno = EIO;
1099			break;
1100		}
1101
1102		TICK_TO_TIMEVAL(timeo, &wait);
1103
1104		/*
1105		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1106		 * and SIGTERM. (Preserving the existing masks).
1107		 * Mask out SIGINT if mount option nointr is specified.
1108		 */
1109		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1110		if (!(mi->mi_flags & MI_INT))
1111			client->cl_nosignal = TRUE;
1112
1113		/*
1114		 * If there is a current signal, then don't bother
1115		 * even trying to send out the request because we
1116		 * won't be able to block waiting for the response.
1117		 * Simply assume RPC_INTR and get on with it.
1118		 */
1119		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1120			status = RPC_INTR;
1121		else {
1122			status = CLNT_CALL(client, which, xdrargs, argsp,
1123			    xdrres, resp, wait);
1124		}
1125
1126		if (!(mi->mi_flags & MI_INT))
1127			client->cl_nosignal = FALSE;
1128		/*
1129		 * restore original signal mask
1130		 */
1131		sigunintr(&smask);
1132
1133		switch (status) {
1134		case RPC_SUCCESS:
1135			if ((mi->mi_flags & MI_DYNAMIC) &&
1136			    mi->mi_timer_type[which] != 0 &&
1137			    (mi->mi_curread != my_rsize ||
1138			    mi->mi_curwrite != my_wsize))
1139				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1140			break;
1141
1142		case RPC_INTR:
1143			/*
1144			 * There is no way to recover from this error,
1145			 * even if mount option nointr is specified.
1146			 * SIGKILL, for example, cannot be blocked.
1147			 */
1148			rpcerr.re_status = RPC_INTR;
1149			rpcerr.re_errno = EINTR;
1150			break;
1151
1152		case RPC_UDERROR:
1153			/*
1154			 * If the NFS server is local (vold) and
1155			 * it goes away then we get RPC_UDERROR.
1156			 * This is a retryable error, so we would
1157			 * loop, so check to see if the specific
1158			 * error was ECONNRESET, indicating that
1159			 * target did not exist at all.  If so,
1160			 * return with RPC_PROGUNAVAIL and
1161			 * ECONNRESET to indicate why.
1162			 */
1163			CLNT_GETERR(client, &rpcerr);
1164			if (rpcerr.re_errno == ECONNRESET) {
1165				rpcerr.re_status = RPC_PROGUNAVAIL;
1166				rpcerr.re_errno = ECONNRESET;
1167				break;
1168			}
1169			/*FALLTHROUGH*/
1170
1171		default:		/* probably RPC_TIMEDOUT */
1172			if (IS_UNRECOVERABLE_RPC(status))
1173				break;
1174
1175			/*
1176			 * increment server not responding count
1177			 */
1178			mutex_enter(&mi->mi_lock);
1179			mi->mi_noresponse++;
1180			mutex_exit(&mi->mi_lock);
1181#ifdef DEBUG
1182			nfscl->nfscl_stat.noresponse.value.ui64++;
1183#endif
1184
1185			if (!(mi->mi_flags & MI_HARD)) {
1186				if (!(mi->mi_flags & MI_SEMISOFT) ||
1187				    (mi->mi_ss_call_type[which] == 0))
1188					break;
1189			}
1190
1191			/*
1192			 * The call is in progress (over COTS).
1193			 * Try the CLNT_CALL again, but don't
1194			 * print a noisy error message.
1195			 */
1196			if (status == RPC_INPROGRESS) {
1197				tryagain = TRUE;
1198				break;
1199			}
1200
1201			if (flags & RFSCALL_SOFT)
1202				break;
1203
1204			/*
1205			 * On zone shutdown, just move on.
1206			 */
1207			if (zone_status_get(curproc->p_zone) >=
1208			    ZONE_IS_SHUTTING_DOWN) {
1209				rpcerr.re_status = RPC_FAILED;
1210				rpcerr.re_errno = EIO;
1211				break;
1212			}
1213
1214			/*
1215			 * NFS client failover support
1216			 *
1217			 * If the current server just failed us, we'll
1218			 * start the process of finding a new server.
1219			 * After that, we can just retry.
1220			 */
1221			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1222				if (svp == mi->mi_curr_serv)
1223					failover_newserver(mi);
1224				clfree_impl(client, ch, nfscl);
1225				goto failoverretry;
1226			}
1227
1228			tryagain = TRUE;
1229			timeo = backoff(timeo);
1230
1231			CLNT_GETERR(client, &rpcerr_tmp);
1232			if ((status == RPC_CANTSEND) &&
1233			    (rpcerr_tmp.re_errno == ENOBUFS))
1234				msg = SRV_QFULL_MSG;
1235			else
1236				msg = SRV_NOTRESP_MSG;
1237
1238			mutex_enter(&mi->mi_lock);
1239			if (!(mi->mi_flags & MI_PRINTED)) {
1240				mi->mi_flags |= MI_PRINTED;
1241				mutex_exit(&mi->mi_lock);
1242#ifdef DEBUG
1243				zprintf(zoneid, msg, mi->mi_vers,
1244				    svp->sv_hostname);
1245#else
1246				zprintf(zoneid, msg, svp->sv_hostname);
1247#endif
1248			} else
1249				mutex_exit(&mi->mi_lock);
1250			if (*douprintf && nfs_has_ctty()) {
1251				*douprintf = 0;
1252				if (!(mi->mi_flags & MI_NOPRINT))
1253#ifdef DEBUG
1254					uprintf(msg, mi->mi_vers,
1255					    svp->sv_hostname);
1256#else
1257					uprintf(msg, svp->sv_hostname);
1258#endif
1259			}
1260
1261			/*
1262			 * If doing dynamic adjustment of transfer
1263			 * size and if it's a read or write call
1264			 * and if the transfer size changed while
1265			 * retransmitting or if the feedback routine
1266			 * changed the transfer size,
1267			 * then exit rfscall so that the transfer
1268			 * size can be adjusted at the vnops level.
1269			 */
1270			if ((mi->mi_flags & MI_DYNAMIC) &&
1271			    mi->mi_timer_type[which] != 0 &&
1272			    (mi->mi_curread != my_rsize ||
1273			    mi->mi_curwrite != my_wsize ||
1274			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1275				/*
1276				 * On read or write calls, return
1277				 * back to the vnode ops level if
1278				 * the transfer size changed.
1279				 */
1280				clfree_impl(client, ch, nfscl);
1281				if (cred_cloned)
1282					crfree(cr);
1283				return (ENFS_TRYAGAIN);
1284			}
1285		}
1286	} while (tryagain);
1287
1288	if (status != RPC_SUCCESS) {
1289		/*
1290		 * Let soft mounts use the timed out message.
1291		 */
1292		if (status == RPC_INPROGRESS)
1293			status = RPC_TIMEDOUT;
1294		nfscl->nfscl_stat.badcalls.value.ui64++;
1295		if (status != RPC_INTR) {
1296			mutex_enter(&mi->mi_lock);
1297			mi->mi_flags |= MI_DOWN;
1298			mutex_exit(&mi->mi_lock);
1299			CLNT_GETERR(client, &rpcerr);
1300#ifdef DEBUG
1301			bufp = clnt_sperror(client, svp->sv_hostname);
1302			zprintf(zoneid, "NFS%d %s failed for %s\n",
1303			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1304			if (nfs_has_ctty()) {
1305				if (!(mi->mi_flags & MI_NOPRINT)) {
1306					uprintf("NFS%d %s failed for %s\n",
1307					    mi->mi_vers, mi->mi_rfsnames[which],
1308					    bufp);
1309				}
1310			}
1311			kmem_free(bufp, MAXPATHLEN);
1312#else
1313			zprintf(zoneid,
1314			    "NFS %s failed for server %s: error %d (%s)\n",
1315			    mi->mi_rfsnames[which], svp->sv_hostname,
1316			    status, clnt_sperrno(status));
1317			if (nfs_has_ctty()) {
1318				if (!(mi->mi_flags & MI_NOPRINT)) {
1319					uprintf(
1320				"NFS %s failed for server %s: error %d (%s)\n",
1321					    mi->mi_rfsnames[which],
1322					    svp->sv_hostname, status,
1323					    clnt_sperrno(status));
1324				}
1325			}
1326#endif
1327			/*
1328			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1329			 * re_errno is set appropriately depending on
1330			 * the authentication error
1331			 */
1332			if (status == RPC_VERSMISMATCH ||
1333			    status == RPC_PROGVERSMISMATCH)
1334				rpcerr.re_errno = EIO;
1335		}
1336	} else {
1337		/*
1338		 * Test the value of mi_down and mi_printed without
1339		 * holding the mi_lock mutex.  If they are both zero,
1340		 * then it is okay to skip the down and printed
1341		 * processing.  This saves on a mutex_enter and
1342		 * mutex_exit pair for a normal, successful RPC.
1343		 * This was just complete overhead.
1344		 */
1345		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1346			mutex_enter(&mi->mi_lock);
1347			mi->mi_flags &= ~MI_DOWN;
1348			if (mi->mi_flags & MI_PRINTED) {
1349				mi->mi_flags &= ~MI_PRINTED;
1350				mutex_exit(&mi->mi_lock);
1351#ifdef DEBUG
1352			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1353				zprintf(zoneid, "NFS%d server %s ok\n",
1354				    mi->mi_vers, svp->sv_hostname);
1355#else
1356			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1357				zprintf(zoneid, "NFS server %s ok\n",
1358				    svp->sv_hostname);
1359#endif
1360			} else
1361				mutex_exit(&mi->mi_lock);
1362		}
1363
1364		if (*douprintf == 0) {
1365			if (!(mi->mi_flags & MI_NOPRINT))
1366#ifdef DEBUG
1367				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1368					uprintf("NFS%d server %s ok\n",
1369					    mi->mi_vers, svp->sv_hostname);
1370#else
1371			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1372				uprintf("NFS server %s ok\n", svp->sv_hostname);
1373#endif
1374			*douprintf = 1;
1375		}
1376	}
1377
1378	clfree_impl(client, ch, nfscl);
1379	if (cred_cloned)
1380		crfree(cr);
1381
1382	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1383
1384	if (rpc_status != NULL)
1385		*rpc_status = rpcerr.re_status;
1386
1387	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1388	    rpcerr.re_errno);
1389
1390	return (rpcerr.re_errno);
1391}
1392
1393#ifdef DEBUG
1394static int acl2call_hits = 0;
1395static int acl2call_misses = 0;
1396#endif
1397
1398int
1399acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1400    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1401    enum nfsstat *statusp, int flags, failinfo_t *fi)
1402{
1403	int rpcerror;
1404
1405	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1406	    cr, douprintf, flags, fi);
1407	if (!rpcerror) {
1408		/*
1409		 * See comments with crnetadjust().
1410		 */
1411		if (*statusp == NFSERR_ACCES &&
1412		    (cr = crnetadjust(cr)) != NULL) {
1413#ifdef DEBUG
1414			acl2call_hits++;
1415#endif
1416			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1417			    resp, cr, douprintf, flags, fi);
1418			crfree(cr);
1419#ifdef DEBUG
1420			if (*statusp == NFSERR_ACCES)
1421				acl2call_misses++;
1422#endif
1423		}
1424	}
1425
1426	return (rpcerror);
1427}
1428
1429#ifdef DEBUG
1430static int acl3call_hits = 0;
1431static int acl3call_misses = 0;
1432#endif
1433
1434int
1435acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1436    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1437    nfsstat3 *statusp, int flags, failinfo_t *fi)
1438{
1439	int rpcerror;
1440	int user_informed;
1441
1442	user_informed = 0;
1443
1444	do {
1445		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1446		    cr, douprintf, flags, fi);
1447		if (!rpcerror) {
1448			cred_t *crr;
1449			if (*statusp == NFS3ERR_JUKEBOX) {
1450				if (!user_informed) {
1451					user_informed = 1;
1452					uprintf(
1453		"file temporarily unavailable on the server, retrying...\n");
1454				}
1455				delay(nfs3_jukebox_delay);
1456			}
1457			/*
1458			 * See crnetadjust() for comments.
1459			 */
1460			else if (*statusp == NFS3ERR_ACCES &&
1461			    (crr = crnetadjust(cr)) != NULL) {
1462#ifdef DEBUG
1463				acl3call_hits++;
1464#endif
1465				rpcerror = aclcall(mi, which, xdrargs, argsp,
1466				    xdrres, resp, crr, douprintf, flags, fi);
1467
1468				crfree(crr);
1469#ifdef DEBUG
1470				if (*statusp == NFS3ERR_ACCES)
1471					acl3call_misses++;
1472#endif
1473			}
1474		}
1475	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1476
1477	return (rpcerror);
1478}
1479
1480static int
1481aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1482    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1483    int flags, failinfo_t *fi)
1484{
1485	CLIENT *client;
1486	struct chtab *ch;
1487	cred_t *cr = icr;
1488	bool_t cred_cloned = FALSE;
1489	enum clnt_stat status;
1490	struct rpc_err rpcerr;
1491	struct timeval wait;
1492	int timeo;		/* in units of hz */
1493#if 0 /* notyet */
1494	int my_rsize, my_wsize;
1495#endif
1496	bool_t tryagain;
1497	k_sigset_t smask;
1498	servinfo_t *svp;
1499	struct nfs_clnt *nfscl;
1500	zoneid_t zoneid = getzoneid();
1501#ifdef DEBUG
1502	char *bufp;
1503#endif
1504
1505#if 0 /* notyet */
1506	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1507	    "rfscall_start:which %d mi %p", which, mi);
1508#endif
1509
1510	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1511	ASSERT(nfscl != NULL);
1512
1513	nfscl->nfscl_stat.calls.value.ui64++;
1514	mi->mi_aclreqs[which].value.ui64++;
1515
1516	rpcerr.re_status = RPC_SUCCESS;
1517
1518	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1519		rpcerr.re_status = RPC_FAILED;
1520		rpcerr.re_errno = EIO;
1521		return (rpcerr.re_errno);
1522	}
1523
1524#if 0 /* notyet */
1525	/*
1526	 * Remember the transfer sizes in case
1527	 * nfs_feedback changes them underneath us.
1528	 */
1529	my_rsize = mi->mi_curread;
1530	my_wsize = mi->mi_curwrite;
1531#endif
1532
1533	/*
1534	 * NFS client failover support
1535	 *
1536	 * If this rnode is not in sync with the current server (VALID_FH),
1537	 * we'd like to do a remap to get in sync.  We can be interrupted
1538	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1539	 * use the best info we have to try the RPC.  Part of that is
1540	 * unconditionally updating the filehandle copy kept for V3.
1541	 *
1542	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1543	 * rw_enter(); we're trying to keep the current server from being
1544	 * changed on us until we're done with the remapping and have a
1545	 * matching client handle.  We don't want to sending a filehandle
1546	 * to the wrong host.
1547	 */
1548failoverretry:
1549	if (FAILOVER_MOUNT(mi)) {
1550		mutex_enter(&mi->mi_lock);
1551		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1552			if (failover_wait(mi)) {
1553				mutex_exit(&mi->mi_lock);
1554				return (EINTR);
1555			}
1556		}
1557		INC_READERS(mi);
1558		mutex_exit(&mi->mi_lock);
1559		if (fi) {
1560			if (!VALID_FH(fi) &&
1561			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1562				int remaperr;
1563
1564				svp = mi->mi_curr_serv;
1565				remaperr = failover_remap(fi);
1566				if (remaperr != 0) {
1567#ifdef DEBUG
1568					if (remaperr != EINTR)
1569						nfs_cmn_err(remaperr, CE_WARN,
1570					    "aclcall couldn't failover: %m");
1571#endif
1572					mutex_enter(&mi->mi_lock);
1573					DEC_READERS(mi);
1574					mutex_exit(&mi->mi_lock);
1575
1576					/*
1577					 * If failover_remap returns ETIMEDOUT
1578					 * and the filesystem is hard mounted
1579					 * we have to retry the call with a new
1580					 * server.
1581					 */
1582					if ((mi->mi_flags & MI_HARD) &&
1583					    IS_RECOVERABLE_ERROR(remaperr)) {
1584						if (svp == mi->mi_curr_serv)
1585							failover_newserver(mi);
1586						rpcerr.re_status = RPC_SUCCESS;
1587						goto failoverretry;
1588					}
1589					return (remaperr);
1590				}
1591			}
1592			if (fi->fhp && fi->copyproc)
1593				(*fi->copyproc)(fi->fhp, fi->vp);
1594		}
1595	}
1596
1597	/* For TSOL, use a new cred which has net_mac_aware flag */
1598	if (!cred_cloned && is_system_labeled()) {
1599		cred_cloned = TRUE;
1600		cr = crdup(icr);
1601		(void) setpflags(NET_MAC_AWARE, 1, cr);
1602	}
1603
1604	/*
1605	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1606	 * are guaranteed to reprocess the retry as a new request.
1607	 */
1608	svp = mi->mi_curr_serv;
1609	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1610	if (FAILOVER_MOUNT(mi)) {
1611		mutex_enter(&mi->mi_lock);
1612		DEC_READERS(mi);
1613		mutex_exit(&mi->mi_lock);
1614
1615		if ((rpcerr.re_errno == ETIMEDOUT ||
1616		    rpcerr.re_errno == ECONNRESET) &&
1617		    failover_safe(fi)) {
1618			if (svp == mi->mi_curr_serv)
1619				failover_newserver(mi);
1620			goto failoverretry;
1621		}
1622	}
1623	if (rpcerr.re_errno != 0) {
1624		if (cred_cloned)
1625			crfree(cr);
1626		return (rpcerr.re_errno);
1627	}
1628
1629	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1630	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1631		timeo = (mi->mi_timeo * hz) / 10;
1632	} else {
1633		mutex_enter(&mi->mi_lock);
1634		timeo = CLNT_SETTIMERS(client,
1635		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1636		    &(mi->mi_timers[NFS_CALLTYPES]),
1637		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1638		    (void (*)()) 0, (caddr_t)mi, 0);
1639		mutex_exit(&mi->mi_lock);
1640	}
1641
1642	/*
1643	 * If hard mounted fs, retry call forever unless hard error occurs.
1644	 */
1645	do {
1646		tryagain = FALSE;
1647
1648		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1649			status = RPC_FAILED;
1650			rpcerr.re_status = RPC_FAILED;
1651			rpcerr.re_errno = EIO;
1652			break;
1653		}
1654
1655		TICK_TO_TIMEVAL(timeo, &wait);
1656
1657		/*
1658		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1659		 * and SIGTERM. (Preserving the existing masks).
1660		 * Mask out SIGINT if mount option nointr is specified.
1661		 */
1662		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1663		if (!(mi->mi_flags & MI_INT))
1664			client->cl_nosignal = TRUE;
1665
1666		/*
1667		 * If there is a current signal, then don't bother
1668		 * even trying to send out the request because we
1669		 * won't be able to block waiting for the response.
1670		 * Simply assume RPC_INTR and get on with it.
1671		 */
1672		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1673			status = RPC_INTR;
1674		else {
1675			status = CLNT_CALL(client, which, xdrargs, argsp,
1676			    xdrres, resp, wait);
1677		}
1678
1679		if (!(mi->mi_flags & MI_INT))
1680			client->cl_nosignal = FALSE;
1681		/*
1682		 * restore original signal mask
1683		 */
1684		sigunintr(&smask);
1685
1686		switch (status) {
1687		case RPC_SUCCESS:
1688#if 0 /* notyet */
1689			if ((mi->mi_flags & MI_DYNAMIC) &&
1690			    mi->mi_timer_type[which] != 0 &&
1691			    (mi->mi_curread != my_rsize ||
1692			    mi->mi_curwrite != my_wsize))
1693				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1694#endif
1695			break;
1696
1697		/*
1698		 * Unfortunately, there are servers in the world which
1699		 * are not coded correctly.  They are not prepared to
1700		 * handle RPC requests to the NFS port which are not
1701		 * NFS requests.  Thus, they may try to process the
1702		 * NFS_ACL request as if it were an NFS request.  This
1703		 * does not work.  Generally, an error will be generated
1704		 * on the client because it will not be able to decode
1705		 * the response from the server.  However, it seems
1706		 * possible that the server may not be able to decode
1707		 * the arguments.  Thus, the criteria for deciding
1708		 * whether the server supports NFS_ACL or not is whether
1709		 * the following RPC errors are returned from CLNT_CALL.
1710		 */
1711		case RPC_CANTDECODERES:
1712		case RPC_PROGUNAVAIL:
1713		case RPC_CANTDECODEARGS:
1714		case RPC_PROGVERSMISMATCH:
1715			mutex_enter(&mi->mi_lock);
1716			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1717			mutex_exit(&mi->mi_lock);
1718			break;
1719
1720		/*
1721		 * If the server supports NFS_ACL but not the new ops
1722		 * for extended attributes, make sure we don't retry.
1723		 */
1724		case RPC_PROCUNAVAIL:
1725			mutex_enter(&mi->mi_lock);
1726			mi->mi_flags &= ~MI_EXTATTR;
1727			mutex_exit(&mi->mi_lock);
1728			break;
1729
1730		case RPC_INTR:
1731			/*
1732			 * There is no way to recover from this error,
1733			 * even if mount option nointr is specified.
1734			 * SIGKILL, for example, cannot be blocked.
1735			 */
1736			rpcerr.re_status = RPC_INTR;
1737			rpcerr.re_errno = EINTR;
1738			break;
1739
1740		case RPC_UDERROR:
1741			/*
1742			 * If the NFS server is local (vold) and
1743			 * it goes away then we get RPC_UDERROR.
1744			 * This is a retryable error, so we would
1745			 * loop, so check to see if the specific
1746			 * error was ECONNRESET, indicating that
1747			 * target did not exist at all.  If so,
1748			 * return with RPC_PROGUNAVAIL and
1749			 * ECONNRESET to indicate why.
1750			 */
1751			CLNT_GETERR(client, &rpcerr);
1752			if (rpcerr.re_errno == ECONNRESET) {
1753				rpcerr.re_status = RPC_PROGUNAVAIL;
1754				rpcerr.re_errno = ECONNRESET;
1755				break;
1756			}
1757			/*FALLTHROUGH*/
1758
1759		default:		/* probably RPC_TIMEDOUT */
1760			if (IS_UNRECOVERABLE_RPC(status))
1761				break;
1762
1763			/*
1764			 * increment server not responding count
1765			 */
1766			mutex_enter(&mi->mi_lock);
1767			mi->mi_noresponse++;
1768			mutex_exit(&mi->mi_lock);
1769#ifdef DEBUG
1770			nfscl->nfscl_stat.noresponse.value.ui64++;
1771#endif
1772
1773			if (!(mi->mi_flags & MI_HARD)) {
1774				if (!(mi->mi_flags & MI_SEMISOFT) ||
1775				    (mi->mi_acl_ss_call_type[which] == 0))
1776					break;
1777			}
1778
1779			/*
1780			 * The call is in progress (over COTS).
1781			 * Try the CLNT_CALL again, but don't
1782			 * print a noisy error message.
1783			 */
1784			if (status == RPC_INPROGRESS) {
1785				tryagain = TRUE;
1786				break;
1787			}
1788
1789			if (flags & RFSCALL_SOFT)
1790				break;
1791
1792			/*
1793			 * On zone shutdown, just move on.
1794			 */
1795			if (zone_status_get(curproc->p_zone) >=
1796			    ZONE_IS_SHUTTING_DOWN) {
1797				rpcerr.re_status = RPC_FAILED;
1798				rpcerr.re_errno = EIO;
1799				break;
1800			}
1801
1802			/*
1803			 * NFS client failover support
1804			 *
1805			 * If the current server just failed us, we'll
1806			 * start the process of finding a new server.
1807			 * After that, we can just retry.
1808			 */
1809			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1810				if (svp == mi->mi_curr_serv)
1811					failover_newserver(mi);
1812				clfree_impl(client, ch, nfscl);
1813				goto failoverretry;
1814			}
1815
1816			tryagain = TRUE;
1817			timeo = backoff(timeo);
1818			mutex_enter(&mi->mi_lock);
1819			if (!(mi->mi_flags & MI_PRINTED)) {
1820				mi->mi_flags |= MI_PRINTED;
1821				mutex_exit(&mi->mi_lock);
1822#ifdef DEBUG
1823				zprintf(zoneid,
1824			"NFS_ACL%d server %s not responding still trying\n",
1825				    mi->mi_vers, svp->sv_hostname);
1826#else
1827				zprintf(zoneid,
1828			    "NFS server %s not responding still trying\n",
1829				    svp->sv_hostname);
1830#endif
1831			} else
1832				mutex_exit(&mi->mi_lock);
1833			if (*douprintf && nfs_has_ctty()) {
1834				*douprintf = 0;
1835				if (!(mi->mi_flags & MI_NOPRINT))
1836#ifdef DEBUG
1837					uprintf(
1838			"NFS_ACL%d server %s not responding still trying\n",
1839					    mi->mi_vers, svp->sv_hostname);
1840#else
1841					uprintf(
1842			    "NFS server %s not responding still trying\n",
1843					    svp->sv_hostname);
1844#endif
1845			}
1846
1847#if 0 /* notyet */
1848			/*
1849			 * If doing dynamic adjustment of transfer
1850			 * size and if it's a read or write call
1851			 * and if the transfer size changed while
1852			 * retransmitting or if the feedback routine
1853			 * changed the transfer size,
1854			 * then exit rfscall so that the transfer
1855			 * size can be adjusted at the vnops level.
1856			 */
1857			if ((mi->mi_flags & MI_DYNAMIC) &&
1858			    mi->mi_acl_timer_type[which] != 0 &&
1859			    (mi->mi_curread != my_rsize ||
1860			    mi->mi_curwrite != my_wsize ||
1861			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1862				/*
1863				 * On read or write calls, return
1864				 * back to the vnode ops level if
1865				 * the transfer size changed.
1866				 */
1867				clfree_impl(client, ch, nfscl);
1868				if (cred_cloned)
1869					crfree(cr);
1870				return (ENFS_TRYAGAIN);
1871			}
1872#endif
1873		}
1874	} while (tryagain);
1875
1876	if (status != RPC_SUCCESS) {
1877		/*
1878		 * Let soft mounts use the timed out message.
1879		 */
1880		if (status == RPC_INPROGRESS)
1881			status = RPC_TIMEDOUT;
1882		nfscl->nfscl_stat.badcalls.value.ui64++;
1883		if (status == RPC_CANTDECODERES ||
1884		    status == RPC_PROGUNAVAIL ||
1885		    status == RPC_PROCUNAVAIL ||
1886		    status == RPC_CANTDECODEARGS ||
1887		    status == RPC_PROGVERSMISMATCH)
1888			CLNT_GETERR(client, &rpcerr);
1889		else if (status != RPC_INTR) {
1890			mutex_enter(&mi->mi_lock);
1891			mi->mi_flags |= MI_DOWN;
1892			mutex_exit(&mi->mi_lock);
1893			CLNT_GETERR(client, &rpcerr);
1894#ifdef DEBUG
1895			bufp = clnt_sperror(client, svp->sv_hostname);
1896			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1897			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1898			if (nfs_has_ctty()) {
1899				if (!(mi->mi_flags & MI_NOPRINT)) {
1900					uprintf("NFS_ACL%d %s failed for %s\n",
1901					    mi->mi_vers, mi->mi_aclnames[which],
1902					    bufp);
1903				}
1904			}
1905			kmem_free(bufp, MAXPATHLEN);
1906#else
1907			zprintf(zoneid,
1908			    "NFS %s failed for server %s: error %d (%s)\n",
1909			    mi->mi_aclnames[which], svp->sv_hostname,
1910			    status, clnt_sperrno(status));
1911			if (nfs_has_ctty()) {
1912				if (!(mi->mi_flags & MI_NOPRINT))
1913					uprintf(
1914				"NFS %s failed for server %s: error %d (%s)\n",
1915					    mi->mi_aclnames[which],
1916					    svp->sv_hostname, status,
1917					    clnt_sperrno(status));
1918			}
1919#endif
1920			/*
1921			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1922			 * re_errno is set appropriately depending on
1923			 * the authentication error
1924			 */
1925			if (status == RPC_VERSMISMATCH ||
1926			    status == RPC_PROGVERSMISMATCH)
1927				rpcerr.re_errno = EIO;
1928		}
1929	} else {
1930		/*
1931		 * Test the value of mi_down and mi_printed without
1932		 * holding the mi_lock mutex.  If they are both zero,
1933		 * then it is okay to skip the down and printed
1934		 * processing.  This saves on a mutex_enter and
1935		 * mutex_exit pair for a normal, successful RPC.
1936		 * This was just complete overhead.
1937		 */
1938		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1939			mutex_enter(&mi->mi_lock);
1940			mi->mi_flags &= ~MI_DOWN;
1941			if (mi->mi_flags & MI_PRINTED) {
1942				mi->mi_flags &= ~MI_PRINTED;
1943				mutex_exit(&mi->mi_lock);
1944#ifdef DEBUG
1945				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1946				    mi->mi_vers, svp->sv_hostname);
1947#else
1948				zprintf(zoneid, "NFS server %s ok\n",
1949				    svp->sv_hostname);
1950#endif
1951			} else
1952				mutex_exit(&mi->mi_lock);
1953		}
1954
1955		if (*douprintf == 0) {
1956			if (!(mi->mi_flags & MI_NOPRINT))
1957#ifdef DEBUG
1958				uprintf("NFS_ACL%d server %s ok\n",
1959				    mi->mi_vers, svp->sv_hostname);
1960#else
1961				uprintf("NFS server %s ok\n", svp->sv_hostname);
1962#endif
1963			*douprintf = 1;
1964		}
1965	}
1966
1967	clfree_impl(client, ch, nfscl);
1968	if (cred_cloned)
1969		crfree(cr);
1970
1971	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1972
1973#if 0 /* notyet */
1974	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1975	    rpcerr.re_errno);
1976#endif
1977
1978	return (rpcerr.re_errno);
1979}
1980
1981int
1982vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1983{
1984	uint_t mask = vap->va_mask;
1985
1986	if (!(mask & AT_MODE))
1987		sa->sa_mode = (uint32_t)-1;
1988	else
1989		sa->sa_mode = vap->va_mode;
1990	if (!(mask & AT_UID))
1991		sa->sa_uid = (uint32_t)-1;
1992	else
1993		sa->sa_uid = (uint32_t)vap->va_uid;
1994	if (!(mask & AT_GID))
1995		sa->sa_gid = (uint32_t)-1;
1996	else
1997		sa->sa_gid = (uint32_t)vap->va_gid;
1998	if (!(mask & AT_SIZE))
1999		sa->sa_size = (uint32_t)-1;
2000	else
2001		sa->sa_size = (uint32_t)vap->va_size;
2002	if (!(mask & AT_ATIME))
2003		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2004	else {
2005		/* check time validity */
2006		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2007			return (EOVERFLOW);
2008		}
2009		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2010		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2011	}
2012	if (!(mask & AT_MTIME))
2013		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2014	else {
2015		/* check time validity */
2016		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2017			return (EOVERFLOW);
2018		}
2019		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2020		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2021	}
2022	return (0);
2023}
2024
2025int
2026vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2027{
2028	uint_t mask = vap->va_mask;
2029
2030	if (!(mask & AT_MODE))
2031		sa->mode.set_it = FALSE;
2032	else {
2033		sa->mode.set_it = TRUE;
2034		sa->mode.mode = (mode3)vap->va_mode;
2035	}
2036	if (!(mask & AT_UID))
2037		sa->uid.set_it = FALSE;
2038	else {
2039		sa->uid.set_it = TRUE;
2040		sa->uid.uid = (uid3)vap->va_uid;
2041	}
2042	if (!(mask & AT_GID))
2043		sa->gid.set_it = FALSE;
2044	else {
2045		sa->gid.set_it = TRUE;
2046		sa->gid.gid = (gid3)vap->va_gid;
2047	}
2048	if (!(mask & AT_SIZE))
2049		sa->size.set_it = FALSE;
2050	else {
2051		sa->size.set_it = TRUE;
2052		sa->size.size = (size3)vap->va_size;
2053	}
2054	if (!(mask & AT_ATIME))
2055		sa->atime.set_it = DONT_CHANGE;
2056	else {
2057		/* check time validity */
2058		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2059			return (EOVERFLOW);
2060		}
2061		sa->atime.set_it = SET_TO_CLIENT_TIME;
2062		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2063		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2064	}
2065	if (!(mask & AT_MTIME))
2066		sa->mtime.set_it = DONT_CHANGE;
2067	else {
2068		/* check time validity */
2069		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2070			return (EOVERFLOW);
2071		}
2072		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2073		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2074		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2075	}
2076	return (0);
2077}
2078
2079void
2080setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2081{
2082
2083	da->da_fhandle = VTOFH(dvp);
2084	da->da_name = nm;
2085	da->da_flags = 0;
2086}
2087
2088void
2089setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2090{
2091
2092	da->dirp = VTOFH3(dvp);
2093	da->name = nm;
2094}
2095
2096int
2097setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2098{
2099	int error;
2100	rnode_t *rp;
2101	struct vattr va;
2102
2103	va.va_mask = AT_MODE | AT_GID;
2104	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2105	if (error)
2106		return (error);
2107
2108	/*
2109	 * To determine the expected group-id of the created file:
2110	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2111	 *	GRPID option, and the directory's set-gid bit is clear,
2112	 *	then use the process's gid.
2113	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2114	 */
2115	rp = VTOR(dvp);
2116	mutex_enter(&rp->r_statelock);
2117	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2118		*gidp = crgetgid(cr);
2119	else
2120		*gidp = va.va_gid;
2121	mutex_exit(&rp->r_statelock);
2122	return (0);
2123}
2124
2125int
2126setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2127{
2128	int error;
2129	struct vattr va;
2130
2131	va.va_mask = AT_MODE;
2132	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2133	if (error)
2134		return (error);
2135
2136	/*
2137	 * Modify the expected mode (om) so that the set-gid bit matches
2138	 * that of the parent directory (dvp).
2139	 */
2140	if (va.va_mode & VSGID)
2141		*omp |= VSGID;
2142	else
2143		*omp &= ~VSGID;
2144	return (0);
2145}
2146
2147void
2148nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2149{
2150
2151	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2152		if (!(vp->v_flag & VSWAPLIKE)) {
2153			mutex_enter(&vp->v_lock);
2154			vp->v_flag |= VSWAPLIKE;
2155			mutex_exit(&vp->v_lock);
2156		}
2157	} else {
2158		if (vp->v_flag & VSWAPLIKE) {
2159			mutex_enter(&vp->v_lock);
2160			vp->v_flag &= ~VSWAPLIKE;
2161			mutex_exit(&vp->v_lock);
2162		}
2163	}
2164}
2165
2166/*
2167 * Free the resources associated with an rnode.
2168 */
2169static void
2170rinactive(rnode_t *rp, cred_t *cr)
2171{
2172	vnode_t *vp;
2173	cred_t *cred;
2174	char *contents;
2175	int size;
2176	vsecattr_t *vsp;
2177	int error;
2178	nfs3_pathconf_info *info;
2179
2180	/*
2181	 * Before freeing anything, wait until all asynchronous
2182	 * activity is done on this rnode.  This will allow all
2183	 * asynchronous read ahead and write behind i/o's to
2184	 * finish.
2185	 */
2186	mutex_enter(&rp->r_statelock);
2187	while (rp->r_count > 0)
2188		cv_wait(&rp->r_cv, &rp->r_statelock);
2189	mutex_exit(&rp->r_statelock);
2190
2191	/*
2192	 * Flush and invalidate all pages associated with the vnode.
2193	 */
2194	vp = RTOV(rp);
2195	if (vn_has_cached_data(vp)) {
2196		ASSERT(vp->v_type != VCHR);
2197		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2198			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2199			if (error && (error == ENOSPC || error == EDQUOT)) {
2200				mutex_enter(&rp->r_statelock);
2201				if (!rp->r_error)
2202					rp->r_error = error;
2203				mutex_exit(&rp->r_statelock);
2204			}
2205		}
2206		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2207	}
2208
2209	/*
2210	 * Free any held credentials and caches which may be associated
2211	 * with this rnode.
2212	 */
2213	mutex_enter(&rp->r_statelock);
2214	cred = rp->r_cred;
2215	rp->r_cred = NULL;
2216	contents = rp->r_symlink.contents;
2217	size = rp->r_symlink.size;
2218	rp->r_symlink.contents = NULL;
2219	vsp = rp->r_secattr;
2220	rp->r_secattr = NULL;
2221	info = rp->r_pathconf;
2222	rp->r_pathconf = NULL;
2223	mutex_exit(&rp->r_statelock);
2224
2225	/*
2226	 * Free the held credential.
2227	 */
2228	if (cred != NULL)
2229		crfree(cred);
2230
2231	/*
2232	 * Free the access cache entries.
2233	 */
2234	(void) nfs_access_purge_rp(rp);
2235
2236	/*
2237	 * Free the readdir cache entries.
2238	 */
2239	if (HAVE_RDDIR_CACHE(rp))
2240		nfs_purge_rddir_cache(vp);
2241
2242	/*
2243	 * Free the symbolic link cache.
2244	 */
2245	if (contents != NULL) {
2246
2247		kmem_free((void *)contents, size);
2248	}
2249
2250	/*
2251	 * Free any cached ACL.
2252	 */
2253	if (vsp != NULL)
2254		nfs_acl_free(vsp);
2255
2256	/*
2257	 * Free any cached pathconf information.
2258	 */
2259	if (info != NULL)
2260		kmem_free(info, sizeof (*info));
2261}
2262
2263/*
2264 * Return a vnode for the given NFS Version 2 file handle.
2265 * If no rnode exists for this fhandle, create one and put it
2266 * into the hash queues.  If the rnode for this fhandle
2267 * already exists, return it.
2268 *
2269 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2270 */
2271vnode_t *
2272makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2273    hrtime_t t, cred_t *cr, char *dnm, char *nm)
2274{
2275	int newnode;
2276	int index;
2277	vnode_t *vp;
2278	nfs_fhandle nfh;
2279	vattr_t va;
2280
2281	nfh.fh_len = NFS_FHSIZE;
2282	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2283
2284	index = rtablehash(&nfh);
2285	rw_enter(&rtable[index].r_lock, RW_READER);
2286
2287	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2288	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2289
2290	if (attr != NULL) {
2291		if (!newnode) {
2292			rw_exit(&rtable[index].r_lock);
2293			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2294		} else {
2295			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2296				vp->v_type = VBAD;
2297			else
2298				vp->v_type = n2v_type(attr);
2299			/*
2300			 * A translation here seems to be necessary
2301			 * because this function can be called
2302			 * with `attr' that has come from the wire,
2303			 * and been operated on by vattr_to_nattr().
2304			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2305			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2306			 * ->makenfsnode().
2307			 */
2308			if ((attr->na_rdev & 0xffff0000) == 0)
2309				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2310			else
2311				vp->v_rdev = expldev(n2v_rdev(attr));
2312			nfs_attrcache(vp, attr, t);
2313			rw_exit(&rtable[index].r_lock);
2314		}
2315	} else {
2316		if (newnode) {
2317			PURGE_ATTRCACHE(vp);
2318		}
2319		rw_exit(&rtable[index].r_lock);
2320	}
2321
2322	return (vp);
2323}
2324
2325/*
2326 * Return a vnode for the given NFS Version 3 file handle.
2327 * If no rnode exists for this fhandle, create one and put it
2328 * into the hash queues.  If the rnode for this fhandle
2329 * already exists, return it.
2330 *
2331 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2332 */
2333vnode_t *
2334makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2335    cred_t *cr, char *dnm, char *nm)
2336{
2337	int newnode;
2338	int index;
2339	vnode_t *vp;
2340
2341	index = rtablehash((nfs_fhandle *)fh);
2342	rw_enter(&rtable[index].r_lock, RW_READER);
2343
2344	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2345	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2346	    dnm, nm);
2347
2348	if (vap == NULL) {
2349		if (newnode) {
2350			PURGE_ATTRCACHE(vp);
2351		}
2352		rw_exit(&rtable[index].r_lock);
2353		return (vp);
2354	}
2355
2356	if (!newnode) {
2357		rw_exit(&rtable[index].r_lock);
2358		nfs_attr_cache(vp, vap, t, cr);
2359	} else {
2360		rnode_t *rp = VTOR(vp);
2361
2362		vp->v_type = vap->va_type;
2363		vp->v_rdev = vap->va_rdev;
2364
2365		mutex_enter(&rp->r_statelock);
2366		if (rp->r_mtime <= t)
2367			nfs_attrcache_va(vp, vap);
2368		mutex_exit(&rp->r_statelock);
2369		rw_exit(&rtable[index].r_lock);
2370	}
2371
2372	return (vp);
2373}
2374
2375vnode_t *
2376makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2377    cred_t *cr, char *dnm, char *nm)
2378{
2379	int newnode;
2380	int index;
2381	vnode_t *vp;
2382	vattr_t va;
2383
2384	index = rtablehash((nfs_fhandle *)fh);
2385	rw_enter(&rtable[index].r_lock, RW_READER);
2386
2387	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2388	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2389	    dnm, nm);
2390
2391	if (attr == NULL) {
2392		if (newnode) {
2393			PURGE_ATTRCACHE(vp);
2394		}
2395		rw_exit(&rtable[index].r_lock);
2396		return (vp);
2397	}
2398
2399	if (!newnode) {
2400		rw_exit(&rtable[index].r_lock);
2401		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2402	} else {
2403		if (attr->type < NF3REG || attr->type > NF3FIFO)
2404			vp->v_type = VBAD;
2405		else
2406			vp->v_type = nf3_to_vt[attr->type];
2407		vp->v_rdev = makedevice(attr->rdev.specdata1,
2408		    attr->rdev.specdata2);
2409		nfs3_attrcache(vp, attr, t);
2410		rw_exit(&rtable[index].r_lock);
2411	}
2412
2413	return (vp);
2414}
2415
2416/*
2417 * Read this comment before making changes to rtablehash()!
2418 * This is a hash function in which seemingly obvious and harmless
2419 * changes can cause escalations costing million dollars!
2420 * Know what you are doing.
2421 *
2422 * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2423 * algorithm is currently detailed here:
2424 *
2425 *   http://burtleburtle.net/bob/hash/doobs.html
2426 *
2427 * Of course, the above link may not be valid by the time you are reading
2428 * this, but suffice it to say that the one-at-a-time algorithm works well in
2429 * almost all cases.  If you are changing the algorithm be sure to verify that
2430 * the hash algorithm still provides even distribution in all cases and with
2431 * any server returning filehandles in whatever order (sequential or random).
2432 */
2433static int
2434rtablehash(nfs_fhandle *fh)
2435{
2436	ulong_t hash, len, i;
2437	char *key;
2438
2439	key = fh->fh_buf;
2440	len = (ulong_t)fh->fh_len;
2441	for (hash = 0, i = 0; i < len; i++) {
2442		hash += key[i];
2443		hash += (hash << 10);
2444		hash ^= (hash >> 6);
2445	}
2446	hash += (hash << 3);
2447	hash ^= (hash >> 11);
2448	hash += (hash << 15);
2449	return (hash & rtablemask);
2450}
2451
2452static vnode_t *
2453make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2454    struct vnodeops *vops,
2455    int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2456    int (*compar)(const void *, const void *),
2457    int *newnode, cred_t *cr, char *dnm, char *nm)
2458{
2459	rnode_t *rp;
2460	rnode_t *trp;
2461	vnode_t *vp;
2462	mntinfo_t *mi;
2463
2464	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2465
2466	mi = VFTOMI(vfsp);
2467start:
2468	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2469		vp = RTOV(rp);
2470		nfs_set_vroot(vp);
2471		*newnode = 0;
2472		return (vp);
2473	}
2474	rw_exit(&rhtp->r_lock);
2475
2476	mutex_enter(&rpfreelist_lock);
2477	if (rpfreelist != NULL && rnew >= nrnode) {
2478		rp = rpfreelist;
2479		rp_rmfree(rp);
2480		mutex_exit(&rpfreelist_lock);
2481
2482		vp = RTOV(rp);
2483
2484		if (rp->r_flags & RHASHED) {
2485			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2486			mutex_enter(&vp->v_lock);
2487			if (vp->v_count > 1) {
2488				vp->v_count--;
2489				mutex_exit(&vp->v_lock);
2490				rw_exit(&rp->r_hashq->r_lock);
2491				rw_enter(&rhtp->r_lock, RW_READER);
2492				goto start;
2493			}
2494			mutex_exit(&vp->v_lock);
2495			rp_rmhash_locked(rp);
2496			rw_exit(&rp->r_hashq->r_lock);
2497		}
2498
2499		rinactive(rp, cr);
2500
2501		mutex_enter(&vp->v_lock);
2502		if (vp->v_count > 1) {
2503			vp->v_count--;
2504			mutex_exit(&vp->v_lock);
2505			rw_enter(&rhtp->r_lock, RW_READER);
2506			goto start;
2507		}
2508		mutex_exit(&vp->v_lock);
2509		vn_invalid(vp);
2510		/*
2511		 * destroy old locks before bzero'ing and
2512		 * recreating the locks below.
2513		 */
2514		nfs_rw_destroy(&rp->r_rwlock);
2515		nfs_rw_destroy(&rp->r_lkserlock);
2516		mutex_destroy(&rp->r_statelock);
2517		cv_destroy(&rp->r_cv);
2518		cv_destroy(&rp->r_commit.c_cv);
2519		nfs_free_r_path(rp);
2520		avl_destroy(&rp->r_dir);
2521		/*
2522		 * Make sure that if rnode is recycled then
2523		 * VFS count is decremented properly before
2524		 * reuse.
2525		 */
2526		VFS_RELE(vp->v_vfsp);
2527		vn_reinit(vp);
2528	} else {
2529		vnode_t *new_vp;
2530
2531		mutex_exit(&rpfreelist_lock);
2532
2533		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2534		new_vp = vn_alloc(KM_SLEEP);
2535
2536		atomic_add_long((ulong_t *)&rnew, 1);
2537#ifdef DEBUG
2538		clstat_debug.nrnode.value.ui64++;
2539#endif
2540		vp = new_vp;
2541	}
2542
2543	bzero(rp, sizeof (*rp));
2544	rp->r_vnode = vp;
2545	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2546	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2547	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2548	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2549	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2550	rp->r_fh.fh_len = fh->fh_len;
2551	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2552	rp->r_server = mi->mi_curr_serv;
2553	if (FAILOVER_MOUNT(mi)) {
2554		/*
2555		 * If replicated servers, stash pathnames
2556		 */
2557		if (dnm != NULL && nm != NULL) {
2558			char *s, *p;
2559			uint_t len;
2560
2561			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2562			rp->r_path = kmem_alloc(len, KM_SLEEP);
2563#ifdef DEBUG
2564			clstat_debug.rpath.value.ui64 += len;
2565#endif
2566			s = rp->r_path;
2567			for (p = dnm; *p; p++)
2568				*s++ = *p;
2569			*s++ = '/';
2570			for (p = nm; *p; p++)
2571				*s++ = *p;
2572			*s = '\0';
2573		} else {
2574			/* special case for root */
2575			rp->r_path = kmem_alloc(2, KM_SLEEP);
2576#ifdef DEBUG
2577			clstat_debug.rpath.value.ui64 += 2;
2578#endif
2579			*rp->r_path = '.';
2580			*(rp->r_path + 1) = '\0';
2581		}
2582	}
2583	VFS_HOLD(vfsp);
2584	rp->r_putapage = putapage;
2585	rp->r_hashq = rhtp;
2586	rp->r_flags = RREADDIRPLUS;
2587	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2588	    offsetof(rddir_cache, tree));
2589	vn_setops(vp, vops);
2590	vp->v_data = (caddr_t)rp;
2591	vp->v_vfsp = vfsp;
2592	vp->v_type = VNON;
2593	vp->v_flag |= VMODSORT;
2594	nfs_set_vroot(vp);
2595
2596	/*
2597	 * There is a race condition if someone else
2598	 * alloc's the rnode while no locks are held, so we
2599	 * check again and recover if found.
2600	 */
2601	rw_enter(&rhtp->r_lock, RW_WRITER);
2602	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2603		vp = RTOV(trp);
2604		nfs_set_vroot(vp);
2605		*newnode = 0;
2606		rw_exit(&rhtp->r_lock);
2607		rp_addfree(rp, cr);
2608		rw_enter(&rhtp->r_lock, RW_READER);
2609		return (vp);
2610	}
2611	rp_addhash(rp);
2612	*newnode = 1;
2613	return (vp);
2614}
2615
2616/*
2617 * Callback function to check if the page should be marked as
2618 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2619 */
2620int
2621nfs_setmod_check(page_t *pp)
2622{
2623	if (pp->p_fsdata != C_NOCOMMIT) {
2624		pp->p_fsdata = C_NOCOMMIT;
2625		return (1);
2626	}
2627	return (0);
2628}
2629
2630static void
2631nfs_set_vroot(vnode_t *vp)
2632{
2633	rnode_t *rp;
2634	nfs_fhandle *rootfh;
2635
2636	rp = VTOR(vp);
2637	rootfh = &rp->r_server->sv_fhandle;
2638	if (rootfh->fh_len == rp->r_fh.fh_len &&
2639	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2640		if (!(vp->v_flag & VROOT)) {
2641			mutex_enter(&vp->v_lock);
2642			vp->v_flag |= VROOT;
2643			mutex_exit(&vp->v_lock);
2644		}
2645	}
2646}
2647
2648static void
2649nfs_free_r_path(rnode_t *rp)
2650{
2651	char *path;
2652	size_t len;
2653
2654	path = rp->r_path;
2655	if (path) {
2656		rp->r_path = NULL;
2657		len = strlen(path) + 1;
2658		kmem_free(path, len);
2659#ifdef DEBUG
2660		clstat_debug.rpath.value.ui64 -= len;
2661#endif
2662	}
2663}
2664
2665/*
2666 * Put an rnode on the free list.
2667 *
2668 * Rnodes which were allocated above and beyond the normal limit
2669 * are immediately freed.
2670 */
2671void
2672rp_addfree(rnode_t *rp, cred_t *cr)
2673{
2674	vnode_t *vp;
2675	struct vfs *vfsp;
2676
2677	vp = RTOV(rp);
2678	ASSERT(vp->v_count >= 1);
2679	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2680
2681	/*
2682	 * If we have too many rnodes allocated and there are no
2683	 * references to this rnode, or if the rnode is no longer
2684	 * accessible by it does not reside in the hash queues,
2685	 * or if an i/o error occurred while writing to the file,
2686	 * then just free it instead of putting it on the rnode
2687	 * freelist.
2688	 */
2689	vfsp = vp->v_vfsp;
2690	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2691	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2692		if (rp->r_flags & RHASHED) {
2693			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2694			mutex_enter(&vp->v_lock);
2695			if (vp->v_count > 1) {
2696				vp->v_count--;
2697				mutex_exit(&vp->v_lock);
2698				rw_exit(&rp->r_hashq->r_lock);
2699				return;
2700			}
2701			mutex_exit(&vp->v_lock);
2702			rp_rmhash_locked(rp);
2703			rw_exit(&rp->r_hashq->r_lock);
2704		}
2705
2706		rinactive(rp, cr);
2707
2708		/*
2709		 * Recheck the vnode reference count.  We need to
2710		 * make sure that another reference has not been
2711		 * acquired while we were not holding v_lock.  The
2712		 * rnode is not in the rnode hash queues, so the
2713		 * only way for a reference to have been acquired
2714		 * is for a VOP_PUTPAGE because the rnode was marked
2715		 * with RDIRTY or for a modified page.  This
2716		 * reference may have been acquired before our call
2717		 * to rinactive.  The i/o may have been completed,
2718		 * thus allowing rinactive to complete, but the
2719		 * reference to the vnode may not have been released
2720		 * yet.  In any case, the rnode can not be destroyed
2721		 * until the other references to this vnode have been
2722		 * released.  The other references will take care of
2723		 * either destroying the rnode or placing it on the
2724		 * rnode freelist.  If there are no other references,
2725		 * then the rnode may be safely destroyed.
2726		 */
2727		mutex_enter(&vp->v_lock);
2728		if (vp->v_count > 1) {
2729			vp->v_count--;
2730			mutex_exit(&vp->v_lock);
2731			return;
2732		}
2733		mutex_exit(&vp->v_lock);
2734
2735		destroy_rnode(rp);
2736		return;
2737	}
2738
2739	/*
2740	 * Lock the hash queue and then recheck the reference count
2741	 * to ensure that no other threads have acquired a reference
2742	 * to indicate that the rnode should not be placed on the
2743	 * freelist.  If another reference has been acquired, then
2744	 * just release this one and let the other thread complete
2745	 * the processing of adding this rnode to the freelist.
2746	 */
2747	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2748
2749	mutex_enter(&vp->v_lock);
2750	if (vp->v_count > 1) {
2751		vp->v_count--;
2752		mutex_exit(&vp->v_lock);
2753		rw_exit(&rp->r_hashq->r_lock);
2754		return;
2755	}
2756	mutex_exit(&vp->v_lock);
2757
2758	/*
2759	 * If there is no cached data or metadata for this file, then
2760	 * put the rnode on the front of the freelist so that it will
2761	 * be reused before other rnodes which may have cached data or
2762	 * metadata associated with them.
2763	 */
2764	mutex_enter(&rpfreelist_lock);
2765	if (rpfreelist == NULL) {
2766		rp->r_freef = rp;
2767		rp->r_freeb = rp;
2768		rpfreelist = rp;
2769	} else {
2770		rp->r_freef = rpfreelist;
2771		rp->r_freeb = rpfreelist->r_freeb;
2772		rpfreelist->r_freeb->r_freef = rp;
2773		rpfreelist->r_freeb = rp;
2774		if (!vn_has_cached_data(vp) &&
2775		    !HAVE_RDDIR_CACHE(rp) &&
2776		    rp->r_symlink.contents == NULL &&
2777		    rp->r_secattr == NULL &&
2778		    rp->r_pathconf == NULL)
2779			rpfreelist = rp;
2780	}
2781	mutex_exit(&rpfreelist_lock);
2782
2783	rw_exit(&rp->r_hashq->r_lock);
2784}
2785
2786/*
2787 * Remove an rnode from the free list.
2788 *
2789 * The caller must be holding rpfreelist_lock and the rnode
2790 * must be on the freelist.
2791 */
2792static void
2793rp_rmfree(rnode_t *rp)
2794{
2795
2796	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2797	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2798
2799	if (rp == rpfreelist) {
2800		rpfreelist = rp->r_freef;
2801		if (rp == rpfreelist)
2802			rpfreelist = NULL;
2803	}
2804
2805	rp->r_freeb->r_freef = rp->r_freef;
2806	rp->r_freef->r_freeb = rp->r_freeb;
2807
2808	rp->r_freef = rp->r_freeb = NULL;
2809}
2810
2811/*
2812 * Put a rnode in the hash table.
2813 *
2814 * The caller must be holding the exclusive hash queue lock.
2815 */
2816static void
2817rp_addhash(rnode_t *rp)
2818{
2819
2820	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2821	ASSERT(!(rp->r_flags & RHASHED));
2822
2823	rp->r_hashf = rp->r_hashq->r_hashf;
2824	rp->r_hashq->r_hashf = rp;
2825	rp->r_hashb = (rnode_t *)rp->r_hashq;
2826	rp->r_hashf->r_hashb = rp;
2827
2828	mutex_enter(&rp->r_statelock);
2829	rp->r_flags |= RHASHED;
2830	mutex_exit(&rp->r_statelock);
2831}
2832
2833/*
2834 * Remove a rnode from the hash table.
2835 *
2836 * The caller must be holding the hash queue lock.
2837 */
2838static void
2839rp_rmhash_locked(rnode_t *rp)
2840{
2841
2842	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2843	ASSERT(rp->r_flags & RHASHED);
2844
2845	rp->r_hashb->r_hashf = rp->r_hashf;
2846	rp->r_hashf->r_hashb = rp->r_hashb;
2847
2848	mutex_enter(&rp->r_statelock);
2849	rp->r_flags &= ~RHASHED;
2850	mutex_exit(&rp->r_statelock);
2851}
2852
2853/*
2854 * Remove a rnode from the hash table.
2855 *
2856 * The caller must not be holding the hash queue lock.
2857 */
2858void
2859rp_rmhash(rnode_t *rp)
2860{
2861
2862	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2863	rp_rmhash_locked(rp);
2864	rw_exit(&rp->r_hashq->r_lock);
2865}
2866
2867/*
2868 * Lookup a rnode by fhandle.
2869 *
2870 * The caller must be holding the hash queue lock, either shared or exclusive.
2871 */
2872static rnode_t *
2873rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2874{
2875	rnode_t *rp;
2876	vnode_t *vp;
2877
2878	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2879
2880	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2881		vp = RTOV(rp);
2882		if (vp->v_vfsp == vfsp &&
2883		    rp->r_fh.fh_len == fh->fh_len &&
2884		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2885			/*
2886			 * remove rnode from free list, if necessary.
2887			 */
2888			if (rp->r_freef != NULL) {
2889				mutex_enter(&rpfreelist_lock);
2890				/*
2891				 * If the rnode is on the freelist,
2892				 * then remove it and use that reference
2893				 * as the new reference.  Otherwise,
2894				 * need to increment the reference count.
2895				 */
2896				if (rp->r_freef != NULL) {
2897					rp_rmfree(rp);
2898					mutex_exit(&rpfreelist_lock);
2899				} else {
2900					mutex_exit(&rpfreelist_lock);
2901					VN_HOLD(vp);
2902				}
2903			} else
2904				VN_HOLD(vp);
2905			return (rp);
2906		}
2907	}
2908	return (NULL);
2909}
2910
2911/*
2912 * Return 1 if there is a active vnode belonging to this vfs in the
2913 * rtable cache.
2914 *
2915 * Several of these checks are done without holding the usual
2916 * locks.  This is safe because destroy_rtable(), rp_addfree(),
2917 * etc. will redo the necessary checks before actually destroying
2918 * any rnodes.
2919 */
2920int
2921check_rtable(struct vfs *vfsp)
2922{
2923	int index;
2924	rnode_t *rp;
2925	vnode_t *vp;
2926
2927	for (index = 0; index < rtablesize; index++) {
2928		rw_enter(&rtable[index].r_lock, RW_READER);
2929		for (rp = rtable[index].r_hashf;
2930		    rp != (rnode_t *)(&rtable[index]);
2931		    rp = rp->r_hashf) {
2932			vp = RTOV(rp);
2933			if (vp->v_vfsp == vfsp) {
2934				if (rp->r_freef == NULL ||
2935				    (vn_has_cached_data(vp) &&
2936				    (rp->r_flags & RDIRTY)) ||
2937				    rp->r_count > 0) {
2938					rw_exit(&rtable[index].r_lock);
2939					return (1);
2940				}
2941			}
2942		}
2943		rw_exit(&rtable[index].r_lock);
2944	}
2945	return (0);
2946}
2947
2948/*
2949 * Destroy inactive vnodes from the hash queues which belong to this
2950 * vfs.  It is essential that we destroy all inactive vnodes during a
2951 * forced unmount as well as during a normal unmount.
2952 */
2953void
2954destroy_rtable(struct vfs *vfsp, cred_t *cr)
2955{
2956	int index;
2957	rnode_t *rp;
2958	rnode_t *rlist;
2959	rnode_t *r_hashf;
2960	vnode_t *vp;
2961
2962	rlist = NULL;
2963
2964	for (index = 0; index < rtablesize; index++) {
2965		rw_enter(&rtable[index].r_lock, RW_WRITER);
2966		for (rp = rtable[index].r_hashf;
2967		    rp != (rnode_t *)(&rtable[index]);
2968		    rp = r_hashf) {
2969			/* save the hash pointer before destroying */
2970			r_hashf = rp->r_hashf;
2971			vp = RTOV(rp);
2972			if (vp->v_vfsp == vfsp) {
2973				mutex_enter(&rpfreelist_lock);
2974				if (rp->r_freef != NULL) {
2975					rp_rmfree(rp);
2976					mutex_exit(&rpfreelist_lock);
2977					rp_rmhash_locked(rp);
2978					rp->r_hashf = rlist;
2979					rlist = rp;
2980				} else
2981					mutex_exit(&rpfreelist_lock);
2982			}
2983		}
2984		rw_exit(&rtable[index].r_lock);
2985	}
2986
2987	for (rp = rlist; rp != NULL; rp = rlist) {
2988		rlist = rp->r_hashf;
2989		/*
2990		 * This call to rp_addfree will end up destroying the
2991		 * rnode, but in a safe way with the appropriate set
2992		 * of checks done.
2993		 */
2994		rp_addfree(rp, cr);
2995	}
2996
2997}
2998
2999/*
3000 * This routine destroys all the resources associated with the rnode
3001 * and then the rnode itself.
3002 */
3003static void
3004destroy_rnode(rnode_t *rp)
3005{
3006	vnode_t *vp;
3007	vfs_t *vfsp;
3008
3009	vp = RTOV(rp);
3010	vfsp = vp->v_vfsp;
3011
3012	ASSERT(vp->v_count == 1);
3013	ASSERT(rp->r_count == 0);
3014	ASSERT(rp->r_lmpl == NULL);
3015	ASSERT(rp->r_mapcnt == 0);
3016	ASSERT(!(rp->r_flags & RHASHED));
3017	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3018	atomic_add_long((ulong_t *)&rnew, -1);
3019#ifdef DEBUG
3020	clstat_debug.nrnode.value.ui64--;
3021#endif
3022	nfs_rw_destroy(&rp->r_rwlock);
3023	nfs_rw_destroy(&rp->r_lkserlock);
3024	mutex_destroy(&rp->r_statelock);
3025	cv_destroy(&rp->r_cv);
3026	cv_destroy(&rp->r_commit.c_cv);
3027	if (rp->r_flags & RDELMAPLIST)
3028		list_destroy(&rp->r_indelmap);
3029	nfs_free_r_path(rp);
3030	avl_destroy(&rp->r_dir);
3031	vn_invalid(vp);
3032	vn_free(vp);
3033	kmem_cache_free(rnode_cache, rp);
3034	VFS_RELE(vfsp);
3035}
3036
3037/*
3038 * Flush all vnodes in this (or every) vfs.
3039 * Used by nfs_sync and by nfs_unmount.
3040 */
3041void
3042rflush(struct vfs *vfsp, cred_t *cr)
3043{
3044	int index;
3045	rnode_t *rp;
3046	vnode_t *vp, **vplist;
3047	long num, cnt;
3048
3049	/*
3050	 * Check to see whether there is anything to do.
3051	 */
3052	num = rnew;
3053	if (num == 0)
3054		return;
3055
3056	/*
3057	 * Allocate a slot for all currently active rnodes on the
3058	 * supposition that they all may need flushing.
3059	 */
3060	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3061	cnt = 0;
3062
3063	/*
3064	 * Walk the hash queues looking for rnodes with page
3065	 * lists associated with them.  Make a list of these
3066	 * files.
3067	 */
3068	for (index = 0; index < rtablesize; index++) {
3069		rw_enter(&rtable[index].r_lock, RW_READER);
3070		for (rp = rtable[index].r_hashf;
3071		    rp != (rnode_t *)(&rtable[index]);
3072		    rp = rp->r_hashf) {
3073			vp = RTOV(rp);
3074			/*
3075			 * Don't bother sync'ing a vp if it
3076			 * is part of virtual swap device or
3077			 * if VFS is read-only
3078			 */
3079			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3080				continue;
3081			/*
3082			 * If flushing all mounted file systems or
3083			 * the vnode belongs to this vfs, has pages
3084			 * and is marked as either dirty or mmap'd,
3085			 * hold and add this vnode to the list of
3086			 * vnodes to flush.
3087			 */
3088			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3089			    vn_has_cached_data(vp) &&
3090			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3091				VN_HOLD(vp);
3092				vplist[cnt++] = vp;
3093				if (cnt == num) {
3094					rw_exit(&rtable[index].r_lock);
3095					goto toomany;
3096				}
3097			}
3098		}
3099		rw_exit(&rtable[index].r_lock);
3100	}
3101toomany:
3102
3103	/*
3104	 * Flush and release all of the files on the list.
3105	 */
3106	while (cnt-- > 0) {
3107		vp = vplist[cnt];
3108		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3109		VN_RELE(vp);
3110	}
3111
3112	/*
3113	 * Free the space allocated to hold the list.
3114	 */
3115	kmem_free(vplist, num * sizeof (*vplist));
3116}
3117
3118/*
3119 * This probably needs to be larger than or equal to
3120 * log2(sizeof (struct rnode)) due to the way that rnodes are
3121 * allocated.
3122 */
3123#define	ACACHE_SHIFT_BITS	9
3124
3125static int
3126acachehash(rnode_t *rp, cred_t *cr)
3127{
3128
3129	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3130	    acachemask);
3131}
3132
3133#ifdef DEBUG
3134static long nfs_access_cache_hits = 0;
3135static long nfs_access_cache_misses = 0;
3136#endif
3137
3138nfs_access_type_t
3139nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3140{
3141	vnode_t *vp;
3142	acache_t *ap;
3143	acache_hash_t *hp;
3144	nfs_access_type_t all;
3145
3146	vp = RTOV(rp);
3147	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3148		return (NFS_ACCESS_UNKNOWN);
3149
3150	if (rp->r_acache != NULL) {
3151		hp = &acache[acachehash(rp, cr)];
3152		rw_enter(&hp->lock, RW_READER);
3153		ap = hp->next;
3154		while (ap != (acache_t *)hp) {
3155			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3156				if ((ap->known & acc) == acc) {
3157#ifdef DEBUG
3158					nfs_access_cache_hits++;
3159#endif
3160					if ((ap->allowed & acc) == acc)
3161						all = NFS_ACCESS_ALLOWED;
3162					else
3163						all = NFS_ACCESS_DENIED;
3164				} else {
3165#ifdef DEBUG
3166					nfs_access_cache_misses++;
3167#endif
3168					all = NFS_ACCESS_UNKNOWN;
3169				}
3170				rw_exit(&hp->lock);
3171				return (all);
3172			}
3173			ap = ap->next;
3174		}
3175		rw_exit(&hp->lock);
3176	}
3177
3178#ifdef DEBUG
3179	nfs_access_cache_misses++;
3180#endif
3181	return (NFS_ACCESS_UNKNOWN);
3182}
3183
3184void
3185nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3186{
3187	acache_t *ap;
3188	acache_t *nap;
3189	acache_hash_t *hp;
3190
3191	hp = &acache[acachehash(rp, cr)];
3192
3193	/*
3194	 * Allocate now assuming that mostly an allocation will be
3195	 * required.  This allows the allocation to happen without
3196	 * holding the hash bucket locked.
3197	 */
3198	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3199	if (nap != NULL) {
3200		nap->known = acc;
3201		nap->allowed = resacc;
3202		nap->rnode = rp;
3203		crhold(cr);
3204		nap->cred = cr;
3205		nap->hashq = hp;
3206	}
3207
3208	rw_enter(&hp->lock, RW_WRITER);
3209
3210	if (rp->r_acache != NULL) {
3211		ap = hp->next;
3212		while (ap != (acache_t *)hp) {
3213			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3214				ap->known |= acc;
3215				ap->allowed &= ~acc;
3216				ap->allowed |= resacc;
3217				rw_exit(&hp->lock);
3218				if (nap != NULL) {
3219					crfree(nap->cred);
3220					kmem_cache_free(acache_cache, nap);
3221				}
3222				return;
3223			}
3224			ap = ap->next;
3225		}
3226	}
3227
3228	if (nap != NULL) {
3229#ifdef DEBUG
3230		clstat_debug.access.value.ui64++;
3231#endif
3232		nap->next = hp->next;
3233		hp->next = nap;
3234		nap->next->prev = nap;
3235		nap->prev = (acache_t *)hp;
3236
3237		mutex_enter(&rp->r_statelock);
3238		nap->list = rp->r_acache;
3239		rp->r_acache = nap;
3240		mutex_exit(&rp->r_statelock);
3241	}
3242
3243	rw_exit(&hp->lock);
3244}
3245
3246int
3247nfs_access_purge_rp(rnode_t *rp)
3248{
3249	acache_t *ap;
3250	acache_t *tmpap;
3251	acache_t *rplist;
3252
3253	/*
3254	 * If there aren't any cached entries, then there is nothing
3255	 * to free.
3256	 */
3257	if (rp->r_acache == NULL)
3258		return (0);
3259
3260	mutex_enter(&rp->r_statelock);
3261	rplist = rp->r_acache;
3262	rp->r_acache = NULL;
3263	mutex_exit(&rp->r_statelock);
3264
3265	/*
3266	 * Loop through each entry in the list pointed to in the
3267	 * rnode.  Remove each of these entries from the hash
3268	 * queue that it is on and remove it from the list in
3269	 * the rnode.
3270	 */
3271	for (ap = rplist; ap != NULL; ap = tmpap) {
3272		rw_enter(&ap->hashq->lock, RW_WRITER);
3273		ap->prev->next = ap->next;
3274		ap->next->prev = ap->prev;
3275		rw_exit(&ap->hashq->lock);
3276
3277		tmpap = ap->list;
3278		crfree(ap->cred);
3279		kmem_cache_free(acache_cache, ap);
3280#ifdef DEBUG
3281		clstat_debug.access.value.ui64--;
3282#endif
3283	}
3284
3285	return (1);
3286}
3287
3288static const char prefix[] = ".nfs";
3289
3290static kmutex_t newnum_lock;
3291
3292int
3293newnum(void)
3294{
3295	static uint_t newnum = 0;
3296	uint_t id;
3297
3298	mutex_enter(&newnum_lock);
3299	if (newnum == 0)
3300		newnum = gethrestime_sec() & 0xffff;
3301	id = newnum++;
3302	mutex_exit(&newnum_lock);
3303	return (id);
3304}
3305
3306char *
3307newname(void)
3308{
3309	char *news;
3310	char *s;
3311	const char *p;
3312	uint_t id;
3313
3314	id = newnum();
3315	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3316	s = news;
3317	p = prefix;
3318	while (*p != '\0')
3319		*s++ = *p++;
3320	while (id != 0) {
3321		*s++ = "0123456789ABCDEF"[id & 0x0f];
3322		id >>= 4;
3323	}
3324	*s = '\0';
3325	return (news);
3326}
3327
3328/*
3329 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3330 * framework.
3331 */
3332static int
3333cl_snapshot(kstat_t *ksp, void *buf, int rw)
3334{
3335	ksp->ks_snaptime = gethrtime();
3336	if (rw == KSTAT_WRITE) {
3337		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3338#ifdef DEBUG
3339		/*
3340		 * Currently only the global zone can write to kstats, but we
3341		 * add the check just for paranoia.
3342		 */
3343		if (INGLOBALZONE(curproc))
3344			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3345			    sizeof (clstat_debug));
3346#endif
3347	} else {
3348		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3349#ifdef DEBUG
3350		/*
3351		 * If we're displaying the "global" debug kstat values, we
3352		 * display them as-is to all zones since in fact they apply to
3353		 * the system as a whole.
3354		 */
3355		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3356		    sizeof (clstat_debug));
3357#endif
3358	}
3359	return (0);
3360}
3361
3362static void *
3363clinit_zone(zoneid_t zoneid)
3364{
3365	kstat_t *nfs_client_kstat;
3366	struct nfs_clnt *nfscl;
3367	uint_t ndata;
3368
3369	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3370	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3371	nfscl->nfscl_chtable = NULL;
3372	nfscl->nfscl_zoneid = zoneid;
3373
3374	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3375	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3376#ifdef DEBUG
3377	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3378#endif
3379	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3380	    "misc", KSTAT_TYPE_NAMED, ndata,
3381	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3382		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3383		nfs_client_kstat->ks_snapshot = cl_snapshot;
3384		kstat_install(nfs_client_kstat);
3385	}
3386	mutex_enter(&nfs_clnt_list_lock);
3387	list_insert_head(&nfs_clnt_list, nfscl);
3388	mutex_exit(&nfs_clnt_list_lock);
3389	return (nfscl);
3390}
3391
3392/*ARGSUSED*/
3393static void
3394clfini_zone(zoneid_t zoneid, void *arg)
3395{
3396	struct nfs_clnt *nfscl = arg;
3397	chhead_t *chp, *next;
3398
3399	if (nfscl == NULL)
3400		return;
3401	mutex_enter(&nfs_clnt_list_lock);
3402	list_remove(&nfs_clnt_list, nfscl);
3403	mutex_exit(&nfs_clnt_list_lock);
3404	clreclaim_zone(nfscl, 0);
3405	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3406		ASSERT(chp->ch_list == NULL);
3407		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3408		next = chp->ch_next;
3409		kmem_free(chp, sizeof (*chp));
3410	}
3411	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3412	mutex_destroy(&nfscl->nfscl_chtable_lock);
3413	kmem_free(nfscl, sizeof (*nfscl));
3414}
3415
3416/*
3417 * Called by endpnt_destructor to make sure the client handles are
3418 * cleaned up before the RPC endpoints.  This becomes a no-op if
3419 * clfini_zone (above) is called first.  This function is needed
3420 * (rather than relying on clfini_zone to clean up) because the ZSD
3421 * callbacks have no ordering mechanism, so we have no way to ensure
3422 * that clfini_zone is called before endpnt_destructor.
3423 */
3424void
3425clcleanup_zone(zoneid_t zoneid)
3426{
3427	struct nfs_clnt *nfscl;
3428
3429	mutex_enter(&nfs_clnt_list_lock);
3430	nfscl = list_head(&nfs_clnt_list);
3431	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3432		if (nfscl->nfscl_zoneid == zoneid) {
3433			clreclaim_zone(nfscl, 0);
3434			break;
3435		}
3436	}
3437	mutex_exit(&nfs_clnt_list_lock);
3438}
3439
3440int
3441nfs_subrinit(void)
3442{
3443	int i;
3444	ulong_t nrnode_max;
3445
3446	/*
3447	 * Allocate and initialize the rnode hash queues
3448	 */
3449	if (nrnode <= 0)
3450		nrnode = ncsize;
3451	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3452	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3453		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3454		    "setting nrnode to max value of %ld", nrnode_max);
3455		nrnode = nrnode_max;
3456	}
3457
3458	rtablesize = 1 << highbit(nrnode / hashlen);
3459	rtablemask = rtablesize - 1;
3460	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3461	for (i = 0; i < rtablesize; i++) {
3462		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3463		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3464		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3465	}
3466	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3467	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3468
3469	/*
3470	 * Allocate and initialize the access cache
3471	 */
3472
3473	/*
3474	 * Initial guess is one access cache entry per rnode unless
3475	 * nacache is set to a non-zero value and then it is used to
3476	 * indicate a guess at the number of access cache entries.
3477	 */
3478	if (nacache > 0)
3479		acachesize = 1 << highbit(nacache / hashlen);
3480	else
3481		acachesize = rtablesize;
3482	acachemask = acachesize - 1;
3483	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3484	for (i = 0; i < acachesize; i++) {
3485		acache[i].next = (acache_t *)&acache[i];
3486		acache[i].prev = (acache_t *)&acache[i];
3487		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3488	}
3489	acache_cache = kmem_cache_create("nfs_access_cache",
3490	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3491	/*
3492	 * Allocate and initialize the client handle cache
3493	 */
3494	chtab_cache = kmem_cache_create("client_handle_cache",
3495	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3496	/*
3497	 * Initialize the list of per-zone client handles (and associated data).
3498	 * This needs to be done before we call zone_key_create().
3499	 */
3500	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3501	    offsetof(struct nfs_clnt, nfscl_node));
3502	/*
3503	 * Initialize the zone_key for per-zone client handle lists.
3504	 */
3505	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3506	/*
3507	 * Initialize the various mutexes and reader/writer locks
3508	 */
3509	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3510	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3511	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3512
3513	/*
3514	 * Assign unique major number for all nfs mounts
3515	 */
3516	if ((nfs_major = getudev()) == -1) {
3517		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3518		    "nfs: init: can't get unique device number");
3519		nfs_major = 0;
3520	}
3521	nfs_minor = 0;
3522
3523	if (nfs3_jukebox_delay == 0)
3524		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3525
3526	return (0);
3527}
3528
3529void
3530nfs_subrfini(void)
3531{
3532	int i;
3533
3534	/*
3535	 * Deallocate the rnode hash queues
3536	 */
3537	kmem_cache_destroy(rnode_cache);
3538
3539	for (i = 0; i < rtablesize; i++)
3540		rw_destroy(&rtable[i].r_lock);
3541	kmem_free(rtable, rtablesize * sizeof (*rtable));
3542
3543	/*
3544	 * Deallocated the access cache
3545	 */
3546	kmem_cache_destroy(acache_cache);
3547
3548	for (i = 0; i < acachesize; i++)
3549		rw_destroy(&acache[i].lock);
3550	kmem_free(acache, acachesize * sizeof (*acache));
3551
3552	/*
3553	 * Deallocate the client handle cache
3554	 */
3555	kmem_cache_destroy(chtab_cache);
3556
3557	/*
3558	 * Destroy the various mutexes and reader/writer locks
3559	 */
3560	mutex_destroy(&rpfreelist_lock);
3561	mutex_destroy(&newnum_lock);
3562	mutex_destroy(&nfs_minor_lock);
3563	(void) zone_key_delete(nfsclnt_zone_key);
3564}
3565
3566enum nfsstat
3567puterrno(int error)
3568{
3569
3570	switch (error) {
3571	case EOPNOTSUPP:
3572		return (NFSERR_OPNOTSUPP);
3573	case ENAMETOOLONG:
3574		return (NFSERR_NAMETOOLONG);
3575	case ENOTEMPTY:
3576		return (NFSERR_NOTEMPTY);
3577	case EDQUOT:
3578		return (NFSERR_DQUOT);
3579	case ESTALE:
3580		return (NFSERR_STALE);
3581	case EREMOTE:
3582		return (NFSERR_REMOTE);
3583	case ENOSYS:
3584		return (NFSERR_OPNOTSUPP);
3585	case EOVERFLOW:
3586		return (NFSERR_INVAL);
3587	default:
3588		return ((enum nfsstat)error);
3589	}
3590	/* NOTREACHED */
3591}
3592
3593int
3594geterrno(enum nfsstat status)
3595{
3596
3597	switch (status) {
3598	case NFSERR_OPNOTSUPP:
3599		return (EOPNOTSUPP);
3600	case NFSERR_NAMETOOLONG:
3601		return (ENAMETOOLONG);
3602	case NFSERR_NOTEMPTY:
3603		return (ENOTEMPTY);
3604	case NFSERR_DQUOT:
3605		return (EDQUOT);
3606	case NFSERR_STALE:
3607		return (ESTALE);
3608	case NFSERR_REMOTE:
3609		return (EREMOTE);
3610	case NFSERR_WFLUSH:
3611		return (EIO);
3612	default:
3613		return ((int)status);
3614	}
3615	/* NOTREACHED */
3616}
3617
3618enum nfsstat3
3619puterrno3(int error)
3620{
3621
3622#ifdef DEBUG
3623	switch (error) {
3624	case 0:
3625		return (NFS3_OK);
3626	case EPERM:
3627		return (NFS3ERR_PERM);
3628	case ENOENT:
3629		return (NFS3ERR_NOENT);
3630	case EIO:
3631		return (NFS3ERR_IO);
3632	case ENXIO:
3633		return (NFS3ERR_NXIO);
3634	case EACCES:
3635		return (NFS3ERR_ACCES);
3636	case EEXIST:
3637		return (NFS3ERR_EXIST);
3638	case EXDEV:
3639		return (NFS3ERR_XDEV);
3640	case ENODEV:
3641		return (NFS3ERR_NODEV);
3642	case ENOTDIR:
3643		return (NFS3ERR_NOTDIR);
3644	case EISDIR:
3645		return (NFS3ERR_ISDIR);
3646	case EINVAL:
3647		return (NFS3ERR_INVAL);
3648	case EFBIG:
3649		return (NFS3ERR_FBIG);
3650	case ENOSPC:
3651		return (NFS3ERR_NOSPC);
3652	case EROFS:
3653		return (NFS3ERR_ROFS);
3654	case EMLINK:
3655		return (NFS3ERR_MLINK);
3656	case ENAMETOOLONG:
3657		return (NFS3ERR_NAMETOOLONG);
3658	case ENOTEMPTY:
3659		return (NFS3ERR_NOTEMPTY);
3660	case EDQUOT:
3661		return (NFS3ERR_DQUOT);
3662	case ESTALE:
3663		return (NFS3ERR_STALE);
3664	case EREMOTE:
3665		return (NFS3ERR_REMOTE);
3666	case ENOSYS:
3667	case EOPNOTSUPP:
3668		return (NFS3ERR_NOTSUPP);
3669	case EOVERFLOW:
3670		return (NFS3ERR_INVAL);
3671	default:
3672		zcmn_err(getzoneid(), CE_WARN,
3673		    "puterrno3: got error %d", error);
3674		return ((enum nfsstat3)error);
3675	}
3676#else
3677	switch (error) {
3678	case ENAMETOOLONG:
3679		return (NFS3ERR_NAMETOOLONG);
3680	case ENOTEMPTY:
3681		return (NFS3ERR_NOTEMPTY);
3682	case EDQUOT:
3683		return (NFS3ERR_DQUOT);
3684	case ESTALE:
3685		return (NFS3ERR_STALE);
3686	case ENOSYS:
3687	case EOPNOTSUPP:
3688		return (NFS3ERR_NOTSUPP);
3689	case EREMOTE:
3690		return (NFS3ERR_REMOTE);
3691	case EOVERFLOW:
3692		return (NFS3ERR_INVAL);
3693	default:
3694		return ((enum nfsstat3)error);
3695	}
3696#endif
3697}
3698
3699int
3700geterrno3(enum nfsstat3 status)
3701{
3702
3703#ifdef DEBUG
3704	switch (status) {
3705	case NFS3_OK:
3706		return (0);
3707	case NFS3ERR_PERM:
3708		return (EPERM);
3709	case NFS3ERR_NOENT:
3710		return (ENOENT);
3711	case NFS3ERR_IO:
3712		return (EIO);
3713	case NFS3ERR_NXIO:
3714		return (ENXIO);
3715	case NFS3ERR_ACCES:
3716		return (EACCES);
3717	case NFS3ERR_EXIST:
3718		return (EEXIST);
3719	case NFS3ERR_XDEV:
3720		return (EXDEV);
3721	case NFS3ERR_NODEV:
3722		return (ENODEV);
3723	case NFS3ERR_NOTDIR:
3724		return (ENOTDIR);
3725	case NFS3ERR_ISDIR:
3726		return (EISDIR);
3727	case NFS3ERR_INVAL:
3728		return (EINVAL);
3729	case NFS3ERR_FBIG:
3730		return (EFBIG);
3731	case NFS3ERR_NOSPC:
3732		return (ENOSPC);
3733	case NFS3ERR_ROFS:
3734		return (EROFS);
3735	case NFS3ERR_MLINK:
3736		return (EMLINK);
3737	case NFS3ERR_NAMETOOLONG:
3738		return (ENAMETOOLONG);
3739	case NFS3ERR_NOTEMPTY:
3740		return (ENOTEMPTY);
3741	case NFS3ERR_DQUOT:
3742		return (EDQUOT);
3743	case NFS3ERR_STALE:
3744		return (ESTALE);
3745	case NFS3ERR_REMOTE:
3746		return (EREMOTE);
3747	case NFS3ERR_BADHANDLE:
3748		return (ESTALE);
3749	case NFS3ERR_NOT_SYNC:
3750		return (EINVAL);
3751	case NFS3ERR_BAD_COOKIE:
3752		return (ENOENT);
3753	case NFS3ERR_NOTSUPP:
3754		return (EOPNOTSUPP);
3755	case NFS3ERR_TOOSMALL:
3756		return (EINVAL);
3757	case NFS3ERR_SERVERFAULT:
3758		return (EIO);
3759	case NFS3ERR_BADTYPE:
3760		return (EINVAL);
3761	case NFS3ERR_JUKEBOX:
3762		return (ENXIO);
3763	default:
3764		zcmn_err(getzoneid(), CE_WARN,
3765		    "geterrno3: got status %d", status);
3766		return ((int)status);
3767	}
3768#else
3769	switch (status) {
3770	case NFS3ERR_NAMETOOLONG:
3771		return (ENAMETOOLONG);
3772	case NFS3ERR_NOTEMPTY:
3773		return (ENOTEMPTY);
3774	case NFS3ERR_DQUOT:
3775		return (EDQUOT);
3776	case NFS3ERR_STALE:
3777	case NFS3ERR_BADHANDLE:
3778		return (ESTALE);
3779	case NFS3ERR_NOTSUPP:
3780		return (EOPNOTSUPP);
3781	case NFS3ERR_REMOTE:
3782		return (EREMOTE);
3783	case NFS3ERR_NOT_SYNC:
3784	case NFS3ERR_TOOSMALL:
3785	case NFS3ERR_BADTYPE:
3786		return (EINVAL);
3787	case NFS3ERR_BAD_COOKIE:
3788		return (ENOENT);
3789	case NFS3ERR_SERVERFAULT:
3790		return (EIO);
3791	case NFS3ERR_JUKEBOX:
3792		return (ENXIO);
3793	default:
3794		return ((int)status);
3795	}
3796#endif
3797}
3798
3799rddir_cache *
3800rddir_cache_alloc(int flags)
3801{
3802	rddir_cache *rc;
3803
3804	rc = kmem_alloc(sizeof (*rc), flags);
3805	if (rc != NULL) {
3806		rc->entries = NULL;
3807		rc->flags = RDDIR;
3808		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3809		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3810		rc->count = 1;
3811#ifdef DEBUG
3812		atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3813#endif
3814	}
3815	return (rc);
3816}
3817
3818static void
3819rddir_cache_free(rddir_cache *rc)
3820{
3821
3822#ifdef DEBUG
3823	atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3824#endif
3825	if (rc->entries != NULL) {
3826#ifdef DEBUG
3827		rddir_cache_buf_free(rc->entries, rc->buflen);
3828#else
3829		kmem_free(rc->entries, rc->buflen);
3830#endif
3831	}
3832	cv_destroy(&rc->cv);
3833	mutex_destroy(&rc->lock);
3834	kmem_free(rc, sizeof (*rc));
3835}
3836
3837void
3838rddir_cache_hold(rddir_cache *rc)
3839{
3840
3841	mutex_enter(&rc->lock);
3842	rc->count++;
3843	mutex_exit(&rc->lock);
3844}
3845
3846void
3847rddir_cache_rele(rddir_cache *rc)
3848{
3849
3850	mutex_enter(&rc->lock);
3851	ASSERT(rc->count > 0);
3852	if (--rc->count == 0) {
3853		mutex_exit(&rc->lock);
3854		rddir_cache_free(rc);
3855	} else
3856		mutex_exit(&rc->lock);
3857}
3858
3859#ifdef DEBUG
3860char *
3861rddir_cache_buf_alloc(size_t size, int flags)
3862{
3863	char *rc;
3864
3865	rc = kmem_alloc(size, flags);
3866	if (rc != NULL)
3867		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3868	return (rc);
3869}
3870
3871void
3872rddir_cache_buf_free(void *addr, size_t size)
3873{
3874
3875	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3876	kmem_free(addr, size);
3877}
3878#endif
3879
3880static int
3881nfs_free_data_reclaim(rnode_t *rp)
3882{
3883	char *contents;
3884	int size;
3885	vsecattr_t *vsp;
3886	nfs3_pathconf_info *info;
3887	int freed;
3888	cred_t *cred;
3889
3890	/*
3891	 * Free any held credentials and caches which
3892	 * may be associated with this rnode.
3893	 */
3894	mutex_enter(&rp->r_statelock);
3895	cred = rp->r_cred;
3896	rp->r_cred = NULL;
3897	contents = rp->r_symlink.contents;
3898	size = rp->r_symlink.size;
3899	rp->r_symlink.contents = NULL;
3900	vsp = rp->r_secattr;
3901	rp->r_secattr = NULL;
3902	info = rp->r_pathconf;
3903	rp->r_pathconf = NULL;
3904	mutex_exit(&rp->r_statelock);
3905
3906	if (cred != NULL)
3907		crfree(cred);
3908
3909	/*
3910	 * Free the access cache entries.
3911	 */
3912	freed = nfs_access_purge_rp(rp);
3913
3914	if (!HAVE_RDDIR_CACHE(rp) &&
3915	    contents == NULL &&
3916	    vsp == NULL &&
3917	    info == NULL)
3918		return (freed);
3919
3920	/*
3921	 * Free the readdir cache entries
3922	 */
3923	if (HAVE_RDDIR_CACHE(rp))
3924		nfs_purge_rddir_cache(RTOV(rp));
3925
3926	/*
3927	 * Free the symbolic link cache.
3928	 */
3929	if (contents != NULL) {
3930
3931		kmem_free((void *)contents, size);
3932	}
3933
3934	/*
3935	 * Free any cached ACL.
3936	 */
3937	if (vsp != NULL)
3938		nfs_acl_free(vsp);
3939
3940	/*
3941	 * Free any cached pathconf information.
3942	 */
3943	if (info != NULL)
3944		kmem_free(info, sizeof (*info));
3945
3946	return (1);
3947}
3948
3949static int
3950nfs_active_data_reclaim(rnode_t *rp)
3951{
3952	char *contents;
3953	int size;
3954	vsecattr_t *vsp;
3955	nfs3_pathconf_info *info;
3956	int freed;
3957
3958	/*
3959	 * Free any held credentials and caches which
3960	 * may be associated with this rnode.
3961	 */
3962	if (!mutex_tryenter(&rp->r_statelock))
3963		return (0);
3964	contents = rp->r_symlink.contents;
3965	size = rp->r_symlink.size;
3966	rp->r_symlink.contents = NULL;
3967	vsp = rp->r_secattr;
3968	rp->r_secattr = NULL;
3969	info = rp->r_pathconf;
3970	rp->r_pathconf = NULL;
3971	mutex_exit(&rp->r_statelock);
3972
3973	/*
3974	 * Free the access cache entries.
3975	 */
3976	freed = nfs_access_purge_rp(rp);
3977
3978	if (!HAVE_RDDIR_CACHE(rp) &&
3979	    contents == NULL &&
3980	    vsp == NULL &&
3981	    info == NULL)
3982		return (freed);
3983
3984	/*
3985	 * Free the readdir cache entries
3986	 */
3987	if (HAVE_RDDIR_CACHE(rp))
3988		nfs_purge_rddir_cache(RTOV(rp));
3989
3990	/*
3991	 * Free the symbolic link cache.
3992	 */
3993	if (contents != NULL) {
3994
3995		kmem_free((void *)contents, size);
3996	}
3997
3998	/*
3999	 * Free any cached ACL.
4000	 */
4001	if (vsp != NULL)
4002		nfs_acl_free(vsp);
4003
4004	/*
4005	 * Free any cached pathconf information.
4006	 */
4007	if (info != NULL)
4008		kmem_free(info, sizeof (*info));
4009
4010	return (1);
4011}
4012
4013static int
4014nfs_free_reclaim(void)
4015{
4016	int freed;
4017	rnode_t *rp;
4018
4019#ifdef DEBUG
4020	clstat_debug.f_reclaim.value.ui64++;
4021#endif
4022	freed = 0;
4023	mutex_enter(&rpfreelist_lock);
4024	rp = rpfreelist;
4025	if (rp != NULL) {
4026		do {
4027			if (nfs_free_data_reclaim(rp))
4028				freed = 1;
4029		} while ((rp = rp->r_freef) != rpfreelist);
4030	}
4031	mutex_exit(&rpfreelist_lock);
4032	return (freed);
4033}
4034
4035static int
4036nfs_active_reclaim(void)
4037{
4038	int freed;
4039	int index;
4040	rnode_t *rp;
4041
4042#ifdef DEBUG
4043	clstat_debug.a_reclaim.value.ui64++;
4044#endif
4045	freed = 0;
4046	for (index = 0; index < rtablesize; index++) {
4047		rw_enter(&rtable[index].r_lock, RW_READER);
4048		for (rp = rtable[index].r_hashf;
4049		    rp != (rnode_t *)(&rtable[index]);
4050		    rp = rp->r_hashf) {
4051			if (nfs_active_data_reclaim(rp))
4052				freed = 1;
4053		}
4054		rw_exit(&rtable[index].r_lock);
4055	}
4056	return (freed);
4057}
4058
4059static int
4060nfs_rnode_reclaim(void)
4061{
4062	int freed;
4063	rnode_t *rp;
4064	vnode_t *vp;
4065
4066#ifdef DEBUG
4067	clstat_debug.r_reclaim.value.ui64++;
4068#endif
4069	freed = 0;
4070	mutex_enter(&rpfreelist_lock);
4071	while ((rp = rpfreelist) != NULL) {
4072		rp_rmfree(rp);
4073		mutex_exit(&rpfreelist_lock);
4074		if (rp->r_flags & RHASHED) {
4075			vp = RTOV(rp);
4076			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4077			mutex_enter(&vp->v_lock);
4078			if (vp->v_count > 1) {
4079				vp->v_count--;
4080				mutex_exit(&vp->v_lock);
4081				rw_exit(&rp->r_hashq->r_lock);
4082				mutex_enter(&rpfreelist_lock);
4083				continue;
4084			}
4085			mutex_exit(&vp->v_lock);
4086			rp_rmhash_locked(rp);
4087			rw_exit(&rp->r_hashq->r_lock);
4088		}
4089		/*
4090		 * This call to rp_addfree will end up destroying the
4091		 * rnode, but in a safe way with the appropriate set
4092		 * of checks done.
4093		 */
4094		rp_addfree(rp, CRED());
4095		mutex_enter(&rpfreelist_lock);
4096	}
4097	mutex_exit(&rpfreelist_lock);
4098	return (freed);
4099}
4100
4101/*ARGSUSED*/
4102static void
4103nfs_reclaim(void *cdrarg)
4104{
4105
4106#ifdef DEBUG
4107	clstat_debug.reclaim.value.ui64++;
4108#endif
4109	if (nfs_free_reclaim())
4110		return;
4111
4112	if (nfs_active_reclaim())
4113		return;
4114
4115	(void) nfs_rnode_reclaim();
4116}
4117
4118/*
4119 * NFS client failover support
4120 *
4121 * Routines to copy filehandles
4122 */
4123void
4124nfscopyfh(caddr_t fhp, vnode_t *vp)
4125{
4126	fhandle_t *dest = (fhandle_t *)fhp;
4127
4128	if (dest != NULL)
4129		*dest = *VTOFH(vp);
4130}
4131
4132void
4133nfs3copyfh(caddr_t fhp, vnode_t *vp)
4134{
4135	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4136
4137	if (dest != NULL)
4138		*dest = *VTOFH3(vp);
4139}
4140
4141/*
4142 * NFS client failover support
4143 *
4144 * failover_safe() will test various conditions to ensure that
4145 * failover is permitted for this vnode.  It will be denied
4146 * if:
4147 *	1) the operation in progress does not support failover (NULL fi)
4148 *	2) there are no available replicas (NULL mi_servers->sv_next)
4149 *	3) any locks are outstanding on this file
4150 */
4151static int
4152failover_safe(failinfo_t *fi)
4153{
4154
4155	/*
4156	 * Does this op permit failover?
4157	 */
4158	if (fi == NULL || fi->vp == NULL)
4159		return (0);
4160
4161	/*
4162	 * Are there any alternates to failover to?
4163	 */
4164	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4165		return (0);
4166
4167	/*
4168	 * Disable check; we've forced local locking
4169	 *
4170	 * if (flk_has_remote_locks(fi->vp))
4171	 *	return (0);
4172	 */
4173
4174	/*
4175	 * If we have no partial path, we can't do anything
4176	 */
4177	if (VTOR(fi->vp)->r_path == NULL)
4178		return (0);
4179
4180	return (1);
4181}
4182
4183#include <sys/thread.h>
4184
4185/*
4186 * NFS client failover support
4187 *
4188 * failover_newserver() will start a search for a new server,
4189 * preferably by starting an async thread to do the work.  If
4190 * someone is already doing this (recognizable by MI_BINDINPROG
4191 * being set), it will simply return and the calling thread
4192 * will queue on the mi_failover_cv condition variable.
4193 */
4194static void
4195failover_newserver(mntinfo_t *mi)
4196{
4197	/*
4198	 * Check if someone else is doing this already
4199	 */
4200	mutex_enter(&mi->mi_lock);
4201	if (mi->mi_flags & MI_BINDINPROG) {
4202		mutex_exit(&mi->mi_lock);
4203		return;
4204	}
4205	mi->mi_flags |= MI_BINDINPROG;
4206
4207	/*
4208	 * Need to hold the vfs struct so that it can't be released
4209	 * while the failover thread is selecting a new server.
4210	 */
4211	VFS_HOLD(mi->mi_vfsp);
4212
4213	/*
4214	 * Start a thread to do the real searching.
4215	 */
4216	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4217
4218	mutex_exit(&mi->mi_lock);
4219}
4220
4221/*
4222 * NFS client failover support
4223 *
4224 * failover_thread() will find a new server to replace the one
4225 * currently in use, wake up other threads waiting on this mount
4226 * point, and die.  It will start at the head of the server list
4227 * and poll servers until it finds one with an NFS server which is
4228 * registered and responds to a NULL procedure ping.
4229 *
4230 * XXX failover_thread is unsafe within the scope of the
4231 * present model defined for cpr to suspend the system.
4232 * Specifically, over-the-wire calls made by the thread
4233 * are unsafe. The thread needs to be reevaluated in case of
4234 * future updates to the cpr suspend model.
4235 */
4236static void
4237failover_thread(mntinfo_t *mi)
4238{
4239	servinfo_t *svp = NULL;
4240	CLIENT *cl;
4241	enum clnt_stat status;
4242	struct timeval tv;
4243	int error;
4244	int oncethru = 0;
4245	callb_cpr_t cprinfo;
4246	rnode_t *rp;
4247	int index;
4248	char *srvnames;
4249	size_t srvnames_len;
4250	struct nfs_clnt *nfscl = NULL;
4251	zoneid_t zoneid = getzoneid();
4252
4253#ifdef DEBUG
4254	/*
4255	 * This is currently only needed to access counters which exist on
4256	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4257	 * on non-DEBUG kernels.
4258	 */
4259	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4260	ASSERT(nfscl != NULL);
4261#endif
4262
4263	/*
4264	 * Its safe to piggyback on the mi_lock since failover_newserver()
4265	 * code guarantees that there will be only one failover thread
4266	 * per mountinfo at any instance.
4267	 */
4268	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4269	    "failover_thread");
4270
4271	mutex_enter(&mi->mi_lock);
4272	while (mi->mi_readers) {
4273		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4274		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4275		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4276	}
4277	mutex_exit(&mi->mi_lock);
4278
4279	tv.tv_sec = 2;
4280	tv.tv_usec = 0;
4281
4282	/*
4283	 * Ping the null NFS procedure of every server in
4284	 * the list until one responds.  We always start
4285	 * at the head of the list and always skip the one
4286	 * that is current, since it's caused us a problem.
4287	 */
4288	while (svp == NULL) {
4289		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4290			if (!oncethru && svp == mi->mi_curr_serv)
4291				continue;
4292
4293			/*
4294			 * If the file system was forcibly umounted
4295			 * while trying to do a failover, then just
4296			 * give up on the failover.  It won't matter
4297			 * what the server is.
4298			 */
4299			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4300				svp = NULL;
4301				goto done;
4302			}
4303
4304			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4305			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4306			if (error)
4307				continue;
4308
4309			if (!(mi->mi_flags & MI_INT))
4310				cl->cl_nosignal = TRUE;
4311			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4312			    xdr_void, NULL, tv);
4313			if (!(mi->mi_flags & MI_INT))
4314				cl->cl_nosignal = FALSE;
4315			AUTH_DESTROY(cl->cl_auth);
4316			CLNT_DESTROY(cl);
4317			if (status == RPC_SUCCESS) {
4318				if (svp == mi->mi_curr_serv) {
4319#ifdef DEBUG
4320					zcmn_err(zoneid, CE_NOTE,
4321			"NFS%d: failing over: selecting original server %s",
4322					    mi->mi_vers, svp->sv_hostname);
4323#else
4324					zcmn_err(zoneid, CE_NOTE,
4325			"NFS: failing over: selecting original server %s",
4326					    svp->sv_hostname);
4327#endif
4328				} else {
4329#ifdef DEBUG
4330					zcmn_err(zoneid, CE_NOTE,
4331				    "NFS%d: failing over from %s to %s",
4332					    mi->mi_vers,
4333					    mi->mi_curr_serv->sv_hostname,
4334					    svp->sv_hostname);
4335#else
4336					zcmn_err(zoneid, CE_NOTE,
4337				    "NFS: failing over from %s to %s",
4338					    mi->mi_curr_serv->sv_hostname,
4339					    svp->sv_hostname);
4340#endif
4341				}
4342				break;
4343			}
4344		}
4345
4346		if (svp == NULL) {
4347			if (!oncethru) {
4348				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4349#ifdef DEBUG
4350				zprintf(zoneid,
4351				    "NFS%d servers %s not responding "
4352				    "still trying\n", mi->mi_vers, srvnames);
4353#else
4354				zprintf(zoneid, "NFS servers %s not responding "
4355				    "still trying\n", srvnames);
4356#endif
4357				oncethru = 1;
4358			}
4359			mutex_enter(&mi->mi_lock);
4360			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4361			mutex_exit(&mi->mi_lock);
4362			delay(hz);
4363			mutex_enter(&mi->mi_lock);
4364			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4365			mutex_exit(&mi->mi_lock);
4366		}
4367	}
4368
4369	if (oncethru) {
4370#ifdef DEBUG
4371		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4372#else
4373		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4374#endif
4375	}
4376
4377	if (svp != mi->mi_curr_serv) {
4378		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4379		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4380		rw_enter(&rtable[index].r_lock, RW_WRITER);
4381		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4382		    mi->mi_vfsp);
4383		if (rp != NULL) {
4384			if (rp->r_flags & RHASHED)
4385				rp_rmhash_locked(rp);
4386			rw_exit(&rtable[index].r_lock);
4387			rp->r_server = svp;
4388			rp->r_fh = svp->sv_fhandle;
4389			(void) nfs_free_data_reclaim(rp);
4390			index = rtablehash(&rp->r_fh);
4391			rp->r_hashq = &rtable[index];
4392			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4393			vn_exists(RTOV(rp));
4394			rp_addhash(rp);
4395			rw_exit(&rp->r_hashq->r_lock);
4396			VN_RELE(RTOV(rp));
4397		} else
4398			rw_exit(&rtable[index].r_lock);
4399	}
4400
4401done:
4402	if (oncethru)
4403		kmem_free(srvnames, srvnames_len);
4404	mutex_enter(&mi->mi_lock);
4405	mi->mi_flags &= ~MI_BINDINPROG;
4406	if (svp != NULL) {
4407		mi->mi_curr_serv = svp;
4408		mi->mi_failover++;
4409#ifdef DEBUG
4410	nfscl->nfscl_stat.failover.value.ui64++;
4411#endif
4412	}
4413	cv_broadcast(&mi->mi_failover_cv);
4414	CALLB_CPR_EXIT(&cprinfo);
4415	VFS_RELE(mi->mi_vfsp);
4416	zthread_exit();
4417	/* NOTREACHED */
4418}
4419
4420/*
4421 * NFS client failover support
4422 *
4423 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4424 * is cleared, meaning that failover is complete.  Called with
4425 * mi_lock mutex held.
4426 */
4427static int
4428failover_wait(mntinfo_t *mi)
4429{
4430	k_sigset_t smask;
4431
4432	/*
4433	 * If someone else is hunting for a living server,
4434	 * sleep until it's done.  After our sleep, we may
4435	 * be bound to the right server and get off cheaply.
4436	 */
4437	while (mi->mi_flags & MI_BINDINPROG) {
4438		/*
4439		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4440		 * and SIGTERM. (Preserving the existing masks).
4441		 * Mask out SIGINT if mount option nointr is specified.
4442		 */
4443		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4444		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4445			/*
4446			 * restore original signal mask
4447			 */
4448			sigunintr(&smask);
4449			return (EINTR);
4450		}
4451		/*
4452		 * restore original signal mask
4453		 */
4454		sigunintr(&smask);
4455	}
4456	return (0);
4457}
4458
4459/*
4460 * NFS client failover support
4461 *
4462 * failover_remap() will do a partial pathname lookup and find the
4463 * desired vnode on the current server.  The interim vnode will be
4464 * discarded after we pilfer the new filehandle.
4465 *
4466 * Side effects:
4467 * - This routine will also update the filehandle in the args structure
4468 *    pointed to by the fi->fhp pointer if it is non-NULL.
4469 */
4470
4471static int
4472failover_remap(failinfo_t *fi)
4473{
4474	vnode_t *vp, *nvp, *rootvp;
4475	rnode_t *rp, *nrp;
4476	mntinfo_t *mi;
4477	int error;
4478#ifdef DEBUG
4479	struct nfs_clnt *nfscl;
4480
4481	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4482	ASSERT(nfscl != NULL);
4483#endif
4484	/*
4485	 * Sanity check
4486	 */
4487	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4488		return (EINVAL);
4489	vp = fi->vp;
4490	rp = VTOR(vp);
4491	mi = VTOMI(vp);
4492
4493	if (!(vp->v_flag & VROOT)) {
4494		/*
4495		 * Given the root fh, use the path stored in
4496		 * the rnode to find the fh for the new server.
4497		 */
4498		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4499		if (error)
4500			return (error);
4501
4502		error = failover_lookup(rp->r_path, rootvp,
4503		    fi->lookupproc, fi->xattrdirproc, &nvp);
4504
4505		VN_RELE(rootvp);
4506
4507		if (error)
4508			return (error);
4509
4510		/*
4511		 * If we found the same rnode, we're done now
4512		 */
4513		if (nvp == vp) {
4514			/*
4515			 * Failed and the new server may physically be same
4516			 * OR may share a same disk subsystem. In this case
4517			 * file handle for a particular file path is not going
4518			 * to change, given the same filehandle lookup will
4519			 * always locate the same rnode as the existing one.
4520			 * All we might need to do is to update the r_server
4521			 * with the current servinfo.
4522			 */
4523			if (!VALID_FH(fi)) {
4524				rp->r_server = mi->mi_curr_serv;
4525			}
4526			VN_RELE(nvp);
4527			return (0);
4528		}
4529
4530		/*
4531		 * Try to make it so that no one else will find this
4532		 * vnode because it is just a temporary to hold the
4533		 * new file handle until that file handle can be
4534		 * copied to the original vnode/rnode.
4535		 */
4536		nrp = VTOR(nvp);
4537		mutex_enter(&mi->mi_remap_lock);
4538		/*
4539		 * Some other thread could have raced in here and could
4540		 * have done the remap for this particular rnode before
4541		 * this thread here. Check for rp->r_server and
4542		 * mi->mi_curr_serv and return if they are same.
4543		 */
4544		if (VALID_FH(fi)) {
4545			mutex_exit(&mi->mi_remap_lock);
4546			VN_RELE(nvp);
4547			return (0);
4548		}
4549
4550		if (nrp->r_flags & RHASHED)
4551			rp_rmhash(nrp);
4552
4553		/*
4554		 * As a heuristic check on the validity of the new
4555		 * file, check that the size and type match against
4556		 * that we remember from the old version.
4557		 */
4558		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4559			mutex_exit(&mi->mi_remap_lock);
4560			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4561			    "NFS replicas %s and %s: file %s not same.",
4562			    rp->r_server->sv_hostname,
4563			    nrp->r_server->sv_hostname, rp->r_path);
4564			VN_RELE(nvp);
4565			return (EINVAL);
4566		}
4567
4568		/*
4569		 * snarf the filehandle from the new rnode
4570		 * then release it, again while updating the
4571		 * hash queues for the rnode.
4572		 */
4573		if (rp->r_flags & RHASHED)
4574			rp_rmhash(rp);
4575		rp->r_server = mi->mi_curr_serv;
4576		rp->r_fh = nrp->r_fh;
4577		rp->r_hashq = nrp->r_hashq;
4578		/*
4579		 * Copy the attributes from the new rnode to the old
4580		 * rnode.  This will help to reduce unnecessary page
4581		 * cache flushes.
4582		 */
4583		rp->r_attr = nrp->r_attr;
4584		rp->r_attrtime = nrp->r_attrtime;
4585		rp->r_mtime = nrp->r_mtime;
4586		(void) nfs_free_data_reclaim(rp);
4587		nfs_setswaplike(vp, &rp->r_attr);
4588		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4589		rp_addhash(rp);
4590		rw_exit(&rp->r_hashq->r_lock);
4591		mutex_exit(&mi->mi_remap_lock);
4592		VN_RELE(nvp);
4593	}
4594
4595	/*
4596	 * Update successful failover remap count
4597	 */
4598	mutex_enter(&mi->mi_lock);
4599	mi->mi_remap++;
4600	mutex_exit(&mi->mi_lock);
4601#ifdef DEBUG
4602	nfscl->nfscl_stat.remap.value.ui64++;
4603#endif
4604
4605	/*
4606	 * If we have a copied filehandle to update, do it now.
4607	 */
4608	if (fi->fhp != NULL && fi->copyproc != NULL)
4609		(*fi->copyproc)(fi->fhp, vp);
4610
4611	return (0);
4612}
4613
4614/*
4615 * NFS client failover support
4616 *
4617 * We want a simple pathname lookup routine to parse the pieces
4618 * of path in rp->r_path.  We know that the path was a created
4619 * as rnodes were made, so we know we have only to deal with
4620 * paths that look like:
4621 *	dir1/dir2/dir3/file
4622 * Any evidence of anything like .., symlinks, and ENOTDIR
4623 * are hard errors, because they mean something in this filesystem
4624 * is different from the one we came from, or has changed under
4625 * us in some way.  If this is true, we want the failure.
4626 *
4627 * Extended attributes: if the filesystem is mounted with extended
4628 * attributes enabled (-o xattr), the attribute directory will be
4629 * represented in the r_path as the magic name XATTR_RPATH. So if
4630 * we see that name in the pathname, is must be because this node
4631 * is an extended attribute.  Therefore, look it up that way.
4632 */
4633static int
4634failover_lookup(char *path, vnode_t *root,
4635    int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4636	vnode_t *, cred_t *, int),
4637    int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4638    vnode_t **new)
4639{
4640	vnode_t *dvp, *nvp;
4641	int error = EINVAL;
4642	char *s, *p, *tmppath;
4643	size_t len;
4644	mntinfo_t *mi;
4645	bool_t xattr;
4646
4647	/* Make local copy of path */
4648	len = strlen(path) + 1;
4649	tmppath = kmem_alloc(len, KM_SLEEP);
4650	(void) strcpy(tmppath, path);
4651	s = tmppath;
4652
4653	dvp = root;
4654	VN_HOLD(dvp);
4655	mi = VTOMI(root);
4656	xattr = mi->mi_flags & MI_EXTATTR;
4657
4658	do {
4659		p = strchr(s, '/');
4660		if (p != NULL)
4661			*p = '\0';
4662		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4663			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4664			    RFSCALL_SOFT);
4665		} else {
4666			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4667			    CRED(), RFSCALL_SOFT);
4668		}
4669		if (p != NULL)
4670			*p++ = '/';
4671		if (error) {
4672			VN_RELE(dvp);
4673			kmem_free(tmppath, len);
4674			return (error);
4675		}
4676		s = p;
4677		VN_RELE(dvp);
4678		dvp = nvp;
4679	} while (p != NULL);
4680
4681	if (nvp != NULL && new != NULL)
4682		*new = nvp;
4683	kmem_free(tmppath, len);
4684	return (0);
4685}
4686
4687/*
4688 * NFS client failover support
4689 *
4690 * sv_free() frees the malloc'd portion of a "servinfo_t".
4691 */
4692void
4693sv_free(servinfo_t *svp)
4694{
4695	servinfo_t *next;
4696	struct knetconfig *knconf;
4697
4698	while (svp != NULL) {
4699		next = svp->sv_next;
4700		if (svp->sv_secdata)
4701			sec_clnt_freeinfo(svp->sv_secdata);
4702		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4703			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4704		knconf = svp->sv_knconf;
4705		if (knconf != NULL) {
4706			if (knconf->knc_protofmly != NULL)
4707				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4708			if (knconf->knc_proto != NULL)
4709				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4710			kmem_free(knconf, sizeof (*knconf));
4711		}
4712		knconf = svp->sv_origknconf;
4713		if (knconf != NULL) {
4714			if (knconf->knc_protofmly != NULL)
4715				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4716			if (knconf->knc_proto != NULL)
4717				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4718			kmem_free(knconf, sizeof (*knconf));
4719		}
4720		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4721			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4722		mutex_destroy(&svp->sv_lock);
4723		kmem_free(svp, sizeof (*svp));
4724		svp = next;
4725	}
4726}
4727
4728/*
4729 * Only can return non-zero if intr != 0.
4730 */
4731int
4732nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4733{
4734
4735	mutex_enter(&l->lock);
4736
4737	/*
4738	 * If this is a nested enter, then allow it.  There
4739	 * must be as many exits as enters through.
4740	 */
4741	if (l->owner == curthread) {
4742		/* lock is held for writing by current thread */
4743		ASSERT(rw == RW_READER || rw == RW_WRITER);
4744		l->count--;
4745	} else if (rw == RW_READER) {
4746		/*
4747		 * While there is a writer active or writers waiting,
4748		 * then wait for them to finish up and move on.  Then,
4749		 * increment the count to indicate that a reader is
4750		 * active.
4751		 */
4752		while (l->count < 0 || l->waiters > 0) {
4753			if (intr) {
4754				klwp_t *lwp = ttolwp(curthread);
4755
4756				if (lwp != NULL)
4757					lwp->lwp_nostop++;
4758				if (!cv_wait_sig(&l->cv, &l->lock)) {
4759					if (lwp != NULL)
4760						lwp->lwp_nostop--;
4761					mutex_exit(&l->lock);
4762					return (EINTR);
4763				}
4764				if (lwp != NULL)
4765					lwp->lwp_nostop--;
4766			} else
4767				cv_wait(&l->cv, &l->lock);
4768		}
4769		ASSERT(l->count < INT_MAX);
4770#ifdef	DEBUG
4771		if ((l->count % 10000) == 9999)
4772			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4773			    "rwlock @ %p\n", l->count, (void *)&l);
4774#endif
4775		l->count++;
4776	} else {
4777		ASSERT(rw == RW_WRITER);
4778		/*
4779		 * While there are readers active or a writer
4780		 * active, then wait for all of the readers
4781		 * to finish or for the writer to finish.
4782		 * Then, set the owner field to curthread and
4783		 * decrement count to indicate that a writer
4784		 * is active.
4785		 */
4786		while (l->count > 0 || l->owner != NULL) {
4787			l->waiters++;
4788			if (intr) {
4789				klwp_t *lwp = ttolwp(curthread);
4790
4791				if (lwp != NULL)
4792					lwp->lwp_nostop++;
4793				if (!cv_wait_sig(&l->cv, &l->lock)) {
4794					if (lwp != NULL)
4795						lwp->lwp_nostop--;
4796					l->waiters--;
4797					cv_broadcast(&l->cv);
4798					mutex_exit(&l->lock);
4799					return (EINTR);
4800				}
4801				if (lwp != NULL)
4802					lwp->lwp_nostop--;
4803			} else
4804				cv_wait(&l->cv, &l->lock);
4805			l->waiters--;
4806		}
4807		l->owner = curthread;
4808		l->count--;
4809	}
4810
4811	mutex_exit(&l->lock);
4812
4813	return (0);
4814}
4815
4816/*
4817 * If the lock is available, obtain it and return non-zero.  If there is
4818 * already a conflicting lock, return 0 immediately.
4819 */
4820
4821int
4822nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4823{
4824	mutex_enter(&l->lock);
4825
4826	/*
4827	 * If this is a nested enter, then allow it.  There
4828	 * must be as many exits as enters through.
4829	 */
4830	if (l->owner == curthread) {
4831		/* lock is held for writing by current thread */
4832		ASSERT(rw == RW_READER || rw == RW_WRITER);
4833		l->count--;
4834	} else if (rw == RW_READER) {
4835		/*
4836		 * If there is a writer active or writers waiting, deny the
4837		 * lock.  Otherwise, bump the count of readers.
4838		 */
4839		if (l->count < 0 || l->waiters > 0) {
4840			mutex_exit(&l->lock);
4841			return (0);
4842		}
4843		l->count++;
4844	} else {
4845		ASSERT(rw == RW_WRITER);
4846		/*
4847		 * If there are readers active or a writer active, deny the
4848		 * lock.  Otherwise, set the owner field to curthread and
4849		 * decrement count to indicate that a writer is active.
4850		 */
4851		if (l->count > 0 || l->owner != NULL) {
4852			mutex_exit(&l->lock);
4853			return (0);
4854		}
4855		l->owner = curthread;
4856		l->count--;
4857	}
4858
4859	mutex_exit(&l->lock);
4860
4861	return (1);
4862}
4863
4864void
4865nfs_rw_exit(nfs_rwlock_t *l)
4866{
4867
4868	mutex_enter(&l->lock);
4869	/*
4870	 * If this is releasing a writer lock, then increment count to
4871	 * indicate that there is one less writer active.  If this was
4872	 * the last of possibly nested writer locks, then clear the owner
4873	 * field as well to indicate that there is no writer active
4874	 * and wakeup any possible waiting writers or readers.
4875	 *
4876	 * If releasing a reader lock, then just decrement count to
4877	 * indicate that there is one less reader active.  If this was
4878	 * the last active reader and there are writer(s) waiting,
4879	 * then wake up the first.
4880	 */
4881	if (l->owner != NULL) {
4882		ASSERT(l->owner == curthread);
4883		l->count++;
4884		if (l->count == 0) {
4885			l->owner = NULL;
4886			cv_broadcast(&l->cv);
4887		}
4888	} else {
4889		ASSERT(l->count > 0);
4890		l->count--;
4891		if (l->count == 0 && l->waiters > 0)
4892			cv_broadcast(&l->cv);
4893	}
4894	mutex_exit(&l->lock);
4895}
4896
4897int
4898nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4899{
4900
4901	if (rw == RW_READER)
4902		return (l->count > 0);
4903	ASSERT(rw == RW_WRITER);
4904	return (l->count < 0);
4905}
4906
4907/* ARGSUSED */
4908void
4909nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4910{
4911
4912	l->count = 0;
4913	l->waiters = 0;
4914	l->owner = NULL;
4915	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4916	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4917}
4918
4919void
4920nfs_rw_destroy(nfs_rwlock_t *l)
4921{
4922
4923	mutex_destroy(&l->lock);
4924	cv_destroy(&l->cv);
4925}
4926
4927int
4928nfs3_rddir_compar(const void *x, const void *y)
4929{
4930	rddir_cache *a = (rddir_cache *)x;
4931	rddir_cache *b = (rddir_cache *)y;
4932
4933	if (a->nfs3_cookie == b->nfs3_cookie) {
4934		if (a->buflen == b->buflen)
4935			return (0);
4936		if (a->buflen < b->buflen)
4937			return (-1);
4938		return (1);
4939	}
4940
4941	if (a->nfs3_cookie < b->nfs3_cookie)
4942		return (-1);
4943
4944	return (1);
4945}
4946
4947int
4948nfs_rddir_compar(const void *x, const void *y)
4949{
4950	rddir_cache *a = (rddir_cache *)x;
4951	rddir_cache *b = (rddir_cache *)y;
4952
4953	if (a->nfs_cookie == b->nfs_cookie) {
4954		if (a->buflen == b->buflen)
4955			return (0);
4956		if (a->buflen < b->buflen)
4957			return (-1);
4958		return (1);
4959	}
4960
4961	if (a->nfs_cookie < b->nfs_cookie)
4962		return (-1);
4963
4964	return (1);
4965}
4966
4967static char *
4968nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4969{
4970	servinfo_t *s;
4971	char *srvnames;
4972	char *namep;
4973	size_t length;
4974
4975	/*
4976	 * Calculate the length of the string required to hold all
4977	 * of the server names plus either a comma or a null
4978	 * character following each individual one.
4979	 */
4980	length = 0;
4981	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4982		length += s->sv_hostnamelen;
4983
4984	srvnames = kmem_alloc(length, KM_SLEEP);
4985
4986	namep = srvnames;
4987	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4988		(void) strcpy(namep, s->sv_hostname);
4989		namep += s->sv_hostnamelen - 1;
4990		*namep++ = ',';
4991	}
4992	*--namep = '\0';
4993
4994	*len = length;
4995
4996	return (srvnames);
4997}
4998
4999/*
5000 * These two functions are temporary and designed for the upgrade-workaround
5001 * only.  They cannot be used for general zone-crossing NFS client support, and
5002 * will be removed shortly.
5003 *
5004 * When the workaround is enabled, all NFS traffic is forced into the global
5005 * zone.  These functions are called when the code needs to refer to the state
5006 * of the underlying network connection.  They're not called when the function
5007 * needs to refer to the state of the process that invoked the system call.
5008 * (E.g., when checking whether the zone is shutting down during the mount()
5009 * call.)
5010 */
5011
5012struct zone *
5013nfs_zone(void)
5014{
5015	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5016}
5017
5018zoneid_t
5019nfs_zoneid(void)
5020{
5021	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5022}
5023
5024/*
5025 * nfs_mount_label_policy:
5026 *	Determine whether the mount is allowed according to MAC check,
5027 *	by comparing (where appropriate) label of the remote server
5028 *	against the label of the zone being mounted into.
5029 *
5030 *	Returns:
5031 *		 0 :	access allowed
5032 *		-1 :	read-only access allowed (i.e., read-down)
5033 *		>0 :	error code, such as EACCES
5034 */
5035int
5036nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5037    struct knetconfig *knconf, cred_t *cr)
5038{
5039	int		addr_type;
5040	void		*ipaddr;
5041	bslabel_t	*server_sl, *mntlabel;
5042	zone_t		*mntzone = NULL;
5043	ts_label_t	*zlabel;
5044	tsol_tpc_t	*tp;
5045	ts_label_t	*tsl = NULL;
5046	int		retv;
5047
5048	/*
5049	 * Get the zone's label.  Each zone on a labeled system has a label.
5050	 */
5051	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5052	zlabel = mntzone->zone_slabel;
5053	ASSERT(zlabel != NULL);
5054	label_hold(zlabel);
5055
5056	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5057		addr_type = IPV4_VERSION;
5058		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5059	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5060		addr_type = IPV6_VERSION;
5061		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5062	} else {
5063		retv = 0;
5064		goto out;
5065	}
5066
5067	retv = EACCES;				/* assume the worst */
5068
5069	/*
5070	 * Next, get the assigned label of the remote server.
5071	 */
5072	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5073	if (tp == NULL)
5074		goto out;			/* error getting host entry */
5075
5076	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5077		goto rel_tpc;			/* invalid domain */
5078	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5079	    (tp->tpc_tp.host_type != UNLABELED))
5080		goto rel_tpc;			/* invalid hosttype */
5081
5082	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5083		tsl = getflabel_cipso(vfsp);
5084		if (tsl == NULL)
5085			goto rel_tpc;		/* error getting server lbl */
5086
5087		server_sl = label2bslabel(tsl);
5088	} else {	/* UNLABELED */
5089		server_sl = &tp->tpc_tp.tp_def_label;
5090	}
5091
5092	mntlabel = label2bslabel(zlabel);
5093
5094	/*
5095	 * Now compare labels to complete the MAC check.  If the labels
5096	 * are equal or if the requestor is in the global zone and has
5097	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5098	 * mounts into the global zone itself; restrict these to
5099	 * read-only.)
5100	 *
5101	 * If the requestor is in some other zone, but his label
5102	 * dominates the server, then allow read-down.
5103	 *
5104	 * Otherwise, access is denied.
5105	 */
5106	if (blequal(mntlabel, server_sl) ||
5107	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5108	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5109		if ((mntzone == global_zone) ||
5110		    !blequal(mntlabel, server_sl))
5111			retv = -1;		/* read-only */
5112		else
5113			retv = 0;		/* access OK */
5114	} else if (bldominates(mntlabel, server_sl)) {
5115		retv = -1;			/* read-only */
5116	} else {
5117		retv = EACCES;
5118	}
5119
5120	if (tsl != NULL)
5121		label_rele(tsl);
5122
5123rel_tpc:
5124	TPC_RELE(tp);
5125out:
5126	if (mntzone)
5127		zone_rele(mntzone);
5128	label_rele(zlabel);
5129	return (retv);
5130}
5131
5132boolean_t
5133nfs_has_ctty(void)
5134{
5135	boolean_t rv;
5136	mutex_enter(&curproc->p_splock);
5137	rv = (curproc->p_sessp->s_vp != NULL);
5138	mutex_exit(&curproc->p_splock);
5139	return (rv);
5140}
5141
5142/*
5143 * See if xattr directory to see if it has any generic user attributes
5144 */
5145int
5146do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5147{
5148	struct uio uio;
5149	struct iovec iov;
5150	char *dbuf;
5151	struct dirent64 *dp;
5152	size_t dlen = 8 * 1024;
5153	size_t dbuflen;
5154	int eof = 0;
5155	int error;
5156
5157	*valp = 0;
5158	dbuf = kmem_alloc(dlen, KM_SLEEP);
5159	uio.uio_iov = &iov;
5160	uio.uio_iovcnt = 1;
5161	uio.uio_segflg = UIO_SYSSPACE;
5162	uio.uio_fmode = 0;
5163	uio.uio_extflg = UIO_COPY_CACHED;
5164	uio.uio_loffset = 0;
5165	uio.uio_resid = dlen;
5166	iov.iov_base = dbuf;
5167	iov.iov_len = dlen;
5168	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5169	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5170	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5171
5172	dbuflen = dlen - uio.uio_resid;
5173
5174	if (error || dbuflen == 0) {
5175		kmem_free(dbuf, dlen);
5176		return (error);
5177	}
5178
5179	dp = (dirent64_t *)dbuf;
5180
5181	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5182		if (strcmp(dp->d_name, ".") == 0 ||
5183		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5184		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5185		    VIEW_READONLY) == 0) {
5186			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5187			continue;
5188		}
5189
5190		*valp = 1;
5191		break;
5192	}
5193	kmem_free(dbuf, dlen);
5194	return (0);
5195}
5196