nfs_subr.c revision 2712:f74a135872bc
1284990Scy/*
2284990Scy * CDDL HEADER START
3284990Scy *
4284990Scy * The contents of this file are subject to the terms of the
5284990Scy * Common Development and Distribution License (the "License").
6284990Scy * You may not use this file except in compliance with the License.
7284990Scy *
8284990Scy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9284990Scy * or http://www.opensolaris.org/os/licensing.
10284990Scy * See the License for the specific language governing permissions
11284990Scy * and limitations under the License.
12284990Scy *
13284990Scy * When distributing Covered Code, include this CDDL HEADER in each
14284990Scy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15284990Scy * If applicable, add the following below this CDDL HEADER, with the
16284990Scy * fields enclosed by brackets "[]" replaced with your own identifying
17284990Scy * information: Portions Copyright [yyyy] [name of copyright owner]
18284990Scy *
19284990Scy * CDDL HEADER END
20284990Scy */
21284990Scy/*
22284990Scy * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23284990Scy * Use is subject to license terms.
24284990Scy *
25290000Sglebius *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
26290000Sglebius *	All rights reserved.
27290000Sglebius */
28290000Sglebius
29284990Scy#pragma ident	"%Z%%M%	%I%	%E% SMI"
30284990Scy
31284990Scy#include <sys/param.h>
32284990Scy#include <sys/types.h>
33284990Scy#include <sys/systm.h>
34284990Scy#include <sys/cred_impl.h>
35284990Scy#include <sys/proc.h>
36284990Scy#include <sys/user.h>
37284990Scy#include <sys/time.h>
38284990Scy#include <sys/buf.h>
39290000Sglebius#include <sys/vfs.h>
40290000Sglebius#include <sys/vnode.h>
41284990Scy#include <sys/socket.h>
42284990Scy#include <sys/uio.h>
43284990Scy#include <sys/tiuser.h>
44284990Scy#include <sys/swap.h>
45284990Scy#include <sys/errno.h>
46290000Sglebius#include <sys/debug.h>
47284990Scy#include <sys/kmem.h>
48284990Scy#include <sys/kstat.h>
49284990Scy#include <sys/cmn_err.h>
50284990Scy#include <sys/vtrace.h>
51284990Scy#include <sys/session.h>
52284990Scy#include <sys/dnlc.h>
53284990Scy#include <sys/bitmap.h>
54284990Scy#include <sys/acl.h>
55290000Sglebius#include <sys/ddi.h>
56290000Sglebius#include <sys/pathname.h>
57284990Scy#include <sys/flock.h>
58284990Scy#include <sys/dirent.h>
59284990Scy#include <sys/flock.h>
60#include <sys/callb.h>
61#include <sys/atomic.h>
62#include <sys/list.h>
63#include <sys/tsol/tnet.h>
64#include <sys/priv.h>
65
66#include <inet/ip6.h>
67
68#include <rpc/types.h>
69#include <rpc/xdr.h>
70#include <rpc/auth.h>
71#include <rpc/clnt.h>
72
73#include <nfs/nfs.h>
74#include <nfs/nfs4.h>
75#include <nfs/nfs_clnt.h>
76#include <nfs/rnode.h>
77#include <nfs/nfs_acl.h>
78
79/*
80 * The hash queues for the access to active and cached rnodes
81 * are organized as doubly linked lists.  A reader/writer lock
82 * for each hash bucket is used to control access and to synchronize
83 * lookups, additions, and deletions from the hash queue.
84 *
85 * The rnode freelist is organized as a doubly linked list with
86 * a head pointer.  Additions and deletions are synchronized via
87 * a single mutex.
88 *
89 * In order to add an rnode to the free list, it must be hashed into
90 * a hash queue and the exclusive lock to the hash queue be held.
91 * If an rnode is not hashed into a hash queue, then it is destroyed
92 * because it represents no valuable information that can be reused
93 * about the file.  The exclusive lock to the hash queue must be
94 * held in order to prevent a lookup in the hash queue from finding
95 * the rnode and using it and assuming that the rnode is not on the
96 * freelist.  The lookup in the hash queue will have the hash queue
97 * locked, either exclusive or shared.
98 *
99 * The vnode reference count for each rnode is not allowed to drop
100 * below 1.  This prevents external entities, such as the VM
101 * subsystem, from acquiring references to vnodes already on the
102 * freelist and then trying to place them back on the freelist
103 * when their reference is released.  This means that the when an
104 * rnode is looked up in the hash queues, then either the rnode
105 * is removed from the freelist and that reference is tranfered to
106 * the new reference or the vnode reference count must be incremented
107 * accordingly.  The mutex for the freelist must be held in order to
108 * accurately test to see if the rnode is on the freelist or not.
109 * The hash queue lock might be held shared and it is possible that
110 * two different threads may race to remove the rnode from the
111 * freelist.  This race can be resolved by holding the mutex for the
112 * freelist.  Please note that the mutex for the freelist does not
113 * need to held if the rnode is not on the freelist.  It can not be
114 * placed on the freelist due to the requirement that the thread
115 * putting the rnode on the freelist must hold the exclusive lock
116 * to the hash queue and the thread doing the lookup in the hash
117 * queue is holding either a shared or exclusive lock to the hash
118 * queue.
119 *
120 * The lock ordering is:
121 *
122 *	hash bucket lock -> vnode lock
123 *	hash bucket lock -> freelist lock
124 */
125static rhashq_t *rtable;
126
127static kmutex_t rpfreelist_lock;
128static rnode_t *rpfreelist = NULL;
129static long rnew = 0;
130long nrnode = 0;
131
132static int rtablesize;
133static int rtablemask;
134
135static int hashlen = 4;
136
137static struct kmem_cache *rnode_cache;
138
139/*
140 * Mutex to protect the following variables:
141 *	nfs_major
142 *	nfs_minor
143 */
144kmutex_t nfs_minor_lock;
145int nfs_major;
146int nfs_minor;
147
148/* Do we allow preepoch (negative) time values otw? */
149bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
150
151/*
152 * Access cache
153 */
154static acache_hash_t *acache;
155static long nacache;	/* used strictly to size the number of hash queues */
156
157static int acachesize;
158static int acachemask;
159static struct kmem_cache *acache_cache;
160
161/*
162 * Client side utilities
163 */
164
165/*
166 * client side statistics
167 */
168static const struct clstat clstat_tmpl = {
169	{ "calls",	KSTAT_DATA_UINT64 },
170	{ "badcalls",	KSTAT_DATA_UINT64 },
171	{ "clgets",	KSTAT_DATA_UINT64 },
172	{ "cltoomany",	KSTAT_DATA_UINT64 },
173#ifdef DEBUG
174	{ "clalloc",	KSTAT_DATA_UINT64 },
175	{ "noresponse",	KSTAT_DATA_UINT64 },
176	{ "failover",	KSTAT_DATA_UINT64 },
177	{ "remap",	KSTAT_DATA_UINT64 },
178#endif
179};
180
181/*
182 * The following are statistics that describe behavior of the system as a whole
183 * and doesn't correspond to any one particular zone.
184 */
185#ifdef DEBUG
186static struct clstat_debug {
187	kstat_named_t	nrnode;			/* number of allocated rnodes */
188	kstat_named_t	access;			/* size of access cache */
189	kstat_named_t	dirent;			/* size of readdir cache */
190	kstat_named_t	dirents;		/* size of readdir buf cache */
191	kstat_named_t	reclaim;		/* number of reclaims */
192	kstat_named_t	clreclaim;		/* number of cl reclaims */
193	kstat_named_t	f_reclaim;		/* number of free reclaims */
194	kstat_named_t	a_reclaim;		/* number of active reclaims */
195	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
196	kstat_named_t	rpath;			/* bytes used to store rpaths */
197} clstat_debug = {
198	{ "nrnode",	KSTAT_DATA_UINT64 },
199	{ "access",	KSTAT_DATA_UINT64 },
200	{ "dirent",	KSTAT_DATA_UINT64 },
201	{ "dirents",	KSTAT_DATA_UINT64 },
202	{ "reclaim",	KSTAT_DATA_UINT64 },
203	{ "clreclaim",	KSTAT_DATA_UINT64 },
204	{ "f_reclaim",	KSTAT_DATA_UINT64 },
205	{ "a_reclaim",	KSTAT_DATA_UINT64 },
206	{ "r_reclaim",	KSTAT_DATA_UINT64 },
207	{ "r_path",	KSTAT_DATA_UINT64 },
208};
209#endif	/* DEBUG */
210
211/*
212 * We keep a global list of per-zone client data, so we can clean up all zones
213 * if we get low on memory.
214 */
215static list_t nfs_clnt_list;
216static kmutex_t nfs_clnt_list_lock;
217static zone_key_t nfsclnt_zone_key;
218
219static struct kmem_cache *chtab_cache;
220
221/*
222 * Some servers do not properly update the attributes of the
223 * directory when changes are made.  To allow interoperability
224 * with these broken servers, the nfs_disable_rddir_cache
225 * parameter must be set in /etc/system
226 */
227int nfs_disable_rddir_cache = 0;
228
229int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
230		    struct chtab **);
231void		clfree(CLIENT *, struct chtab *);
232static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
233		    struct chtab **, struct nfs_clnt *);
234static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
235		    struct chtab **, struct nfs_clnt *);
236static void	clreclaim(void *);
237static int	nfs_feedback(int, int, mntinfo_t *);
238static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
239		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
240		    failinfo_t *);
241static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
242		    caddr_t, cred_t *, int *, int, failinfo_t *);
243static void	rinactive(rnode_t *, cred_t *);
244static int	rtablehash(nfs_fhandle *);
245static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
246		    struct vnodeops *,
247		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
248			cred_t *),
249		    int (*)(const void *, const void *), int *, cred_t *,
250		    char *, char *);
251static void	rp_rmfree(rnode_t *);
252static void	rp_addhash(rnode_t *);
253static void	rp_rmhash_locked(rnode_t *);
254static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
255static void	destroy_rnode(rnode_t *);
256static void	rddir_cache_free(rddir_cache *);
257static int	nfs_free_data_reclaim(rnode_t *);
258static int	nfs_active_data_reclaim(rnode_t *);
259static int	nfs_free_reclaim(void);
260static int	nfs_active_reclaim(void);
261static int	nfs_rnode_reclaim(void);
262static void	nfs_reclaim(void *);
263static int	failover_safe(failinfo_t *);
264static void	failover_newserver(mntinfo_t *mi);
265static void	failover_thread(mntinfo_t *mi);
266static int	failover_wait(mntinfo_t *);
267static int	failover_remap(failinfo_t *);
268static int	failover_lookup(char *, vnode_t *,
269		    int (*)(vnode_t *, char *, vnode_t **,
270			struct pathname *, int, vnode_t *, cred_t *, int),
271		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
272		    vnode_t **);
273static void	nfs_free_r_path(rnode_t *);
274static void	nfs_set_vroot(vnode_t *);
275static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
276
277/*
278 * from rpcsec module (common/rpcsec)
279 */
280extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
281extern void sec_clnt_freeh(AUTH *);
282extern void sec_clnt_freeinfo(struct sec_data *);
283
284/*
285 * used in mount policy
286 */
287extern ts_label_t *getflabel_cipso(vfs_t *);
288
289/*
290 * EIO or EINTR are not recoverable errors.
291 */
292#define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
293
294/*
295 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
296 */
297static int
298clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
299    struct chtab **chp, struct nfs_clnt *nfscl)
300{
301	struct chhead *ch, *newch;
302	struct chhead **plistp;
303	struct chtab *cp;
304	int error;
305	k_sigset_t smask;
306
307	if (newcl == NULL || chp == NULL || ci == NULL)
308		return (EINVAL);
309
310	*newcl = NULL;
311	*chp = NULL;
312
313	/*
314	 * Find an unused handle or create one
315	 */
316	newch = NULL;
317	nfscl->nfscl_stat.clgets.value.ui64++;
318top:
319	/*
320	 * Find the correct entry in the cache to check for free
321	 * client handles.  The search is based on the RPC program
322	 * number, program version number, dev_t for the transport
323	 * device, and the protocol family.
324	 */
325	mutex_enter(&nfscl->nfscl_chtable_lock);
326	plistp = &nfscl->nfscl_chtable;
327	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
328		if (ch->ch_prog == ci->cl_prog &&
329		    ch->ch_vers == ci->cl_vers &&
330		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
331		    (strcmp(ch->ch_protofmly,
332			svp->sv_knconf->knc_protofmly) == 0))
333			break;
334		plistp = &ch->ch_next;
335	}
336
337	/*
338	 * If we didn't find a cache entry for this quadruple, then
339	 * create one.  If we don't have one already preallocated,
340	 * then drop the cache lock, create one, and then start over.
341	 * If we did have a preallocated entry, then just add it to
342	 * the front of the list.
343	 */
344	if (ch == NULL) {
345		if (newch == NULL) {
346			mutex_exit(&nfscl->nfscl_chtable_lock);
347			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
348			newch->ch_timesused = 0;
349			newch->ch_prog = ci->cl_prog;
350			newch->ch_vers = ci->cl_vers;
351			newch->ch_dev = svp->sv_knconf->knc_rdev;
352			newch->ch_protofmly = kmem_alloc(
353			    strlen(svp->sv_knconf->knc_protofmly) + 1,
354			    KM_SLEEP);
355			(void) strcpy(newch->ch_protofmly,
356			    svp->sv_knconf->knc_protofmly);
357			newch->ch_list = NULL;
358			goto top;
359		}
360		ch = newch;
361		newch = NULL;
362		ch->ch_next = nfscl->nfscl_chtable;
363		nfscl->nfscl_chtable = ch;
364	/*
365	 * We found a cache entry, but if it isn't on the front of the
366	 * list, then move it to the front of the list to try to take
367	 * advantage of locality of operations.
368	 */
369	} else if (ch != nfscl->nfscl_chtable) {
370		*plistp = ch->ch_next;
371		ch->ch_next = nfscl->nfscl_chtable;
372		nfscl->nfscl_chtable = ch;
373	}
374
375	/*
376	 * If there was a free client handle cached, then remove it
377	 * from the list, init it, and use it.
378	 */
379	if (ch->ch_list != NULL) {
380		cp = ch->ch_list;
381		ch->ch_list = cp->ch_list;
382		mutex_exit(&nfscl->nfscl_chtable_lock);
383		if (newch != NULL) {
384			kmem_free(newch->ch_protofmly,
385			    strlen(newch->ch_protofmly) + 1);
386			kmem_free(newch, sizeof (*newch));
387		}
388		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
389		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
390		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
391		    &cp->ch_client->cl_auth);
392		if (error || cp->ch_client->cl_auth == NULL) {
393			CLNT_DESTROY(cp->ch_client);
394			kmem_cache_free(chtab_cache, cp);
395			return ((error != 0) ? error : EINTR);
396		}
397		ch->ch_timesused++;
398		*newcl = cp->ch_client;
399		*chp = cp;
400		return (0);
401	}
402
403	/*
404	 * There weren't any free client handles which fit, so allocate
405	 * a new one and use that.
406	 */
407#ifdef DEBUG
408	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
409#endif
410	mutex_exit(&nfscl->nfscl_chtable_lock);
411
412	nfscl->nfscl_stat.cltoomany.value.ui64++;
413	if (newch != NULL) {
414		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
415		kmem_free(newch, sizeof (*newch));
416	}
417
418	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
419	cp->ch_head = ch;
420
421	sigintr(&smask, (int)ci->cl_flags & MI_INT);
422	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
423	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
424	sigunintr(&smask);
425
426	if (error != 0) {
427		kmem_cache_free(chtab_cache, cp);
428#ifdef DEBUG
429		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
430#endif
431		/*
432		 * Warning is unnecessary if error is EINTR.
433		 */
434		if (error != EINTR) {
435			nfs_cmn_err(error, CE_WARN,
436			    "clget: couldn't create handle: %m\n");
437		}
438		return (error);
439	}
440	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
441	auth_destroy(cp->ch_client->cl_auth);
442	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
443	    &cp->ch_client->cl_auth);
444	if (error || cp->ch_client->cl_auth == NULL) {
445		CLNT_DESTROY(cp->ch_client);
446		kmem_cache_free(chtab_cache, cp);
447#ifdef DEBUG
448		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
449#endif
450		return ((error != 0) ? error : EINTR);
451	}
452	ch->ch_timesused++;
453	*newcl = cp->ch_client;
454	ASSERT(cp->ch_client->cl_nosignal == FALSE);
455	*chp = cp;
456	return (0);
457}
458
459int
460clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
461    struct chtab **chp)
462{
463	struct nfs_clnt *nfscl;
464
465	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
466	ASSERT(nfscl != NULL);
467
468	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
469}
470
471static int
472acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
473    struct chtab **chp, struct nfs_clnt *nfscl)
474{
475	clinfo_t ci;
476	int error;
477
478	/*
479	 * Set read buffer size to rsize
480	 * and add room for RPC headers.
481	 */
482	ci.cl_readsize = mi->mi_tsize;
483	if (ci.cl_readsize != 0)
484		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
485
486	/*
487	 * If soft mount and server is down just try once.
488	 * meaning: do not retransmit.
489	 */
490	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
491		ci.cl_retrans = 0;
492	else
493		ci.cl_retrans = mi->mi_retrans;
494
495	ci.cl_prog = NFS_ACL_PROGRAM;
496	ci.cl_vers = mi->mi_vers;
497	ci.cl_flags = mi->mi_flags;
498
499	/*
500	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
501	 * security flavor, the client tries to establish a security context
502	 * by contacting the server. If the connection is timed out or reset,
503	 * e.g. server reboot, we will try again.
504	 */
505	do {
506		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
507
508		if (error == 0)
509			break;
510
511		/*
512		 * For forced unmount or zone shutdown, bail out, no retry.
513		 */
514		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
515			error = EIO;
516			break;
517		}
518
519		/* do not retry for softmount */
520		if (!(mi->mi_flags & MI_HARD))
521			break;
522
523		/* let the caller deal with the failover case */
524		if (FAILOVER_MOUNT(mi))
525			break;
526
527	} while (error == ETIMEDOUT || error == ECONNRESET);
528
529	return (error);
530}
531
532static int
533nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
534    struct chtab **chp, struct nfs_clnt *nfscl)
535{
536	clinfo_t ci;
537	int error;
538
539	/*
540	 * Set read buffer size to rsize
541	 * and add room for RPC headers.
542	 */
543	ci.cl_readsize = mi->mi_tsize;
544	if (ci.cl_readsize != 0)
545		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
546
547	/*
548	 * If soft mount and server is down just try once.
549	 * meaning: do not retransmit.
550	 */
551	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
552		ci.cl_retrans = 0;
553	else
554		ci.cl_retrans = mi->mi_retrans;
555
556	ci.cl_prog = mi->mi_prog;
557	ci.cl_vers = mi->mi_vers;
558	ci.cl_flags = mi->mi_flags;
559
560	/*
561	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
562	 * security flavor, the client tries to establish a security context
563	 * by contacting the server. If the connection is timed out or reset,
564	 * e.g. server reboot, we will try again.
565	 */
566	do {
567		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
568
569		if (error == 0)
570			break;
571
572		/*
573		 * For forced unmount or zone shutdown, bail out, no retry.
574		 */
575		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
576			error = EIO;
577			break;
578		}
579
580		/* do not retry for softmount */
581		if (!(mi->mi_flags & MI_HARD))
582			break;
583
584		/* let the caller deal with the failover case */
585		if (FAILOVER_MOUNT(mi))
586			break;
587
588	} while (error == ETIMEDOUT || error == ECONNRESET);
589
590	return (error);
591}
592
593static void
594clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
595{
596	if (cl->cl_auth != NULL) {
597		sec_clnt_freeh(cl->cl_auth);
598		cl->cl_auth = NULL;
599	}
600
601	/*
602	 * Timestamp this cache entry so that we know when it was last
603	 * used.
604	 */
605	cp->ch_freed = gethrestime_sec();
606
607	/*
608	 * Add the free client handle to the front of the list.
609	 * This way, the list will be sorted in youngest to oldest
610	 * order.
611	 */
612	mutex_enter(&nfscl->nfscl_chtable_lock);
613	cp->ch_list = cp->ch_head->ch_list;
614	cp->ch_head->ch_list = cp;
615	mutex_exit(&nfscl->nfscl_chtable_lock);
616}
617
618void
619clfree(CLIENT *cl, struct chtab *cp)
620{
621	struct nfs_clnt *nfscl;
622
623	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
624	ASSERT(nfscl != NULL);
625
626	clfree_impl(cl, cp, nfscl);
627}
628
629#define	CL_HOLDTIME	60	/* time to hold client handles */
630
631static void
632clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
633{
634	struct chhead *ch;
635	struct chtab *cp;	/* list of objects that can be reclaimed */
636	struct chtab *cpe;
637	struct chtab *cpl;
638	struct chtab **cpp;
639#ifdef DEBUG
640	int n = 0;
641#endif
642
643	/*
644	 * Need to reclaim some memory, so step through the cache
645	 * looking through the lists for entries which can be freed.
646	 */
647	cp = NULL;
648
649	mutex_enter(&nfscl->nfscl_chtable_lock);
650
651	/*
652	 * Here we step through each non-NULL quadruple and start to
653	 * construct the reclaim list pointed to by cp.  Note that
654	 * cp will contain all eligible chtab entries.  When this traversal
655	 * completes, chtab entries from the last quadruple will be at the
656	 * front of cp and entries from previously inspected quadruples have
657	 * been appended to the rear of cp.
658	 */
659	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
660		if (ch->ch_list == NULL)
661			continue;
662		/*
663		 * Search each list for entries older then
664		 * cl_holdtime seconds.  The lists are maintained
665		 * in youngest to oldest order so that when the
666		 * first entry is found which is old enough, then
667		 * all of the rest of the entries on the list will
668		 * be old enough as well.
669		 */
670		cpl = ch->ch_list;
671		cpp = &ch->ch_list;
672		while (cpl != NULL &&
673			cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
674			cpp = &cpl->ch_list;
675			cpl = cpl->ch_list;
676		}
677		if (cpl != NULL) {
678			*cpp = NULL;
679			if (cp != NULL) {
680				cpe = cpl;
681				while (cpe->ch_list != NULL)
682					cpe = cpe->ch_list;
683				cpe->ch_list = cp;
684			}
685			cp = cpl;
686		}
687	}
688
689	mutex_exit(&nfscl->nfscl_chtable_lock);
690
691	/*
692	 * If cp is empty, then there is nothing to reclaim here.
693	 */
694	if (cp == NULL)
695		return;
696
697	/*
698	 * Step through the list of entries to free, destroying each client
699	 * handle and kmem_free'ing the memory for each entry.
700	 */
701	while (cp != NULL) {
702#ifdef DEBUG
703		n++;
704#endif
705		CLNT_DESTROY(cp->ch_client);
706		cpl = cp->ch_list;
707		kmem_cache_free(chtab_cache, cp);
708		cp = cpl;
709	}
710
711#ifdef DEBUG
712	/*
713	 * Update clalloc so that nfsstat shows the current number
714	 * of allocated client handles.
715	 */
716	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
717#endif
718}
719
720/* ARGSUSED */
721static void
722clreclaim(void *all)
723{
724	struct nfs_clnt *nfscl;
725
726#ifdef DEBUG
727	clstat_debug.clreclaim.value.ui64++;
728#endif
729	/*
730	 * The system is low on memory; go through and try to reclaim some from
731	 * every zone on the system.
732	 */
733	mutex_enter(&nfs_clnt_list_lock);
734	nfscl = list_head(&nfs_clnt_list);
735	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
736		clreclaim_zone(nfscl, CL_HOLDTIME);
737	mutex_exit(&nfs_clnt_list_lock);
738}
739
740/*
741 * Minimum time-out values indexed by call type
742 * These units are in "eights" of a second to avoid multiplies
743 */
744static unsigned int minimum_timeo[] = {
745	6, 7, 10
746};
747
748/*
749 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
750 */
751#define	MAXTIMO	(20*hz)
752#define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
753#define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
754
755#define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
756#define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
757#define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
758
759/*
760 * Function called when rfscall notices that we have been
761 * re-transmitting, or when we get a response without retransmissions.
762 * Return 1 if the transfer size was adjusted down - 0 if no change.
763 */
764static int
765nfs_feedback(int flag, int which, mntinfo_t *mi)
766{
767	int kind;
768	int r = 0;
769
770	mutex_enter(&mi->mi_lock);
771	if (flag == FEEDBACK_REXMIT1) {
772		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
773		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
774			goto done;
775		if (mi->mi_curread > MIN_NFS_TSIZE) {
776			mi->mi_curread /= 2;
777			if (mi->mi_curread < MIN_NFS_TSIZE)
778				mi->mi_curread = MIN_NFS_TSIZE;
779			r = 1;
780		}
781
782		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
783			mi->mi_curwrite /= 2;
784			if (mi->mi_curwrite < MIN_NFS_TSIZE)
785				mi->mi_curwrite = MIN_NFS_TSIZE;
786			r = 1;
787		}
788	} else if (flag == FEEDBACK_OK) {
789		kind = mi->mi_timer_type[which];
790		if (kind == 0 ||
791		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
792			goto done;
793		if (kind == 1) {
794			if (mi->mi_curread >= mi->mi_tsize)
795				goto done;
796			mi->mi_curread +=  MIN_NFS_TSIZE;
797			if (mi->mi_curread > mi->mi_tsize/2)
798				mi->mi_curread = mi->mi_tsize;
799		} else if (kind == 2) {
800			if (mi->mi_curwrite >= mi->mi_stsize)
801				goto done;
802			mi->mi_curwrite += MIN_NFS_TSIZE;
803			if (mi->mi_curwrite > mi->mi_stsize/2)
804				mi->mi_curwrite = mi->mi_stsize;
805		}
806	}
807done:
808	mutex_exit(&mi->mi_lock);
809	return (r);
810}
811
812#ifdef DEBUG
813static int rfs2call_hits = 0;
814static int rfs2call_misses = 0;
815#endif
816
817int
818rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
819    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
820    enum nfsstat *statusp, int flags, failinfo_t *fi)
821{
822	int rpcerror;
823	enum clnt_stat rpc_status;
824
825	ASSERT(statusp != NULL);
826
827	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
828	    cr, douprintf, &rpc_status, flags, fi);
829	if (!rpcerror) {
830		/*
831		 * See crnetadjust() for comments.
832		 */
833		if (*statusp == NFSERR_ACCES &&
834		    (cr = crnetadjust(cr)) != NULL) {
835#ifdef DEBUG
836			rfs2call_hits++;
837#endif
838			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
839			    resp, cr, douprintf, NULL, flags, fi);
840			crfree(cr);
841#ifdef DEBUG
842			if (*statusp == NFSERR_ACCES)
843				rfs2call_misses++;
844#endif
845		}
846	} else if (rpc_status == RPC_PROCUNAVAIL) {
847		*statusp = NFSERR_OPNOTSUPP;
848		rpcerror = 0;
849	}
850
851	return (rpcerror);
852}
853
854#define	NFS3_JUKEBOX_DELAY	10 * hz
855
856static clock_t nfs3_jukebox_delay = 0;
857
858#ifdef DEBUG
859static int rfs3call_hits = 0;
860static int rfs3call_misses = 0;
861#endif
862
863int
864rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
865    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
866    nfsstat3 *statusp, int flags, failinfo_t *fi)
867{
868	int rpcerror;
869	int user_informed;
870
871	user_informed = 0;
872	do {
873		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
874		    cr, douprintf, NULL, flags, fi);
875		if (!rpcerror) {
876			cred_t *crr;
877			if (*statusp == NFS3ERR_JUKEBOX) {
878				if (ttoproc(curthread) == &p0) {
879					rpcerror = EAGAIN;
880					break;
881				}
882				if (!user_informed) {
883					user_informed = 1;
884					uprintf(
885		"file temporarily unavailable on the server, retrying...\n");
886				}
887				delay(nfs3_jukebox_delay);
888			}
889			/*
890			 * See crnetadjust() for comments.
891			 */
892			else if (*statusp == NFS3ERR_ACCES &&
893			    (crr = crnetadjust(cr)) != NULL) {
894#ifdef DEBUG
895				rfs3call_hits++;
896#endif
897				rpcerror = rfscall(mi, which, xdrargs, argsp,
898				    xdrres, resp, crr, douprintf,
899				    NULL, flags, fi);
900
901				crfree(crr);
902#ifdef DEBUG
903				if (*statusp == NFS3ERR_ACCES)
904					rfs3call_misses++;
905#endif
906			}
907		}
908	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
909
910	return (rpcerror);
911}
912
913#define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
914#define	INC_READERS(mi)		{ \
915	mi->mi_readers++; \
916}
917#define	DEC_READERS(mi)		{ \
918	mi->mi_readers--; \
919	if (mi->mi_readers == 0) \
920		cv_broadcast(&mi->mi_failover_cv); \
921}
922
923static int
924rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
925    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
926    enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
927{
928	CLIENT *client;
929	struct chtab *ch;
930	cred_t *cr = icr;
931	enum clnt_stat status;
932	struct rpc_err rpcerr;
933	struct timeval wait;
934	int timeo;		/* in units of hz */
935	int my_rsize, my_wsize;
936	bool_t tryagain;
937	bool_t cred_cloned = FALSE;
938	k_sigset_t smask;
939	servinfo_t *svp;
940	struct nfs_clnt *nfscl;
941	zoneid_t zoneid = getzoneid();
942#ifdef DEBUG
943	char *bufp;
944#endif
945
946
947	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
948		"rfscall_start:which %d mi %p", which, mi);
949
950	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
951	ASSERT(nfscl != NULL);
952
953	nfscl->nfscl_stat.calls.value.ui64++;
954	mi->mi_reqs[which].value.ui64++;
955
956	rpcerr.re_status = RPC_SUCCESS;
957
958	/*
959	 * In case of forced unmount or zone shutdown, return EIO.
960	 */
961
962	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
963		rpcerr.re_status = RPC_FAILED;
964		rpcerr.re_errno = EIO;
965		return (rpcerr.re_errno);
966	}
967
968	/*
969	 * Remember the transfer sizes in case
970	 * nfs_feedback changes them underneath us.
971	 */
972	my_rsize = mi->mi_curread;
973	my_wsize = mi->mi_curwrite;
974
975	/*
976	 * NFS client failover support
977	 *
978	 * If this rnode is not in sync with the current server (VALID_FH),
979	 * we'd like to do a remap to get in sync.  We can be interrupted
980	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
981	 * use the best info we have to try the RPC.  Part of that is
982	 * unconditionally updating the filehandle copy kept for V3.
983	 *
984	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
985	 * rw_enter(); we're trying to keep the current server from being
986	 * changed on us until we're done with the remapping and have a
987	 * matching client handle.  We don't want to sending a filehandle
988	 * to the wrong host.
989	 */
990failoverretry:
991	if (FAILOVER_MOUNT(mi)) {
992		mutex_enter(&mi->mi_lock);
993		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
994			if (failover_wait(mi)) {
995				mutex_exit(&mi->mi_lock);
996				return (EINTR);
997			}
998		}
999		INC_READERS(mi);
1000		mutex_exit(&mi->mi_lock);
1001		if (fi) {
1002			if (!VALID_FH(fi) &&
1003			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1004				int remaperr;
1005
1006				svp = mi->mi_curr_serv;
1007				remaperr = failover_remap(fi);
1008				if (remaperr != 0) {
1009#ifdef DEBUG
1010					if (remaperr != EINTR)
1011						nfs_cmn_err(remaperr, CE_WARN,
1012					    "rfscall couldn't failover: %m");
1013#endif
1014					mutex_enter(&mi->mi_lock);
1015					DEC_READERS(mi);
1016					mutex_exit(&mi->mi_lock);
1017					/*
1018					 * If failover_remap returns ETIMEDOUT
1019					 * and the filesystem is hard mounted
1020					 * we have to retry the call with a new
1021					 * server.
1022					 */
1023					if ((mi->mi_flags & MI_HARD) &&
1024					    IS_RECOVERABLE_ERROR(remaperr)) {
1025						if (svp == mi->mi_curr_serv)
1026							failover_newserver(mi);
1027						rpcerr.re_status = RPC_SUCCESS;
1028						goto failoverretry;
1029					}
1030					rpcerr.re_errno = remaperr;
1031					return (remaperr);
1032				}
1033			}
1034			if (fi->fhp && fi->copyproc)
1035				(*fi->copyproc)(fi->fhp, fi->vp);
1036		}
1037	}
1038
1039	/* For TSOL, use a new cred which has net_mac_aware flag */
1040	if (!cred_cloned && is_system_labeled()) {
1041		cred_cloned = TRUE;
1042		cr = crdup(icr);
1043		(void) setpflags(NET_MAC_AWARE, 1, cr);
1044	}
1045
1046	/*
1047	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1048	 * are guaranteed to reprocess the retry as a new request.
1049	 */
1050	svp = mi->mi_curr_serv;
1051	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1052
1053	if (FAILOVER_MOUNT(mi)) {
1054		mutex_enter(&mi->mi_lock);
1055		DEC_READERS(mi);
1056		mutex_exit(&mi->mi_lock);
1057
1058		if ((rpcerr.re_errno == ETIMEDOUT ||
1059				rpcerr.re_errno == ECONNRESET) &&
1060				failover_safe(fi)) {
1061			if (svp == mi->mi_curr_serv)
1062				failover_newserver(mi);
1063			goto failoverretry;
1064		}
1065	}
1066	if (rpcerr.re_errno != 0)
1067		return (rpcerr.re_errno);
1068
1069	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1070	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1071		timeo = (mi->mi_timeo * hz) / 10;
1072	} else {
1073		mutex_enter(&mi->mi_lock);
1074		timeo = CLNT_SETTIMERS(client,
1075		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1076		    &(mi->mi_timers[NFS_CALLTYPES]),
1077		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1078		    (void (*)())NULL, (caddr_t)mi, 0);
1079		mutex_exit(&mi->mi_lock);
1080	}
1081
1082	/*
1083	 * If hard mounted fs, retry call forever unless hard error occurs.
1084	 */
1085	do {
1086		tryagain = FALSE;
1087
1088		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1089			status = RPC_FAILED;
1090			rpcerr.re_status = RPC_FAILED;
1091			rpcerr.re_errno = EIO;
1092			break;
1093		}
1094
1095		TICK_TO_TIMEVAL(timeo, &wait);
1096
1097		/*
1098		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1099		 * and SIGTERM. (Preserving the existing masks).
1100		 * Mask out SIGINT if mount option nointr is specified.
1101		 */
1102		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1103		if (!(mi->mi_flags & MI_INT))
1104			client->cl_nosignal = TRUE;
1105
1106		/*
1107		 * If there is a current signal, then don't bother
1108		 * even trying to send out the request because we
1109		 * won't be able to block waiting for the response.
1110		 * Simply assume RPC_INTR and get on with it.
1111		 */
1112		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1113			status = RPC_INTR;
1114		else {
1115			status = CLNT_CALL(client, which, xdrargs, argsp,
1116			    xdrres, resp, wait);
1117		}
1118
1119		if (!(mi->mi_flags & MI_INT))
1120			client->cl_nosignal = FALSE;
1121		/*
1122		 * restore original signal mask
1123		 */
1124		sigunintr(&smask);
1125
1126		switch (status) {
1127		case RPC_SUCCESS:
1128			if ((mi->mi_flags & MI_DYNAMIC) &&
1129			    mi->mi_timer_type[which] != 0 &&
1130			    (mi->mi_curread != my_rsize ||
1131			    mi->mi_curwrite != my_wsize))
1132				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1133			break;
1134
1135		case RPC_INTR:
1136			/*
1137			 * There is no way to recover from this error,
1138			 * even if mount option nointr is specified.
1139			 * SIGKILL, for example, cannot be blocked.
1140			 */
1141			rpcerr.re_status = RPC_INTR;
1142			rpcerr.re_errno = EINTR;
1143			break;
1144
1145		case RPC_UDERROR:
1146			/*
1147			 * If the NFS server is local (vold) and
1148			 * it goes away then we get RPC_UDERROR.
1149			 * This is a retryable error, so we would
1150			 * loop, so check to see if the specific
1151			 * error was ECONNRESET, indicating that
1152			 * target did not exist at all.  If so,
1153			 * return with RPC_PROGUNAVAIL and
1154			 * ECONNRESET to indicate why.
1155			 */
1156			CLNT_GETERR(client, &rpcerr);
1157			if (rpcerr.re_errno == ECONNRESET) {
1158				rpcerr.re_status = RPC_PROGUNAVAIL;
1159				rpcerr.re_errno = ECONNRESET;
1160				break;
1161			}
1162			/*FALLTHROUGH*/
1163
1164		default:		/* probably RPC_TIMEDOUT */
1165			if (IS_UNRECOVERABLE_RPC(status))
1166				break;
1167
1168			/*
1169			 * increment server not responding count
1170			 */
1171			mutex_enter(&mi->mi_lock);
1172			mi->mi_noresponse++;
1173			mutex_exit(&mi->mi_lock);
1174#ifdef DEBUG
1175			nfscl->nfscl_stat.noresponse.value.ui64++;
1176#endif
1177
1178			if (!(mi->mi_flags & MI_HARD)) {
1179				if (!(mi->mi_flags & MI_SEMISOFT) ||
1180				    (mi->mi_ss_call_type[which] == 0))
1181					break;
1182			}
1183
1184			/*
1185			 * The call is in progress (over COTS).
1186			 * Try the CLNT_CALL again, but don't
1187			 * print a noisy error message.
1188			 */
1189			if (status == RPC_INPROGRESS) {
1190				tryagain = TRUE;
1191				break;
1192			}
1193
1194			if (flags & RFSCALL_SOFT)
1195				break;
1196
1197			/*
1198			 * On zone shutdown, just move on.
1199			 */
1200			if (zone_status_get(curproc->p_zone) >=
1201			    ZONE_IS_SHUTTING_DOWN) {
1202				rpcerr.re_status = RPC_FAILED;
1203				rpcerr.re_errno = EIO;
1204				break;
1205			}
1206
1207			/*
1208			 * NFS client failover support
1209			 *
1210			 * If the current server just failed us, we'll
1211			 * start the process of finding a new server.
1212			 * After that, we can just retry.
1213			 */
1214			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1215				if (svp == mi->mi_curr_serv)
1216					failover_newserver(mi);
1217				clfree_impl(client, ch, nfscl);
1218				goto failoverretry;
1219			}
1220
1221			tryagain = TRUE;
1222			timeo = backoff(timeo);
1223			mutex_enter(&mi->mi_lock);
1224			if (!(mi->mi_flags & MI_PRINTED)) {
1225				mi->mi_flags |= MI_PRINTED;
1226				mutex_exit(&mi->mi_lock);
1227#ifdef DEBUG
1228				zprintf(zoneid,
1229			"NFS%d server %s not responding still trying\n",
1230				    mi->mi_vers, svp->sv_hostname);
1231#else
1232				zprintf(zoneid,
1233			"NFS server %s not responding still trying\n",
1234				    svp->sv_hostname);
1235#endif
1236			} else
1237				mutex_exit(&mi->mi_lock);
1238			if (*douprintf && nfs_has_ctty()) {
1239				*douprintf = 0;
1240				if (!(mi->mi_flags & MI_NOPRINT))
1241#ifdef DEBUG
1242					uprintf(
1243			    "NFS%d server %s not responding still trying\n",
1244					    mi->mi_vers, svp->sv_hostname);
1245#else
1246					uprintf(
1247			    "NFS server %s not responding still trying\n",
1248					    svp->sv_hostname);
1249#endif
1250			}
1251
1252			/*
1253			 * If doing dynamic adjustment of transfer
1254			 * size and if it's a read or write call
1255			 * and if the transfer size changed while
1256			 * retransmitting or if the feedback routine
1257			 * changed the transfer size,
1258			 * then exit rfscall so that the transfer
1259			 * size can be adjusted at the vnops level.
1260			 */
1261			if ((mi->mi_flags & MI_DYNAMIC) &&
1262			    mi->mi_timer_type[which] != 0 &&
1263			    (mi->mi_curread != my_rsize ||
1264			    mi->mi_curwrite != my_wsize ||
1265			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1266				/*
1267				 * On read or write calls, return
1268				 * back to the vnode ops level if
1269				 * the transfer size changed.
1270				 */
1271				clfree_impl(client, ch, nfscl);
1272				if (cred_cloned)
1273					crfree(cr);
1274				return (ENFS_TRYAGAIN);
1275			}
1276		}
1277	} while (tryagain);
1278
1279	if (status != RPC_SUCCESS) {
1280		/*
1281		 * Let soft mounts use the timed out message.
1282		 */
1283		if (status == RPC_INPROGRESS)
1284			status = RPC_TIMEDOUT;
1285		nfscl->nfscl_stat.badcalls.value.ui64++;
1286		if (status != RPC_INTR) {
1287			mutex_enter(&mi->mi_lock);
1288			mi->mi_flags |= MI_DOWN;
1289			mutex_exit(&mi->mi_lock);
1290			CLNT_GETERR(client, &rpcerr);
1291#ifdef DEBUG
1292			bufp = clnt_sperror(client, svp->sv_hostname);
1293			zprintf(zoneid, "NFS%d %s failed for %s\n",
1294			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1295			if (nfs_has_ctty()) {
1296				if (!(mi->mi_flags & MI_NOPRINT)) {
1297					uprintf("NFS%d %s failed for %s\n",
1298					    mi->mi_vers, mi->mi_rfsnames[which],
1299					    bufp);
1300				}
1301			}
1302			kmem_free(bufp, MAXPATHLEN);
1303#else
1304			zprintf(zoneid,
1305			    "NFS %s failed for server %s: error %d (%s)\n",
1306			    mi->mi_rfsnames[which], svp->sv_hostname,
1307			    status, clnt_sperrno(status));
1308			if (nfs_has_ctty()) {
1309				if (!(mi->mi_flags & MI_NOPRINT)) {
1310					uprintf(
1311				"NFS %s failed for server %s: error %d (%s)\n",
1312					    mi->mi_rfsnames[which],
1313					    svp->sv_hostname, status,
1314					    clnt_sperrno(status));
1315				}
1316			}
1317#endif
1318			/*
1319			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1320			 * re_errno is set appropriately depending on
1321			 * the authentication error
1322			 */
1323			if (status == RPC_VERSMISMATCH ||
1324			    status == RPC_PROGVERSMISMATCH)
1325				rpcerr.re_errno = EIO;
1326		}
1327	} else {
1328		/*
1329		 * Test the value of mi_down and mi_printed without
1330		 * holding the mi_lock mutex.  If they are both zero,
1331		 * then it is okay to skip the down and printed
1332		 * processing.  This saves on a mutex_enter and
1333		 * mutex_exit pair for a normal, successful RPC.
1334		 * This was just complete overhead.
1335		 */
1336		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1337			mutex_enter(&mi->mi_lock);
1338			mi->mi_flags &= ~MI_DOWN;
1339			if (mi->mi_flags & MI_PRINTED) {
1340				mi->mi_flags &= ~MI_PRINTED;
1341				mutex_exit(&mi->mi_lock);
1342#ifdef DEBUG
1343			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1344				zprintf(zoneid, "NFS%d server %s ok\n",
1345				    mi->mi_vers, svp->sv_hostname);
1346#else
1347			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1348				zprintf(zoneid, "NFS server %s ok\n",
1349				    svp->sv_hostname);
1350#endif
1351			} else
1352				mutex_exit(&mi->mi_lock);
1353		}
1354
1355		if (*douprintf == 0) {
1356			if (!(mi->mi_flags & MI_NOPRINT))
1357#ifdef DEBUG
1358				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1359					uprintf("NFS%d server %s ok\n",
1360					    mi->mi_vers, svp->sv_hostname);
1361#else
1362			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1363				uprintf("NFS server %s ok\n", svp->sv_hostname);
1364#endif
1365			*douprintf = 1;
1366		}
1367	}
1368
1369	clfree_impl(client, ch, nfscl);
1370	if (cred_cloned)
1371		crfree(cr);
1372
1373	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1374
1375	if (rpc_status != NULL)
1376		*rpc_status = rpcerr.re_status;
1377
1378	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1379	    rpcerr.re_errno);
1380
1381	return (rpcerr.re_errno);
1382}
1383
1384#ifdef DEBUG
1385static int acl2call_hits = 0;
1386static int acl2call_misses = 0;
1387#endif
1388
1389int
1390acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1391    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1392    enum nfsstat *statusp, int flags, failinfo_t *fi)
1393{
1394	int rpcerror;
1395
1396	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1397	    cr, douprintf, flags, fi);
1398	if (!rpcerror) {
1399		/*
1400		 * See comments with crnetadjust().
1401		 */
1402		if (*statusp == NFSERR_ACCES &&
1403		    (cr = crnetadjust(cr)) != NULL) {
1404#ifdef DEBUG
1405			acl2call_hits++;
1406#endif
1407			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1408			    resp, cr, douprintf, flags, fi);
1409			crfree(cr);
1410#ifdef DEBUG
1411			if (*statusp == NFSERR_ACCES)
1412				acl2call_misses++;
1413#endif
1414		}
1415	}
1416
1417	return (rpcerror);
1418}
1419
1420#ifdef DEBUG
1421static int acl3call_hits = 0;
1422static int acl3call_misses = 0;
1423#endif
1424
1425int
1426acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1427    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1428    nfsstat3 *statusp, int flags, failinfo_t *fi)
1429{
1430	int rpcerror;
1431	int user_informed;
1432
1433	user_informed = 0;
1434
1435	do {
1436		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1437		    cr, douprintf, flags, fi);
1438		if (!rpcerror) {
1439			cred_t *crr;
1440			if (*statusp == NFS3ERR_JUKEBOX) {
1441				if (!user_informed) {
1442					user_informed = 1;
1443					uprintf(
1444		"file temporarily unavailable on the server, retrying...\n");
1445				}
1446				delay(nfs3_jukebox_delay);
1447			}
1448			/*
1449			 * See crnetadjust() for comments.
1450			 */
1451			else if (*statusp == NFS3ERR_ACCES &&
1452			    (crr = crnetadjust(cr)) != NULL) {
1453#ifdef DEBUG
1454				acl3call_hits++;
1455#endif
1456				rpcerror = aclcall(mi, which, xdrargs, argsp,
1457				    xdrres, resp, crr, douprintf, flags, fi);
1458
1459				crfree(crr);
1460#ifdef DEBUG
1461				if (*statusp == NFS3ERR_ACCES)
1462					acl3call_misses++;
1463#endif
1464			}
1465		}
1466	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1467
1468	return (rpcerror);
1469}
1470
1471static int
1472aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1473    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1474    int flags, failinfo_t *fi)
1475{
1476	CLIENT *client;
1477	struct chtab *ch;
1478	cred_t *cr = icr;
1479	bool_t cred_cloned = FALSE;
1480	enum clnt_stat status;
1481	struct rpc_err rpcerr;
1482	struct timeval wait;
1483	int timeo;		/* in units of hz */
1484#if 0 /* notyet */
1485	int my_rsize, my_wsize;
1486#endif
1487	bool_t tryagain;
1488	k_sigset_t smask;
1489	servinfo_t *svp;
1490	struct nfs_clnt *nfscl;
1491	zoneid_t zoneid = getzoneid();
1492#ifdef DEBUG
1493	char *bufp;
1494#endif
1495
1496#if 0 /* notyet */
1497	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1498		"rfscall_start:which %d mi %p", which, mi);
1499#endif
1500
1501	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1502	ASSERT(nfscl != NULL);
1503
1504	nfscl->nfscl_stat.calls.value.ui64++;
1505	mi->mi_aclreqs[which].value.ui64++;
1506
1507	rpcerr.re_status = RPC_SUCCESS;
1508
1509	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1510		rpcerr.re_status = RPC_FAILED;
1511		rpcerr.re_errno = EIO;
1512		return (rpcerr.re_errno);
1513	}
1514
1515#if 0 /* notyet */
1516	/*
1517	 * Remember the transfer sizes in case
1518	 * nfs_feedback changes them underneath us.
1519	 */
1520	my_rsize = mi->mi_curread;
1521	my_wsize = mi->mi_curwrite;
1522#endif
1523
1524	/*
1525	 * NFS client failover support
1526	 *
1527	 * If this rnode is not in sync with the current server (VALID_FH),
1528	 * we'd like to do a remap to get in sync.  We can be interrupted
1529	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1530	 * use the best info we have to try the RPC.  Part of that is
1531	 * unconditionally updating the filehandle copy kept for V3.
1532	 *
1533	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1534	 * rw_enter(); we're trying to keep the current server from being
1535	 * changed on us until we're done with the remapping and have a
1536	 * matching client handle.  We don't want to sending a filehandle
1537	 * to the wrong host.
1538	 */
1539failoverretry:
1540	if (FAILOVER_MOUNT(mi)) {
1541		mutex_enter(&mi->mi_lock);
1542		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1543			if (failover_wait(mi)) {
1544				mutex_exit(&mi->mi_lock);
1545				return (EINTR);
1546			}
1547		}
1548		INC_READERS(mi);
1549		mutex_exit(&mi->mi_lock);
1550		if (fi) {
1551			if (!VALID_FH(fi) &&
1552			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1553				int remaperr;
1554
1555				svp = mi->mi_curr_serv;
1556				remaperr = failover_remap(fi);
1557				if (remaperr != 0) {
1558#ifdef DEBUG
1559					if (remaperr != EINTR)
1560						nfs_cmn_err(remaperr, CE_WARN,
1561					    "aclcall couldn't failover: %m");
1562#endif
1563					mutex_enter(&mi->mi_lock);
1564					DEC_READERS(mi);
1565					mutex_exit(&mi->mi_lock);
1566
1567					/*
1568					 * If failover_remap returns ETIMEDOUT
1569					 * and the filesystem is hard mounted
1570					 * we have to retry the call with a new
1571					 * server.
1572					 */
1573					if ((mi->mi_flags & MI_HARD) &&
1574					    IS_RECOVERABLE_ERROR(remaperr)) {
1575						if (svp == mi->mi_curr_serv)
1576							failover_newserver(mi);
1577						rpcerr.re_status = RPC_SUCCESS;
1578						goto failoverretry;
1579					}
1580					return (remaperr);
1581				}
1582			}
1583			if (fi->fhp && fi->copyproc)
1584				(*fi->copyproc)(fi->fhp, fi->vp);
1585		}
1586	}
1587
1588	/* For TSOL, use a new cred which has net_mac_aware flag */
1589	if (!cred_cloned && is_system_labeled()) {
1590		cred_cloned = TRUE;
1591		cr = crdup(icr);
1592		(void) setpflags(NET_MAC_AWARE, 1, cr);
1593	}
1594
1595	/*
1596	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1597	 * are guaranteed to reprocess the retry as a new request.
1598	 */
1599	svp = mi->mi_curr_serv;
1600	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1601	if (FAILOVER_MOUNT(mi)) {
1602		mutex_enter(&mi->mi_lock);
1603		DEC_READERS(mi);
1604		mutex_exit(&mi->mi_lock);
1605
1606		if ((rpcerr.re_errno == ETIMEDOUT ||
1607				rpcerr.re_errno == ECONNRESET) &&
1608				failover_safe(fi)) {
1609			if (svp == mi->mi_curr_serv)
1610				failover_newserver(mi);
1611			goto failoverretry;
1612		}
1613	}
1614	if (rpcerr.re_errno != 0) {
1615		if (cred_cloned)
1616			crfree(cr);
1617		return (rpcerr.re_errno);
1618	}
1619
1620	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1621	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1622		timeo = (mi->mi_timeo * hz) / 10;
1623	} else {
1624		mutex_enter(&mi->mi_lock);
1625		timeo = CLNT_SETTIMERS(client,
1626		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1627		    &(mi->mi_timers[NFS_CALLTYPES]),
1628		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1629		    (void (*)()) 0, (caddr_t)mi, 0);
1630		mutex_exit(&mi->mi_lock);
1631	}
1632
1633	/*
1634	 * If hard mounted fs, retry call forever unless hard error occurs.
1635	 */
1636	do {
1637		tryagain = FALSE;
1638
1639		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1640			status = RPC_FAILED;
1641			rpcerr.re_status = RPC_FAILED;
1642			rpcerr.re_errno = EIO;
1643			break;
1644		}
1645
1646		TICK_TO_TIMEVAL(timeo, &wait);
1647
1648		/*
1649		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1650		 * and SIGTERM. (Preserving the existing masks).
1651		 * Mask out SIGINT if mount option nointr is specified.
1652		 */
1653		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1654		if (!(mi->mi_flags & MI_INT))
1655			client->cl_nosignal = TRUE;
1656
1657		/*
1658		 * If there is a current signal, then don't bother
1659		 * even trying to send out the request because we
1660		 * won't be able to block waiting for the response.
1661		 * Simply assume RPC_INTR and get on with it.
1662		 */
1663		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1664			status = RPC_INTR;
1665		else {
1666			status = CLNT_CALL(client, which, xdrargs, argsp,
1667			    xdrres, resp, wait);
1668		}
1669
1670		if (!(mi->mi_flags & MI_INT))
1671			client->cl_nosignal = FALSE;
1672		/*
1673		 * restore original signal mask
1674		 */
1675		sigunintr(&smask);
1676
1677		switch (status) {
1678		case RPC_SUCCESS:
1679#if 0 /* notyet */
1680			if ((mi->mi_flags & MI_DYNAMIC) &&
1681			    mi->mi_timer_type[which] != 0 &&
1682			    (mi->mi_curread != my_rsize ||
1683			    mi->mi_curwrite != my_wsize))
1684				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1685#endif
1686			break;
1687
1688		/*
1689		 * Unfortunately, there are servers in the world which
1690		 * are not coded correctly.  They are not prepared to
1691		 * handle RPC requests to the NFS port which are not
1692		 * NFS requests.  Thus, they may try to process the
1693		 * NFS_ACL request as if it were an NFS request.  This
1694		 * does not work.  Generally, an error will be generated
1695		 * on the client because it will not be able to decode
1696		 * the response from the server.  However, it seems
1697		 * possible that the server may not be able to decode
1698		 * the arguments.  Thus, the criteria for deciding
1699		 * whether the server supports NFS_ACL or not is whether
1700		 * the following RPC errors are returned from CLNT_CALL.
1701		 */
1702		case RPC_CANTDECODERES:
1703		case RPC_PROGUNAVAIL:
1704		case RPC_CANTDECODEARGS:
1705		case RPC_PROGVERSMISMATCH:
1706			mutex_enter(&mi->mi_lock);
1707			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1708			mutex_exit(&mi->mi_lock);
1709			break;
1710
1711		/*
1712		 * If the server supports NFS_ACL but not the new ops
1713		 * for extended attributes, make sure we don't retry.
1714		 */
1715		case RPC_PROCUNAVAIL:
1716			mutex_enter(&mi->mi_lock);
1717			mi->mi_flags &= ~MI_EXTATTR;
1718			mutex_exit(&mi->mi_lock);
1719			break;
1720
1721		case RPC_INTR:
1722			/*
1723			 * There is no way to recover from this error,
1724			 * even if mount option nointr is specified.
1725			 * SIGKILL, for example, cannot be blocked.
1726			 */
1727			rpcerr.re_status = RPC_INTR;
1728			rpcerr.re_errno = EINTR;
1729			break;
1730
1731		case RPC_UDERROR:
1732			/*
1733			 * If the NFS server is local (vold) and
1734			 * it goes away then we get RPC_UDERROR.
1735			 * This is a retryable error, so we would
1736			 * loop, so check to see if the specific
1737			 * error was ECONNRESET, indicating that
1738			 * target did not exist at all.  If so,
1739			 * return with RPC_PROGUNAVAIL and
1740			 * ECONNRESET to indicate why.
1741			 */
1742			CLNT_GETERR(client, &rpcerr);
1743			if (rpcerr.re_errno == ECONNRESET) {
1744				rpcerr.re_status = RPC_PROGUNAVAIL;
1745				rpcerr.re_errno = ECONNRESET;
1746				break;
1747			}
1748			/*FALLTHROUGH*/
1749
1750		default:		/* probably RPC_TIMEDOUT */
1751			if (IS_UNRECOVERABLE_RPC(status))
1752				break;
1753
1754			/*
1755			 * increment server not responding count
1756			 */
1757			mutex_enter(&mi->mi_lock);
1758			mi->mi_noresponse++;
1759			mutex_exit(&mi->mi_lock);
1760#ifdef DEBUG
1761			nfscl->nfscl_stat.noresponse.value.ui64++;
1762#endif
1763
1764			if (!(mi->mi_flags & MI_HARD)) {
1765				if (!(mi->mi_flags & MI_SEMISOFT) ||
1766				    (mi->mi_acl_ss_call_type[which] == 0))
1767					break;
1768			}
1769
1770			/*
1771			 * The call is in progress (over COTS).
1772			 * Try the CLNT_CALL again, but don't
1773			 * print a noisy error message.
1774			 */
1775			if (status == RPC_INPROGRESS) {
1776				tryagain = TRUE;
1777				break;
1778			}
1779
1780			if (flags & RFSCALL_SOFT)
1781				break;
1782
1783			/*
1784			 * On zone shutdown, just move on.
1785			 */
1786			if (zone_status_get(curproc->p_zone) >=
1787			    ZONE_IS_SHUTTING_DOWN) {
1788				rpcerr.re_status = RPC_FAILED;
1789				rpcerr.re_errno = EIO;
1790				break;
1791			}
1792
1793			/*
1794			 * NFS client failover support
1795			 *
1796			 * If the current server just failed us, we'll
1797			 * start the process of finding a new server.
1798			 * After that, we can just retry.
1799			 */
1800			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1801				if (svp == mi->mi_curr_serv)
1802					failover_newserver(mi);
1803				clfree_impl(client, ch, nfscl);
1804				goto failoverretry;
1805			}
1806
1807			tryagain = TRUE;
1808			timeo = backoff(timeo);
1809			mutex_enter(&mi->mi_lock);
1810			if (!(mi->mi_flags & MI_PRINTED)) {
1811				mi->mi_flags |= MI_PRINTED;
1812				mutex_exit(&mi->mi_lock);
1813#ifdef DEBUG
1814				zprintf(zoneid,
1815			"NFS_ACL%d server %s not responding still trying\n",
1816				    mi->mi_vers, svp->sv_hostname);
1817#else
1818				zprintf(zoneid,
1819			    "NFS server %s not responding still trying\n",
1820				    svp->sv_hostname);
1821#endif
1822			} else
1823				mutex_exit(&mi->mi_lock);
1824			if (*douprintf && nfs_has_ctty()) {
1825				*douprintf = 0;
1826				if (!(mi->mi_flags & MI_NOPRINT))
1827#ifdef DEBUG
1828					uprintf(
1829			"NFS_ACL%d server %s not responding still trying\n",
1830					    mi->mi_vers, svp->sv_hostname);
1831#else
1832					uprintf(
1833			    "NFS server %s not responding still trying\n",
1834					    svp->sv_hostname);
1835#endif
1836			}
1837
1838#if 0 /* notyet */
1839			/*
1840			 * If doing dynamic adjustment of transfer
1841			 * size and if it's a read or write call
1842			 * and if the transfer size changed while
1843			 * retransmitting or if the feedback routine
1844			 * changed the transfer size,
1845			 * then exit rfscall so that the transfer
1846			 * size can be adjusted at the vnops level.
1847			 */
1848			if ((mi->mi_flags & MI_DYNAMIC) &&
1849			    mi->mi_acl_timer_type[which] != 0 &&
1850			    (mi->mi_curread != my_rsize ||
1851			    mi->mi_curwrite != my_wsize ||
1852			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1853				/*
1854				 * On read or write calls, return
1855				 * back to the vnode ops level if
1856				 * the transfer size changed.
1857				 */
1858				clfree_impl(client, ch, nfscl);
1859				if (cred_cloned)
1860					crfree(cr);
1861				return (ENFS_TRYAGAIN);
1862			}
1863#endif
1864		}
1865	} while (tryagain);
1866
1867	if (status != RPC_SUCCESS) {
1868		/*
1869		 * Let soft mounts use the timed out message.
1870		 */
1871		if (status == RPC_INPROGRESS)
1872			status = RPC_TIMEDOUT;
1873		nfscl->nfscl_stat.badcalls.value.ui64++;
1874		if (status == RPC_CANTDECODERES ||
1875		    status == RPC_PROGUNAVAIL ||
1876		    status == RPC_PROCUNAVAIL ||
1877		    status == RPC_CANTDECODEARGS ||
1878		    status == RPC_PROGVERSMISMATCH)
1879			CLNT_GETERR(client, &rpcerr);
1880		else if (status != RPC_INTR) {
1881			mutex_enter(&mi->mi_lock);
1882			mi->mi_flags |= MI_DOWN;
1883			mutex_exit(&mi->mi_lock);
1884			CLNT_GETERR(client, &rpcerr);
1885#ifdef DEBUG
1886			bufp = clnt_sperror(client, svp->sv_hostname);
1887			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1888			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1889			if (nfs_has_ctty()) {
1890				if (!(mi->mi_flags & MI_NOPRINT)) {
1891					uprintf("NFS_ACL%d %s failed for %s\n",
1892					    mi->mi_vers, mi->mi_aclnames[which],
1893					    bufp);
1894				}
1895			}
1896			kmem_free(bufp, MAXPATHLEN);
1897#else
1898			zprintf(zoneid,
1899			    "NFS %s failed for server %s: error %d (%s)\n",
1900			    mi->mi_aclnames[which], svp->sv_hostname,
1901			    status, clnt_sperrno(status));
1902			if (nfs_has_ctty()) {
1903				if (!(mi->mi_flags & MI_NOPRINT))
1904					uprintf(
1905				"NFS %s failed for server %s: error %d (%s)\n",
1906					    mi->mi_aclnames[which],
1907					    svp->sv_hostname, status,
1908					    clnt_sperrno(status));
1909			}
1910#endif
1911			/*
1912			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1913			 * re_errno is set appropriately depending on
1914			 * the authentication error
1915			 */
1916			if (status == RPC_VERSMISMATCH ||
1917			    status == RPC_PROGVERSMISMATCH)
1918				rpcerr.re_errno = EIO;
1919		}
1920	} else {
1921		/*
1922		 * Test the value of mi_down and mi_printed without
1923		 * holding the mi_lock mutex.  If they are both zero,
1924		 * then it is okay to skip the down and printed
1925		 * processing.  This saves on a mutex_enter and
1926		 * mutex_exit pair for a normal, successful RPC.
1927		 * This was just complete overhead.
1928		 */
1929		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1930			mutex_enter(&mi->mi_lock);
1931			mi->mi_flags &= ~MI_DOWN;
1932			if (mi->mi_flags & MI_PRINTED) {
1933				mi->mi_flags &= ~MI_PRINTED;
1934				mutex_exit(&mi->mi_lock);
1935#ifdef DEBUG
1936				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1937				    mi->mi_vers, svp->sv_hostname);
1938#else
1939				zprintf(zoneid, "NFS server %s ok\n",
1940				    svp->sv_hostname);
1941#endif
1942			} else
1943				mutex_exit(&mi->mi_lock);
1944		}
1945
1946		if (*douprintf == 0) {
1947			if (!(mi->mi_flags & MI_NOPRINT))
1948#ifdef DEBUG
1949				uprintf("NFS_ACL%d server %s ok\n",
1950				    mi->mi_vers, svp->sv_hostname);
1951#else
1952				uprintf("NFS server %s ok\n", svp->sv_hostname);
1953#endif
1954			*douprintf = 1;
1955		}
1956	}
1957
1958	clfree_impl(client, ch, nfscl);
1959	if (cred_cloned)
1960		crfree(cr);
1961
1962	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1963
1964#if 0 /* notyet */
1965	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1966	    rpcerr.re_errno);
1967#endif
1968
1969	return (rpcerr.re_errno);
1970}
1971
1972int
1973vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1974{
1975	uint_t mask = vap->va_mask;
1976
1977	if (!(mask & AT_MODE))
1978		sa->sa_mode = (uint32_t)-1;
1979	else
1980		sa->sa_mode = vap->va_mode;
1981	if (!(mask & AT_UID))
1982		sa->sa_uid = (uint32_t)-1;
1983	else
1984		sa->sa_uid = (uint32_t)vap->va_uid;
1985	if (!(mask & AT_GID))
1986		sa->sa_gid = (uint32_t)-1;
1987	else
1988		sa->sa_gid = (uint32_t)vap->va_gid;
1989	if (!(mask & AT_SIZE))
1990		sa->sa_size = (uint32_t)-1;
1991	else
1992		sa->sa_size = (uint32_t)vap->va_size;
1993	if (!(mask & AT_ATIME))
1994		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
1995	else {
1996		/* check time validity */
1997		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1998			return (EOVERFLOW);
1999		}
2000		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2001		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2002	}
2003	if (!(mask & AT_MTIME))
2004		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2005	else {
2006		/* check time validity */
2007		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2008			return (EOVERFLOW);
2009		}
2010		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2011		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2012	}
2013	return (0);
2014}
2015
2016int
2017vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2018{
2019	uint_t mask = vap->va_mask;
2020
2021	if (!(mask & AT_MODE))
2022		sa->mode.set_it = FALSE;
2023	else {
2024		sa->mode.set_it = TRUE;
2025		sa->mode.mode = (mode3)vap->va_mode;
2026	}
2027	if (!(mask & AT_UID))
2028		sa->uid.set_it = FALSE;
2029	else {
2030		sa->uid.set_it = TRUE;
2031		sa->uid.uid = (uid3)vap->va_uid;
2032	}
2033	if (!(mask & AT_GID))
2034		sa->gid.set_it = FALSE;
2035	else {
2036		sa->gid.set_it = TRUE;
2037		sa->gid.gid = (gid3)vap->va_gid;
2038	}
2039	if (!(mask & AT_SIZE))
2040		sa->size.set_it = FALSE;
2041	else {
2042		sa->size.set_it = TRUE;
2043		sa->size.size = (size3)vap->va_size;
2044	}
2045	if (!(mask & AT_ATIME))
2046		sa->atime.set_it = DONT_CHANGE;
2047	else {
2048		/* check time validity */
2049		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2050			return (EOVERFLOW);
2051		}
2052		sa->atime.set_it = SET_TO_CLIENT_TIME;
2053		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2054		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2055	}
2056	if (!(mask & AT_MTIME))
2057		sa->mtime.set_it = DONT_CHANGE;
2058	else {
2059		/* check time validity */
2060		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2061			return (EOVERFLOW);
2062		}
2063		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2064		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2065		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2066	}
2067	return (0);
2068}
2069
2070void
2071setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2072{
2073
2074	da->da_fhandle = VTOFH(dvp);
2075	da->da_name = nm;
2076	da->da_flags = 0;
2077}
2078
2079void
2080setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2081{
2082
2083	da->dirp = VTOFH3(dvp);
2084	da->name = nm;
2085}
2086
2087int
2088setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2089{
2090	int error;
2091	rnode_t *rp;
2092	struct vattr va;
2093
2094	va.va_mask = AT_MODE | AT_GID;
2095	error = VOP_GETATTR(dvp, &va, 0, cr);
2096	if (error)
2097		return (error);
2098
2099	/*
2100	 * To determine the expected group-id of the created file:
2101	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2102	 *	GRPID option, and the directory's set-gid bit is clear,
2103	 *	then use the process's gid.
2104	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2105	 */
2106	rp = VTOR(dvp);
2107	mutex_enter(&rp->r_statelock);
2108	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2109		*gidp = crgetgid(cr);
2110	else
2111		*gidp = va.va_gid;
2112	mutex_exit(&rp->r_statelock);
2113	return (0);
2114}
2115
2116int
2117setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2118{
2119	int error;
2120	struct vattr va;
2121
2122	va.va_mask = AT_MODE;
2123	error = VOP_GETATTR(dvp, &va, 0, cr);
2124	if (error)
2125		return (error);
2126
2127	/*
2128	 * Modify the expected mode (om) so that the set-gid bit matches
2129	 * that of the parent directory (dvp).
2130	 */
2131	if (va.va_mode & VSGID)
2132		*omp |= VSGID;
2133	else
2134		*omp &= ~VSGID;
2135	return (0);
2136}
2137
2138void
2139nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2140{
2141
2142	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2143		if (!(vp->v_flag & VSWAPLIKE)) {
2144			mutex_enter(&vp->v_lock);
2145			vp->v_flag |= VSWAPLIKE;
2146			mutex_exit(&vp->v_lock);
2147		}
2148	} else {
2149		if (vp->v_flag & VSWAPLIKE) {
2150			mutex_enter(&vp->v_lock);
2151			vp->v_flag &= ~VSWAPLIKE;
2152			mutex_exit(&vp->v_lock);
2153		}
2154	}
2155}
2156
2157/*
2158 * Free the resources associated with an rnode.
2159 */
2160static void
2161rinactive(rnode_t *rp, cred_t *cr)
2162{
2163	vnode_t *vp;
2164	cred_t *cred;
2165	char *contents;
2166	int size;
2167	vsecattr_t *vsp;
2168	int error;
2169	nfs3_pathconf_info *info;
2170
2171	/*
2172	 * Before freeing anything, wait until all asynchronous
2173	 * activity is done on this rnode.  This will allow all
2174	 * asynchronous read ahead and write behind i/o's to
2175	 * finish.
2176	 */
2177	mutex_enter(&rp->r_statelock);
2178	while (rp->r_count > 0)
2179		cv_wait(&rp->r_cv, &rp->r_statelock);
2180	mutex_exit(&rp->r_statelock);
2181
2182	/*
2183	 * Flush and invalidate all pages associated with the vnode.
2184	 */
2185	vp = RTOV(rp);
2186	if (vn_has_cached_data(vp)) {
2187		ASSERT(vp->v_type != VCHR);
2188		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2189			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr);
2190			if (error && (error == ENOSPC || error == EDQUOT)) {
2191				mutex_enter(&rp->r_statelock);
2192				if (!rp->r_error)
2193					rp->r_error = error;
2194				mutex_exit(&rp->r_statelock);
2195			}
2196		}
2197		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2198	}
2199
2200	/*
2201	 * Free any held credentials and caches which may be associated
2202	 * with this rnode.
2203	 */
2204	mutex_enter(&rp->r_statelock);
2205	cred = rp->r_cred;
2206	rp->r_cred = NULL;
2207	contents = rp->r_symlink.contents;
2208	size = rp->r_symlink.size;
2209	rp->r_symlink.contents = NULL;
2210	vsp = rp->r_secattr;
2211	rp->r_secattr = NULL;
2212	info = rp->r_pathconf;
2213	rp->r_pathconf = NULL;
2214	mutex_exit(&rp->r_statelock);
2215
2216	/*
2217	 * Free the held credential.
2218	 */
2219	if (cred != NULL)
2220		crfree(cred);
2221
2222	/*
2223	 * Free the access cache entries.
2224	 */
2225	(void) nfs_access_purge_rp(rp);
2226
2227	/*
2228	 * Free the readdir cache entries.
2229	 */
2230	if (HAVE_RDDIR_CACHE(rp))
2231		nfs_purge_rddir_cache(vp);
2232
2233	/*
2234	 * Free the symbolic link cache.
2235	 */
2236	if (contents != NULL) {
2237
2238		kmem_free((void *)contents, size);
2239	}
2240
2241	/*
2242	 * Free any cached ACL.
2243	 */
2244	if (vsp != NULL)
2245		nfs_acl_free(vsp);
2246
2247	/*
2248	 * Free any cached pathconf information.
2249	 */
2250	if (info != NULL)
2251		kmem_free(info, sizeof (*info));
2252}
2253
2254/*
2255 * Return a vnode for the given NFS Version 2 file handle.
2256 * If no rnode exists for this fhandle, create one and put it
2257 * into the hash queues.  If the rnode for this fhandle
2258 * already exists, return it.
2259 *
2260 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2261 */
2262vnode_t *
2263makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2264    hrtime_t t, cred_t *cr, char *dnm, char *nm)
2265{
2266	int newnode;
2267	int index;
2268	vnode_t *vp;
2269	nfs_fhandle nfh;
2270	vattr_t va;
2271
2272	nfh.fh_len = NFS_FHSIZE;
2273	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2274
2275	index = rtablehash(&nfh);
2276	rw_enter(&rtable[index].r_lock, RW_READER);
2277
2278	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2279	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2280
2281	if (attr != NULL) {
2282		if (!newnode) {
2283			rw_exit(&rtable[index].r_lock);
2284			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2285		} else {
2286			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2287				vp->v_type = VBAD;
2288			else
2289				vp->v_type = n2v_type(attr);
2290			/*
2291			 * A translation here seems to be necessary
2292			 * because this function can be called
2293			 * with `attr' that has come from the wire,
2294			 * and been operated on by vattr_to_nattr().
2295			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2296			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2297			 * ->makenfsnode().
2298			 */
2299			if ((attr->na_rdev & 0xffff0000) == 0)
2300				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2301			else
2302				vp->v_rdev = expldev(n2v_rdev(attr));
2303			nfs_attrcache(vp, attr, t);
2304			rw_exit(&rtable[index].r_lock);
2305		}
2306	} else {
2307		if (newnode) {
2308			PURGE_ATTRCACHE(vp);
2309		}
2310		rw_exit(&rtable[index].r_lock);
2311	}
2312
2313	return (vp);
2314}
2315
2316/*
2317 * Return a vnode for the given NFS Version 3 file handle.
2318 * If no rnode exists for this fhandle, create one and put it
2319 * into the hash queues.  If the rnode for this fhandle
2320 * already exists, return it.
2321 *
2322 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2323 */
2324vnode_t *
2325makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2326    cred_t *cr, char *dnm, char *nm)
2327{
2328	int newnode;
2329	int index;
2330	vnode_t *vp;
2331
2332	index = rtablehash((nfs_fhandle *)fh);
2333	rw_enter(&rtable[index].r_lock, RW_READER);
2334
2335	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2336	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2337	    dnm, nm);
2338
2339	if (vap == NULL) {
2340		if (newnode) {
2341			PURGE_ATTRCACHE(vp);
2342		}
2343		rw_exit(&rtable[index].r_lock);
2344		return (vp);
2345	}
2346
2347	if (!newnode) {
2348		rw_exit(&rtable[index].r_lock);
2349		nfs_attr_cache(vp, vap, t, cr);
2350	} else {
2351		rnode_t *rp = VTOR(vp);
2352
2353		vp->v_type = vap->va_type;
2354		vp->v_rdev = vap->va_rdev;
2355
2356		mutex_enter(&rp->r_statelock);
2357		if (rp->r_mtime <= t)
2358			nfs_attrcache_va(vp, vap);
2359		mutex_exit(&rp->r_statelock);
2360		rw_exit(&rtable[index].r_lock);
2361	}
2362
2363	return (vp);
2364}
2365
2366vnode_t *
2367makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2368    cred_t *cr, char *dnm, char *nm)
2369{
2370	int newnode;
2371	int index;
2372	vnode_t *vp;
2373	vattr_t va;
2374
2375	index = rtablehash((nfs_fhandle *)fh);
2376	rw_enter(&rtable[index].r_lock, RW_READER);
2377
2378	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2379	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2380	    dnm, nm);
2381
2382	if (attr == NULL) {
2383		if (newnode) {
2384			PURGE_ATTRCACHE(vp);
2385		}
2386		rw_exit(&rtable[index].r_lock);
2387		return (vp);
2388	}
2389
2390	if (!newnode) {
2391		rw_exit(&rtable[index].r_lock);
2392		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2393	} else {
2394		if (attr->type < NF3REG || attr->type > NF3FIFO)
2395			vp->v_type = VBAD;
2396		else
2397			vp->v_type = nf3_to_vt[attr->type];
2398		vp->v_rdev = makedevice(attr->rdev.specdata1,
2399			    attr->rdev.specdata2);
2400		nfs3_attrcache(vp, attr, t);
2401		rw_exit(&rtable[index].r_lock);
2402	}
2403
2404	return (vp);
2405}
2406
2407/*
2408 * Read this comment before making changes to rtablehash()!
2409 * This is a hash function in which seemingly obvious and harmless
2410 * changes can cause escalations costing million dollars!
2411 * Know what you are doing.
2412 *
2413 * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2414 * algorithm is currently detailed here:
2415 *
2416 *   http://burtleburtle.net/bob/hash/doobs.html
2417 *
2418 * Of course, the above link may not be valid by the time you are reading
2419 * this, but suffice it to say that the one-at-a-time algorithm works well in
2420 * almost all cases.  If you are changing the algorithm be sure to verify that
2421 * the hash algorithm still provides even distribution in all cases and with
2422 * any server returning filehandles in whatever order (sequential or random).
2423 */
2424static int
2425rtablehash(nfs_fhandle *fh)
2426{
2427	ulong_t hash, len, i;
2428	char *key;
2429
2430	key = fh->fh_buf;
2431	len = (ulong_t)fh->fh_len;
2432	for (hash = 0, i = 0; i < len; i++) {
2433		hash += key[i];
2434		hash += (hash << 10);
2435		hash ^= (hash >> 6);
2436	}
2437	hash += (hash << 3);
2438	hash ^= (hash >> 11);
2439	hash += (hash << 15);
2440	return (hash & rtablemask);
2441}
2442
2443static vnode_t *
2444make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2445    struct vnodeops *vops,
2446    int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2447    int (*compar)(const void *, const void *),
2448    int *newnode, cred_t *cr, char *dnm, char *nm)
2449{
2450	rnode_t *rp;
2451	rnode_t *trp;
2452	vnode_t *vp;
2453	mntinfo_t *mi;
2454
2455	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2456
2457	mi = VFTOMI(vfsp);
2458start:
2459	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2460		vp = RTOV(rp);
2461		nfs_set_vroot(vp);
2462		*newnode = 0;
2463		return (vp);
2464	}
2465	rw_exit(&rhtp->r_lock);
2466
2467	mutex_enter(&rpfreelist_lock);
2468	if (rpfreelist != NULL && rnew >= nrnode) {
2469		rp = rpfreelist;
2470		rp_rmfree(rp);
2471		mutex_exit(&rpfreelist_lock);
2472
2473		vp = RTOV(rp);
2474
2475		if (rp->r_flags & RHASHED) {
2476			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2477			mutex_enter(&vp->v_lock);
2478			if (vp->v_count > 1) {
2479				vp->v_count--;
2480				mutex_exit(&vp->v_lock);
2481				rw_exit(&rp->r_hashq->r_lock);
2482				rw_enter(&rhtp->r_lock, RW_READER);
2483				goto start;
2484			}
2485			mutex_exit(&vp->v_lock);
2486			rp_rmhash_locked(rp);
2487			rw_exit(&rp->r_hashq->r_lock);
2488		}
2489
2490		rinactive(rp, cr);
2491
2492		mutex_enter(&vp->v_lock);
2493		if (vp->v_count > 1) {
2494			vp->v_count--;
2495			mutex_exit(&vp->v_lock);
2496			rw_enter(&rhtp->r_lock, RW_READER);
2497			goto start;
2498		}
2499		mutex_exit(&vp->v_lock);
2500		vn_invalid(vp);
2501		/*
2502		 * destroy old locks before bzero'ing and
2503		 * recreating the locks below.
2504		 */
2505		nfs_rw_destroy(&rp->r_rwlock);
2506		nfs_rw_destroy(&rp->r_lkserlock);
2507		mutex_destroy(&rp->r_statelock);
2508		cv_destroy(&rp->r_cv);
2509		cv_destroy(&rp->r_commit.c_cv);
2510		nfs_free_r_path(rp);
2511		avl_destroy(&rp->r_dir);
2512		/*
2513		 * Make sure that if rnode is recycled then
2514		 * VFS count is decremented properly before
2515		 * reuse.
2516		 */
2517		VFS_RELE(vp->v_vfsp);
2518		vn_reinit(vp);
2519	} else {
2520		vnode_t *new_vp;
2521
2522		mutex_exit(&rpfreelist_lock);
2523
2524		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2525		new_vp = vn_alloc(KM_SLEEP);
2526
2527		atomic_add_long((ulong_t *)&rnew, 1);
2528#ifdef DEBUG
2529		clstat_debug.nrnode.value.ui64++;
2530#endif
2531		vp = new_vp;
2532	}
2533
2534	bzero(rp, sizeof (*rp));
2535	rp->r_vnode = vp;
2536	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2537	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2538	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2539	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2540	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2541	rp->r_fh.fh_len = fh->fh_len;
2542	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2543	rp->r_server = mi->mi_curr_serv;
2544	if (FAILOVER_MOUNT(mi)) {
2545		/*
2546		 * If replicated servers, stash pathnames
2547		 */
2548		if (dnm != NULL && nm != NULL) {
2549			char *s, *p;
2550			uint_t len;
2551
2552			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2553			rp->r_path = kmem_alloc(len, KM_SLEEP);
2554#ifdef DEBUG
2555			clstat_debug.rpath.value.ui64 += len;
2556#endif
2557			s = rp->r_path;
2558			for (p = dnm; *p; p++)
2559				*s++ = *p;
2560			*s++ = '/';
2561			for (p = nm; *p; p++)
2562				*s++ = *p;
2563			*s = '\0';
2564		} else {
2565			/* special case for root */
2566			rp->r_path = kmem_alloc(2, KM_SLEEP);
2567#ifdef DEBUG
2568			clstat_debug.rpath.value.ui64 += 2;
2569#endif
2570			*rp->r_path = '.';
2571			*(rp->r_path + 1) = '\0';
2572		}
2573	}
2574	VFS_HOLD(vfsp);
2575	rp->r_putapage = putapage;
2576	rp->r_hashq = rhtp;
2577	rp->r_flags = RREADDIRPLUS;
2578	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2579	    offsetof(rddir_cache, tree));
2580	vn_setops(vp, vops);
2581	vp->v_data = (caddr_t)rp;
2582	vp->v_vfsp = vfsp;
2583	vp->v_type = VNON;
2584	nfs_set_vroot(vp);
2585
2586	/*
2587	 * There is a race condition if someone else
2588	 * alloc's the rnode while no locks are held, so we
2589	 * check again and recover if found.
2590	 */
2591	rw_enter(&rhtp->r_lock, RW_WRITER);
2592	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2593		vp = RTOV(trp);
2594		nfs_set_vroot(vp);
2595		*newnode = 0;
2596		rw_exit(&rhtp->r_lock);
2597		rp_addfree(rp, cr);
2598		rw_enter(&rhtp->r_lock, RW_READER);
2599		return (vp);
2600	}
2601	rp_addhash(rp);
2602	*newnode = 1;
2603	return (vp);
2604}
2605
2606static void
2607nfs_set_vroot(vnode_t *vp)
2608{
2609	rnode_t *rp;
2610	nfs_fhandle *rootfh;
2611
2612	rp = VTOR(vp);
2613	rootfh = &rp->r_server->sv_fhandle;
2614	if (rootfh->fh_len == rp->r_fh.fh_len &&
2615	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2616		if (!(vp->v_flag & VROOT)) {
2617			mutex_enter(&vp->v_lock);
2618			vp->v_flag |= VROOT;
2619			mutex_exit(&vp->v_lock);
2620		}
2621	}
2622}
2623
2624static void
2625nfs_free_r_path(rnode_t *rp)
2626{
2627	char *path;
2628	size_t len;
2629
2630	path = rp->r_path;
2631	if (path) {
2632		rp->r_path = NULL;
2633		len = strlen(path) + 1;
2634		kmem_free(path, len);
2635#ifdef DEBUG
2636		clstat_debug.rpath.value.ui64 -= len;
2637#endif
2638	}
2639}
2640
2641/*
2642 * Put an rnode on the free list.
2643 *
2644 * Rnodes which were allocated above and beyond the normal limit
2645 * are immediately freed.
2646 */
2647void
2648rp_addfree(rnode_t *rp, cred_t *cr)
2649{
2650	vnode_t *vp;
2651	struct vfs *vfsp;
2652
2653	vp = RTOV(rp);
2654	ASSERT(vp->v_count >= 1);
2655	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2656
2657	/*
2658	 * If we have too many rnodes allocated and there are no
2659	 * references to this rnode, or if the rnode is no longer
2660	 * accessible by it does not reside in the hash queues,
2661	 * or if an i/o error occurred while writing to the file,
2662	 * then just free it instead of putting it on the rnode
2663	 * freelist.
2664	 */
2665	vfsp = vp->v_vfsp;
2666	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2667	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2668		if (rp->r_flags & RHASHED) {
2669			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2670			mutex_enter(&vp->v_lock);
2671			if (vp->v_count > 1) {
2672				vp->v_count--;
2673				mutex_exit(&vp->v_lock);
2674				rw_exit(&rp->r_hashq->r_lock);
2675				return;
2676			}
2677			mutex_exit(&vp->v_lock);
2678			rp_rmhash_locked(rp);
2679			rw_exit(&rp->r_hashq->r_lock);
2680		}
2681
2682		rinactive(rp, cr);
2683
2684		/*
2685		 * Recheck the vnode reference count.  We need to
2686		 * make sure that another reference has not been
2687		 * acquired while we were not holding v_lock.  The
2688		 * rnode is not in the rnode hash queues, so the
2689		 * only way for a reference to have been acquired
2690		 * is for a VOP_PUTPAGE because the rnode was marked
2691		 * with RDIRTY or for a modified page.  This
2692		 * reference may have been acquired before our call
2693		 * to rinactive.  The i/o may have been completed,
2694		 * thus allowing rinactive to complete, but the
2695		 * reference to the vnode may not have been released
2696		 * yet.  In any case, the rnode can not be destroyed
2697		 * until the other references to this vnode have been
2698		 * released.  The other references will take care of
2699		 * either destroying the rnode or placing it on the
2700		 * rnode freelist.  If there are no other references,
2701		 * then the rnode may be safely destroyed.
2702		 */
2703		mutex_enter(&vp->v_lock);
2704		if (vp->v_count > 1) {
2705			vp->v_count--;
2706			mutex_exit(&vp->v_lock);
2707			return;
2708		}
2709		mutex_exit(&vp->v_lock);
2710
2711		destroy_rnode(rp);
2712		return;
2713	}
2714
2715	/*
2716	 * Lock the hash queue and then recheck the reference count
2717	 * to ensure that no other threads have acquired a reference
2718	 * to indicate that the rnode should not be placed on the
2719	 * freelist.  If another reference has been acquired, then
2720	 * just release this one and let the other thread complete
2721	 * the processing of adding this rnode to the freelist.
2722	 */
2723	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2724
2725	mutex_enter(&vp->v_lock);
2726	if (vp->v_count > 1) {
2727		vp->v_count--;
2728		mutex_exit(&vp->v_lock);
2729		rw_exit(&rp->r_hashq->r_lock);
2730		return;
2731	}
2732	mutex_exit(&vp->v_lock);
2733
2734	/*
2735	 * If there is no cached data or metadata for this file, then
2736	 * put the rnode on the front of the freelist so that it will
2737	 * be reused before other rnodes which may have cached data or
2738	 * metadata associated with them.
2739	 */
2740	mutex_enter(&rpfreelist_lock);
2741	if (rpfreelist == NULL) {
2742		rp->r_freef = rp;
2743		rp->r_freeb = rp;
2744		rpfreelist = rp;
2745	} else {
2746		rp->r_freef = rpfreelist;
2747		rp->r_freeb = rpfreelist->r_freeb;
2748		rpfreelist->r_freeb->r_freef = rp;
2749		rpfreelist->r_freeb = rp;
2750		if (!vn_has_cached_data(vp) &&
2751		    !HAVE_RDDIR_CACHE(rp) &&
2752		    rp->r_symlink.contents == NULL &&
2753		    rp->r_secattr == NULL &&
2754		    rp->r_pathconf == NULL)
2755			rpfreelist = rp;
2756	}
2757	mutex_exit(&rpfreelist_lock);
2758
2759	rw_exit(&rp->r_hashq->r_lock);
2760}
2761
2762/*
2763 * Remove an rnode from the free list.
2764 *
2765 * The caller must be holding rpfreelist_lock and the rnode
2766 * must be on the freelist.
2767 */
2768static void
2769rp_rmfree(rnode_t *rp)
2770{
2771
2772	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2773	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2774
2775	if (rp == rpfreelist) {
2776		rpfreelist = rp->r_freef;
2777		if (rp == rpfreelist)
2778			rpfreelist = NULL;
2779	}
2780
2781	rp->r_freeb->r_freef = rp->r_freef;
2782	rp->r_freef->r_freeb = rp->r_freeb;
2783
2784	rp->r_freef = rp->r_freeb = NULL;
2785}
2786
2787/*
2788 * Put a rnode in the hash table.
2789 *
2790 * The caller must be holding the exclusive hash queue lock.
2791 */
2792static void
2793rp_addhash(rnode_t *rp)
2794{
2795
2796	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2797	ASSERT(!(rp->r_flags & RHASHED));
2798
2799	rp->r_hashf = rp->r_hashq->r_hashf;
2800	rp->r_hashq->r_hashf = rp;
2801	rp->r_hashb = (rnode_t *)rp->r_hashq;
2802	rp->r_hashf->r_hashb = rp;
2803
2804	mutex_enter(&rp->r_statelock);
2805	rp->r_flags |= RHASHED;
2806	mutex_exit(&rp->r_statelock);
2807}
2808
2809/*
2810 * Remove a rnode from the hash table.
2811 *
2812 * The caller must be holding the hash queue lock.
2813 */
2814static void
2815rp_rmhash_locked(rnode_t *rp)
2816{
2817
2818	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2819	ASSERT(rp->r_flags & RHASHED);
2820
2821	rp->r_hashb->r_hashf = rp->r_hashf;
2822	rp->r_hashf->r_hashb = rp->r_hashb;
2823
2824	mutex_enter(&rp->r_statelock);
2825	rp->r_flags &= ~RHASHED;
2826	mutex_exit(&rp->r_statelock);
2827}
2828
2829/*
2830 * Remove a rnode from the hash table.
2831 *
2832 * The caller must not be holding the hash queue lock.
2833 */
2834void
2835rp_rmhash(rnode_t *rp)
2836{
2837
2838	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2839	rp_rmhash_locked(rp);
2840	rw_exit(&rp->r_hashq->r_lock);
2841}
2842
2843/*
2844 * Lookup a rnode by fhandle.
2845 *
2846 * The caller must be holding the hash queue lock, either shared or exclusive.
2847 */
2848static rnode_t *
2849rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2850{
2851	rnode_t *rp;
2852	vnode_t *vp;
2853
2854	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2855
2856	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2857		vp = RTOV(rp);
2858		if (vp->v_vfsp == vfsp &&
2859		    rp->r_fh.fh_len == fh->fh_len &&
2860		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2861			/*
2862			 * remove rnode from free list, if necessary.
2863			 */
2864			if (rp->r_freef != NULL) {
2865				mutex_enter(&rpfreelist_lock);
2866				/*
2867				 * If the rnode is on the freelist,
2868				 * then remove it and use that reference
2869				 * as the new reference.  Otherwise,
2870				 * need to increment the reference count.
2871				 */
2872				if (rp->r_freef != NULL) {
2873					rp_rmfree(rp);
2874					mutex_exit(&rpfreelist_lock);
2875				} else {
2876					mutex_exit(&rpfreelist_lock);
2877					VN_HOLD(vp);
2878				}
2879			} else
2880				VN_HOLD(vp);
2881			return (rp);
2882		}
2883	}
2884	return (NULL);
2885}
2886
2887/*
2888 * Return 1 if there is a active vnode belonging to this vfs in the
2889 * rtable cache.
2890 *
2891 * Several of these checks are done without holding the usual
2892 * locks.  This is safe because destroy_rtable(), rp_addfree(),
2893 * etc. will redo the necessary checks before actually destroying
2894 * any rnodes.
2895 */
2896int
2897check_rtable(struct vfs *vfsp)
2898{
2899	int index;
2900	rnode_t *rp;
2901	vnode_t *vp;
2902
2903	for (index = 0; index < rtablesize; index++) {
2904		rw_enter(&rtable[index].r_lock, RW_READER);
2905		for (rp = rtable[index].r_hashf;
2906		    rp != (rnode_t *)(&rtable[index]);
2907		    rp = rp->r_hashf) {
2908			vp = RTOV(rp);
2909			if (vp->v_vfsp == vfsp) {
2910				if (rp->r_freef == NULL ||
2911				    (vn_has_cached_data(vp) &&
2912				    (rp->r_flags & RDIRTY)) ||
2913				    rp->r_count > 0) {
2914					rw_exit(&rtable[index].r_lock);
2915					return (1);
2916				}
2917			}
2918		}
2919		rw_exit(&rtable[index].r_lock);
2920	}
2921	return (0);
2922}
2923
2924/*
2925 * Destroy inactive vnodes from the hash queues which belong to this
2926 * vfs.  It is essential that we destroy all inactive vnodes during a
2927 * forced unmount as well as during a normal unmount.
2928 */
2929void
2930destroy_rtable(struct vfs *vfsp, cred_t *cr)
2931{
2932	int index;
2933	rnode_t *rp;
2934	rnode_t *rlist;
2935	rnode_t *r_hashf;
2936	vnode_t *vp;
2937
2938	rlist = NULL;
2939
2940	for (index = 0; index < rtablesize; index++) {
2941		rw_enter(&rtable[index].r_lock, RW_WRITER);
2942		for (rp = rtable[index].r_hashf;
2943		    rp != (rnode_t *)(&rtable[index]);
2944		    rp = r_hashf) {
2945			/* save the hash pointer before destroying */
2946			r_hashf = rp->r_hashf;
2947			vp = RTOV(rp);
2948			if (vp->v_vfsp == vfsp) {
2949				mutex_enter(&rpfreelist_lock);
2950				if (rp->r_freef != NULL) {
2951					rp_rmfree(rp);
2952					mutex_exit(&rpfreelist_lock);
2953					rp_rmhash_locked(rp);
2954					rp->r_hashf = rlist;
2955					rlist = rp;
2956				} else
2957					mutex_exit(&rpfreelist_lock);
2958			}
2959		}
2960		rw_exit(&rtable[index].r_lock);
2961	}
2962
2963	for (rp = rlist; rp != NULL; rp = rlist) {
2964		rlist = rp->r_hashf;
2965		/*
2966		 * This call to rp_addfree will end up destroying the
2967		 * rnode, but in a safe way with the appropriate set
2968		 * of checks done.
2969		 */
2970		rp_addfree(rp, cr);
2971	}
2972
2973}
2974
2975/*
2976 * This routine destroys all the resources associated with the rnode
2977 * and then the rnode itself.
2978 */
2979static void
2980destroy_rnode(rnode_t *rp)
2981{
2982	vnode_t *vp;
2983	vfs_t *vfsp;
2984
2985	vp = RTOV(rp);
2986	vfsp = vp->v_vfsp;
2987
2988	ASSERT(vp->v_count == 1);
2989	ASSERT(rp->r_count == 0);
2990	ASSERT(rp->r_lmpl == NULL);
2991	ASSERT(rp->r_mapcnt == 0);
2992	ASSERT(!(rp->r_flags & RHASHED));
2993	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2994	atomic_add_long((ulong_t *)&rnew, -1);
2995#ifdef DEBUG
2996	clstat_debug.nrnode.value.ui64--;
2997#endif
2998	nfs_rw_destroy(&rp->r_rwlock);
2999	nfs_rw_destroy(&rp->r_lkserlock);
3000	mutex_destroy(&rp->r_statelock);
3001	cv_destroy(&rp->r_cv);
3002	cv_destroy(&rp->r_commit.c_cv);
3003	if (rp->r_flags & RDELMAPLIST)
3004		list_destroy(&rp->r_indelmap);
3005	nfs_free_r_path(rp);
3006	avl_destroy(&rp->r_dir);
3007	vn_invalid(vp);
3008	vn_free(vp);
3009	kmem_cache_free(rnode_cache, rp);
3010	VFS_RELE(vfsp);
3011}
3012
3013/*
3014 * Flush all vnodes in this (or every) vfs.
3015 * Used by nfs_sync and by nfs_unmount.
3016 */
3017void
3018rflush(struct vfs *vfsp, cred_t *cr)
3019{
3020	int index;
3021	rnode_t *rp;
3022	vnode_t *vp, **vplist;
3023	long num, cnt;
3024
3025	/*
3026	 * Check to see whether there is anything to do.
3027	 */
3028	num = rnew;
3029	if (num == 0)
3030		return;
3031
3032	/*
3033	 * Allocate a slot for all currently active rnodes on the
3034	 * supposition that they all may need flushing.
3035	 */
3036	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3037	cnt = 0;
3038
3039	/*
3040	 * Walk the hash queues looking for rnodes with page
3041	 * lists associated with them.  Make a list of these
3042	 * files.
3043	 */
3044	for (index = 0; index < rtablesize; index++) {
3045		rw_enter(&rtable[index].r_lock, RW_READER);
3046		for (rp = rtable[index].r_hashf;
3047		    rp != (rnode_t *)(&rtable[index]);
3048		    rp = rp->r_hashf) {
3049			vp = RTOV(rp);
3050			/*
3051			 * Don't bother sync'ing a vp if it
3052			 * is part of virtual swap device or
3053			 * if VFS is read-only
3054			 */
3055			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3056				continue;
3057			/*
3058			 * If flushing all mounted file systems or
3059			 * the vnode belongs to this vfs, has pages
3060			 * and is marked as either dirty or mmap'd,
3061			 * hold and add this vnode to the list of
3062			 * vnodes to flush.
3063			 */
3064			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3065			    vn_has_cached_data(vp) &&
3066			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3067				VN_HOLD(vp);
3068				vplist[cnt++] = vp;
3069				if (cnt == num) {
3070					rw_exit(&rtable[index].r_lock);
3071					goto toomany;
3072				}
3073			}
3074		}
3075		rw_exit(&rtable[index].r_lock);
3076	}
3077toomany:
3078
3079	/*
3080	 * Flush and release all of the files on the list.
3081	 */
3082	while (cnt-- > 0) {
3083		vp = vplist[cnt];
3084		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr);
3085		VN_RELE(vp);
3086	}
3087
3088	/*
3089	 * Free the space allocated to hold the list.
3090	 */
3091	kmem_free(vplist, num * sizeof (*vplist));
3092}
3093
3094/*
3095 * This probably needs to be larger than or equal to
3096 * log2(sizeof (struct rnode)) due to the way that rnodes are
3097 * allocated.
3098 */
3099#define	ACACHE_SHIFT_BITS	9
3100
3101static int
3102acachehash(rnode_t *rp, cred_t *cr)
3103{
3104
3105	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3106	    acachemask);
3107}
3108
3109#ifdef DEBUG
3110static long nfs_access_cache_hits = 0;
3111static long nfs_access_cache_misses = 0;
3112#endif
3113
3114nfs_access_type_t
3115nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3116{
3117	vnode_t *vp;
3118	acache_t *ap;
3119	acache_hash_t *hp;
3120	nfs_access_type_t all;
3121
3122	vp = RTOV(rp);
3123	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3124		return (NFS_ACCESS_UNKNOWN);
3125
3126	if (rp->r_acache != NULL) {
3127		hp = &acache[acachehash(rp, cr)];
3128		rw_enter(&hp->lock, RW_READER);
3129		ap = hp->next;
3130		while (ap != (acache_t *)hp) {
3131			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3132				if ((ap->known & acc) == acc) {
3133#ifdef DEBUG
3134					nfs_access_cache_hits++;
3135#endif
3136					if ((ap->allowed & acc) == acc)
3137						all = NFS_ACCESS_ALLOWED;
3138					else
3139						all = NFS_ACCESS_DENIED;
3140				} else {
3141#ifdef DEBUG
3142					nfs_access_cache_misses++;
3143#endif
3144					all = NFS_ACCESS_UNKNOWN;
3145				}
3146				rw_exit(&hp->lock);
3147				return (all);
3148			}
3149			ap = ap->next;
3150		}
3151		rw_exit(&hp->lock);
3152	}
3153
3154#ifdef DEBUG
3155	nfs_access_cache_misses++;
3156#endif
3157	return (NFS_ACCESS_UNKNOWN);
3158}
3159
3160void
3161nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3162{
3163	acache_t *ap;
3164	acache_t *nap;
3165	acache_hash_t *hp;
3166
3167	hp = &acache[acachehash(rp, cr)];
3168
3169	/*
3170	 * Allocate now assuming that mostly an allocation will be
3171	 * required.  This allows the allocation to happen without
3172	 * holding the hash bucket locked.
3173	 */
3174	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3175	if (nap != NULL) {
3176		nap->known = acc;
3177		nap->allowed = resacc;
3178		nap->rnode = rp;
3179		crhold(cr);
3180		nap->cred = cr;
3181		nap->hashq = hp;
3182	}
3183
3184	rw_enter(&hp->lock, RW_WRITER);
3185
3186	if (rp->r_acache != NULL) {
3187		ap = hp->next;
3188		while (ap != (acache_t *)hp) {
3189			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3190				ap->known |= acc;
3191				ap->allowed &= ~acc;
3192				ap->allowed |= resacc;
3193				rw_exit(&hp->lock);
3194				if (nap != NULL) {
3195					crfree(nap->cred);
3196					kmem_cache_free(acache_cache, nap);
3197				}
3198				return;
3199			}
3200			ap = ap->next;
3201		}
3202	}
3203
3204	if (nap != NULL) {
3205#ifdef DEBUG
3206		clstat_debug.access.value.ui64++;
3207#endif
3208		nap->next = hp->next;
3209		hp->next = nap;
3210		nap->next->prev = nap;
3211		nap->prev = (acache_t *)hp;
3212
3213		mutex_enter(&rp->r_statelock);
3214		nap->list = rp->r_acache;
3215		rp->r_acache = nap;
3216		mutex_exit(&rp->r_statelock);
3217	}
3218
3219	rw_exit(&hp->lock);
3220}
3221
3222int
3223nfs_access_purge_rp(rnode_t *rp)
3224{
3225	acache_t *ap;
3226	acache_t *tmpap;
3227	acache_t *rplist;
3228
3229	/*
3230	 * If there aren't any cached entries, then there is nothing
3231	 * to free.
3232	 */
3233	if (rp->r_acache == NULL)
3234		return (0);
3235
3236	mutex_enter(&rp->r_statelock);
3237	rplist = rp->r_acache;
3238	rp->r_acache = NULL;
3239	mutex_exit(&rp->r_statelock);
3240
3241	/*
3242	 * Loop through each entry in the list pointed to in the
3243	 * rnode.  Remove each of these entries from the hash
3244	 * queue that it is on and remove it from the list in
3245	 * the rnode.
3246	 */
3247	for (ap = rplist; ap != NULL; ap = tmpap) {
3248		rw_enter(&ap->hashq->lock, RW_WRITER);
3249		ap->prev->next = ap->next;
3250		ap->next->prev = ap->prev;
3251		rw_exit(&ap->hashq->lock);
3252
3253		tmpap = ap->list;
3254		crfree(ap->cred);
3255		kmem_cache_free(acache_cache, ap);
3256#ifdef DEBUG
3257		clstat_debug.access.value.ui64--;
3258#endif
3259	}
3260
3261	return (1);
3262}
3263
3264static const char prefix[] = ".nfs";
3265
3266static kmutex_t newnum_lock;
3267
3268int
3269newnum(void)
3270{
3271	static uint_t newnum = 0;
3272	uint_t id;
3273
3274	mutex_enter(&newnum_lock);
3275	if (newnum == 0)
3276		newnum = gethrestime_sec() & 0xffff;
3277	id = newnum++;
3278	mutex_exit(&newnum_lock);
3279	return (id);
3280}
3281
3282char *
3283newname(void)
3284{
3285	char *news;
3286	char *s;
3287	const char *p;
3288	uint_t id;
3289
3290	id = newnum();
3291	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3292	s = news;
3293	p = prefix;
3294	while (*p != '\0')
3295		*s++ = *p++;
3296	while (id != 0) {
3297		*s++ = "0123456789ABCDEF"[id & 0x0f];
3298		id >>= 4;
3299	}
3300	*s = '\0';
3301	return (news);
3302}
3303
3304int
3305nfs_atoi(char *cp)
3306{
3307	int n;
3308
3309	n = 0;
3310	while (*cp != '\0') {
3311		n = n * 10 + (*cp - '0');
3312		cp++;
3313	}
3314
3315	return (n);
3316}
3317
3318/*
3319 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3320 * framework.
3321 */
3322static int
3323cl_snapshot(kstat_t *ksp, void *buf, int rw)
3324{
3325	ksp->ks_snaptime = gethrtime();
3326	if (rw == KSTAT_WRITE) {
3327		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3328#ifdef DEBUG
3329		/*
3330		 * Currently only the global zone can write to kstats, but we
3331		 * add the check just for paranoia.
3332		 */
3333		if (INGLOBALZONE(curproc))
3334			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3335			    sizeof (clstat_debug));
3336#endif
3337	} else {
3338		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3339#ifdef DEBUG
3340		/*
3341		 * If we're displaying the "global" debug kstat values, we
3342		 * display them as-is to all zones since in fact they apply to
3343		 * the system as a whole.
3344		 */
3345		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3346		    sizeof (clstat_debug));
3347#endif
3348	}
3349	return (0);
3350}
3351
3352static void *
3353clinit_zone(zoneid_t zoneid)
3354{
3355	kstat_t *nfs_client_kstat;
3356	struct nfs_clnt *nfscl;
3357	uint_t ndata;
3358
3359	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3360	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3361	nfscl->nfscl_chtable = NULL;
3362	nfscl->nfscl_zoneid = zoneid;
3363
3364	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3365	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3366#ifdef DEBUG
3367	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3368#endif
3369	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3370	    "misc", KSTAT_TYPE_NAMED, ndata,
3371	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3372		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3373		nfs_client_kstat->ks_snapshot = cl_snapshot;
3374		kstat_install(nfs_client_kstat);
3375	}
3376	mutex_enter(&nfs_clnt_list_lock);
3377	list_insert_head(&nfs_clnt_list, nfscl);
3378	mutex_exit(&nfs_clnt_list_lock);
3379	return (nfscl);
3380}
3381
3382/*ARGSUSED*/
3383static void
3384clfini_zone(zoneid_t zoneid, void *arg)
3385{
3386	struct nfs_clnt *nfscl = arg;
3387	chhead_t *chp, *next;
3388
3389	if (nfscl == NULL)
3390		return;
3391	mutex_enter(&nfs_clnt_list_lock);
3392	list_remove(&nfs_clnt_list, nfscl);
3393	mutex_exit(&nfs_clnt_list_lock);
3394	clreclaim_zone(nfscl, 0);
3395	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3396		ASSERT(chp->ch_list == NULL);
3397		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3398		next = chp->ch_next;
3399		kmem_free(chp, sizeof (*chp));
3400	}
3401	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3402	mutex_destroy(&nfscl->nfscl_chtable_lock);
3403	kmem_free(nfscl, sizeof (*nfscl));
3404}
3405
3406/*
3407 * Called by endpnt_destructor to make sure the client handles are
3408 * cleaned up before the RPC endpoints.  This becomes a no-op if
3409 * clfini_zone (above) is called first.  This function is needed
3410 * (rather than relying on clfini_zone to clean up) because the ZSD
3411 * callbacks have no ordering mechanism, so we have no way to ensure
3412 * that clfini_zone is called before endpnt_destructor.
3413 */
3414void
3415clcleanup_zone(zoneid_t zoneid)
3416{
3417	struct nfs_clnt *nfscl;
3418
3419	mutex_enter(&nfs_clnt_list_lock);
3420	nfscl = list_head(&nfs_clnt_list);
3421	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3422		if (nfscl->nfscl_zoneid == zoneid) {
3423			clreclaim_zone(nfscl, 0);
3424			break;
3425		}
3426	}
3427	mutex_exit(&nfs_clnt_list_lock);
3428}
3429
3430int
3431nfs_subrinit(void)
3432{
3433	int i;
3434	ulong_t nrnode_max;
3435
3436	/*
3437	 * Allocate and initialize the rnode hash queues
3438	 */
3439	if (nrnode <= 0)
3440		nrnode = ncsize;
3441	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3442	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3443		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3444		    "setting nrnode to max value of %ld", nrnode_max);
3445		nrnode = nrnode_max;
3446	}
3447
3448	rtablesize = 1 << highbit(nrnode / hashlen);
3449	rtablemask = rtablesize - 1;
3450	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3451	for (i = 0; i < rtablesize; i++) {
3452		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3453		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3454		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3455	}
3456	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3457	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3458
3459	/*
3460	 * Allocate and initialize the access cache
3461	 */
3462
3463	/*
3464	 * Initial guess is one access cache entry per rnode unless
3465	 * nacache is set to a non-zero value and then it is used to
3466	 * indicate a guess at the number of access cache entries.
3467	 */
3468	if (nacache > 0)
3469		acachesize = 1 << highbit(nacache / hashlen);
3470	else
3471		acachesize = rtablesize;
3472	acachemask = acachesize - 1;
3473	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3474	for (i = 0; i < acachesize; i++) {
3475		acache[i].next = (acache_t *)&acache[i];
3476		acache[i].prev = (acache_t *)&acache[i];
3477		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3478	}
3479	acache_cache = kmem_cache_create("nfs_access_cache",
3480	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3481	/*
3482	 * Allocate and initialize the client handle cache
3483	 */
3484	chtab_cache = kmem_cache_create("client_handle_cache",
3485		sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL,
3486		NULL, 0);
3487	/*
3488	 * Initialize the list of per-zone client handles (and associated data).
3489	 * This needs to be done before we call zone_key_create().
3490	 */
3491	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3492	    offsetof(struct nfs_clnt, nfscl_node));
3493	/*
3494	 * Initialize the zone_key for per-zone client handle lists.
3495	 */
3496	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3497	/*
3498	 * Initialize the various mutexes and reader/writer locks
3499	 */
3500	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3501	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3502	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3503
3504	/*
3505	 * Assign unique major number for all nfs mounts
3506	 */
3507	if ((nfs_major = getudev()) == -1) {
3508		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3509		    "nfs: init: can't get unique device number");
3510		nfs_major = 0;
3511	}
3512	nfs_minor = 0;
3513
3514	if (nfs3_jukebox_delay == 0)
3515		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3516
3517	return (0);
3518}
3519
3520void
3521nfs_subrfini(void)
3522{
3523	int i;
3524
3525	/*
3526	 * Deallocate the rnode hash queues
3527	 */
3528	kmem_cache_destroy(rnode_cache);
3529
3530	for (i = 0; i < rtablesize; i++)
3531		rw_destroy(&rtable[i].r_lock);
3532	kmem_free(rtable, rtablesize * sizeof (*rtable));
3533
3534	/*
3535	 * Deallocated the access cache
3536	 */
3537	kmem_cache_destroy(acache_cache);
3538
3539	for (i = 0; i < acachesize; i++)
3540		rw_destroy(&acache[i].lock);
3541	kmem_free(acache, acachesize * sizeof (*acache));
3542
3543	/*
3544	 * Deallocate the client handle cache
3545	 */
3546	kmem_cache_destroy(chtab_cache);
3547
3548	/*
3549	 * Destroy the various mutexes and reader/writer locks
3550	 */
3551	mutex_destroy(&rpfreelist_lock);
3552	mutex_destroy(&newnum_lock);
3553	mutex_destroy(&nfs_minor_lock);
3554	(void) zone_key_delete(nfsclnt_zone_key);
3555}
3556
3557enum nfsstat
3558puterrno(int error)
3559{
3560
3561	switch (error) {
3562	case EOPNOTSUPP:
3563		return (NFSERR_OPNOTSUPP);
3564	case ENAMETOOLONG:
3565		return (NFSERR_NAMETOOLONG);
3566	case ENOTEMPTY:
3567		return (NFSERR_NOTEMPTY);
3568	case EDQUOT:
3569		return (NFSERR_DQUOT);
3570	case ESTALE:
3571		return (NFSERR_STALE);
3572	case EREMOTE:
3573		return (NFSERR_REMOTE);
3574	case ENOSYS:
3575		return (NFSERR_OPNOTSUPP);
3576	case EOVERFLOW:
3577		return (NFSERR_INVAL);
3578	default:
3579		return ((enum nfsstat)error);
3580	}
3581	/* NOTREACHED */
3582}
3583
3584int
3585geterrno(enum nfsstat status)
3586{
3587
3588	switch (status) {
3589	case NFSERR_OPNOTSUPP:
3590		return (EOPNOTSUPP);
3591	case NFSERR_NAMETOOLONG:
3592		return (ENAMETOOLONG);
3593	case NFSERR_NOTEMPTY:
3594		return (ENOTEMPTY);
3595	case NFSERR_DQUOT:
3596		return (EDQUOT);
3597	case NFSERR_STALE:
3598		return (ESTALE);
3599	case NFSERR_REMOTE:
3600		return (EREMOTE);
3601	case NFSERR_WFLUSH:
3602		return (EIO);
3603	default:
3604		return ((int)status);
3605	}
3606	/* NOTREACHED */
3607}
3608
3609enum nfsstat3
3610puterrno3(int error)
3611{
3612
3613#ifdef DEBUG
3614	switch (error) {
3615	case 0:
3616		return (NFS3_OK);
3617	case EPERM:
3618		return (NFS3ERR_PERM);
3619	case ENOENT:
3620		return (NFS3ERR_NOENT);
3621	case EIO:
3622		return (NFS3ERR_IO);
3623	case ENXIO:
3624		return (NFS3ERR_NXIO);
3625	case EACCES:
3626		return (NFS3ERR_ACCES);
3627	case EEXIST:
3628		return (NFS3ERR_EXIST);
3629	case EXDEV:
3630		return (NFS3ERR_XDEV);
3631	case ENODEV:
3632		return (NFS3ERR_NODEV);
3633	case ENOTDIR:
3634		return (NFS3ERR_NOTDIR);
3635	case EISDIR:
3636		return (NFS3ERR_ISDIR);
3637	case EINVAL:
3638		return (NFS3ERR_INVAL);
3639	case EFBIG:
3640		return (NFS3ERR_FBIG);
3641	case ENOSPC:
3642		return (NFS3ERR_NOSPC);
3643	case EROFS:
3644		return (NFS3ERR_ROFS);
3645	case EMLINK:
3646		return (NFS3ERR_MLINK);
3647	case ENAMETOOLONG:
3648		return (NFS3ERR_NAMETOOLONG);
3649	case ENOTEMPTY:
3650		return (NFS3ERR_NOTEMPTY);
3651	case EDQUOT:
3652		return (NFS3ERR_DQUOT);
3653	case ESTALE:
3654		return (NFS3ERR_STALE);
3655	case EREMOTE:
3656		return (NFS3ERR_REMOTE);
3657	case EOPNOTSUPP:
3658		return (NFS3ERR_NOTSUPP);
3659	case EOVERFLOW:
3660		return (NFS3ERR_INVAL);
3661	default:
3662		zcmn_err(getzoneid(), CE_WARN,
3663		    "puterrno3: got error %d", error);
3664		return ((enum nfsstat3)error);
3665	}
3666#else
3667	switch (error) {
3668	case ENAMETOOLONG:
3669		return (NFS3ERR_NAMETOOLONG);
3670	case ENOTEMPTY:
3671		return (NFS3ERR_NOTEMPTY);
3672	case EDQUOT:
3673		return (NFS3ERR_DQUOT);
3674	case ESTALE:
3675		return (NFS3ERR_STALE);
3676	case EOPNOTSUPP:
3677		return (NFS3ERR_NOTSUPP);
3678	case EREMOTE:
3679		return (NFS3ERR_REMOTE);
3680	case EOVERFLOW:
3681		return (NFS3ERR_INVAL);
3682	default:
3683		return ((enum nfsstat3)error);
3684	}
3685#endif
3686}
3687
3688int
3689geterrno3(enum nfsstat3 status)
3690{
3691
3692#ifdef DEBUG
3693	switch (status) {
3694	case NFS3_OK:
3695		return (0);
3696	case NFS3ERR_PERM:
3697		return (EPERM);
3698	case NFS3ERR_NOENT:
3699		return (ENOENT);
3700	case NFS3ERR_IO:
3701		return (EIO);
3702	case NFS3ERR_NXIO:
3703		return (ENXIO);
3704	case NFS3ERR_ACCES:
3705		return (EACCES);
3706	case NFS3ERR_EXIST:
3707		return (EEXIST);
3708	case NFS3ERR_XDEV:
3709		return (EXDEV);
3710	case NFS3ERR_NODEV:
3711		return (ENODEV);
3712	case NFS3ERR_NOTDIR:
3713		return (ENOTDIR);
3714	case NFS3ERR_ISDIR:
3715		return (EISDIR);
3716	case NFS3ERR_INVAL:
3717		return (EINVAL);
3718	case NFS3ERR_FBIG:
3719		return (EFBIG);
3720	case NFS3ERR_NOSPC:
3721		return (ENOSPC);
3722	case NFS3ERR_ROFS:
3723		return (EROFS);
3724	case NFS3ERR_MLINK:
3725		return (EMLINK);
3726	case NFS3ERR_NAMETOOLONG:
3727		return (ENAMETOOLONG);
3728	case NFS3ERR_NOTEMPTY:
3729		return (ENOTEMPTY);
3730	case NFS3ERR_DQUOT:
3731		return (EDQUOT);
3732	case NFS3ERR_STALE:
3733		return (ESTALE);
3734	case NFS3ERR_REMOTE:
3735		return (EREMOTE);
3736	case NFS3ERR_BADHANDLE:
3737		return (ESTALE);
3738	case NFS3ERR_NOT_SYNC:
3739		return (EINVAL);
3740	case NFS3ERR_BAD_COOKIE:
3741		return (ENOENT);
3742	case NFS3ERR_NOTSUPP:
3743		return (EOPNOTSUPP);
3744	case NFS3ERR_TOOSMALL:
3745		return (EINVAL);
3746	case NFS3ERR_SERVERFAULT:
3747		return (EIO);
3748	case NFS3ERR_BADTYPE:
3749		return (EINVAL);
3750	case NFS3ERR_JUKEBOX:
3751		return (ENXIO);
3752	default:
3753		zcmn_err(getzoneid(), CE_WARN,
3754		    "geterrno3: got status %d", status);
3755		return ((int)status);
3756	}
3757#else
3758	switch (status) {
3759	case NFS3ERR_NAMETOOLONG:
3760		return (ENAMETOOLONG);
3761	case NFS3ERR_NOTEMPTY:
3762		return (ENOTEMPTY);
3763	case NFS3ERR_DQUOT:
3764		return (EDQUOT);
3765	case NFS3ERR_STALE:
3766	case NFS3ERR_BADHANDLE:
3767		return (ESTALE);
3768	case NFS3ERR_NOTSUPP:
3769		return (EOPNOTSUPP);
3770	case NFS3ERR_REMOTE:
3771		return (EREMOTE);
3772	case NFS3ERR_NOT_SYNC:
3773	case NFS3ERR_TOOSMALL:
3774	case NFS3ERR_BADTYPE:
3775		return (EINVAL);
3776	case NFS3ERR_BAD_COOKIE:
3777		return (ENOENT);
3778	case NFS3ERR_SERVERFAULT:
3779		return (EIO);
3780	case NFS3ERR_JUKEBOX:
3781		return (ENXIO);
3782	default:
3783		return ((int)status);
3784	}
3785#endif
3786}
3787
3788rddir_cache *
3789rddir_cache_alloc(int flags)
3790{
3791	rddir_cache *rc;
3792
3793	rc = kmem_alloc(sizeof (*rc), flags);
3794	if (rc != NULL) {
3795		rc->entries = NULL;
3796		rc->flags = RDDIR;
3797		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3798		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3799		rc->count = 1;
3800#ifdef DEBUG
3801		atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3802#endif
3803	}
3804	return (rc);
3805}
3806
3807static void
3808rddir_cache_free(rddir_cache *rc)
3809{
3810
3811#ifdef DEBUG
3812	atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3813#endif
3814	if (rc->entries != NULL) {
3815#ifdef DEBUG
3816		rddir_cache_buf_free(rc->entries, rc->buflen);
3817#else
3818		kmem_free(rc->entries, rc->buflen);
3819#endif
3820	}
3821	cv_destroy(&rc->cv);
3822	mutex_destroy(&rc->lock);
3823	kmem_free(rc, sizeof (*rc));
3824}
3825
3826void
3827rddir_cache_hold(rddir_cache *rc)
3828{
3829
3830	mutex_enter(&rc->lock);
3831	rc->count++;
3832	mutex_exit(&rc->lock);
3833}
3834
3835void
3836rddir_cache_rele(rddir_cache *rc)
3837{
3838
3839	mutex_enter(&rc->lock);
3840	ASSERT(rc->count > 0);
3841	if (--rc->count == 0) {
3842		mutex_exit(&rc->lock);
3843		rddir_cache_free(rc);
3844	} else
3845		mutex_exit(&rc->lock);
3846}
3847
3848#ifdef DEBUG
3849char *
3850rddir_cache_buf_alloc(size_t size, int flags)
3851{
3852	char *rc;
3853
3854	rc = kmem_alloc(size, flags);
3855	if (rc != NULL)
3856		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3857	return (rc);
3858}
3859
3860void
3861rddir_cache_buf_free(void *addr, size_t size)
3862{
3863
3864	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3865	kmem_free(addr, size);
3866}
3867#endif
3868
3869static int
3870nfs_free_data_reclaim(rnode_t *rp)
3871{
3872	char *contents;
3873	int size;
3874	vsecattr_t *vsp;
3875	nfs3_pathconf_info *info;
3876	int freed;
3877	cred_t *cred;
3878
3879	/*
3880	 * Free any held credentials and caches which
3881	 * may be associated with this rnode.
3882	 */
3883	mutex_enter(&rp->r_statelock);
3884	cred = rp->r_cred;
3885	rp->r_cred = NULL;
3886	contents = rp->r_symlink.contents;
3887	size = rp->r_symlink.size;
3888	rp->r_symlink.contents = NULL;
3889	vsp = rp->r_secattr;
3890	rp->r_secattr = NULL;
3891	info = rp->r_pathconf;
3892	rp->r_pathconf = NULL;
3893	mutex_exit(&rp->r_statelock);
3894
3895	if (cred != NULL)
3896		crfree(cred);
3897
3898	/*
3899	 * Free the access cache entries.
3900	 */
3901	freed = nfs_access_purge_rp(rp);
3902
3903	if (!HAVE_RDDIR_CACHE(rp) &&
3904	    contents == NULL &&
3905	    vsp == NULL &&
3906	    info == NULL)
3907		return (freed);
3908
3909	/*
3910	 * Free the readdir cache entries
3911	 */
3912	if (HAVE_RDDIR_CACHE(rp))
3913		nfs_purge_rddir_cache(RTOV(rp));
3914
3915	/*
3916	 * Free the symbolic link cache.
3917	 */
3918	if (contents != NULL) {
3919
3920		kmem_free((void *)contents, size);
3921	}
3922
3923	/*
3924	 * Free any cached ACL.
3925	 */
3926	if (vsp != NULL)
3927		nfs_acl_free(vsp);
3928
3929	/*
3930	 * Free any cached pathconf information.
3931	 */
3932	if (info != NULL)
3933		kmem_free(info, sizeof (*info));
3934
3935	return (1);
3936}
3937
3938static int
3939nfs_active_data_reclaim(rnode_t *rp)
3940{
3941	char *contents;
3942	int size;
3943	vsecattr_t *vsp;
3944	nfs3_pathconf_info *info;
3945	int freed;
3946
3947	/*
3948	 * Free any held credentials and caches which
3949	 * may be associated with this rnode.
3950	 */
3951	if (!mutex_tryenter(&rp->r_statelock))
3952		return (0);
3953	contents = rp->r_symlink.contents;
3954	size = rp->r_symlink.size;
3955	rp->r_symlink.contents = NULL;
3956	vsp = rp->r_secattr;
3957	rp->r_secattr = NULL;
3958	info = rp->r_pathconf;
3959	rp->r_pathconf = NULL;
3960	mutex_exit(&rp->r_statelock);
3961
3962	/*
3963	 * Free the access cache entries.
3964	 */
3965	freed = nfs_access_purge_rp(rp);
3966
3967	if (!HAVE_RDDIR_CACHE(rp) &&
3968	    contents == NULL &&
3969	    vsp == NULL &&
3970	    info == NULL)
3971		return (freed);
3972
3973	/*
3974	 * Free the readdir cache entries
3975	 */
3976	if (HAVE_RDDIR_CACHE(rp))
3977		nfs_purge_rddir_cache(RTOV(rp));
3978
3979	/*
3980	 * Free the symbolic link cache.
3981	 */
3982	if (contents != NULL) {
3983
3984		kmem_free((void *)contents, size);
3985	}
3986
3987	/*
3988	 * Free any cached ACL.
3989	 */
3990	if (vsp != NULL)
3991		nfs_acl_free(vsp);
3992
3993	/*
3994	 * Free any cached pathconf information.
3995	 */
3996	if (info != NULL)
3997		kmem_free(info, sizeof (*info));
3998
3999	return (1);
4000}
4001
4002static int
4003nfs_free_reclaim(void)
4004{
4005	int freed;
4006	rnode_t *rp;
4007
4008#ifdef DEBUG
4009	clstat_debug.f_reclaim.value.ui64++;
4010#endif
4011	freed = 0;
4012	mutex_enter(&rpfreelist_lock);
4013	rp = rpfreelist;
4014	if (rp != NULL) {
4015		do {
4016			if (nfs_free_data_reclaim(rp))
4017				freed = 1;
4018		} while ((rp = rp->r_freef) != rpfreelist);
4019	}
4020	mutex_exit(&rpfreelist_lock);
4021	return (freed);
4022}
4023
4024static int
4025nfs_active_reclaim(void)
4026{
4027	int freed;
4028	int index;
4029	rnode_t *rp;
4030
4031#ifdef DEBUG
4032	clstat_debug.a_reclaim.value.ui64++;
4033#endif
4034	freed = 0;
4035	for (index = 0; index < rtablesize; index++) {
4036		rw_enter(&rtable[index].r_lock, RW_READER);
4037		for (rp = rtable[index].r_hashf;
4038		    rp != (rnode_t *)(&rtable[index]);
4039		    rp = rp->r_hashf) {
4040			if (nfs_active_data_reclaim(rp))
4041				freed = 1;
4042		}
4043		rw_exit(&rtable[index].r_lock);
4044	}
4045	return (freed);
4046}
4047
4048static int
4049nfs_rnode_reclaim(void)
4050{
4051	int freed;
4052	rnode_t *rp;
4053	vnode_t *vp;
4054
4055#ifdef DEBUG
4056	clstat_debug.r_reclaim.value.ui64++;
4057#endif
4058	freed = 0;
4059	mutex_enter(&rpfreelist_lock);
4060	while ((rp = rpfreelist) != NULL) {
4061		rp_rmfree(rp);
4062		mutex_exit(&rpfreelist_lock);
4063		if (rp->r_flags & RHASHED) {
4064			vp = RTOV(rp);
4065			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4066			mutex_enter(&vp->v_lock);
4067			if (vp->v_count > 1) {
4068				vp->v_count--;
4069				mutex_exit(&vp->v_lock);
4070				rw_exit(&rp->r_hashq->r_lock);
4071				mutex_enter(&rpfreelist_lock);
4072				continue;
4073			}
4074			mutex_exit(&vp->v_lock);
4075			rp_rmhash_locked(rp);
4076			rw_exit(&rp->r_hashq->r_lock);
4077		}
4078		/*
4079		 * This call to rp_addfree will end up destroying the
4080		 * rnode, but in a safe way with the appropriate set
4081		 * of checks done.
4082		 */
4083		rp_addfree(rp, CRED());
4084		mutex_enter(&rpfreelist_lock);
4085	}
4086	mutex_exit(&rpfreelist_lock);
4087	return (freed);
4088}
4089
4090/*ARGSUSED*/
4091static void
4092nfs_reclaim(void *cdrarg)
4093{
4094
4095#ifdef DEBUG
4096	clstat_debug.reclaim.value.ui64++;
4097#endif
4098	if (nfs_free_reclaim())
4099		return;
4100
4101	if (nfs_active_reclaim())
4102		return;
4103
4104	(void) nfs_rnode_reclaim();
4105}
4106
4107/*
4108 * NFS client failover support
4109 *
4110 * Routines to copy filehandles
4111 */
4112void
4113nfscopyfh(caddr_t fhp, vnode_t *vp)
4114{
4115	fhandle_t *dest = (fhandle_t *)fhp;
4116
4117	if (dest != NULL)
4118		*dest = *VTOFH(vp);
4119}
4120
4121void
4122nfs3copyfh(caddr_t fhp, vnode_t *vp)
4123{
4124	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4125
4126	if (dest != NULL)
4127		*dest = *VTOFH3(vp);
4128}
4129
4130/*
4131 * NFS client failover support
4132 *
4133 * failover_safe() will test various conditions to ensure that
4134 * failover is permitted for this vnode.  It will be denied
4135 * if:
4136 *	1) the operation in progress does not support failover (NULL fi)
4137 *	2) there are no available replicas (NULL mi_servers->sv_next)
4138 *	3) any locks are outstanding on this file
4139 */
4140static int
4141failover_safe(failinfo_t *fi)
4142{
4143
4144	/*
4145	 * Does this op permit failover?
4146	 */
4147	if (fi == NULL || fi->vp == NULL)
4148		return (0);
4149
4150	/*
4151	 * Are there any alternates to failover to?
4152	 */
4153	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4154		return (0);
4155
4156	/*
4157	 * Disable check; we've forced local locking
4158	 *
4159	 * if (flk_has_remote_locks(fi->vp))
4160	 *	return (0);
4161	 */
4162
4163	/*
4164	 * If we have no partial path, we can't do anything
4165	 */
4166	if (VTOR(fi->vp)->r_path == NULL)
4167		return (0);
4168
4169	return (1);
4170}
4171
4172#include <sys/thread.h>
4173
4174/*
4175 * NFS client failover support
4176 *
4177 * failover_newserver() will start a search for a new server,
4178 * preferably by starting an async thread to do the work.  If
4179 * someone is already doing this (recognizable by MI_BINDINPROG
4180 * being set), it will simply return and the calling thread
4181 * will queue on the mi_failover_cv condition variable.
4182 */
4183static void
4184failover_newserver(mntinfo_t *mi)
4185{
4186	/*
4187	 * Check if someone else is doing this already
4188	 */
4189	mutex_enter(&mi->mi_lock);
4190	if (mi->mi_flags & MI_BINDINPROG) {
4191		mutex_exit(&mi->mi_lock);
4192		return;
4193	}
4194	mi->mi_flags |= MI_BINDINPROG;
4195
4196	/*
4197	 * Need to hold the vfs struct so that it can't be released
4198	 * while the failover thread is selecting a new server.
4199	 */
4200	VFS_HOLD(mi->mi_vfsp);
4201
4202	/*
4203	 * Start a thread to do the real searching.
4204	 */
4205	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4206
4207	mutex_exit(&mi->mi_lock);
4208}
4209
4210/*
4211 * NFS client failover support
4212 *
4213 * failover_thread() will find a new server to replace the one
4214 * currently in use, wake up other threads waiting on this mount
4215 * point, and die.  It will start at the head of the server list
4216 * and poll servers until it finds one with an NFS server which is
4217 * registered and responds to a NULL procedure ping.
4218 *
4219 * XXX failover_thread is unsafe within the scope of the
4220 * present model defined for cpr to suspend the system.
4221 * Specifically, over-the-wire calls made by the thread
4222 * are unsafe. The thread needs to be reevaluated in case of
4223 * future updates to the cpr suspend model.
4224 */
4225static void
4226failover_thread(mntinfo_t *mi)
4227{
4228	servinfo_t *svp = NULL;
4229	CLIENT *cl;
4230	enum clnt_stat status;
4231	struct timeval tv;
4232	int error;
4233	int oncethru = 0;
4234	callb_cpr_t cprinfo;
4235	rnode_t *rp;
4236	int index;
4237	char *srvnames;
4238	size_t srvnames_len;
4239	struct nfs_clnt *nfscl = NULL;
4240	zoneid_t zoneid = getzoneid();
4241
4242#ifdef DEBUG
4243	/*
4244	 * This is currently only needed to access counters which exist on
4245	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4246	 * on non-DEBUG kernels.
4247	 */
4248	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4249	ASSERT(nfscl != NULL);
4250#endif
4251
4252	/*
4253	 * Its safe to piggyback on the mi_lock since failover_newserver()
4254	 * code guarantees that there will be only one failover thread
4255	 * per mountinfo at any instance.
4256	 */
4257	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4258	    "failover_thread");
4259
4260	mutex_enter(&mi->mi_lock);
4261	while (mi->mi_readers) {
4262		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4263		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4264		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4265	}
4266	mutex_exit(&mi->mi_lock);
4267
4268	tv.tv_sec = 2;
4269	tv.tv_usec = 0;
4270
4271	/*
4272	 * Ping the null NFS procedure of every server in
4273	 * the list until one responds.  We always start
4274	 * at the head of the list and always skip the one
4275	 * that is current, since it's caused us a problem.
4276	 */
4277	while (svp == NULL) {
4278		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4279			if (!oncethru && svp == mi->mi_curr_serv)
4280				continue;
4281
4282			/*
4283			 * If the file system was forcibly umounted
4284			 * while trying to do a failover, then just
4285			 * give up on the failover.  It won't matter
4286			 * what the server is.
4287			 */
4288			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4289				svp = NULL;
4290				goto done;
4291			}
4292
4293			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4294			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4295			if (error)
4296				continue;
4297
4298			if (!(mi->mi_flags & MI_INT))
4299				cl->cl_nosignal = TRUE;
4300			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4301			    xdr_void, NULL, tv);
4302			if (!(mi->mi_flags & MI_INT))
4303				cl->cl_nosignal = FALSE;
4304			AUTH_DESTROY(cl->cl_auth);
4305			CLNT_DESTROY(cl);
4306			if (status == RPC_SUCCESS) {
4307				if (svp == mi->mi_curr_serv) {
4308#ifdef DEBUG
4309					zcmn_err(zoneid, CE_NOTE,
4310			"NFS%d: failing over: selecting original server %s",
4311					    mi->mi_vers, svp->sv_hostname);
4312#else
4313					zcmn_err(zoneid, CE_NOTE,
4314			"NFS: failing over: selecting original server %s",
4315					    svp->sv_hostname);
4316#endif
4317				} else {
4318#ifdef DEBUG
4319					zcmn_err(zoneid, CE_NOTE,
4320				    "NFS%d: failing over from %s to %s",
4321					    mi->mi_vers,
4322					    mi->mi_curr_serv->sv_hostname,
4323					    svp->sv_hostname);
4324#else
4325					zcmn_err(zoneid, CE_NOTE,
4326				    "NFS: failing over from %s to %s",
4327					    mi->mi_curr_serv->sv_hostname,
4328					    svp->sv_hostname);
4329#endif
4330				}
4331				break;
4332			}
4333		}
4334
4335		if (svp == NULL) {
4336			if (!oncethru) {
4337				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4338#ifdef DEBUG
4339				zprintf(zoneid,
4340				    "NFS%d servers %s not responding "
4341				    "still trying\n", mi->mi_vers, srvnames);
4342#else
4343				zprintf(zoneid, "NFS servers %s not responding "
4344				    "still trying\n", srvnames);
4345#endif
4346				oncethru = 1;
4347			}
4348			mutex_enter(&mi->mi_lock);
4349			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4350			mutex_exit(&mi->mi_lock);
4351			delay(hz);
4352			mutex_enter(&mi->mi_lock);
4353			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4354			mutex_exit(&mi->mi_lock);
4355		}
4356	}
4357
4358	if (oncethru) {
4359#ifdef DEBUG
4360		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4361#else
4362		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4363#endif
4364	}
4365
4366	if (svp != mi->mi_curr_serv) {
4367		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4368		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4369		rw_enter(&rtable[index].r_lock, RW_WRITER);
4370		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4371		    mi->mi_vfsp);
4372		if (rp != NULL) {
4373			if (rp->r_flags & RHASHED)
4374				rp_rmhash_locked(rp);
4375			rw_exit(&rtable[index].r_lock);
4376			rp->r_server = svp;
4377			rp->r_fh = svp->sv_fhandle;
4378			(void) nfs_free_data_reclaim(rp);
4379			index = rtablehash(&rp->r_fh);
4380			rp->r_hashq = &rtable[index];
4381			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4382			vn_exists(RTOV(rp));
4383			rp_addhash(rp);
4384			rw_exit(&rp->r_hashq->r_lock);
4385			VN_RELE(RTOV(rp));
4386		} else
4387			rw_exit(&rtable[index].r_lock);
4388	}
4389
4390done:
4391	if (oncethru)
4392		kmem_free(srvnames, srvnames_len);
4393	mutex_enter(&mi->mi_lock);
4394	mi->mi_flags &= ~MI_BINDINPROG;
4395	if (svp != NULL) {
4396		mi->mi_curr_serv = svp;
4397		mi->mi_failover++;
4398#ifdef DEBUG
4399	nfscl->nfscl_stat.failover.value.ui64++;
4400#endif
4401	}
4402	cv_broadcast(&mi->mi_failover_cv);
4403	CALLB_CPR_EXIT(&cprinfo);
4404	VFS_RELE(mi->mi_vfsp);
4405	zthread_exit();
4406	/* NOTREACHED */
4407}
4408
4409/*
4410 * NFS client failover support
4411 *
4412 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4413 * is cleared, meaning that failover is complete.  Called with
4414 * mi_lock mutex held.
4415 */
4416static int
4417failover_wait(mntinfo_t *mi)
4418{
4419	k_sigset_t smask;
4420
4421	/*
4422	 * If someone else is hunting for a living server,
4423	 * sleep until it's done.  After our sleep, we may
4424	 * be bound to the right server and get off cheaply.
4425	 */
4426	while (mi->mi_flags & MI_BINDINPROG) {
4427		/*
4428		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4429		 * and SIGTERM. (Preserving the existing masks).
4430		 * Mask out SIGINT if mount option nointr is specified.
4431		 */
4432		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4433		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4434			/*
4435			 * restore original signal mask
4436			 */
4437			sigunintr(&smask);
4438			return (EINTR);
4439		}
4440		/*
4441		 * restore original signal mask
4442		 */
4443		sigunintr(&smask);
4444	}
4445	return (0);
4446}
4447
4448/*
4449 * NFS client failover support
4450 *
4451 * failover_remap() will do a partial pathname lookup and find the
4452 * desired vnode on the current server.  The interim vnode will be
4453 * discarded after we pilfer the new filehandle.
4454 *
4455 * Side effects:
4456 * - This routine will also update the filehandle in the args structure
4457 *    pointed to by the fi->fhp pointer if it is non-NULL.
4458 */
4459
4460static int
4461failover_remap(failinfo_t *fi)
4462{
4463	vnode_t *vp, *nvp, *rootvp;
4464	rnode_t *rp, *nrp;
4465	mntinfo_t *mi;
4466	int error;
4467#ifdef DEBUG
4468	struct nfs_clnt *nfscl;
4469
4470	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4471	ASSERT(nfscl != NULL);
4472#endif
4473	/*
4474	 * Sanity check
4475	 */
4476	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4477		return (EINVAL);
4478	vp = fi->vp;
4479	rp = VTOR(vp);
4480	mi = VTOMI(vp);
4481
4482	if (!(vp->v_flag & VROOT)) {
4483		/*
4484		 * Given the root fh, use the path stored in
4485		 * the rnode to find the fh for the new server.
4486		 */
4487		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4488		if (error)
4489			return (error);
4490
4491		error = failover_lookup(rp->r_path, rootvp,
4492		    fi->lookupproc, fi->xattrdirproc, &nvp);
4493
4494		VN_RELE(rootvp);
4495
4496		if (error)
4497			return (error);
4498
4499		/*
4500		 * If we found the same rnode, we're done now
4501		 */
4502		if (nvp == vp) {
4503			/*
4504			 * Failed and the new server may physically be same
4505			 * OR may share a same disk subsystem. In this case
4506			 * file handle for a particular file path is not going
4507			 * to change, given the same filehandle lookup will
4508			 * always locate the same rnode as the existing one.
4509			 * All we might need to do is to update the r_server
4510			 * with the current servinfo.
4511			 */
4512			if (!VALID_FH(fi)) {
4513				rp->r_server = mi->mi_curr_serv;
4514			}
4515			VN_RELE(nvp);
4516			return (0);
4517		}
4518
4519		/*
4520		 * Try to make it so that no one else will find this
4521		 * vnode because it is just a temporary to hold the
4522		 * new file handle until that file handle can be
4523		 * copied to the original vnode/rnode.
4524		 */
4525		nrp = VTOR(nvp);
4526		mutex_enter(&mi->mi_remap_lock);
4527		/*
4528		 * Some other thread could have raced in here and could
4529		 * have done the remap for this particular rnode before
4530		 * this thread here. Check for rp->r_server and
4531		 * mi->mi_curr_serv and return if they are same.
4532		 */
4533		if (VALID_FH(fi)) {
4534			mutex_exit(&mi->mi_remap_lock);
4535			VN_RELE(nvp);
4536			return (0);
4537		}
4538
4539		if (nrp->r_flags & RHASHED)
4540			rp_rmhash(nrp);
4541
4542		/*
4543		 * As a heuristic check on the validity of the new
4544		 * file, check that the size and type match against
4545		 * that we remember from the old version.
4546		 */
4547		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4548			mutex_exit(&mi->mi_remap_lock);
4549			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4550			    "NFS replicas %s and %s: file %s not same.",
4551			    rp->r_server->sv_hostname,
4552			    nrp->r_server->sv_hostname, rp->r_path);
4553			VN_RELE(nvp);
4554			return (EINVAL);
4555		}
4556
4557		/*
4558		 * snarf the filehandle from the new rnode
4559		 * then release it, again while updating the
4560		 * hash queues for the rnode.
4561		 */
4562		if (rp->r_flags & RHASHED)
4563			rp_rmhash(rp);
4564		rp->r_server = mi->mi_curr_serv;
4565		rp->r_fh = nrp->r_fh;
4566		rp->r_hashq = nrp->r_hashq;
4567		/*
4568		 * Copy the attributes from the new rnode to the old
4569		 * rnode.  This will help to reduce unnecessary page
4570		 * cache flushes.
4571		 */
4572		rp->r_attr = nrp->r_attr;
4573		rp->r_attrtime = nrp->r_attrtime;
4574		rp->r_mtime = nrp->r_mtime;
4575		(void) nfs_free_data_reclaim(rp);
4576		nfs_setswaplike(vp, &rp->r_attr);
4577		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4578		rp_addhash(rp);
4579		rw_exit(&rp->r_hashq->r_lock);
4580		mutex_exit(&mi->mi_remap_lock);
4581		VN_RELE(nvp);
4582	}
4583
4584	/*
4585	 * Update successful failover remap count
4586	 */
4587	mutex_enter(&mi->mi_lock);
4588	mi->mi_remap++;
4589	mutex_exit(&mi->mi_lock);
4590#ifdef DEBUG
4591	nfscl->nfscl_stat.remap.value.ui64++;
4592#endif
4593
4594	/*
4595	 * If we have a copied filehandle to update, do it now.
4596	 */
4597	if (fi->fhp != NULL && fi->copyproc != NULL)
4598		(*fi->copyproc)(fi->fhp, vp);
4599
4600	return (0);
4601}
4602
4603/*
4604 * NFS client failover support
4605 *
4606 * We want a simple pathname lookup routine to parse the pieces
4607 * of path in rp->r_path.  We know that the path was a created
4608 * as rnodes were made, so we know we have only to deal with
4609 * paths that look like:
4610 *	dir1/dir2/dir3/file
4611 * Any evidence of anything like .., symlinks, and ENOTDIR
4612 * are hard errors, because they mean something in this filesystem
4613 * is different from the one we came from, or has changed under
4614 * us in some way.  If this is true, we want the failure.
4615 *
4616 * Extended attributes: if the filesystem is mounted with extended
4617 * attributes enabled (-o xattr), the attribute directory will be
4618 * represented in the r_path as the magic name XATTR_RPATH. So if
4619 * we see that name in the pathname, is must be because this node
4620 * is an extended attribute.  Therefore, look it up that way.
4621 */
4622static int
4623failover_lookup(char *path, vnode_t *root,
4624    int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4625	vnode_t *, cred_t *, int),
4626    int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4627    vnode_t **new)
4628{
4629	vnode_t *dvp, *nvp;
4630	int error = EINVAL;
4631	char *s, *p, *tmppath;
4632	size_t len;
4633	mntinfo_t *mi;
4634	bool_t xattr;
4635
4636	/* Make local copy of path */
4637	len = strlen(path) + 1;
4638	tmppath = kmem_alloc(len, KM_SLEEP);
4639	(void) strcpy(tmppath, path);
4640	s = tmppath;
4641
4642	dvp = root;
4643	VN_HOLD(dvp);
4644	mi = VTOMI(root);
4645	xattr = mi->mi_flags & MI_EXTATTR;
4646
4647	do {
4648		p = strchr(s, '/');
4649		if (p != NULL)
4650			*p = '\0';
4651		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4652			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4653			    RFSCALL_SOFT);
4654		} else {
4655			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4656			    CRED(), RFSCALL_SOFT);
4657		}
4658		if (p != NULL)
4659			*p++ = '/';
4660		if (error) {
4661			VN_RELE(dvp);
4662			kmem_free(tmppath, len);
4663			return (error);
4664		}
4665		s = p;
4666		VN_RELE(dvp);
4667		dvp = nvp;
4668	} while (p != NULL);
4669
4670	if (nvp != NULL && new != NULL)
4671		*new = nvp;
4672	kmem_free(tmppath, len);
4673	return (0);
4674}
4675
4676/*
4677 * NFS client failover support
4678 *
4679 * sv_free() frees the malloc'd portion of a "servinfo_t".
4680 */
4681void
4682sv_free(servinfo_t *svp)
4683{
4684	servinfo_t *next;
4685	struct knetconfig *knconf;
4686
4687	while (svp != NULL) {
4688		next = svp->sv_next;
4689		if (svp->sv_secdata)
4690			sec_clnt_freeinfo(svp->sv_secdata);
4691		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4692			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4693		knconf = svp->sv_knconf;
4694		if (knconf != NULL) {
4695			if (knconf->knc_protofmly != NULL)
4696				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4697			if (knconf->knc_proto != NULL)
4698				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4699			kmem_free(knconf, sizeof (*knconf));
4700		}
4701		knconf = svp->sv_origknconf;
4702		if (knconf != NULL) {
4703			if (knconf->knc_protofmly != NULL)
4704				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4705			if (knconf->knc_proto != NULL)
4706				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4707			kmem_free(knconf, sizeof (*knconf));
4708		}
4709		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4710			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4711		mutex_destroy(&svp->sv_lock);
4712		kmem_free(svp, sizeof (*svp));
4713		svp = next;
4714	}
4715}
4716
4717/*
4718 * Only can return non-zero if intr != 0.
4719 */
4720int
4721nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4722{
4723
4724	mutex_enter(&l->lock);
4725
4726	/*
4727	 * If this is a nested enter, then allow it.  There
4728	 * must be as many exits as enters through.
4729	 */
4730	if (l->owner == curthread) {
4731		/* lock is held for writing by current thread */
4732		ASSERT(rw == RW_READER || rw == RW_WRITER);
4733		l->count--;
4734	} else if (rw == RW_READER) {
4735		/*
4736		 * While there is a writer active or writers waiting,
4737		 * then wait for them to finish up and move on.  Then,
4738		 * increment the count to indicate that a reader is
4739		 * active.
4740		 */
4741		while (l->count < 0 || l->waiters > 0) {
4742			if (intr) {
4743				klwp_t *lwp = ttolwp(curthread);
4744
4745				if (lwp != NULL)
4746					lwp->lwp_nostop++;
4747				if (!cv_wait_sig(&l->cv, &l->lock)) {
4748					if (lwp != NULL)
4749						lwp->lwp_nostop--;
4750					mutex_exit(&l->lock);
4751					return (EINTR);
4752				}
4753				if (lwp != NULL)
4754					lwp->lwp_nostop--;
4755			} else
4756				cv_wait(&l->cv, &l->lock);
4757		}
4758		ASSERT(l->count < INT_MAX);
4759#ifdef	DEBUG
4760		if ((l->count % 10000) == 9999)
4761			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4762				"rwlock @ %p\n", l->count, (void *)&l);
4763#endif
4764		l->count++;
4765	} else {
4766		ASSERT(rw == RW_WRITER);
4767		/*
4768		 * While there are readers active or a writer
4769		 * active, then wait for all of the readers
4770		 * to finish or for the writer to finish.
4771		 * Then, set the owner field to curthread and
4772		 * decrement count to indicate that a writer
4773		 * is active.
4774		 */
4775		while (l->count > 0 || l->owner != NULL) {
4776			l->waiters++;
4777			if (intr) {
4778				klwp_t *lwp = ttolwp(curthread);
4779
4780				if (lwp != NULL)
4781					lwp->lwp_nostop++;
4782				if (!cv_wait_sig(&l->cv, &l->lock)) {
4783					if (lwp != NULL)
4784						lwp->lwp_nostop--;
4785					l->waiters--;
4786					cv_broadcast(&l->cv);
4787					mutex_exit(&l->lock);
4788					return (EINTR);
4789				}
4790				if (lwp != NULL)
4791					lwp->lwp_nostop--;
4792			} else
4793				cv_wait(&l->cv, &l->lock);
4794			l->waiters--;
4795		}
4796		l->owner = curthread;
4797		l->count--;
4798	}
4799
4800	mutex_exit(&l->lock);
4801
4802	return (0);
4803}
4804
4805/*
4806 * If the lock is available, obtain it and return non-zero.  If there is
4807 * already a conflicting lock, return 0 immediately.
4808 */
4809
4810int
4811nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4812{
4813	mutex_enter(&l->lock);
4814
4815	/*
4816	 * If this is a nested enter, then allow it.  There
4817	 * must be as many exits as enters through.
4818	 */
4819	if (l->owner == curthread) {
4820		/* lock is held for writing by current thread */
4821		ASSERT(rw == RW_READER || rw == RW_WRITER);
4822		l->count--;
4823	} else if (rw == RW_READER) {
4824		/*
4825		 * If there is a writer active or writers waiting, deny the
4826		 * lock.  Otherwise, bump the count of readers.
4827		 */
4828		if (l->count < 0 || l->waiters > 0) {
4829			mutex_exit(&l->lock);
4830			return (0);
4831		}
4832		l->count++;
4833	} else {
4834		ASSERT(rw == RW_WRITER);
4835		/*
4836		 * If there are readers active or a writer active, deny the
4837		 * lock.  Otherwise, set the owner field to curthread and
4838		 * decrement count to indicate that a writer is active.
4839		 */
4840		if (l->count > 0 || l->owner != NULL) {
4841			mutex_exit(&l->lock);
4842			return (0);
4843		}
4844		l->owner = curthread;
4845		l->count--;
4846	}
4847
4848	mutex_exit(&l->lock);
4849
4850	return (1);
4851}
4852
4853void
4854nfs_rw_exit(nfs_rwlock_t *l)
4855{
4856
4857	mutex_enter(&l->lock);
4858	/*
4859	 * If this is releasing a writer lock, then increment count to
4860	 * indicate that there is one less writer active.  If this was
4861	 * the last of possibly nested writer locks, then clear the owner
4862	 * field as well to indicate that there is no writer active
4863	 * and wakeup any possible waiting writers or readers.
4864	 *
4865	 * If releasing a reader lock, then just decrement count to
4866	 * indicate that there is one less reader active.  If this was
4867	 * the last active reader and there are writer(s) waiting,
4868	 * then wake up the first.
4869	 */
4870	if (l->owner != NULL) {
4871		ASSERT(l->owner == curthread);
4872		l->count++;
4873		if (l->count == 0) {
4874			l->owner = NULL;
4875			cv_broadcast(&l->cv);
4876		}
4877	} else {
4878		ASSERT(l->count > 0);
4879		l->count--;
4880		if (l->count == 0 && l->waiters > 0)
4881			cv_broadcast(&l->cv);
4882	}
4883	mutex_exit(&l->lock);
4884}
4885
4886int
4887nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4888{
4889
4890	if (rw == RW_READER)
4891		return (l->count > 0);
4892	ASSERT(rw == RW_WRITER);
4893	return (l->count < 0);
4894}
4895
4896/* ARGSUSED */
4897void
4898nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4899{
4900
4901	l->count = 0;
4902	l->waiters = 0;
4903	l->owner = NULL;
4904	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4905	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4906}
4907
4908void
4909nfs_rw_destroy(nfs_rwlock_t *l)
4910{
4911
4912	mutex_destroy(&l->lock);
4913	cv_destroy(&l->cv);
4914}
4915
4916int
4917nfs3_rddir_compar(const void *x, const void *y)
4918{
4919	rddir_cache *a = (rddir_cache *)x;
4920	rddir_cache *b = (rddir_cache *)y;
4921
4922	if (a->nfs3_cookie == b->nfs3_cookie) {
4923		if (a->buflen == b->buflen)
4924			return (0);
4925		if (a->buflen < b->buflen)
4926			return (-1);
4927		return (1);
4928	}
4929
4930	if (a->nfs3_cookie < b->nfs3_cookie)
4931		return (-1);
4932
4933	return (1);
4934}
4935
4936int
4937nfs_rddir_compar(const void *x, const void *y)
4938{
4939	rddir_cache *a = (rddir_cache *)x;
4940	rddir_cache *b = (rddir_cache *)y;
4941
4942	if (a->nfs_cookie == b->nfs_cookie) {
4943		if (a->buflen == b->buflen)
4944			return (0);
4945		if (a->buflen < b->buflen)
4946			return (-1);
4947		return (1);
4948	}
4949
4950	if (a->nfs_cookie < b->nfs_cookie)
4951		return (-1);
4952
4953	return (1);
4954}
4955
4956static char *
4957nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4958{
4959	servinfo_t *s;
4960	char *srvnames;
4961	char *namep;
4962	size_t length;
4963
4964	/*
4965	 * Calculate the length of the string required to hold all
4966	 * of the server names plus either a comma or a null
4967	 * character following each individual one.
4968	 */
4969	length = 0;
4970	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4971		length += s->sv_hostnamelen;
4972
4973	srvnames = kmem_alloc(length, KM_SLEEP);
4974
4975	namep = srvnames;
4976	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4977		(void) strcpy(namep, s->sv_hostname);
4978		namep += s->sv_hostnamelen - 1;
4979		*namep++ = ',';
4980	}
4981	*--namep = '\0';
4982
4983	*len = length;
4984
4985	return (srvnames);
4986}
4987
4988/*
4989 * These two functions are temporary and designed for the upgrade-workaround
4990 * only.  They cannot be used for general zone-crossing NFS client support, and
4991 * will be removed shortly.
4992 *
4993 * When the workaround is enabled, all NFS traffic is forced into the global
4994 * zone.  These functions are called when the code needs to refer to the state
4995 * of the underlying network connection.  They're not called when the function
4996 * needs to refer to the state of the process that invoked the system call.
4997 * (E.g., when checking whether the zone is shutting down during the mount()
4998 * call.)
4999 */
5000
5001struct zone *
5002nfs_zone(void)
5003{
5004	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5005}
5006
5007zoneid_t
5008nfs_zoneid(void)
5009{
5010	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5011}
5012
5013/*
5014 * nfs_mount_label_policy:
5015 *	Determine whether the mount is allowed according to MAC check,
5016 *	by comparing (where appropriate) label of the remote server
5017 *	against the label of the zone being mounted into.
5018 *
5019 *	Returns:
5020 *		 0 :	access allowed
5021 *		-1 :	read-only access allowed (i.e., read-down)
5022 *		>0 :	error code, such as EACCES
5023 */
5024int
5025nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5026    struct knetconfig *knconf, cred_t *cr)
5027{
5028	int		addr_type;
5029	void		*ipaddr;
5030	bslabel_t	*server_sl, *mntlabel;
5031	zone_t		*mntzone = NULL;
5032	ts_label_t	*zlabel;
5033	tsol_tpc_t	*tp;
5034	ts_label_t	*tsl = NULL;
5035	int		retv;
5036
5037	/*
5038	 * Get the zone's label.  Each zone on a labeled system has a label.
5039	 */
5040	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5041	zlabel = mntzone->zone_slabel;
5042	ASSERT(zlabel != NULL);
5043	label_hold(zlabel);
5044
5045	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5046		addr_type = IPV4_VERSION;
5047		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5048	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5049		addr_type = IPV6_VERSION;
5050		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5051	} else {
5052		retv = 0;
5053		goto out;
5054	}
5055
5056	retv = EACCES;				/* assume the worst */
5057
5058	/*
5059	 * Next, get the assigned label of the remote server.
5060	 */
5061	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5062	if (tp == NULL)
5063		goto out;			/* error getting host entry */
5064
5065	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5066		goto rel_tpc;			/* invalid domain */
5067	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5068	    (tp->tpc_tp.host_type != UNLABELED))
5069		goto rel_tpc;			/* invalid hosttype */
5070
5071	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5072		tsl = getflabel_cipso(vfsp);
5073		if (tsl == NULL)
5074			goto rel_tpc;		/* error getting server lbl */
5075
5076		server_sl = label2bslabel(tsl);
5077	} else {	/* UNLABELED */
5078		server_sl = &tp->tpc_tp.tp_def_label;
5079	}
5080
5081	mntlabel = label2bslabel(zlabel);
5082
5083	/*
5084	 * Now compare labels to complete the MAC check.  If the labels
5085	 * are equal or if the requestor is in the global zone and has
5086	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5087	 * mounts into the global zone itself; restrict these to
5088	 * read-only.)
5089	 *
5090	 * If the requestor is in some other zone, but his label
5091	 * dominates the server, then allow read-down.
5092	 *
5093	 * Otherwise, access is denied.
5094	 */
5095	if (blequal(mntlabel, server_sl) ||
5096	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5097	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5098		if ((mntzone == global_zone) ||
5099		    !blequal(mntlabel, server_sl))
5100			retv = -1;		/* read-only */
5101		else
5102			retv = 0;		/* access OK */
5103	} else if (bldominates(mntlabel, server_sl)) {
5104		retv = -1;			/* read-only */
5105	} else {
5106		retv = EACCES;
5107	}
5108
5109	if (tsl != NULL)
5110		label_rele(tsl);
5111
5112rel_tpc:
5113	TPC_RELE(tp);
5114out:
5115	if (mntzone)
5116		zone_rele(mntzone);
5117	label_rele(zlabel);
5118	return (retv);
5119}
5120
5121boolean_t
5122nfs_has_ctty(void)
5123{
5124	boolean_t rv;
5125	mutex_enter(&curproc->p_splock);
5126	rv = (curproc->p_sessp->s_vp != NULL);
5127	mutex_exit(&curproc->p_splock);
5128	return (rv);
5129}
5130