1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28 *	All Rights Reserved
29 */
30
31#include <sys/param.h>
32#include <sys/types.h>
33#include <sys/systm.h>
34#include <sys/cmn_err.h>
35#include <sys/vtrace.h>
36#include <sys/session.h>
37#include <sys/thread.h>
38#include <sys/dnlc.h>
39#include <sys/cred.h>
40#include <sys/priv.h>
41#include <sys/list.h>
42#include <sys/sdt.h>
43#include <sys/policy.h>
44
45#include <rpc/types.h>
46#include <rpc/xdr.h>
47
48#include <nfs/nfs.h>
49
50#include <nfs/nfs_clnt.h>
51
52#include <nfs/nfs4.h>
53#include <nfs/rnode4.h>
54#include <nfs/nfs4_clnt.h>
55
56/*
57 * client side statistics
58 */
59static const struct clstat4 clstat4_tmpl = {
60	{ "calls",	KSTAT_DATA_UINT64 },
61	{ "badcalls",	KSTAT_DATA_UINT64 },
62	{ "referrals",	KSTAT_DATA_UINT64 },
63	{ "referlinks",	KSTAT_DATA_UINT64 },
64	{ "clgets",	KSTAT_DATA_UINT64 },
65	{ "cltoomany",	KSTAT_DATA_UINT64 },
66#ifdef DEBUG
67	{ "clalloc",	KSTAT_DATA_UINT64 },
68	{ "noresponse",	KSTAT_DATA_UINT64 },
69	{ "failover",	KSTAT_DATA_UINT64 },
70	{ "remap",	KSTAT_DATA_UINT64 },
71#endif
72};
73
74#ifdef DEBUG
75struct clstat4_debug clstat4_debug = {
76	{ "nrnode",	KSTAT_DATA_UINT64 },
77	{ "access",	KSTAT_DATA_UINT64 },
78	{ "dirent",	KSTAT_DATA_UINT64 },
79	{ "dirents",	KSTAT_DATA_UINT64 },
80	{ "reclaim",	KSTAT_DATA_UINT64 },
81	{ "clreclaim",	KSTAT_DATA_UINT64 },
82	{ "f_reclaim",	KSTAT_DATA_UINT64 },
83	{ "a_reclaim",	KSTAT_DATA_UINT64 },
84	{ "r_reclaim",	KSTAT_DATA_UINT64 },
85	{ "r_path",	KSTAT_DATA_UINT64 },
86};
87#endif
88
89/*
90 * We keep a global list of per-zone client data, so we can clean up all zones
91 * if we get low on memory.
92 */
93static list_t nfs4_clnt_list;
94static kmutex_t nfs4_clnt_list_lock;
95zone_key_t nfs4clnt_zone_key;
96
97static struct kmem_cache *chtab4_cache;
98
99#ifdef DEBUG
100static int nfs4_rfscall_debug;
101static int nfs4_try_failover_any;
102int nfs4_utf8_debug = 0;
103#endif
104
105/*
106 * NFSv4 readdir cache implementation
107 */
108typedef struct rddir4_cache_impl {
109	rddir4_cache	rc;		/* readdir cache element */
110	kmutex_t	lock;		/* lock protects count */
111	uint_t		count;		/* reference count */
112	avl_node_t	tree;		/* AVL tree link */
113} rddir4_cache_impl;
114
115static int rddir4_cache_compar(const void *, const void *);
116static void rddir4_cache_free(rddir4_cache_impl *);
117static rddir4_cache *rddir4_cache_alloc(int);
118static void rddir4_cache_hold(rddir4_cache *);
119static int try_failover(enum clnt_stat);
120
121static int nfs4_readdir_cache_hits = 0;
122static int nfs4_readdir_cache_waits = 0;
123static int nfs4_readdir_cache_misses = 0;
124
125/*
126 * Shared nfs4 functions
127 */
128
129/*
130 * Copy an nfs_fh4.  The destination storage (to->nfs_fh4_val) must already
131 * be allocated.
132 */
133
134void
135nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to)
136{
137	to->nfs_fh4_len = from->nfs_fh4_len;
138	bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len);
139}
140
141/*
142 * nfs4cmpfh - compare 2 filehandles.
143 * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
144 * "less" than the second, +1 if the first is "greater" than the second.
145 */
146
147int
148nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2)
149{
150	const char *c1, *c2;
151
152	if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len)
153		return (-1);
154	if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len)
155		return (1);
156	for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val;
157	    c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len;
158	    c1++, c2++) {
159		if (*c1 < *c2)
160			return (-1);
161		if (*c1 > *c2)
162			return (1);
163	}
164
165	return (0);
166}
167
168/*
169 * Compare two v4 filehandles.  Return zero if they're the same, non-zero
170 * if they're not.  Like nfs4cmpfh(), but different filehandle
171 * representation, and doesn't provide information about greater than or
172 * less than.
173 */
174
175int
176nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2)
177{
178	if (fh1->fh_len == fh2->fh_len)
179		return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len));
180
181	return (1);
182}
183
184int
185stateid4_cmp(stateid4 *s1, stateid4 *s2)
186{
187	if (bcmp(s1, s2, sizeof (stateid4)) == 0)
188		return (1);
189	else
190		return (0);
191}
192
193nfsstat4
194puterrno4(int error)
195{
196	switch (error) {
197	case 0:
198		return (NFS4_OK);
199	case EPERM:
200		return (NFS4ERR_PERM);
201	case ENOENT:
202		return (NFS4ERR_NOENT);
203	case EINTR:
204		return (NFS4ERR_IO);
205	case EIO:
206		return (NFS4ERR_IO);
207	case ENXIO:
208		return (NFS4ERR_NXIO);
209	case ENOMEM:
210		return (NFS4ERR_RESOURCE);
211	case EACCES:
212		return (NFS4ERR_ACCESS);
213	case EBUSY:
214		return (NFS4ERR_IO);
215	case EEXIST:
216		return (NFS4ERR_EXIST);
217	case EXDEV:
218		return (NFS4ERR_XDEV);
219	case ENODEV:
220		return (NFS4ERR_IO);
221	case ENOTDIR:
222		return (NFS4ERR_NOTDIR);
223	case EISDIR:
224		return (NFS4ERR_ISDIR);
225	case EINVAL:
226		return (NFS4ERR_INVAL);
227	case EMFILE:
228		return (NFS4ERR_RESOURCE);
229	case EFBIG:
230		return (NFS4ERR_FBIG);
231	case ENOSPC:
232		return (NFS4ERR_NOSPC);
233	case EROFS:
234		return (NFS4ERR_ROFS);
235	case EMLINK:
236		return (NFS4ERR_MLINK);
237	case EDEADLK:
238		return (NFS4ERR_DEADLOCK);
239	case ENOLCK:
240		return (NFS4ERR_DENIED);
241	case EREMOTE:
242		return (NFS4ERR_SERVERFAULT);
243	case ENOTSUP:
244		return (NFS4ERR_NOTSUPP);
245	case EDQUOT:
246		return (NFS4ERR_DQUOT);
247	case ENAMETOOLONG:
248		return (NFS4ERR_NAMETOOLONG);
249	case EOVERFLOW:
250		return (NFS4ERR_INVAL);
251	case ENOSYS:
252		return (NFS4ERR_NOTSUPP);
253	case ENOTEMPTY:
254		return (NFS4ERR_NOTEMPTY);
255	case EOPNOTSUPP:
256		return (NFS4ERR_NOTSUPP);
257	case ESTALE:
258		return (NFS4ERR_STALE);
259	case EAGAIN:
260		if (curthread->t_flag & T_WOULDBLOCK) {
261			curthread->t_flag &= ~T_WOULDBLOCK;
262			return (NFS4ERR_DELAY);
263		}
264		return (NFS4ERR_LOCKED);
265	default:
266		return ((enum nfsstat4)error);
267	}
268}
269
270int
271geterrno4(enum nfsstat4 status)
272{
273	switch (status) {
274	case NFS4_OK:
275		return (0);
276	case NFS4ERR_PERM:
277		return (EPERM);
278	case NFS4ERR_NOENT:
279		return (ENOENT);
280	case NFS4ERR_IO:
281		return (EIO);
282	case NFS4ERR_NXIO:
283		return (ENXIO);
284	case NFS4ERR_ACCESS:
285		return (EACCES);
286	case NFS4ERR_EXIST:
287		return (EEXIST);
288	case NFS4ERR_XDEV:
289		return (EXDEV);
290	case NFS4ERR_NOTDIR:
291		return (ENOTDIR);
292	case NFS4ERR_ISDIR:
293		return (EISDIR);
294	case NFS4ERR_INVAL:
295		return (EINVAL);
296	case NFS4ERR_FBIG:
297		return (EFBIG);
298	case NFS4ERR_NOSPC:
299		return (ENOSPC);
300	case NFS4ERR_ROFS:
301		return (EROFS);
302	case NFS4ERR_MLINK:
303		return (EMLINK);
304	case NFS4ERR_NAMETOOLONG:
305		return (ENAMETOOLONG);
306	case NFS4ERR_NOTEMPTY:
307		return (ENOTEMPTY);
308	case NFS4ERR_DQUOT:
309		return (EDQUOT);
310	case NFS4ERR_STALE:
311		return (ESTALE);
312	case NFS4ERR_BADHANDLE:
313		return (ESTALE);
314	case NFS4ERR_BAD_COOKIE:
315		return (EINVAL);
316	case NFS4ERR_NOTSUPP:
317		return (EOPNOTSUPP);
318	case NFS4ERR_TOOSMALL:
319		return (EINVAL);
320	case NFS4ERR_SERVERFAULT:
321		return (EIO);
322	case NFS4ERR_BADTYPE:
323		return (EINVAL);
324	case NFS4ERR_DELAY:
325		return (ENXIO);
326	case NFS4ERR_SAME:
327		return (EPROTO);
328	case NFS4ERR_DENIED:
329		return (ENOLCK);
330	case NFS4ERR_EXPIRED:
331		return (EPROTO);
332	case NFS4ERR_LOCKED:
333		return (EACCES);
334	case NFS4ERR_GRACE:
335		return (EAGAIN);
336	case NFS4ERR_FHEXPIRED:	/* if got here, failed to get a new fh */
337		return (ESTALE);
338	case NFS4ERR_SHARE_DENIED:
339		return (EACCES);
340	case NFS4ERR_WRONGSEC:
341		return (EPERM);
342	case NFS4ERR_CLID_INUSE:
343		return (EAGAIN);
344	case NFS4ERR_RESOURCE:
345		return (EAGAIN);
346	case NFS4ERR_MOVED:
347		return (EPROTO);
348	case NFS4ERR_NOFILEHANDLE:
349		return (EIO);
350	case NFS4ERR_MINOR_VERS_MISMATCH:
351		return (ENOTSUP);
352	case NFS4ERR_STALE_CLIENTID:
353		return (EIO);
354	case NFS4ERR_STALE_STATEID:
355		return (EIO);
356	case NFS4ERR_OLD_STATEID:
357		return (EIO);
358	case NFS4ERR_BAD_STATEID:
359		return (EIO);
360	case NFS4ERR_BAD_SEQID:
361		return (EIO);
362	case NFS4ERR_NOT_SAME:
363		return (EPROTO);
364	case NFS4ERR_LOCK_RANGE:
365		return (EPROTO);
366	case NFS4ERR_SYMLINK:
367		return (EPROTO);
368	case NFS4ERR_RESTOREFH:
369		return (EPROTO);
370	case NFS4ERR_LEASE_MOVED:
371		return (EPROTO);
372	case NFS4ERR_ATTRNOTSUPP:
373		return (ENOTSUP);
374	case NFS4ERR_NO_GRACE:
375		return (EPROTO);
376	case NFS4ERR_RECLAIM_BAD:
377		return (EPROTO);
378	case NFS4ERR_RECLAIM_CONFLICT:
379		return (EPROTO);
380	case NFS4ERR_BADXDR:
381		return (EINVAL);
382	case NFS4ERR_LOCKS_HELD:
383		return (EIO);
384	case NFS4ERR_OPENMODE:
385		return (EACCES);
386	case NFS4ERR_BADOWNER:
387		/*
388		 * Client and server are in different DNS domains
389		 * and the NFSMAPID_DOMAIN in /etc/default/nfs
390		 * doesn't match.  No good answer here.  Return
391		 * EACCESS, which translates to "permission denied".
392		 */
393		return (EACCES);
394	case NFS4ERR_BADCHAR:
395		return (EINVAL);
396	case NFS4ERR_BADNAME:
397		return (EINVAL);
398	case NFS4ERR_BAD_RANGE:
399		return (EIO);
400	case NFS4ERR_LOCK_NOTSUPP:
401		return (ENOTSUP);
402	case NFS4ERR_OP_ILLEGAL:
403		return (EINVAL);
404	case NFS4ERR_DEADLOCK:
405		return (EDEADLK);
406	case NFS4ERR_FILE_OPEN:
407		return (EACCES);
408	case NFS4ERR_ADMIN_REVOKED:
409		return (EPROTO);
410	case NFS4ERR_CB_PATH_DOWN:
411		return (EPROTO);
412	default:
413#ifdef DEBUG
414		zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d",
415		    status);
416#endif
417		return ((int)status);
418	}
419}
420
421void
422nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op)
423{
424	nfs4_server_t *server;
425
426	/*
427	 * Return if already printed/queued a msg
428	 * for this mount point.
429	 */
430	if (mi->mi_flags & MI4_BADOWNER_DEBUG)
431		return;
432	/*
433	 * Happens once per client <-> server pair.
434	 */
435	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
436	    mi->mi_flags & MI4_INT))
437		return;
438
439	server = find_nfs4_server(mi);
440	if (server == NULL) {
441		nfs_rw_exit(&mi->mi_recovlock);
442		return;
443	}
444
445	if (!(server->s_flags & N4S_BADOWNER_DEBUG)) {
446		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
447		    "!NFSMAPID_DOMAIN does not match"
448		    " the server: %s domain.\n"
449		    "Please check configuration",
450		    mi->mi_curr_serv->sv_hostname);
451		server->s_flags |= N4S_BADOWNER_DEBUG;
452	}
453	mutex_exit(&server->s_lock);
454	nfs4_server_rele(server);
455	nfs_rw_exit(&mi->mi_recovlock);
456
457	/*
458	 * Happens once per mntinfo4_t.
459	 * This error is deemed as one of the recovery facts "RF_BADOWNER",
460	 * queue this in the mesg queue for this mount_info. This message
461	 * is not printed, meaning its absent from id_to_dump_solo_fact()
462	 * but its there for inspection if the queue is ever dumped/inspected.
463	 */
464	mutex_enter(&mi->mi_lock);
465	if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) {
466		nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op,
467		    FALSE, NULL, 0, NULL);
468		mi->mi_flags |= MI4_BADOWNER_DEBUG;
469	}
470	mutex_exit(&mi->mi_lock);
471}
472
473int
474nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime)
475{
476	int64_t sec;
477	int32_t nsec;
478
479	/*
480	 * Here check that the nfsv4 time is valid for the system.
481	 * nfsv4 time value is a signed 64-bit, and the system time
482	 * may be either int64_t or int32_t (depends on the kernel),
483	 * so if the kernel is 32-bit, the nfsv4 time value may not fit.
484	 */
485#ifndef _LP64
486	if (! NFS4_TIME_OK(ntime->seconds)) {
487		return (EOVERFLOW);
488	}
489#endif
490
491	/* Invalid to specify 1 billion (or more) nsecs */
492	if (ntime->nseconds >= 1000000000)
493		return (EINVAL);
494
495	if (ntime->seconds < 0) {
496		sec = ntime->seconds + 1;
497		nsec = -1000000000 + ntime->nseconds;
498	} else {
499		sec = ntime->seconds;
500		nsec = ntime->nseconds;
501	}
502
503	vatime->tv_sec = sec;
504	vatime->tv_nsec = nsec;
505
506	return (0);
507}
508
509int
510nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime)
511{
512	int64_t sec;
513	uint32_t nsec;
514
515	/*
516	 * nfsv4 time value is a signed 64-bit, and the system time
517	 * may be either int64_t or int32_t (depends on the kernel),
518	 * so all system time values will fit.
519	 */
520	if (vatime->tv_nsec >= 0) {
521		sec = vatime->tv_sec;
522		nsec = vatime->tv_nsec;
523	} else {
524		sec = vatime->tv_sec - 1;
525		nsec = 1000000000 + vatime->tv_nsec;
526	}
527	ntime->seconds = sec;
528	ntime->nseconds = nsec;
529
530	return (0);
531}
532
533/*
534 * Converts a utf8 string to a valid null terminated filename string.
535 *
536 * XXX - Not actually translating the UTF-8 string as per RFC 2279.
537 *	 For now, just validate that the UTF-8 string off the wire
538 *	 does not have characters that will freak out UFS, and leave
539 *	 it at that.
540 */
541char *
542utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s)
543{
544	ASSERT(lenp != NULL);
545
546	if (u8s == NULL || u8s->utf8string_len <= 0 ||
547	    u8s->utf8string_val == NULL)
548		return (NULL);
549
550	/*
551	 * Check for obvious illegal filename chars
552	 */
553	if (utf8_strchr(u8s, '/') != NULL) {
554#ifdef DEBUG
555		if (nfs4_utf8_debug) {
556			char *path;
557			int len = u8s->utf8string_len;
558
559			path = kmem_alloc(len + 1, KM_SLEEP);
560			bcopy(u8s->utf8string_val, path, len);
561			path[len] = '\0';
562
563			zcmn_err(getzoneid(), CE_WARN,
564			    "Invalid UTF-8 filename: %s", path);
565
566			kmem_free(path, len + 1);
567		}
568#endif
569		return (NULL);
570	}
571
572	return (utf8_to_str(u8s, lenp, s));
573}
574
575/*
576 * Converts a utf8 string to a C string.
577 * kmem_allocs a new string if not supplied
578 */
579char *
580utf8_to_str(utf8string *str, uint_t *lenp, char *s)
581{
582	char	*sp;
583	char	*u8p;
584	int	len;
585	int	 i;
586
587	ASSERT(lenp != NULL);
588
589	if (str == NULL)
590		return (NULL);
591
592	u8p = str->utf8string_val;
593	len = str->utf8string_len;
594	if (len <= 0 || u8p == NULL) {
595		if (s)
596			*s = '\0';
597		return (NULL);
598	}
599
600	sp = s;
601	if (sp == NULL)
602		sp = kmem_alloc(len + 1, KM_SLEEP);
603
604	/*
605	 * At least check for embedded nulls
606	 */
607	for (i = 0; i < len; i++) {
608		sp[i] = u8p[i];
609		if (u8p[i] == '\0') {
610#ifdef	DEBUG
611			zcmn_err(getzoneid(), CE_WARN,
612			    "Embedded NULL in UTF-8 string");
613#endif
614			if (s == NULL)
615				kmem_free(sp, len + 1);
616			return (NULL);
617		}
618	}
619	sp[len] = '\0';
620	*lenp = len + 1;
621
622	return (sp);
623}
624
625/*
626 * str_to_utf8 - converts a null-terminated C string to a utf8 string
627 */
628utf8string *
629str_to_utf8(char *nm, utf8string *str)
630{
631	int len;
632
633	if (str == NULL)
634		return (NULL);
635
636	if (nm == NULL || *nm == '\0') {
637		str->utf8string_len = 0;
638		str->utf8string_val = NULL;
639	}
640
641	len = strlen(nm);
642
643	str->utf8string_val = kmem_alloc(len, KM_SLEEP);
644	str->utf8string_len = len;
645	bcopy(nm, str->utf8string_val, len);
646
647	return (str);
648}
649
650utf8string *
651utf8_copy(utf8string *src, utf8string *dest)
652{
653	if (src == NULL)
654		return (NULL);
655	if (dest == NULL)
656		return (NULL);
657
658	if (src->utf8string_len > 0) {
659		dest->utf8string_val = kmem_alloc(src->utf8string_len,
660		    KM_SLEEP);
661		bcopy(src->utf8string_val, dest->utf8string_val,
662		    src->utf8string_len);
663		dest->utf8string_len = src->utf8string_len;
664	} else {
665		dest->utf8string_val = NULL;
666		dest->utf8string_len = 0;
667	}
668
669	return (dest);
670}
671
672int
673utf8_compare(const utf8string *a, const utf8string *b)
674{
675	int mlen, cmp;
676	int alen, blen;
677	char *aval, *bval;
678
679	if ((a == NULL) && (b == NULL))
680		return (0);
681	else if (a == NULL)
682		return (-1);
683	else if (b == NULL)
684		return (1);
685
686	alen = a->utf8string_len;
687	blen = b->utf8string_len;
688	aval = a->utf8string_val;
689	bval = b->utf8string_val;
690
691	if (((alen == 0) || (aval == NULL)) &&
692	    ((blen == 0) || (bval == NULL)))
693		return (0);
694	else if ((alen == 0) || (aval == NULL))
695		return (-1);
696	else if ((blen == 0) || (bval == NULL))
697		return (1);
698
699	mlen = MIN(alen, blen);
700	cmp = strncmp(aval, bval, mlen);
701
702	if ((cmp == 0) && (alen == blen))
703		return (0);
704	else if ((cmp == 0) && (alen < blen))
705		return (-1);
706	else if (cmp == 0)
707		return (1);
708	else if (cmp < 0)
709		return (-1);
710	return (1);
711}
712
713/*
714 * utf8_dir_verify - checks that the utf8 string is valid
715 */
716int
717utf8_dir_verify(utf8string *str)
718{
719	char *nm;
720	int len;
721
722	if (str == NULL)
723		return (0);
724
725	nm = str->utf8string_val;
726	len = str->utf8string_len;
727	if (nm == NULL || len == 0) {
728		return (0);
729	}
730
731	if (len == 1 && nm[0] == '.')
732		return (0);
733	if (len == 2 && nm[0] == '.' && nm[1] == '.')
734		return (0);
735
736	if (utf8_strchr(str, '/') != NULL)
737		return (0);
738
739	if (utf8_strchr(str, '\0') != NULL)
740		return (0);
741
742	return (1);
743}
744
745/*
746 * from rpcsec module (common/rpcsec)
747 */
748extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
749extern void sec_clnt_freeh(AUTH *);
750extern void sec_clnt_freeinfo(struct sec_data *);
751
752/*
753 * authget() gets an auth handle based on the security
754 * information from the servinfo in mountinfo.
755 * The auth handle is stored in ch_client->cl_auth.
756 *
757 * First security flavor of choice is to use sv_secdata
758 * which is initiated by the client. If that fails, get
759 * secinfo from the server and then select one from the
760 * server secinfo list .
761 *
762 * For RPCSEC_GSS flavor, upon success, a secure context is
763 * established between client and server.
764 */
765int
766authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr)
767{
768	int error, i;
769
770	/*
771	 * SV4_TRYSECINFO indicates to try the secinfo list from
772	 * sv_secinfo until a successful one is reached. Point
773	 * sv_currsec to the selected security mechanism for
774	 * later sessions.
775	 */
776	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
777	if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) {
778		for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count;
779		    i++) {
780			if (!(error = sec_clnt_geth(ch_client,
781			    &svp->sv_secinfo->sdata[i],
782			    cr, &ch_client->cl_auth))) {
783
784				svp->sv_currsec = &svp->sv_secinfo->sdata[i];
785				svp->sv_secinfo->index = i;
786				/* done */
787				svp->sv_flags &= ~SV4_TRYSECINFO;
788				break;
789			}
790
791			/*
792			 * Allow the caller retry with the security flavor
793			 * pointed by svp->sv_secinfo->index when
794			 * ETIMEDOUT/ECONNRESET occurs.
795			 */
796			if (error == ETIMEDOUT || error == ECONNRESET) {
797				svp->sv_secinfo->index = i;
798				break;
799			}
800		}
801	} else {
802		/* sv_currsec points to one of the entries in sv_secinfo */
803		if (svp->sv_currsec) {
804			error = sec_clnt_geth(ch_client, svp->sv_currsec, cr,
805			    &ch_client->cl_auth);
806		} else {
807			/* If it's null, use sv_secdata. */
808			error = sec_clnt_geth(ch_client, svp->sv_secdata, cr,
809			    &ch_client->cl_auth);
810		}
811	}
812	nfs_rw_exit(&svp->sv_lock);
813
814	return (error);
815}
816
817/*
818 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
819 */
820int
821clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
822    struct chtab **chp, struct nfs4_clnt *nfscl)
823{
824	struct chhead *ch, *newch;
825	struct chhead **plistp;
826	struct chtab *cp;
827	int error;
828	k_sigset_t smask;
829
830	if (newcl == NULL || chp == NULL || ci == NULL)
831		return (EINVAL);
832
833	*newcl = NULL;
834	*chp = NULL;
835
836	/*
837	 * Find an unused handle or create one
838	 */
839	newch = NULL;
840	nfscl->nfscl_stat.clgets.value.ui64++;
841top:
842	/*
843	 * Find the correct entry in the cache to check for free
844	 * client handles.  The search is based on the RPC program
845	 * number, program version number, dev_t for the transport
846	 * device, and the protocol family.
847	 */
848	mutex_enter(&nfscl->nfscl_chtable4_lock);
849	plistp = &nfscl->nfscl_chtable4;
850	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
851		if (ch->ch_prog == ci->cl_prog &&
852		    ch->ch_vers == ci->cl_vers &&
853		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
854		    (strcmp(ch->ch_protofmly,
855		    svp->sv_knconf->knc_protofmly) == 0))
856			break;
857		plistp = &ch->ch_next;
858	}
859
860	/*
861	 * If we didn't find a cache entry for this quadruple, then
862	 * create one.  If we don't have one already preallocated,
863	 * then drop the cache lock, create one, and then start over.
864	 * If we did have a preallocated entry, then just add it to
865	 * the front of the list.
866	 */
867	if (ch == NULL) {
868		if (newch == NULL) {
869			mutex_exit(&nfscl->nfscl_chtable4_lock);
870			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
871			newch->ch_timesused = 0;
872			newch->ch_prog = ci->cl_prog;
873			newch->ch_vers = ci->cl_vers;
874			newch->ch_dev = svp->sv_knconf->knc_rdev;
875			newch->ch_protofmly = kmem_alloc(
876			    strlen(svp->sv_knconf->knc_protofmly) + 1,
877			    KM_SLEEP);
878			(void) strcpy(newch->ch_protofmly,
879			    svp->sv_knconf->knc_protofmly);
880			newch->ch_list = NULL;
881			goto top;
882		}
883		ch = newch;
884		newch = NULL;
885		ch->ch_next = nfscl->nfscl_chtable4;
886		nfscl->nfscl_chtable4 = ch;
887	/*
888	 * We found a cache entry, but if it isn't on the front of the
889	 * list, then move it to the front of the list to try to take
890	 * advantage of locality of operations.
891	 */
892	} else if (ch != nfscl->nfscl_chtable4) {
893		*plistp = ch->ch_next;
894		ch->ch_next = nfscl->nfscl_chtable4;
895		nfscl->nfscl_chtable4 = ch;
896	}
897
898	/*
899	 * If there was a free client handle cached, then remove it
900	 * from the list, init it, and use it.
901	 */
902	if (ch->ch_list != NULL) {
903		cp = ch->ch_list;
904		ch->ch_list = cp->ch_list;
905		mutex_exit(&nfscl->nfscl_chtable4_lock);
906		if (newch != NULL) {
907			kmem_free(newch->ch_protofmly,
908			    strlen(newch->ch_protofmly) + 1);
909			kmem_free(newch, sizeof (*newch));
910		}
911		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
912		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
913
914		/*
915		 * Get an auth handle.
916		 */
917		error = authget(svp, cp->ch_client, cr);
918		if (error || cp->ch_client->cl_auth == NULL) {
919			CLNT_DESTROY(cp->ch_client);
920			kmem_cache_free(chtab4_cache, cp);
921			return ((error != 0) ? error : EINTR);
922		}
923		ch->ch_timesused++;
924		*newcl = cp->ch_client;
925		*chp = cp;
926		return (0);
927	}
928
929	/*
930	 * There weren't any free client handles which fit, so allocate
931	 * a new one and use that.
932	 */
933#ifdef DEBUG
934	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
935#endif
936	mutex_exit(&nfscl->nfscl_chtable4_lock);
937
938	nfscl->nfscl_stat.cltoomany.value.ui64++;
939	if (newch != NULL) {
940		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
941		kmem_free(newch, sizeof (*newch));
942	}
943
944	cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP);
945	cp->ch_head = ch;
946
947	sigintr(&smask, (int)ci->cl_flags & MI4_INT);
948	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
949	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
950	sigunintr(&smask);
951
952	if (error != 0) {
953		kmem_cache_free(chtab4_cache, cp);
954#ifdef DEBUG
955		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
956#endif
957		/*
958		 * Warning is unnecessary if error is EINTR.
959		 */
960		if (error != EINTR) {
961			nfs_cmn_err(error, CE_WARN,
962			    "clget: couldn't create handle: %m\n");
963		}
964		return (error);
965	}
966	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
967	auth_destroy(cp->ch_client->cl_auth);
968
969	/*
970	 * Get an auth handle.
971	 */
972	error = authget(svp, cp->ch_client, cr);
973	if (error || cp->ch_client->cl_auth == NULL) {
974		CLNT_DESTROY(cp->ch_client);
975		kmem_cache_free(chtab4_cache, cp);
976#ifdef DEBUG
977		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
978#endif
979		return ((error != 0) ? error : EINTR);
980	}
981	ch->ch_timesused++;
982	*newcl = cp->ch_client;
983	ASSERT(cp->ch_client->cl_nosignal == FALSE);
984	*chp = cp;
985	return (0);
986}
987
988static int
989nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
990    struct chtab **chp, struct nfs4_clnt *nfscl)
991{
992	clinfo_t ci;
993	bool_t is_recov;
994	int firstcall, error = 0;
995
996	/*
997	 * Set read buffer size to rsize
998	 * and add room for RPC headers.
999	 */
1000	ci.cl_readsize = mi->mi_tsize;
1001	if (ci.cl_readsize != 0)
1002		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
1003
1004	/*
1005	 * If soft mount and server is down just try once.
1006	 * meaning: do not retransmit.
1007	 */
1008	if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN))
1009		ci.cl_retrans = 0;
1010	else
1011		ci.cl_retrans = mi->mi_retrans;
1012
1013	ci.cl_prog = mi->mi_prog;
1014	ci.cl_vers = mi->mi_vers;
1015	ci.cl_flags = mi->mi_flags;
1016
1017	/*
1018	 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
1019	 * security flavor, the client tries to establish a security context
1020	 * by contacting the server. If the connection is timed out or reset,
1021	 * e.g. server reboot, we will try again.
1022	 */
1023	is_recov = (curthread == mi->mi_recovthread);
1024	firstcall = 1;
1025
1026	do {
1027		error = clget4(&ci, svp, cr, newcl, chp, nfscl);
1028
1029		if (error == 0)
1030			break;
1031
1032		/*
1033		 * For forced unmount and zone shutdown, bail out but
1034		 * let the recovery thread do one more transmission.
1035		 */
1036		if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) &&
1037		    (!is_recov || !firstcall)) {
1038			error = EIO;
1039			break;
1040		}
1041
1042		/* do not retry for soft mount */
1043		if (!(mi->mi_flags & MI4_HARD))
1044			break;
1045
1046		/* let the caller deal with the failover case */
1047		if (FAILOVER_MOUNT4(mi))
1048			break;
1049
1050		firstcall = 0;
1051
1052	} while (error == ETIMEDOUT || error == ECONNRESET);
1053
1054	return (error);
1055}
1056
1057void
1058clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl)
1059{
1060	if (cl->cl_auth != NULL) {
1061		sec_clnt_freeh(cl->cl_auth);
1062		cl->cl_auth = NULL;
1063	}
1064
1065	/*
1066	 * Timestamp this cache entry so that we know when it was last
1067	 * used.
1068	 */
1069	cp->ch_freed = gethrestime_sec();
1070
1071	/*
1072	 * Add the free client handle to the front of the list.
1073	 * This way, the list will be sorted in youngest to oldest
1074	 * order.
1075	 */
1076	mutex_enter(&nfscl->nfscl_chtable4_lock);
1077	cp->ch_list = cp->ch_head->ch_list;
1078	cp->ch_head->ch_list = cp;
1079	mutex_exit(&nfscl->nfscl_chtable4_lock);
1080}
1081
1082#define	CL_HOLDTIME	60	/* time to hold client handles */
1083
1084static void
1085clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime)
1086{
1087	struct chhead *ch;
1088	struct chtab *cp;	/* list of objects that can be reclaimed */
1089	struct chtab *cpe;
1090	struct chtab *cpl;
1091	struct chtab **cpp;
1092#ifdef DEBUG
1093	int n = 0;
1094	clstat4_debug.clreclaim.value.ui64++;
1095#endif
1096
1097	/*
1098	 * Need to reclaim some memory, so step through the cache
1099	 * looking through the lists for entries which can be freed.
1100	 */
1101	cp = NULL;
1102
1103	mutex_enter(&nfscl->nfscl_chtable4_lock);
1104
1105	/*
1106	 * Here we step through each non-NULL quadruple and start to
1107	 * construct the reclaim list pointed to by cp.  Note that
1108	 * cp will contain all eligible chtab entries.  When this traversal
1109	 * completes, chtab entries from the last quadruple will be at the
1110	 * front of cp and entries from previously inspected quadruples have
1111	 * been appended to the rear of cp.
1112	 */
1113	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
1114		if (ch->ch_list == NULL)
1115			continue;
1116		/*
1117		 * Search each list for entries older then
1118		 * cl_holdtime seconds.  The lists are maintained
1119		 * in youngest to oldest order so that when the
1120		 * first entry is found which is old enough, then
1121		 * all of the rest of the entries on the list will
1122		 * be old enough as well.
1123		 */
1124		cpl = ch->ch_list;
1125		cpp = &ch->ch_list;
1126		while (cpl != NULL &&
1127		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
1128			cpp = &cpl->ch_list;
1129			cpl = cpl->ch_list;
1130		}
1131		if (cpl != NULL) {
1132			*cpp = NULL;
1133			if (cp != NULL) {
1134				cpe = cpl;
1135				while (cpe->ch_list != NULL)
1136					cpe = cpe->ch_list;
1137				cpe->ch_list = cp;
1138			}
1139			cp = cpl;
1140		}
1141	}
1142
1143	mutex_exit(&nfscl->nfscl_chtable4_lock);
1144
1145	/*
1146	 * If cp is empty, then there is nothing to reclaim here.
1147	 */
1148	if (cp == NULL)
1149		return;
1150
1151	/*
1152	 * Step through the list of entries to free, destroying each client
1153	 * handle and kmem_free'ing the memory for each entry.
1154	 */
1155	while (cp != NULL) {
1156#ifdef DEBUG
1157		n++;
1158#endif
1159		CLNT_DESTROY(cp->ch_client);
1160		cpl = cp->ch_list;
1161		kmem_cache_free(chtab4_cache, cp);
1162		cp = cpl;
1163	}
1164
1165#ifdef DEBUG
1166	/*
1167	 * Update clalloc so that nfsstat shows the current number
1168	 * of allocated client handles.
1169	 */
1170	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
1171#endif
1172}
1173
1174/* ARGSUSED */
1175static void
1176clreclaim4(void *all)
1177{
1178	struct nfs4_clnt *nfscl;
1179
1180	/*
1181	 * The system is low on memory; go through and try to reclaim some from
1182	 * every zone on the system.
1183	 */
1184	mutex_enter(&nfs4_clnt_list_lock);
1185	nfscl = list_head(&nfs4_clnt_list);
1186	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl))
1187		clreclaim4_zone(nfscl, CL_HOLDTIME);
1188	mutex_exit(&nfs4_clnt_list_lock);
1189}
1190
1191/*
1192 * Minimum time-out values indexed by call type
1193 * These units are in "eights" of a second to avoid multiplies
1194 */
1195static unsigned int minimum_timeo[] = {
1196	6, 7, 10
1197};
1198
1199#define	SHORTWAIT	(NFS_COTS_TIMEO / 10)
1200
1201/*
1202 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
1203 */
1204#define	MAXTIMO	(20*hz)
1205#define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
1206#define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
1207
1208static int
1209nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1210    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue,
1211    enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl)
1212{
1213	CLIENT *client;
1214	struct chtab *ch;
1215	cred_t *cr = icr;
1216	struct rpc_err rpcerr, rpcerr_tmp;
1217	enum clnt_stat status;
1218	int error;
1219	struct timeval wait;
1220	int timeo;		/* in units of hz */
1221	bool_t tryagain, is_recov;
1222	bool_t cred_cloned = FALSE;
1223	k_sigset_t smask;
1224	servinfo4_t *svp;
1225#ifdef DEBUG
1226	char *bufp;
1227#endif
1228	int firstcall;
1229
1230	rpcerr.re_status = RPC_SUCCESS;
1231
1232	/*
1233	 * If we know that we are rebooting then let's
1234	 * not bother with doing any over the wireness.
1235	 */
1236	mutex_enter(&mi->mi_lock);
1237	if (mi->mi_flags & MI4_SHUTDOWN) {
1238		mutex_exit(&mi->mi_lock);
1239		return (EIO);
1240	}
1241	mutex_exit(&mi->mi_lock);
1242
1243	/* For TSOL, use a new cred which has net_mac_aware flag */
1244	if (!cred_cloned && is_system_labeled()) {
1245		cred_cloned = TRUE;
1246		cr = crdup(icr);
1247		(void) setpflags(NET_MAC_AWARE, 1, cr);
1248	}
1249
1250	/*
1251	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1252	 * are guaranteed to reprocess the retry as a new request.
1253	 */
1254	svp = mi->mi_curr_serv;
1255	rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
1256	if (rpcerr.re_errno != 0)
1257		return (rpcerr.re_errno);
1258
1259	timeo = (mi->mi_timeo * hz) / 10;
1260
1261	/*
1262	 * If hard mounted fs, retry call forever unless hard error
1263	 * occurs.
1264	 *
1265	 * For forced unmount, let the recovery thread through but return
1266	 * an error for all others.  This is so that user processes can
1267	 * exit quickly.  The recovery thread bails out after one
1268	 * transmission so that it can tell if it needs to continue.
1269	 *
1270	 * For zone shutdown, behave as above to encourage quick
1271	 * process exit, but also fail quickly when servers have
1272	 * timed out before and reduce the timeouts.
1273	 */
1274	is_recov = (curthread == mi->mi_recovthread);
1275	firstcall = 1;
1276	do {
1277		tryagain = FALSE;
1278
1279		NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE,
1280		    "nfs4_rfscall: vfs_flag=0x%x, %s",
1281		    mi->mi_vfsp->vfs_flag,
1282		    is_recov ? "recov thread" : "not recov thread"));
1283
1284		/*
1285		 * It's possible while we're retrying the admin
1286		 * decided to reboot.
1287		 */
1288		mutex_enter(&mi->mi_lock);
1289		if (mi->mi_flags & MI4_SHUTDOWN) {
1290			mutex_exit(&mi->mi_lock);
1291			clfree4(client, ch, nfscl);
1292			if (cred_cloned)
1293				crfree(cr);
1294			return (EIO);
1295		}
1296		mutex_exit(&mi->mi_lock);
1297
1298		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1299		    (!is_recov || !firstcall)) {
1300			clfree4(client, ch, nfscl);
1301			if (cred_cloned)
1302				crfree(cr);
1303			return (EIO);
1304		}
1305
1306		if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) {
1307			mutex_enter(&mi->mi_lock);
1308			if ((mi->mi_flags & MI4_TIMEDOUT) ||
1309			    !is_recov || !firstcall) {
1310				mutex_exit(&mi->mi_lock);
1311				clfree4(client, ch, nfscl);
1312				if (cred_cloned)
1313					crfree(cr);
1314				return (EIO);
1315			}
1316			mutex_exit(&mi->mi_lock);
1317			timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10;
1318		}
1319
1320		firstcall = 0;
1321		TICK_TO_TIMEVAL(timeo, &wait);
1322
1323		/*
1324		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1325		 * and SIGTERM. (Preserving the existing masks).
1326		 * Mask out SIGINT if mount option nointr is specified.
1327		 */
1328		sigintr(&smask, (int)mi->mi_flags & MI4_INT);
1329		if (!(mi->mi_flags & MI4_INT))
1330			client->cl_nosignal = TRUE;
1331
1332		/*
1333		 * If there is a current signal, then don't bother
1334		 * even trying to send out the request because we
1335		 * won't be able to block waiting for the response.
1336		 * Simply assume RPC_INTR and get on with it.
1337		 */
1338		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1339			status = RPC_INTR;
1340		else {
1341			status = CLNT_CALL(client, which, xdrargs, argsp,
1342			    xdrres, resp, wait);
1343		}
1344
1345		if (!(mi->mi_flags & MI4_INT))
1346			client->cl_nosignal = FALSE;
1347		/*
1348		 * restore original signal mask
1349		 */
1350		sigunintr(&smask);
1351
1352		switch (status) {
1353		case RPC_SUCCESS:
1354			break;
1355
1356		case RPC_INTR:
1357			/*
1358			 * There is no way to recover from this error,
1359			 * even if mount option nointr is specified.
1360			 * SIGKILL, for example, cannot be blocked.
1361			 */
1362			rpcerr.re_status = RPC_INTR;
1363			rpcerr.re_errno = EINTR;
1364			break;
1365
1366		case RPC_UDERROR:
1367			/*
1368			 * If the NFS server is local (vold) and
1369			 * it goes away then we get RPC_UDERROR.
1370			 * This is a retryable error, so we would
1371			 * loop, so check to see if the specific
1372			 * error was ECONNRESET, indicating that
1373			 * target did not exist at all.  If so,
1374			 * return with RPC_PROGUNAVAIL and
1375			 * ECONNRESET to indicate why.
1376			 */
1377			CLNT_GETERR(client, &rpcerr);
1378			if (rpcerr.re_errno == ECONNRESET) {
1379				rpcerr.re_status = RPC_PROGUNAVAIL;
1380				rpcerr.re_errno = ECONNRESET;
1381				break;
1382			}
1383			/*FALLTHROUGH*/
1384
1385		default:		/* probably RPC_TIMEDOUT */
1386
1387			if (IS_UNRECOVERABLE_RPC(status))
1388				break;
1389
1390			/*
1391			 * increment server not responding count
1392			 */
1393			mutex_enter(&mi->mi_lock);
1394			mi->mi_noresponse++;
1395			mutex_exit(&mi->mi_lock);
1396#ifdef DEBUG
1397			nfscl->nfscl_stat.noresponse.value.ui64++;
1398#endif
1399			/*
1400			 * On zone shutdown, mark server dead and move on.
1401			 */
1402			if (zone_status_get(curproc->p_zone) >=
1403			    ZONE_IS_SHUTTING_DOWN) {
1404				mutex_enter(&mi->mi_lock);
1405				mi->mi_flags |= MI4_TIMEDOUT;
1406				mutex_exit(&mi->mi_lock);
1407				clfree4(client, ch, nfscl);
1408				if (cred_cloned)
1409					crfree(cr);
1410				return (EIO);
1411			}
1412
1413			/*
1414			 * NFS client failover support:
1415			 * return and let the caller take care of
1416			 * failover.  We only return for failover mounts
1417			 * because otherwise we want the "not responding"
1418			 * message, the timer updates, etc.
1419			 */
1420			if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) &&
1421			    (error = try_failover(status)) != 0) {
1422				clfree4(client, ch, nfscl);
1423				if (cred_cloned)
1424					crfree(cr);
1425				*rpc_statusp = status;
1426				return (error);
1427			}
1428
1429			if (flags & RFSCALL_SOFT)
1430				break;
1431
1432			tryagain = TRUE;
1433
1434			/*
1435			 * The call is in progress (over COTS).
1436			 * Try the CLNT_CALL again, but don't
1437			 * print a noisy error message.
1438			 */
1439			if (status == RPC_INPROGRESS)
1440				break;
1441
1442			timeo = backoff(timeo);
1443			CLNT_GETERR(client, &rpcerr_tmp);
1444
1445			mutex_enter(&mi->mi_lock);
1446			if (!(mi->mi_flags & MI4_PRINTED)) {
1447				mi->mi_flags |= MI4_PRINTED;
1448				mutex_exit(&mi->mi_lock);
1449				if ((status == RPC_CANTSEND) &&
1450				    (rpcerr_tmp.re_errno == ENOBUFS))
1451					nfs4_queue_fact(RF_SENDQ_FULL, mi, 0,
1452					    0, 0, FALSE, NULL, 0, NULL);
1453				else
1454					nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi,
1455					    0, 0, 0, FALSE, NULL, 0, NULL);
1456			} else
1457				mutex_exit(&mi->mi_lock);
1458
1459			if (*doqueue && nfs_has_ctty()) {
1460				*doqueue = 0;
1461				if (!(mi->mi_flags & MI4_NOPRINT)) {
1462					if ((status == RPC_CANTSEND) &&
1463					    (rpcerr_tmp.re_errno == ENOBUFS))
1464						nfs4_queue_fact(RF_SENDQ_FULL,
1465						    mi, 0, 0, 0, FALSE, NULL,
1466						    0, NULL);
1467					else
1468						nfs4_queue_fact(
1469						    RF_SRV_NOT_RESPOND, mi, 0,
1470						    0, 0, FALSE, NULL, 0, NULL);
1471				}
1472			}
1473		}
1474	} while (tryagain);
1475
1476	DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status,
1477	    int, rpcerr.re_errno);
1478
1479	if (status != RPC_SUCCESS) {
1480		zoneid_t zoneid = mi->mi_zone->zone_id;
1481
1482		/*
1483		 * Let soft mounts use the timed out message.
1484		 */
1485		if (status == RPC_INPROGRESS)
1486			status = RPC_TIMEDOUT;
1487		nfscl->nfscl_stat.badcalls.value.ui64++;
1488		if (status != RPC_INTR) {
1489			mutex_enter(&mi->mi_lock);
1490			mi->mi_flags |= MI4_DOWN;
1491			mutex_exit(&mi->mi_lock);
1492			CLNT_GETERR(client, &rpcerr);
1493#ifdef DEBUG
1494			bufp = clnt_sperror(client, svp->sv_hostname);
1495			zprintf(zoneid, "NFS%d %s failed for %s\n",
1496			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1497			if (nfs_has_ctty()) {
1498				if (!(mi->mi_flags & MI4_NOPRINT)) {
1499					uprintf("NFS%d %s failed for %s\n",
1500					    mi->mi_vers, mi->mi_rfsnames[which],
1501					    bufp);
1502				}
1503			}
1504			kmem_free(bufp, MAXPATHLEN);
1505#else
1506			zprintf(zoneid,
1507			    "NFS %s failed for server %s: error %d (%s)\n",
1508			    mi->mi_rfsnames[which], svp->sv_hostname,
1509			    status, clnt_sperrno(status));
1510			if (nfs_has_ctty()) {
1511				if (!(mi->mi_flags & MI4_NOPRINT)) {
1512					uprintf(
1513				"NFS %s failed for server %s: error %d (%s)\n",
1514					    mi->mi_rfsnames[which],
1515					    svp->sv_hostname, status,
1516					    clnt_sperrno(status));
1517				}
1518			}
1519#endif
1520			/*
1521			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1522			 * re_errno is set appropriately depending on
1523			 * the authentication error
1524			 */
1525			if (status == RPC_VERSMISMATCH ||
1526			    status == RPC_PROGVERSMISMATCH)
1527				rpcerr.re_errno = EIO;
1528		}
1529	} else {
1530		/*
1531		 * Test the value of mi_down and mi_printed without
1532		 * holding the mi_lock mutex.  If they are both zero,
1533		 * then it is okay to skip the down and printed
1534		 * processing.  This saves on a mutex_enter and
1535		 * mutex_exit pair for a normal, successful RPC.
1536		 * This was just complete overhead.
1537		 */
1538		if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) {
1539			mutex_enter(&mi->mi_lock);
1540			mi->mi_flags &= ~MI4_DOWN;
1541			if (mi->mi_flags & MI4_PRINTED) {
1542				mi->mi_flags &= ~MI4_PRINTED;
1543				mutex_exit(&mi->mi_lock);
1544				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1545					nfs4_queue_fact(RF_SRV_OK, mi, 0, 0,
1546					    0, FALSE, NULL, 0, NULL);
1547			} else
1548				mutex_exit(&mi->mi_lock);
1549		}
1550
1551		if (*doqueue == 0) {
1552			if (!(mi->mi_flags & MI4_NOPRINT) &&
1553			    !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1554				nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0,
1555				    FALSE, NULL, 0, NULL);
1556
1557			*doqueue = 1;
1558		}
1559	}
1560
1561	clfree4(client, ch, nfscl);
1562	if (cred_cloned)
1563		crfree(cr);
1564
1565	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1566
1567	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d",
1568	    rpcerr.re_errno);
1569
1570	*rpc_statusp = status;
1571	return (rpcerr.re_errno);
1572}
1573
1574/*
1575 * rfs4call - general wrapper for RPC calls initiated by the client
1576 */
1577void
1578rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp,
1579    cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep)
1580{
1581	int i, error;
1582	enum clnt_stat rpc_status = NFS4_OK;
1583	int num_resops;
1584	struct nfs4_clnt *nfscl;
1585
1586	ASSERT(nfs_zone() == mi->mi_zone);
1587	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1588	ASSERT(nfscl != NULL);
1589
1590	nfscl->nfscl_stat.calls.value.ui64++;
1591	mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++;
1592
1593	/* Set up the results struct for XDR usage */
1594	resp->argsp = argsp;
1595	resp->array = NULL;
1596	resp->status = 0;
1597	resp->decode_len = 0;
1598
1599	error = nfs4_rfscall(mi, NFSPROC4_COMPOUND,
1600	    xdr_COMPOUND4args_clnt, (caddr_t)argsp,
1601	    xdr_COMPOUND4res_clnt, (caddr_t)resp, cr,
1602	    doqueue, &rpc_status, flags, nfscl);
1603
1604	/* Return now if it was an RPC error */
1605	if (error) {
1606		ep->error = error;
1607		ep->stat = resp->status;
1608		ep->rpc_status = rpc_status;
1609		return;
1610	}
1611
1612	/* else we'll count the processed operations */
1613	num_resops = resp->decode_len;
1614	for (i = 0; i < num_resops; i++) {
1615		/*
1616		 * Count the individual operations
1617		 * processed by the server.
1618		 */
1619		if (resp->array[i].resop >= NFSPROC4_NULL &&
1620		    resp->array[i].resop <= OP_WRITE)
1621			mi->mi_reqs[resp->array[i].resop].value.ui64++;
1622	}
1623
1624	ep->error = 0;
1625	ep->stat = resp->status;
1626	ep->rpc_status = rpc_status;
1627}
1628
1629/*
1630 * nfs4rename_update - updates stored state after a rename.  Currently this
1631 * is the path of the object and anything under it, and the filehandle of
1632 * the renamed object.
1633 */
1634void
1635nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm)
1636{
1637	sfh4_update(VTOR4(renvp)->r_fh, nfh4p);
1638	fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm);
1639}
1640
1641/*
1642 * Routine to look up the filehandle for the given path and rootvp.
1643 *
1644 * Return values:
1645 * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
1646 *   updated.
1647 * - error: return value (errno value) and/or *statp is set appropriately.
1648 */
1649#define	RML_ORDINARY	1
1650#define	RML_NAMED_ATTR	2
1651#define	RML_ATTRDIR	3
1652
1653static void
1654remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp,
1655    int filetype, cred_t *cr,
1656    nfs_fh4 *fhp, nfs4_ga_res_t *garp,		/* fh, attrs for object */
1657    nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp,	/* fh, attrs for parent */
1658    nfs4_error_t *ep)
1659{
1660	COMPOUND4args_clnt args;
1661	COMPOUND4res_clnt res;
1662	nfs_argop4 *argop;
1663	nfs_resop4 *resop;
1664	int num_argops;
1665	lookup4_param_t lookuparg;
1666	nfs_fh4 *tmpfhp;
1667	int doqueue = 1;
1668	char *path;
1669	mntinfo4_t *mi;
1670
1671	ASSERT(fname != NULL);
1672	ASSERT(rootvp->v_type == VDIR);
1673
1674	mi = VTOMI4(rootvp);
1675	path = fn_path(fname);
1676	switch (filetype) {
1677	case RML_NAMED_ATTR:
1678		lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR;
1679		args.ctag = TAG_REMAP_LOOKUP_NA;
1680		break;
1681	case RML_ATTRDIR:
1682		lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR;
1683		args.ctag = TAG_REMAP_LOOKUP_AD;
1684		break;
1685	case RML_ORDINARY:
1686		lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1687		args.ctag = TAG_REMAP_LOOKUP;
1688		break;
1689	default:
1690		ep->error = EINVAL;
1691		return;
1692	}
1693	lookuparg.argsp = &args;
1694	lookuparg.resp = &res;
1695	lookuparg.header_len = 1;	/* Putfh */
1696	lookuparg.trailer_len = 0;
1697	lookuparg.ga_bits = NFS4_VATTR_MASK;
1698	lookuparg.mi = VTOMI4(rootvp);
1699
1700	(void) nfs4lookup_setup(path, &lookuparg, 1);
1701
1702	/* 0: putfh directory */
1703	argop = args.array;
1704	argop[0].argop = OP_CPUTFH;
1705	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh;
1706
1707	num_argops = args.array_len;
1708
1709	rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1710
1711	if (ep->error || res.status != NFS4_OK)
1712		goto exit;
1713
1714	/* get the object filehandle */
1715	resop = &res.array[res.array_len - 2];
1716	if (resop->resop != OP_GETFH) {
1717		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1718		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1719		ep->stat = NFS4ERR_SERVERFAULT;
1720		goto exit;
1721	}
1722	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1723	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1724		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1725		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1726		    TAG_NONE, 0, 0);
1727		ep->stat = NFS4ERR_SERVERFAULT;
1728		goto exit;
1729	}
1730	fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1731	nfs_fh4_copy(tmpfhp, fhp);
1732
1733	/* get the object attributes */
1734	resop = &res.array[res.array_len - 1];
1735	if (garp && resop->resop == OP_GETATTR)
1736		*garp = resop->nfs_resop4_u.opgetattr.ga_res;
1737
1738	/* See if there are enough fields in the response for parent info */
1739	if ((int)res.array_len - 5 <= 0)
1740		goto exit;
1741
1742	/* get the parent filehandle */
1743	resop = &res.array[res.array_len - 5];
1744	if (resop->resop != OP_GETFH) {
1745		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1746		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1747		ep->stat = NFS4ERR_SERVERFAULT;
1748		goto exit;
1749	}
1750	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1751	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1752		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1753		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1754		    TAG_NONE, 0, 0);
1755		ep->stat = NFS4ERR_SERVERFAULT;
1756		goto exit;
1757	}
1758	pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1759	nfs_fh4_copy(tmpfhp, pfhp);
1760
1761	/* get the parent attributes */
1762	resop = &res.array[res.array_len - 4];
1763	if (pgarp && resop->resop == OP_GETATTR)
1764		*pgarp = resop->nfs_resop4_u.opgetattr.ga_res;
1765
1766exit:
1767	/*
1768	 * It is too hard to remember where all the OP_LOOKUPs are
1769	 */
1770	nfs4args_lookup_free(argop, num_argops);
1771	kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1772
1773	if (!ep->error)
1774		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1775	kmem_free(path, strlen(path)+1);
1776}
1777
1778/*
1779 * NFS client failover / volatile filehandle support
1780 *
1781 * Recover the filehandle for the given rnode.
1782 *
1783 * Errors are returned via the nfs4_error_t parameter.
1784 */
1785
1786void
1787nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
1788{
1789	int is_stub;
1790	rnode4_t *rp = VTOR4(vp);
1791	vnode_t *rootvp = NULL;
1792	vnode_t *dvp = NULL;
1793	cred_t *cr, *cred_otw;
1794	nfs4_ga_res_t gar, pgar;
1795	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
1796	int filetype = RML_ORDINARY;
1797	nfs4_recov_state_t recov = {NULL, 0, 0};
1798	int badfhcount = 0;
1799	nfs4_open_stream_t *osp = NULL;
1800	bool_t first_time = TRUE;	/* first time getting OTW cred */
1801	bool_t last_time = FALSE;	/* last time getting OTW cred */
1802
1803	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1804	    "nfs4_remap_file: remapping %s", rnode4info(rp)));
1805	ASSERT(nfs4_consistent_type(vp));
1806
1807	if (vp->v_flag & VROOT) {
1808		nfs4_remap_root(mi, ep, flags);
1809		return;
1810	}
1811
1812	/*
1813	 * Given the root fh, use the path stored in
1814	 * the rnode to find the fh for the new server.
1815	 */
1816	ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1817	if (ep->error != 0)
1818		return;
1819
1820	cr = curthread->t_cred;
1821	ASSERT(cr != NULL);
1822get_remap_cred:
1823	/*
1824	 * Releases the osp, if it is provided.
1825	 * Puts a hold on the cred_otw and the new osp (if found).
1826	 */
1827	cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
1828	    &first_time, &last_time);
1829	ASSERT(cred_otw != NULL);
1830
1831	if (rp->r_flags & R4ISXATTR) {
1832		filetype = RML_NAMED_ATTR;
1833		(void) vtodv(vp, &dvp, cred_otw, FALSE);
1834	}
1835
1836	if (vp->v_flag & V_XATTRDIR) {
1837		filetype = RML_ATTRDIR;
1838	}
1839
1840	if (filetype == RML_ORDINARY && rootvp->v_type == VREG) {
1841		/* file mount, doesn't need a remap */
1842		goto done;
1843	}
1844
1845again:
1846	remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw,
1847	    &newfh, &gar, &newpfh, &pgar, ep);
1848
1849	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1850	    "nfs4_remap_file: remap_lookup returned %d/%d",
1851	    ep->error, ep->stat));
1852
1853	if (last_time == FALSE && ep->error == EACCES) {
1854		crfree(cred_otw);
1855		if (dvp != NULL)
1856			VN_RELE(dvp);
1857		goto get_remap_cred;
1858	}
1859	if (ep->error != 0)
1860		goto done;
1861
1862	switch (ep->stat) {
1863	case NFS4_OK:
1864		badfhcount = 0;
1865		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1866			mutex_enter(&rp->r_statelock);
1867			rp->r_delay_interval = 0;
1868			mutex_exit(&rp->r_statelock);
1869			uprintf("NFS File Available..\n");
1870		}
1871		break;
1872	case NFS4ERR_FHEXPIRED:
1873	case NFS4ERR_BADHANDLE:
1874	case NFS4ERR_STALE:
1875		/*
1876		 * If we ran into filehandle problems, we should try to
1877		 * remap the root vnode first and hope life gets better.
1878		 * But we need to avoid loops.
1879		 */
1880		if (badfhcount++ > 0)
1881			goto done;
1882		if (newfh.nfs_fh4_len != 0) {
1883			kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1884			newfh.nfs_fh4_len = 0;
1885		}
1886		if (newpfh.nfs_fh4_len != 0) {
1887			kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1888			newpfh.nfs_fh4_len = 0;
1889		}
1890		/* relative path - remap rootvp then retry */
1891		VN_RELE(rootvp);
1892		rootvp = NULL;
1893		nfs4_remap_root(mi, ep, flags);
1894		if (ep->error != 0 || ep->stat != NFS4_OK)
1895			goto done;
1896		ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1897		if (ep->error != 0)
1898			goto done;
1899		goto again;
1900	case NFS4ERR_DELAY:
1901		badfhcount = 0;
1902		nfs4_set_delay_wait(vp);
1903		ep->error = nfs4_wait_for_delay(vp, &recov);
1904		if (ep->error != 0)
1905			goto done;
1906		goto again;
1907	case NFS4ERR_ACCESS:
1908		/* get new cred, try again */
1909		if (last_time == TRUE)
1910			goto done;
1911		if (dvp != NULL)
1912			VN_RELE(dvp);
1913		crfree(cred_otw);
1914		goto get_remap_cred;
1915	default:
1916		goto done;
1917	}
1918
1919	/*
1920	 * Check on the new and old rnodes before updating;
1921	 * if the vnode type or size changes, issue a warning
1922	 * and mark the file dead.
1923	 */
1924	mutex_enter(&rp->r_statelock);
1925	if (flags & NFS4_REMAP_CKATTRS) {
1926		if (vp->v_type != gar.n4g_va.va_type ||
1927		    (vp->v_type != VDIR &&
1928		    rp->r_size != gar.n4g_va.va_size)) {
1929			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1930			    "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
1931			    (int)rp->r_size, (int)gar.n4g_va.va_size,
1932			    vp->v_type, gar.n4g_va.va_type));
1933			mutex_exit(&rp->r_statelock);
1934			nfs4_queue_event(RE_FILE_DIFF, mi,
1935			    rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0,
1936			    TAG_NONE, TAG_NONE, 0, 0);
1937			nfs4_fail_recov(vp, NULL, 0, NFS4_OK);
1938			goto done;
1939		}
1940	}
1941	ASSERT(gar.n4g_va.va_type != VNON);
1942	rp->r_server = mi->mi_curr_serv;
1943
1944	/*
1945	 * Turn this object into a "stub" object if we
1946	 * crossed an underlying server fs boundary.
1947	 *
1948	 * This stub will be for a mirror-mount.
1949	 * A referral would look like a boundary crossing
1950	 * as well, but would not be the same type of object,
1951	 * so we would expect to mark the object dead.
1952	 *
1953	 * See comment in r4_do_attrcache() for more details.
1954	 */
1955	is_stub = 0;
1956	if (gar.n4g_fsid_valid) {
1957		(void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0);
1958		rp->r_srv_fsid = gar.n4g_fsid;
1959		if (!FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid))
1960			is_stub = 1;
1961		nfs_rw_exit(&rp->r_server->sv_lock);
1962#ifdef DEBUG
1963	} else {
1964		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1965		    "remap_file: fsid attr not provided by server.  rp=%p",
1966		    (void *)rp));
1967#endif
1968	}
1969	if (is_stub)
1970		r4_stub_mirrormount(rp);
1971	else
1972		r4_stub_none(rp);
1973	mutex_exit(&rp->r_statelock);
1974	nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */
1975	sfh4_update(rp->r_fh, &newfh);
1976	ASSERT(nfs4_consistent_type(vp));
1977
1978	/*
1979	 * If we got parent info, use it to update the parent
1980	 */
1981	if (newpfh.nfs_fh4_len != 0) {
1982		if (rp->r_svnode.sv_dfh != NULL)
1983			sfh4_update(rp->r_svnode.sv_dfh, &newpfh);
1984		if (dvp != NULL) {
1985			/* force update of attrs */
1986			nfs4_attrcache_noinval(dvp, &pgar, gethrtime());
1987		}
1988	}
1989done:
1990	if (newfh.nfs_fh4_len != 0)
1991		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1992	if (newpfh.nfs_fh4_len != 0)
1993		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1994	if (cred_otw != NULL)
1995		crfree(cred_otw);
1996	if (rootvp != NULL)
1997		VN_RELE(rootvp);
1998	if (dvp != NULL)
1999		VN_RELE(dvp);
2000	if (osp != NULL)
2001		open_stream_rele(osp, rp);
2002}
2003
2004/*
2005 * Client-side failover support: remap the filehandle for vp if it appears
2006 * necessary.  errors are returned via the nfs4_error_t parameter; though,
2007 * if there is a problem, we will just try again later.
2008 */
2009
2010void
2011nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
2012{
2013	if (vp == NULL)
2014		return;
2015
2016	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY))
2017		return;
2018
2019	if (VTOR4(vp)->r_server == mi->mi_curr_serv)
2020		return;
2021
2022	nfs4_remap_file(mi, vp, flags, ep);
2023}
2024
2025/*
2026 * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
2027 *
2028 * Our caller has a filehandle for ".." relative to a particular
2029 * directory object.  We want to find or create a parent vnode
2030 * with that filehandle and return it.  We can of course create
2031 * a vnode from this filehandle, but we need to also make sure
2032 * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
2033 * that we have a parent FH for future reopens as well.  If
2034 * we have a remap failure, we won't be able to reopen this
2035 * file, but we won't treat that as fatal because a reopen
2036 * is at least unlikely.  Someday nfs4_reopen() should look
2037 * for a missing parent FH and try a remap to recover from it.
2038 *
2039 * need_start_op argument indicates whether this function should
2040 * do a start_op before calling remap_lookup().  This should
2041 * be FALSE, if you are the recovery thread or in an op; otherwise,
2042 * set it to TRUE.
2043 */
2044int
2045nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp,
2046    cred_t *cr, vnode_t **vpp, int need_start_op)
2047{
2048	mntinfo4_t *mi = VTOMI4(dvp);
2049	nfs4_fname_t *np = NULL, *pnp = NULL;
2050	vnode_t *vp = NULL, *rootvp = NULL;
2051	rnode4_t *rp;
2052	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
2053	nfs4_ga_res_t gar, pgar;
2054	vattr_t va, pva;
2055	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2056	nfs4_sharedfh_t *sfh = NULL, *psfh = NULL;
2057	nfs4_recov_state_t recov_state;
2058
2059#ifdef DEBUG
2060	/*
2061	 * ensure need_start_op is correct
2062	 */
2063	{
2064		int no_need_start_op = (tsd_get(nfs4_tsd_key) ||
2065		    (curthread == mi->mi_recovthread));
2066		/* C needs a ^^ operator! */
2067		ASSERT(((need_start_op) && (!no_need_start_op)) ||
2068		    ((! need_start_op) && (no_need_start_op)));
2069	}
2070#endif
2071	ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone());
2072
2073	NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE,
2074	    "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp,
2075	    rnode4info(VTOR4(dvp))));
2076
2077	/*
2078	 * rootvp might be needed eventually. Holding it now will
2079	 * ensure that r4find_unlocked() will find it, if ".." is the root.
2080	 */
2081	e.error = VFS_ROOT(mi->mi_vfsp, &rootvp);
2082	if (e.error != 0)
2083		goto out;
2084	rp = r4find_unlocked(fhp, mi->mi_vfsp);
2085	if (rp != NULL) {
2086		*vpp = RTOV4(rp);
2087		VN_RELE(rootvp);
2088		return (0);
2089	}
2090
2091	/*
2092	 * Since we don't have the rnode, we have to go over the wire.
2093	 * remap_lookup() can get all of the filehandles and attributes
2094	 * we need in one operation.
2095	 */
2096	np = fn_parent(VTOSV(dvp)->sv_name);
2097	/* if a parent was not found return an error */
2098	if (np == NULL) {
2099		e.error = ENOENT;
2100		goto out;
2101	}
2102
2103	recov_state.rs_flags = 0;
2104	recov_state.rs_num_retry_despite_err = 0;
2105recov_retry:
2106	if (need_start_op) {
2107		e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP,
2108		    &recov_state, NULL);
2109		if (e.error != 0) {
2110			goto out;
2111		}
2112	}
2113	va.va_type = VNON;
2114	pva.va_type = VNON;
2115	remap_lookup(np, rootvp, RML_ORDINARY, cr,
2116	    &newfh, &gar, &newpfh, &pgar, &e);
2117	if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2118		if (need_start_op) {
2119			bool_t abort;
2120
2121			abort = nfs4_start_recovery(&e, mi,
2122			    rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL, NULL,
2123			    NULL);
2124			if (abort) {
2125				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2126				    &recov_state, FALSE);
2127				if (e.error == 0)
2128					e.error = EIO;
2129				goto out;
2130			}
2131			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2132			    &recov_state, TRUE);
2133			goto recov_retry;
2134		}
2135		if (e.error == 0)
2136			e.error = EIO;
2137		goto out;
2138	}
2139
2140	if (!e.error) {
2141		va = gar.n4g_va;
2142		pva = pgar.n4g_va;
2143	}
2144
2145	if ((e.error != 0) ||
2146	    (va.va_type != VDIR)) {
2147		if (need_start_op)
2148			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2149			    &recov_state, FALSE);
2150		if (e.error == 0)
2151			e.error = EIO;
2152		goto out;
2153	}
2154
2155	if (e.stat != NFS4_OK) {
2156		if (need_start_op)
2157			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2158			    &recov_state, FALSE);
2159		e.error = EIO;
2160		goto out;
2161	}
2162
2163	/*
2164	 * It is possible for remap_lookup() to return with no error,
2165	 * but without providing the parent filehandle and attrs.
2166	 */
2167	if (pva.va_type != VDIR) {
2168		/*
2169		 * Call remap_lookup() again, this time with the
2170		 * newpfh and pgar args in the first position.
2171		 */
2172		pnp = fn_parent(np);
2173		if (pnp != NULL) {
2174			remap_lookup(pnp, rootvp, RML_ORDINARY, cr,
2175			    &newpfh, &pgar, NULL, NULL, &e);
2176			if (nfs4_needs_recovery(&e, FALSE,
2177			    mi->mi_vfsp)) {
2178				if (need_start_op) {
2179					bool_t abort;
2180
2181					abort = nfs4_start_recovery(&e, mi,
2182					    rootvp, NULL, NULL, NULL,
2183					    OP_LOOKUP, NULL, NULL, NULL);
2184					if (abort) {
2185						nfs4_end_fop(mi, rootvp, NULL,
2186						    OH_LOOKUP, &recov_state,
2187						    FALSE);
2188						if (e.error == 0)
2189							e.error = EIO;
2190						goto out;
2191					}
2192					nfs4_end_fop(mi, rootvp, NULL,
2193					    OH_LOOKUP, &recov_state, TRUE);
2194					goto recov_retry;
2195				}
2196				if (e.error == 0)
2197					e.error = EIO;
2198				goto out;
2199			}
2200
2201			if (e.stat != NFS4_OK) {
2202				if (need_start_op)
2203					nfs4_end_fop(mi, rootvp, NULL,
2204					    OH_LOOKUP, &recov_state, FALSE);
2205				e.error = EIO;
2206				goto out;
2207			}
2208		}
2209		if ((pnp == NULL) ||
2210		    (e.error != 0) ||
2211		    (pva.va_type == VNON)) {
2212			if (need_start_op)
2213				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2214				    &recov_state, FALSE);
2215			if (e.error == 0)
2216				e.error = EIO;
2217			goto out;
2218		}
2219	}
2220	ASSERT(newpfh.nfs_fh4_len != 0);
2221	if (need_start_op)
2222		nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE);
2223	psfh = sfh4_get(&newpfh, mi);
2224
2225	sfh = sfh4_get(&newfh, mi);
2226	vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t);
2227
2228out:
2229	if (np != NULL)
2230		fn_rele(&np);
2231	if (pnp != NULL)
2232		fn_rele(&pnp);
2233	if (newfh.nfs_fh4_len != 0)
2234		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
2235	if (newpfh.nfs_fh4_len != 0)
2236		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
2237	if (sfh != NULL)
2238		sfh4_rele(&sfh);
2239	if (psfh != NULL)
2240		sfh4_rele(&psfh);
2241	if (rootvp != NULL)
2242		VN_RELE(rootvp);
2243	*vpp = vp;
2244	return (e.error);
2245}
2246
2247#ifdef DEBUG
2248size_t r_path_memuse = 0;
2249#endif
2250
2251/*
2252 * NFS client failover support
2253 *
2254 * sv4_free() frees the malloc'd portion of a "servinfo_t".
2255 */
2256void
2257sv4_free(servinfo4_t *svp)
2258{
2259	servinfo4_t *next;
2260	struct knetconfig *knconf;
2261
2262	while (svp != NULL) {
2263		next = svp->sv_next;
2264		if (svp->sv_dhsec)
2265			sec_clnt_freeinfo(svp->sv_dhsec);
2266		if (svp->sv_secdata)
2267			sec_clnt_freeinfo(svp->sv_secdata);
2268		if (svp->sv_save_secinfo &&
2269		    svp->sv_save_secinfo != svp->sv_secinfo)
2270			secinfo_free(svp->sv_save_secinfo);
2271		if (svp->sv_secinfo)
2272			secinfo_free(svp->sv_secinfo);
2273		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
2274			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
2275		knconf = svp->sv_knconf;
2276		if (knconf != NULL) {
2277			if (knconf->knc_protofmly != NULL)
2278				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2279			if (knconf->knc_proto != NULL)
2280				kmem_free(knconf->knc_proto, KNC_STRSIZE);
2281			kmem_free(knconf, sizeof (*knconf));
2282		}
2283		knconf = svp->sv_origknconf;
2284		if (knconf != NULL) {
2285			if (knconf->knc_protofmly != NULL)
2286				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2287			if (knconf->knc_proto != NULL)
2288				kmem_free(knconf->knc_proto, KNC_STRSIZE);
2289			kmem_free(knconf, sizeof (*knconf));
2290		}
2291		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
2292			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
2293		if (svp->sv_path != NULL) {
2294			kmem_free(svp->sv_path, svp->sv_pathlen);
2295		}
2296		nfs_rw_destroy(&svp->sv_lock);
2297		kmem_free(svp, sizeof (*svp));
2298		svp = next;
2299	}
2300}
2301
2302void
2303nfs4_printfhandle(nfs4_fhandle_t *fhp)
2304{
2305	int *ip;
2306	char *buf;
2307	size_t bufsize;
2308	char *cp;
2309
2310	/*
2311	 * 13 == "(file handle:"
2312	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2313	 *	1 == ' '
2314	 *	8 == maximum strlen of "%x"
2315	 * 3 == ")\n\0"
2316	 */
2317	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2318	buf = kmem_alloc(bufsize, KM_NOSLEEP);
2319	if (buf == NULL)
2320		return;
2321
2322	cp = buf;
2323	(void) strcpy(cp, "(file handle:");
2324	while (*cp != '\0')
2325		cp++;
2326	for (ip = (int *)fhp->fh_buf;
2327	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
2328	    ip++) {
2329		(void) sprintf(cp, " %x", *ip);
2330		while (*cp != '\0')
2331			cp++;
2332	}
2333	(void) strcpy(cp, ")\n");
2334
2335	zcmn_err(getzoneid(), CE_CONT, "%s", buf);
2336
2337	kmem_free(buf, bufsize);
2338}
2339
2340/*
2341 * The NFSv4 readdir cache subsystem.
2342 *
2343 * We provide a set of interfaces to allow the rest of the system to utilize
2344 * a caching mechanism while encapsulating the details of the actual
2345 * implementation.  This should allow for better maintainability and
2346 * extensibility by consolidating the implementation details in one location.
2347 */
2348
2349/*
2350 * Comparator used by AVL routines.
2351 */
2352static int
2353rddir4_cache_compar(const void *x, const void *y)
2354{
2355	rddir4_cache_impl *ai = (rddir4_cache_impl *)x;
2356	rddir4_cache_impl *bi = (rddir4_cache_impl *)y;
2357	rddir4_cache *a = &ai->rc;
2358	rddir4_cache *b = &bi->rc;
2359
2360	if (a->nfs4_cookie == b->nfs4_cookie) {
2361		if (a->buflen == b->buflen)
2362			return (0);
2363		if (a->buflen < b->buflen)
2364			return (-1);
2365		return (1);
2366	}
2367
2368	if (a->nfs4_cookie < b->nfs4_cookie)
2369			return (-1);
2370
2371	return (1);
2372}
2373
2374/*
2375 * Allocate an opaque handle for the readdir cache.
2376 */
2377void
2378rddir4_cache_create(rnode4_t *rp)
2379{
2380	ASSERT(rp->r_dir == NULL);
2381
2382	rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
2383
2384	avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl),
2385	    offsetof(rddir4_cache_impl, tree));
2386}
2387
2388/*
2389 *  Purge the cache of all cached readdir responses.
2390 */
2391void
2392rddir4_cache_purge(rnode4_t *rp)
2393{
2394	rddir4_cache_impl	*rdip;
2395	rddir4_cache_impl	*nrdip;
2396
2397	ASSERT(MUTEX_HELD(&rp->r_statelock));
2398
2399	if (rp->r_dir == NULL)
2400		return;
2401
2402	rdip = avl_first(rp->r_dir);
2403
2404	while (rdip != NULL) {
2405		nrdip = AVL_NEXT(rp->r_dir, rdip);
2406		avl_remove(rp->r_dir, rdip);
2407		rdip->rc.flags &= ~RDDIRCACHED;
2408		rddir4_cache_rele(rp, &rdip->rc);
2409		rdip = nrdip;
2410	}
2411	ASSERT(avl_numnodes(rp->r_dir) == 0);
2412}
2413
2414/*
2415 * Destroy the readdir cache.
2416 */
2417void
2418rddir4_cache_destroy(rnode4_t *rp)
2419{
2420	ASSERT(MUTEX_HELD(&rp->r_statelock));
2421	if (rp->r_dir == NULL)
2422		return;
2423
2424	rddir4_cache_purge(rp);
2425	avl_destroy(rp->r_dir);
2426	kmem_free(rp->r_dir, sizeof (avl_tree_t));
2427	rp->r_dir = NULL;
2428}
2429
2430/*
2431 * Locate a readdir response from the readdir cache.
2432 *
2433 * Return values:
2434 *
2435 * NULL - If there is an unrecoverable situation like the operation may have
2436 *	  been interrupted.
2437 *
2438 * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
2439 *		    The flags are set approprately, such that the caller knows
2440 *		    what state the entry is in.
2441 */
2442rddir4_cache *
2443rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count)
2444{
2445	rddir4_cache_impl	*rdip = NULL;
2446	rddir4_cache_impl	srdip;
2447	rddir4_cache		*srdc;
2448	rddir4_cache		*rdc = NULL;
2449	rddir4_cache		*nrdc = NULL;
2450	avl_index_t		where;
2451
2452top:
2453	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2454	ASSERT(MUTEX_HELD(&rp->r_statelock));
2455	/*
2456	 * Check to see if the readdir cache has been disabled.  If so, then
2457	 * simply allocate an rddir4_cache entry and return it, since caching
2458	 * operations do not apply.
2459	 */
2460	if (rp->r_dir == NULL) {
2461		if (nrdc == NULL) {
2462			/*
2463			 * Drop the lock because we are doing a sleeping
2464			 * allocation.
2465			 */
2466			mutex_exit(&rp->r_statelock);
2467			rdc = rddir4_cache_alloc(KM_SLEEP);
2468			rdc->nfs4_cookie = cookie;
2469			rdc->buflen = count;
2470			mutex_enter(&rp->r_statelock);
2471			return (rdc);
2472		}
2473		return (nrdc);
2474	}
2475
2476	srdc = &srdip.rc;
2477	srdc->nfs4_cookie = cookie;
2478	srdc->buflen = count;
2479
2480	rdip = avl_find(rp->r_dir, &srdip, &where);
2481
2482	/*
2483	 * If we didn't find an entry then create one and insert it
2484	 * into the cache.
2485	 */
2486	if (rdip == NULL) {
2487		/*
2488		 * Check for the case where we have made a second pass through
2489		 * the cache due to a lockless allocation.  If we find that no
2490		 * thread has already inserted this entry, do the insert now
2491		 * and return.
2492		 */
2493		if (nrdc != NULL) {
2494			avl_insert(rp->r_dir, nrdc->data, where);
2495			nrdc->flags |= RDDIRCACHED;
2496			rddir4_cache_hold(nrdc);
2497			return (nrdc);
2498		}
2499
2500#ifdef DEBUG
2501		nfs4_readdir_cache_misses++;
2502#endif
2503		/*
2504		 * First, try to allocate an entry without sleeping.  If that
2505		 * fails then drop the lock and do a sleeping allocation.
2506		 */
2507		nrdc = rddir4_cache_alloc(KM_NOSLEEP);
2508		if (nrdc != NULL) {
2509			nrdc->nfs4_cookie = cookie;
2510			nrdc->buflen = count;
2511			avl_insert(rp->r_dir, nrdc->data, where);
2512			nrdc->flags |= RDDIRCACHED;
2513			rddir4_cache_hold(nrdc);
2514			return (nrdc);
2515		}
2516
2517		/*
2518		 * Drop the lock and do a sleeping allocation.	We incur
2519		 * additional overhead by having to search the cache again,
2520		 * but this case should be rare.
2521		 */
2522		mutex_exit(&rp->r_statelock);
2523		nrdc = rddir4_cache_alloc(KM_SLEEP);
2524		nrdc->nfs4_cookie = cookie;
2525		nrdc->buflen = count;
2526		mutex_enter(&rp->r_statelock);
2527		/*
2528		 * We need to take another pass through the cache
2529		 * since we dropped our lock to perform the alloc.
2530		 * Another thread may have come by and inserted the
2531		 * entry we are interested in.
2532		 */
2533		goto top;
2534	}
2535
2536	/*
2537	 * Check to see if we need to free our entry.  This can happen if
2538	 * another thread came along beat us to the insert.  We can
2539	 * safely call rddir4_cache_free directly because no other thread
2540	 * would have a reference to this entry.
2541	 */
2542	if (nrdc != NULL)
2543		rddir4_cache_free((rddir4_cache_impl *)nrdc->data);
2544
2545#ifdef DEBUG
2546	nfs4_readdir_cache_hits++;
2547#endif
2548	/*
2549	 * Found something.  Make sure it's ready to return.
2550	 */
2551	rdc = &rdip->rc;
2552	rddir4_cache_hold(rdc);
2553	/*
2554	 * If the cache entry is in the process of being filled in, wait
2555	 * until this completes.  The RDDIRWAIT bit is set to indicate that
2556	 * someone is waiting and when the thread currently filling the entry
2557	 * is done, it should do a cv_broadcast to wakeup all of the threads
2558	 * waiting for it to finish. If the thread wakes up to find that
2559	 * someone new is now trying to complete the the entry, go back
2560	 * to sleep.
2561	 */
2562	while (rdc->flags & RDDIR) {
2563		/*
2564		 * The entry is not complete.
2565		 */
2566		nfs_rw_exit(&rp->r_rwlock);
2567		rdc->flags |= RDDIRWAIT;
2568#ifdef DEBUG
2569		nfs4_readdir_cache_waits++;
2570#endif
2571		while (rdc->flags & RDDIRWAIT) {
2572			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
2573				/*
2574				 * We got interrupted, probably the user
2575				 * typed ^C or an alarm fired.  We free the
2576				 * new entry if we allocated one.
2577				 */
2578				rddir4_cache_rele(rp, rdc);
2579				mutex_exit(&rp->r_statelock);
2580				(void) nfs_rw_enter_sig(&rp->r_rwlock,
2581				    RW_READER, FALSE);
2582				mutex_enter(&rp->r_statelock);
2583				return (NULL);
2584			}
2585		}
2586		mutex_exit(&rp->r_statelock);
2587		(void) nfs_rw_enter_sig(&rp->r_rwlock,
2588		    RW_READER, FALSE);
2589		mutex_enter(&rp->r_statelock);
2590	}
2591
2592	/*
2593	 * The entry we were waiting on may have been purged from
2594	 * the cache and should no longer be used, release it and
2595	 * start over.
2596	 */
2597	if (!(rdc->flags & RDDIRCACHED)) {
2598		rddir4_cache_rele(rp, rdc);
2599		goto top;
2600	}
2601
2602	/*
2603	 * The entry is completed.  Return it.
2604	 */
2605	return (rdc);
2606}
2607
2608/*
2609 * Allocate a cache element and return it.  Can return NULL if memory is
2610 * low.
2611 */
2612static rddir4_cache *
2613rddir4_cache_alloc(int flags)
2614{
2615	rddir4_cache_impl	*rdip = NULL;
2616	rddir4_cache		*rc = NULL;
2617
2618	rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags);
2619
2620	if (rdip != NULL) {
2621		rc = &rdip->rc;
2622		rc->data = (void *)rdip;
2623		rc->nfs4_cookie = 0;
2624		rc->nfs4_ncookie = 0;
2625		rc->entries = NULL;
2626		rc->eof = 0;
2627		rc->entlen = 0;
2628		rc->buflen = 0;
2629		rc->actlen = 0;
2630		/*
2631		 * A readdir is required so set the flag.
2632		 */
2633		rc->flags = RDDIRREQ;
2634		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
2635		rc->error = 0;
2636		mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL);
2637		rdip->count = 1;
2638#ifdef DEBUG
2639		atomic_add_64(&clstat4_debug.dirent.value.ui64, 1);
2640#endif
2641	}
2642	return (rc);
2643}
2644
2645/*
2646 * Increment the reference count to this cache element.
2647 */
2648static void
2649rddir4_cache_hold(rddir4_cache *rc)
2650{
2651	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data;
2652
2653	mutex_enter(&rdip->lock);
2654	rdip->count++;
2655	mutex_exit(&rdip->lock);
2656}
2657
2658/*
2659 * Release a reference to this cache element.  If the count is zero then
2660 * free the element.
2661 */
2662void
2663rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc)
2664{
2665	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data;
2666
2667	ASSERT(MUTEX_HELD(&rp->r_statelock));
2668
2669	/*
2670	 * Check to see if we have any waiters.  If so, we can wake them
2671	 * so that they can proceed.
2672	 */
2673	if (rdc->flags & RDDIRWAIT) {
2674		rdc->flags &= ~RDDIRWAIT;
2675		cv_broadcast(&rdc->cv);
2676	}
2677
2678	mutex_enter(&rdip->lock);
2679	ASSERT(rdip->count > 0);
2680	if (--rdip->count == 0) {
2681		mutex_exit(&rdip->lock);
2682		rddir4_cache_free(rdip);
2683	} else
2684		mutex_exit(&rdip->lock);
2685}
2686
2687/*
2688 * Free a cache element.
2689 */
2690static void
2691rddir4_cache_free(rddir4_cache_impl *rdip)
2692{
2693	rddir4_cache *rc = &rdip->rc;
2694
2695#ifdef DEBUG
2696	atomic_add_64(&clstat4_debug.dirent.value.ui64, -1);
2697#endif
2698	if (rc->entries != NULL)
2699		kmem_free(rc->entries, rc->buflen);
2700	cv_destroy(&rc->cv);
2701	mutex_destroy(&rdip->lock);
2702	kmem_free(rdip, sizeof (*rdip));
2703}
2704
2705/*
2706 * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
2707 * framework.
2708 */
2709static int
2710cl4_snapshot(kstat_t *ksp, void *buf, int rw)
2711{
2712	ksp->ks_snaptime = gethrtime();
2713	if (rw == KSTAT_WRITE) {
2714		bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl));
2715#ifdef DEBUG
2716		/*
2717		 * Currently only the global zone can write to kstats, but we
2718		 * add the check just for paranoia.
2719		 */
2720		if (INGLOBALZONE(curproc))
2721			bcopy((char *)buf + sizeof (clstat4_tmpl),
2722			    &clstat4_debug, sizeof (clstat4_debug));
2723#endif
2724	} else {
2725		bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl));
2726#ifdef DEBUG
2727		/*
2728		 * If we're displaying the "global" debug kstat values, we
2729		 * display them as-is to all zones since in fact they apply to
2730		 * the system as a whole.
2731		 */
2732		bcopy(&clstat4_debug, (char *)buf + sizeof (clstat4_tmpl),
2733		    sizeof (clstat4_debug));
2734#endif
2735	}
2736	return (0);
2737}
2738
2739
2740
2741/*
2742 * Zone support
2743 */
2744static void *
2745clinit4_zone(zoneid_t zoneid)
2746{
2747	kstat_t *nfs4_client_kstat;
2748	struct nfs4_clnt *nfscl;
2749	uint_t ndata;
2750
2751	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
2752	mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL);
2753	nfscl->nfscl_chtable4 = NULL;
2754	nfscl->nfscl_zoneid = zoneid;
2755
2756	bcopy(&clstat4_tmpl, &nfscl->nfscl_stat, sizeof (clstat4_tmpl));
2757	ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t);
2758#ifdef DEBUG
2759	ndata += sizeof (clstat4_debug) / sizeof (kstat_named_t);
2760#endif
2761	if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client",
2762	    "misc", KSTAT_TYPE_NAMED, ndata,
2763	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
2764		nfs4_client_kstat->ks_private = &nfscl->nfscl_stat;
2765		nfs4_client_kstat->ks_snapshot = cl4_snapshot;
2766		kstat_install(nfs4_client_kstat);
2767	}
2768	mutex_enter(&nfs4_clnt_list_lock);
2769	list_insert_head(&nfs4_clnt_list, nfscl);
2770	mutex_exit(&nfs4_clnt_list_lock);
2771
2772	return (nfscl);
2773}
2774
2775/*ARGSUSED*/
2776static void
2777clfini4_zone(zoneid_t zoneid, void *arg)
2778{
2779	struct nfs4_clnt *nfscl = arg;
2780	chhead_t *chp, *next;
2781
2782	if (nfscl == NULL)
2783		return;
2784	mutex_enter(&nfs4_clnt_list_lock);
2785	list_remove(&nfs4_clnt_list, nfscl);
2786	mutex_exit(&nfs4_clnt_list_lock);
2787	clreclaim4_zone(nfscl, 0);
2788	for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) {
2789		ASSERT(chp->ch_list == NULL);
2790		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
2791		next = chp->ch_next;
2792		kmem_free(chp, sizeof (*chp));
2793	}
2794	kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid);
2795	mutex_destroy(&nfscl->nfscl_chtable4_lock);
2796	kmem_free(nfscl, sizeof (*nfscl));
2797}
2798
2799/*
2800 * Called by endpnt_destructor to make sure the client handles are
2801 * cleaned up before the RPC endpoints.  This becomes a no-op if
2802 * clfini_zone (above) is called first.  This function is needed
2803 * (rather than relying on clfini_zone to clean up) because the ZSD
2804 * callbacks have no ordering mechanism, so we have no way to ensure
2805 * that clfini_zone is called before endpnt_destructor.
2806 */
2807void
2808clcleanup4_zone(zoneid_t zoneid)
2809{
2810	struct nfs4_clnt *nfscl;
2811
2812	mutex_enter(&nfs4_clnt_list_lock);
2813	nfscl = list_head(&nfs4_clnt_list);
2814	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) {
2815		if (nfscl->nfscl_zoneid == zoneid) {
2816			clreclaim4_zone(nfscl, 0);
2817			break;
2818		}
2819	}
2820	mutex_exit(&nfs4_clnt_list_lock);
2821}
2822
2823int
2824nfs4_subr_init(void)
2825{
2826	/*
2827	 * Allocate and initialize the client handle cache
2828	 */
2829	chtab4_cache = kmem_cache_create("client_handle4_cache",
2830	    sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL,
2831	    NULL, 0);
2832
2833	/*
2834	 * Initialize the list of per-zone client handles (and associated data).
2835	 * This needs to be done before we call zone_key_create().
2836	 */
2837	list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt),
2838	    offsetof(struct nfs4_clnt, nfscl_node));
2839
2840	/*
2841	 * Initialize the zone_key for per-zone client handle lists.
2842	 */
2843	zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone);
2844
2845	if (nfs4err_delay_time == 0)
2846		nfs4err_delay_time = NFS4ERR_DELAY_TIME;
2847
2848	return (0);
2849}
2850
2851int
2852nfs4_subr_fini(void)
2853{
2854	/*
2855	 * Deallocate the client handle cache
2856	 */
2857	kmem_cache_destroy(chtab4_cache);
2858
2859	/*
2860	 * Destroy the zone_key
2861	 */
2862	(void) zone_key_delete(nfs4clnt_zone_key);
2863
2864	return (0);
2865}
2866/*
2867 * Set or Clear direct I/O flag
2868 * VOP_RWLOCK() is held for write access to prevent a race condition
2869 * which would occur if a process is in the middle of a write when
2870 * directio flag gets set. It is possible that all pages may not get flushed.
2871 *
2872 * This is a copy of nfs_directio, changes here may need to be made
2873 * there and vice versa.
2874 */
2875
2876int
2877nfs4_directio(vnode_t *vp, int cmd, cred_t *cr)
2878{
2879	int	error = 0;
2880	rnode4_t *rp;
2881
2882	rp = VTOR4(vp);
2883
2884	if (cmd == DIRECTIO_ON) {
2885
2886		if (rp->r_flags & R4DIRECTIO)
2887			return (0);
2888
2889		/*
2890		 * Flush the page cache.
2891		 */
2892
2893		(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
2894
2895		if (rp->r_flags & R4DIRECTIO) {
2896			VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2897			return (0);
2898		}
2899
2900		if (nfs4_has_pages(vp) &&
2901		    ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) {
2902			error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0,
2903			    B_INVAL, cr, NULL);
2904			if (error) {
2905				if (error == ENOSPC || error == EDQUOT) {
2906					mutex_enter(&rp->r_statelock);
2907					if (!rp->r_error)
2908						rp->r_error = error;
2909					mutex_exit(&rp->r_statelock);
2910				}
2911				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2912				return (error);
2913			}
2914		}
2915
2916		mutex_enter(&rp->r_statelock);
2917		rp->r_flags |= R4DIRECTIO;
2918		mutex_exit(&rp->r_statelock);
2919		VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2920		return (0);
2921	}
2922
2923	if (cmd == DIRECTIO_OFF) {
2924		mutex_enter(&rp->r_statelock);
2925		rp->r_flags &= ~R4DIRECTIO;	/* disable direct mode */
2926		mutex_exit(&rp->r_statelock);
2927		return (0);
2928	}
2929
2930	return (EINVAL);
2931}
2932
2933/*
2934 * Return TRUE if the file has any pages.  Always go back to
2935 * the master vnode to check v_pages since none of the shadows
2936 * can have pages.
2937 */
2938
2939bool_t
2940nfs4_has_pages(vnode_t *vp)
2941{
2942	rnode4_t *rp;
2943
2944	rp = VTOR4(vp);
2945	if (IS_SHADOW(vp, rp))
2946		vp = RTOV4(rp);	/* RTOV4 always gives the master */
2947
2948	return (vn_has_cached_data(vp));
2949}
2950
2951/*
2952 * This table is used to determine whether the client should attempt
2953 * failover based on the clnt_stat value returned by CLNT_CALL.  The
2954 * clnt_stat is used as an index into the table.  If
2955 * the error value that corresponds to the clnt_stat value in the
2956 * table is non-zero, then that is the error to be returned AND
2957 * that signals that failover should be attempted.
2958 *
2959 * Special note: If the RPC_ values change, then direct indexing of the
2960 * table is no longer valid, but having the RPC_ values in the table
2961 * allow the functions to detect the change and issue a warning.
2962 * In this case, the code will always attempt failover as a defensive
2963 * measure.
2964 */
2965
2966static struct try_failover_tab {
2967	enum clnt_stat	cstat;
2968	int		error;
2969} try_failover_table [] = {
2970
2971	RPC_SUCCESS,		0,
2972	RPC_CANTENCODEARGS,	0,
2973	RPC_CANTDECODERES,	0,
2974	RPC_CANTSEND,		ECOMM,
2975	RPC_CANTRECV,		ECOMM,
2976	RPC_TIMEDOUT,		ETIMEDOUT,
2977	RPC_VERSMISMATCH,	0,
2978	RPC_AUTHERROR,		0,
2979	RPC_PROGUNAVAIL,	0,
2980	RPC_PROGVERSMISMATCH,	0,
2981	RPC_PROCUNAVAIL,	0,
2982	RPC_CANTDECODEARGS,	0,
2983	RPC_SYSTEMERROR,	ENOSR,
2984	RPC_UNKNOWNHOST,	EHOSTUNREACH,
2985	RPC_RPCBFAILURE,	ENETUNREACH,
2986	RPC_PROGNOTREGISTERED,	ECONNREFUSED,
2987	RPC_FAILED,		ETIMEDOUT,
2988	RPC_UNKNOWNPROTO,	EHOSTUNREACH,
2989	RPC_INTR,		0,
2990	RPC_UNKNOWNADDR,	EHOSTUNREACH,
2991	RPC_TLIERROR,		0,
2992	RPC_NOBROADCAST,	EHOSTUNREACH,
2993	RPC_N2AXLATEFAILURE,	ECONNREFUSED,
2994	RPC_UDERROR,		0,
2995	RPC_INPROGRESS,		0,
2996	RPC_STALERACHANDLE,	EINVAL,
2997	RPC_CANTCONNECT,	ECONNREFUSED,
2998	RPC_XPRTFAILED,		ECONNABORTED,
2999	RPC_CANTCREATESTREAM,	ECONNREFUSED,
3000	RPC_CANTSTORE,		ENOBUFS
3001};
3002
3003/*
3004 * nfs4_try_failover - determine whether the client should
3005 * attempt failover based on the values stored in the nfs4_error_t.
3006 */
3007int
3008nfs4_try_failover(nfs4_error_t *ep)
3009{
3010	if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE)
3011		return (TRUE);
3012
3013	if (ep->error && ep->rpc_status != RPC_SUCCESS)
3014		return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE);
3015
3016	return (FALSE);
3017}
3018
3019/*
3020 * try_failover - internal version of nfs4_try_failover, called
3021 * only by rfscall and aclcall.  Determine if failover is warranted
3022 * based on the clnt_stat and return the error number if it is.
3023 */
3024static int
3025try_failover(enum clnt_stat rpc_status)
3026{
3027	int err = 0;
3028
3029	if (rpc_status == RPC_SUCCESS)
3030		return (0);
3031
3032#ifdef	DEBUG
3033	if (rpc_status != 0 && nfs4_try_failover_any) {
3034		err = ETIMEDOUT;
3035		goto done;
3036	}
3037#endif
3038	/*
3039	 * The rpc status is used as an index into the table.
3040	 * If the rpc status is outside of the range of the
3041	 * table or if the rpc error numbers have been changed
3042	 * since the table was constructed, then print a warning
3043	 * (DEBUG only) and try failover anyway.  Otherwise, just
3044	 * grab the resulting error number out of the table.
3045	 */
3046	if (rpc_status < RPC_SUCCESS || rpc_status >=
3047	    sizeof (try_failover_table)/sizeof (try_failover_table[0]) ||
3048	    try_failover_table[rpc_status].cstat != rpc_status) {
3049
3050		err = ETIMEDOUT;
3051#ifdef	DEBUG
3052		cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d",
3053		    rpc_status);
3054#endif
3055	} else
3056		err = try_failover_table[rpc_status].error;
3057
3058done:
3059	if (rpc_status)
3060		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
3061		    "nfs4_try_failover: %strying failover on error %d",
3062		    err ? "" : "NOT ", rpc_status));
3063
3064	return (err);
3065}
3066
3067void
3068nfs4_error_zinit(nfs4_error_t *ep)
3069{
3070	ep->error = 0;
3071	ep->stat = NFS4_OK;
3072	ep->rpc_status = RPC_SUCCESS;
3073}
3074
3075void
3076nfs4_error_init(nfs4_error_t *ep, int error)
3077{
3078	ep->error = error;
3079	ep->stat = NFS4_OK;
3080	ep->rpc_status = RPC_SUCCESS;
3081}
3082
3083
3084#ifdef DEBUG
3085
3086/*
3087 * Return a 16-bit hash for filehandle, stateid, clientid, owner.
3088 * use the same algorithm as for NFS v3.
3089 *
3090 */
3091int
3092hash16(void *p, int len)
3093{
3094	int i, rem;
3095	uint_t *wp;
3096	uint_t key = 0;
3097
3098	/* protect against non word aligned */
3099	if ((rem = len & 3) != 0)
3100		len &= ~3;
3101
3102	for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) {
3103		key ^= (*wp >> 16) ^ *wp;
3104	}
3105
3106	/* hash left-over bytes */
3107	for (i = 0; i < rem; i++)
3108		key ^= *((uchar_t *)p + i);
3109
3110	return (key & 0xffff);
3111}
3112
3113/*
3114 * rnode4info - return filehandle and path information for an rnode.
3115 * XXX MT issues: uses a single static buffer, no locking of path.
3116 */
3117char *
3118rnode4info(rnode4_t *rp)
3119{
3120	static char buf[80];
3121	nfs4_fhandle_t fhandle;
3122	char *path;
3123	char *type;
3124
3125	if (rp == NULL)
3126		return ("null");
3127	if (rp->r_flags & R4ISXATTR)
3128		type = "attr";
3129	else if (RTOV4(rp)->v_flag & V_XATTRDIR)
3130		type = "attrdir";
3131	else if (RTOV4(rp)->v_flag & VROOT)
3132		type = "root";
3133	else if (RTOV4(rp)->v_type == VDIR)
3134		type = "dir";
3135	else if (RTOV4(rp)->v_type == VREG)
3136		type = "file";
3137	else
3138		type = "other";
3139	sfh4_copyval(rp->r_fh, &fhandle);
3140	path = fn_path(rp->r_svnode.sv_name);
3141	(void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
3142	    (void *)rp, path, type, rp->r_flags,
3143	    hash16((void *)&fhandle.fh_buf, fhandle.fh_len));
3144	kmem_free(path, strlen(path)+1);
3145	return (buf);
3146}
3147#endif
3148