nfs4_callback.c revision 6204:c8334d6e3e0f
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27/* All Rights Reserved */
28
29#pragma ident	"%Z%%M%	%I%	%E% SMI"
30
31#include <sys/param.h>
32#include <sys/types.h>
33#include <sys/systm.h>
34#include <sys/cred.h>
35#include <sys/vfs.h>
36#include <sys/vnode.h>
37#include <sys/pathname.h>
38#include <sys/sysmacros.h>
39#include <sys/kmem.h>
40#include <sys/kstat.h>
41#include <sys/mkdev.h>
42#include <sys/mount.h>
43#include <sys/statvfs.h>
44#include <sys/errno.h>
45#include <sys/debug.h>
46#include <sys/cmn_err.h>
47#include <sys/utsname.h>
48#include <sys/bootconf.h>
49#include <sys/modctl.h>
50#include <sys/acl.h>
51#include <sys/flock.h>
52#include <sys/kstr.h>
53#include <sys/stropts.h>
54#include <sys/strsubr.h>
55#include <sys/atomic.h>
56#include <sys/disp.h>
57#include <sys/policy.h>
58#include <sys/list.h>
59#include <sys/zone.h>
60
61#include <rpc/types.h>
62#include <rpc/auth.h>
63#include <rpc/rpcsec_gss.h>
64#include <rpc/clnt.h>
65#include <rpc/xdr.h>
66
67#include <nfs/nfs.h>
68#include <nfs/nfs_clnt.h>
69#include <nfs/mount.h>
70#include <nfs/nfs_acl.h>
71
72#include <fs/fs_subr.h>
73
74#include <nfs/nfs4.h>
75#include <nfs/rnode4.h>
76#include <nfs/nfs4_clnt.h>
77#include <nfs/nfssys.h>
78
79#ifdef	DEBUG
80/*
81 * These are "special" state IDs and file handles that
82 * match any delegation state ID or file handled.  This
83 * is for testing purposes only.
84 */
85
86stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
87char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
88nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
89nfsstat4 cb4_getattr_fail = NFS4_OK;
90nfsstat4 cb4_recall_fail = NFS4_OK;
91
92int nfs4_callback_debug;
93int nfs4_recall_debug;
94int nfs4_drat_debug;
95
96#endif
97
98#define	CB_NOTE(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
99#define	CB_WARN(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
100#define	CB_WARN1(x, y)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
101
102enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
103
104static zone_key_t nfs4_callback_zone_key;
105
106/*
107 * NFS4_MAPSIZE is the number of bytes we are willing to consume
108 * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
109 * style delegation.
110 */
111
112#define	NFS4_MAPSIZE	8192
113#define	NFS4_MAPWORDS	NFS4_MAPSIZE/sizeof (uint_t)
114#define	NbPW		(NBBY*sizeof (uint_t))
115
116static int nfs4_num_prognums = 1024;
117static SVC_CALLOUT_TABLE nfs4_cb_sct;
118
119struct nfs4_dnode {
120	list_node_t	linkage;
121	rnode4_t	*rnodep;
122	int		flags;		/* Flags for nfs4delegreturn_impl() */
123};
124
125static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
126	{ "delegations",	KSTAT_DATA_UINT64 },
127	{ "cb_getattr",		KSTAT_DATA_UINT64 },
128	{ "cb_recall",		KSTAT_DATA_UINT64 },
129	{ "cb_null",		KSTAT_DATA_UINT64 },
130	{ "cb_dispatch",	KSTAT_DATA_UINT64 },
131	{ "delegaccept_r",	KSTAT_DATA_UINT64 },
132	{ "delegaccept_rw",	KSTAT_DATA_UINT64 },
133	{ "delegreturn",	KSTAT_DATA_UINT64 },
134	{ "callbacks",		KSTAT_DATA_UINT64 },
135	{ "claim_cur",		KSTAT_DATA_UINT64 },
136	{ "claim_cur_ok",	KSTAT_DATA_UINT64 },
137	{ "recall_trunc",	KSTAT_DATA_UINT64 },
138	{ "recall_failed",	KSTAT_DATA_UINT64 },
139	{ "return_limit_write",	KSTAT_DATA_UINT64 },
140	{ "return_limit_addmap", KSTAT_DATA_UINT64 },
141	{ "deleg_recover",	KSTAT_DATA_UINT64 },
142	{ "cb_illegal",		KSTAT_DATA_UINT64 }
143};
144
145struct nfs4_cb_port {
146	list_node_t		linkage; /* linkage into per-zone port list */
147	char			netid[KNC_STRSIZE];
148	char			uaddr[KNC_STRSIZE];
149	char			protofmly[KNC_STRSIZE];
150	char			proto[KNC_STRSIZE];
151};
152
153static int cb_getattr_bytes;
154
155struct cb_recall_pass {
156	rnode4_t	*rp;
157	int		flags;		/* Flags for nfs4delegreturn_impl() */
158	bool_t		truncate;
159};
160
161static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
162static void nfs4delegreturn_thread(struct cb_recall_pass *);
163static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
164    int);
165static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
166static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
167static int nfs4delegreturn_impl(rnode4_t *, int,
168    struct nfs4_callback_globals *);
169static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
170    struct nfs4_callback_globals *);
171
172static void
173cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
174	struct compound_state *cs, struct nfs4_callback_globals *ncg)
175{
176	CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
177	CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
178	rnode4_t *rp;
179	vnode_t *vp;
180	bool_t found = FALSE;
181	struct nfs4_server *sp;
182	struct fattr4 *fap;
183	rpc_inline_t *fdata;
184	long mapcnt;
185	fattr4_change change;
186	fattr4_size size;
187	uint_t rflag;
188
189	ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
190
191#ifdef DEBUG
192	/*
193	 * error injection hook: set cb_getattr_fail global to
194	 * NFS4 pcol error to be returned
195	 */
196	if (cb4_getattr_fail != NFS4_OK) {
197		*cs->statusp = resp->status = cb4_getattr_fail;
198		return;
199	}
200#endif
201
202	resp->obj_attributes.attrmask = 0;
203
204	mutex_enter(&ncg->nfs4_cb_lock);
205	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
206	mutex_exit(&ncg->nfs4_cb_lock);
207
208	if (nfs4_server_vlock(sp, 0) == FALSE) {
209
210		CB_WARN("cb_getattr: cannot find server\n");
211
212		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
213		return;
214	}
215
216	/*
217	 * In cb_compound, callback_ident was validated against rq_prog,
218	 * but we couldn't verify that it was set to the value we provided
219	 * at setclientid time (because we didn't have server struct yet).
220	 * Now we have the server struct, but don't have callback_ident
221	 * handy.  So, validate server struct program number against req
222	 * RPC's prog number.  At this point, we know the RPC prog num
223	 * is valid (else we wouldn't be here); however, we don't know
224	 * that it was the prog number we supplied to this server at
225	 * setclientid time.  If the prog numbers aren't equivalent, then
226	 * log the problem and fail the request because either cbserv
227	 * and/or cbclient are confused.  This will probably never happen.
228	 */
229	if (sp->s_program != req->rq_prog) {
230#ifdef DEBUG
231		zcmn_err(getzoneid(), CE_WARN,
232		    "cb_getattr: wrong server program number srv=%d req=%d\n",
233		    sp->s_program, req->rq_prog);
234#else
235		zcmn_err(getzoneid(), CE_WARN,
236		    "cb_getattr: wrong server program number\n");
237#endif
238		mutex_exit(&sp->s_lock);
239		nfs4_server_rele(sp);
240		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
241		return;
242	}
243
244	/*
245	 * Search the delegation list for a matching file handle;
246	 * mutex on sp prevents the list from changing.
247	 */
248
249	rp = list_head(&sp->s_deleg_list);
250	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
251		nfs4_fhandle_t fhandle;
252
253		sfh4_copyval(rp->r_fh, &fhandle);
254
255		if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
256		    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
257		    fhandle.fh_len) == 0)) {
258
259			found = TRUE;
260			break;
261		}
262#ifdef	DEBUG
263		if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
264		    bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
265		    args->fh.nfs_fh4_len) == 0) {
266
267			found = TRUE;
268			break;
269		}
270#endif
271	}
272
273	/*
274	 * VN_HOLD the vnode before releasing s_lock to guarantee
275	 * we have a valid vnode reference.
276	 */
277	if (found == TRUE) {
278		vp = RTOV4(rp);
279		VN_HOLD(vp);
280	}
281
282	mutex_exit(&sp->s_lock);
283	nfs4_server_rele(sp);
284
285	if (found == FALSE) {
286
287		CB_WARN("cb_getattr: bad fhandle\n");
288
289		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
290		return;
291	}
292
293	/*
294	 * Figure out which attributes the server wants.  We only
295	 * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
296	 */
297	fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
298
299	/*
300	 * Don't actually need to create XDR to encode these
301	 * simple data structures.
302	 * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
303	 */
304	fap = &resp->obj_attributes;
305
306	fap->attrmask = 0;
307	/* attrlist4_len starts at 0 and increases as attrs are processed */
308	fap->attrlist4 = (char *)fdata;
309	fap->attrlist4_len = 0;
310
311	/* don't supply attrs if request was zero */
312	if (args->attr_request != 0) {
313		if (args->attr_request & FATTR4_CHANGE_MASK) {
314			/*
315			 * If the file is mmapped, then increment the change
316			 * attribute and return it.  This will guarantee that
317			 * the server will perceive that the file has changed
318			 * if there is any chance that the client application
319			 * has changed it.  Otherwise, just return the change
320			 * attribute as it has been updated by nfs4write_deleg.
321			 */
322
323			mutex_enter(&rp->r_statelock);
324			mapcnt = rp->r_mapcnt;
325			rflag = rp->r_flags;
326			mutex_exit(&rp->r_statelock);
327
328			mutex_enter(&rp->r_statev4_lock);
329			/*
330			 * If object mapped, then always return new change.
331			 * Otherwise, return change if object has dirty
332			 * pages.  If object doesn't have any dirty pages,
333			 * then all changes have been pushed to server, so
334			 * reset change to grant change.
335			 */
336			if (mapcnt)
337				rp->r_deleg_change++;
338			else if (! (rflag & R4DIRTY))
339				rp->r_deleg_change = rp->r_deleg_change_grant;
340			change = rp->r_deleg_change;
341			mutex_exit(&rp->r_statev4_lock);
342
343			/*
344			 * Use inline XDR code directly, we know that we
345			 * going to a memory buffer and it has enough
346			 * space so it cannot fail.
347			 */
348			IXDR_PUT_U_HYPER(fdata, change);
349			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
350			fap->attrmask |= FATTR4_CHANGE_MASK;
351		}
352
353		if (args->attr_request & FATTR4_SIZE_MASK) {
354			/*
355			 * Use an atomic add of 0 to fetch a consistent view
356			 * of r_size; this avoids having to take rw_lock
357			 * which could cause a deadlock.
358			 */
359			size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
360
361			/*
362			 * Use inline XDR code directly, we know that we
363			 * going to a memory buffer and it has enough
364			 * space so it cannot fail.
365			 */
366			IXDR_PUT_U_HYPER(fdata, size);
367			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
368			fap->attrmask |= FATTR4_SIZE_MASK;
369		}
370	}
371
372	VN_RELE(vp);
373
374	*cs->statusp = resp->status = NFS4_OK;
375}
376
377static void
378cb_getattr_free(nfs_cb_resop4 *resop)
379{
380	if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
381		kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
382		    obj_attributes.attrlist4, cb_getattr_bytes);
383}
384
385static void
386cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
387	struct compound_state *cs, struct nfs4_callback_globals *ncg)
388{
389	CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
390	CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
391	rnode4_t *rp;
392	vnode_t *vp;
393	struct nfs4_server *sp;
394	bool_t found = FALSE;
395
396	ncg->nfs4_callback_stats.cb_recall.value.ui64++;
397
398	ASSERT(req->rq_prog >= NFS4_CALLBACK);
399	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
400
401#ifdef DEBUG
402	/*
403	 * error injection hook: set cb_recall_fail global to
404	 * NFS4 pcol error to be returned
405	 */
406	if (cb4_recall_fail != NFS4_OK) {
407		*cs->statusp = resp->status = cb4_recall_fail;
408		return;
409	}
410#endif
411
412	mutex_enter(&ncg->nfs4_cb_lock);
413	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
414	mutex_exit(&ncg->nfs4_cb_lock);
415
416	if (nfs4_server_vlock(sp, 0) == FALSE) {
417
418		CB_WARN("cb_recall: cannot find server\n");
419
420		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
421		return;
422	}
423
424	/*
425	 * Search the delegation list for a matching file handle
426	 * AND stateid; mutex on sp prevents the list from changing.
427	 */
428
429	rp = list_head(&sp->s_deleg_list);
430	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
431		mutex_enter(&rp->r_statev4_lock);
432
433		/* check both state id and file handle! */
434
435		if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
436		    sizeof (stateid4)) == 0)) {
437			nfs4_fhandle_t fhandle;
438
439			sfh4_copyval(rp->r_fh, &fhandle);
440			if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
441			    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
442			    fhandle.fh_len) == 0)) {
443
444				found = TRUE;
445				break;
446			} else {
447#ifdef	DEBUG
448				CB_WARN("cb_recall: stateid OK, bad fh");
449#endif
450			}
451		}
452#ifdef	DEBUG
453		if (bcmp(&args->stateid, &nfs4_deleg_any,
454		    sizeof (stateid4)) == 0) {
455
456			found = TRUE;
457			break;
458		}
459#endif
460		mutex_exit(&rp->r_statev4_lock);
461	}
462
463	/*
464	 * VN_HOLD the vnode before releasing s_lock to guarantee
465	 * we have a valid vnode reference.  The async thread will
466	 * release the hold when it's done.
467	 */
468	if (found == TRUE) {
469		mutex_exit(&rp->r_statev4_lock);
470		vp = RTOV4(rp);
471		VN_HOLD(vp);
472	}
473	mutex_exit(&sp->s_lock);
474	nfs4_server_rele(sp);
475
476	if (found == FALSE) {
477
478		CB_WARN("cb_recall: bad stateid\n");
479
480		*cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
481		return;
482	}
483
484	/* Fire up a thread to do the delegreturn */
485	nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
486	    args->truncate);
487
488	*cs->statusp = resp->status = 0;
489}
490
491/* ARGSUSED */
492static void
493cb_recall_free(nfs_cb_resop4 *resop)
494{
495	/* nothing to do here, cb_recall doesn't kmem_alloc */
496}
497
498/*
499 * This function handles the CB_NULL proc call from an NFSv4 Server.
500 *
501 * We take note that the server has sent a CB_NULL for later processing
502 * in the recovery logic. It is noted so we may pause slightly after the
503 * setclientid and before reopening files. The pause is to allow the
504 * NFSv4 Server time to receive the CB_NULL reply and adjust any of
505 * its internal structures such that it has the opportunity to grant
506 * delegations to reopened files.
507 *
508 */
509
510/* ARGSUSED */
511static void
512cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
513    struct nfs4_callback_globals *ncg)
514{
515	struct nfs4_server *sp;
516
517	ncg->nfs4_callback_stats.cb_null.value.ui64++;
518
519	ASSERT(req->rq_prog >= NFS4_CALLBACK);
520	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
521
522	mutex_enter(&ncg->nfs4_cb_lock);
523	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
524	mutex_exit(&ncg->nfs4_cb_lock);
525
526	if (nfs4_server_vlock(sp, 0) != FALSE) {
527		sp->s_flags |= N4S_CB_PINGED;
528		cv_broadcast(&sp->wait_cb_null);
529		mutex_exit(&sp->s_lock);
530		nfs4_server_rele(sp);
531	}
532}
533
534/*
535 * cb_illegal	args: void
536 *		res : status (NFS4ERR_OP_CB_ILLEGAL)
537 */
538/* ARGSUSED */
539static void
540cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
541	struct compound_state *cs, struct nfs4_callback_globals *ncg)
542{
543	CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
544
545	ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
546	resop->resop = OP_CB_ILLEGAL;
547	*cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
548}
549
550static void
551cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
552	struct nfs4_callback_globals *ncg)
553{
554	uint_t i;
555	struct compound_state cs;
556	nfs_cb_argop4 *argop;
557	nfs_cb_resop4 *resop, *new_res;
558	uint_t op;
559
560	bzero(&cs, sizeof (cs));
561	cs.statusp = &resp->status;
562	cs.cont = TRUE;
563
564	/*
565	 * Form a reply tag by copying over the reqeuest tag.
566	 */
567	resp->tag.utf8string_len = args->tag.utf8string_len;
568	resp->tag.utf8string_val = kmem_alloc(resp->tag.utf8string_len,
569	    KM_SLEEP);
570	bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
571	    args->tag.utf8string_len);
572
573	/*
574	 * XXX for now, minorversion should be zero
575	 */
576	if (args->minorversion != CB4_MINORVERSION) {
577		resp->array_len = 0;
578		resp->array = NULL;
579		resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
580		return;
581	}
582
583#ifdef DEBUG
584	/*
585	 * Verify callback_ident.  It doesn't really matter if it's wrong
586	 * because we don't really use callback_ident -- we use prog number
587	 * of the RPC request instead.  In this case, just print a DEBUG
588	 * console message to reveal brokenness of cbclient (at bkoff/cthon).
589	 */
590	if (args->callback_ident != req->rq_prog)
591		zcmn_err(getzoneid(), CE_WARN,
592		    "cb_compound: cb_client using wrong "
593		    "callback_ident(%d), should be %d",
594		    args->callback_ident, req->rq_prog);
595#endif
596
597	resp->array_len = args->array_len;
598	resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
599	    KM_SLEEP);
600
601	for (i = 0; i < args->array_len && cs.cont; i++) {
602
603		argop = &args->array[i];
604		resop = &resp->array[i];
605		resop->resop = argop->argop;
606		op = (uint_t)resop->resop;
607
608		switch (op) {
609
610		case OP_CB_GETATTR:
611
612			cb_getattr(argop, resop, req, &cs, ncg);
613			break;
614
615		case OP_CB_RECALL:
616
617			cb_recall(argop, resop, req, &cs, ncg);
618			break;
619
620		case OP_CB_ILLEGAL:
621
622			/* fall through */
623
624		default:
625			/*
626			 * Handle OP_CB_ILLEGAL and any undefined opcode.
627			 * Currently, the XDR code will return BADXDR
628			 * if cb op doesn't decode to legal value, so
629			 * it really only handles OP_CB_ILLEGAL.
630			 */
631			op = OP_CB_ILLEGAL;
632			cb_illegal(argop, resop, req, &cs, ncg);
633		}
634
635		if (*cs.statusp != NFS4_OK)
636			cs.cont = FALSE;
637
638		/*
639		 * If not at last op, and if we are to stop, then
640		 * compact the results array.
641		 */
642		if ((i + 1) < args->array_len && !cs.cont) {
643
644			new_res = kmem_alloc(
645			    (i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
646			bcopy(resp->array,
647			    new_res, (i+1) * sizeof (nfs_cb_resop4));
648			kmem_free(resp->array,
649			    args->array_len * sizeof (nfs_cb_resop4));
650
651			resp->array_len =  i + 1;
652			resp->array = new_res;
653		}
654	}
655
656}
657
658static void
659cb_compound_free(CB_COMPOUND4res *resp)
660{
661	uint_t i, op;
662	nfs_cb_resop4 *resop;
663
664	if (resp->tag.utf8string_val) {
665		UTF8STRING_FREE(resp->tag)
666	}
667
668	for (i = 0; i < resp->array_len; i++) {
669
670		resop = &resp->array[i];
671		op = (uint_t)resop->resop;
672
673		switch (op) {
674
675		case OP_CB_GETATTR:
676
677			cb_getattr_free(resop);
678			break;
679
680		case OP_CB_RECALL:
681
682			cb_recall_free(resop);
683			break;
684
685		default:
686			break;
687		}
688	}
689
690	if (resp->array != NULL) {
691		kmem_free(resp->array,
692		    resp->array_len * sizeof (nfs_cb_resop4));
693	}
694}
695
696static void
697cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
698{
699	CB_COMPOUND4args args;
700	CB_COMPOUND4res res;
701	struct nfs4_callback_globals *ncg;
702
703	bool_t (*xdr_args)(), (*xdr_res)();
704	void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
705	    struct nfs4_callback_globals *);
706	void (*freeproc)(CB_COMPOUND4res *);
707
708	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
709	ASSERT(ncg != NULL);
710
711	ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
712
713	switch (req->rq_proc) {
714	case CB_NULL:
715		xdr_args = xdr_void;
716		xdr_res = xdr_void;
717		proc = cb_null;
718		freeproc = NULL;
719		break;
720
721	case CB_COMPOUND:
722		xdr_args = xdr_CB_COMPOUND4args_clnt;
723		xdr_res = xdr_CB_COMPOUND4res;
724		proc = cb_compound;
725		freeproc = cb_compound_free;
726		break;
727
728	default:
729		CB_WARN("cb_dispatch: no proc\n");
730		svcerr_noproc(xprt);
731		return;
732	}
733
734	args.tag.utf8string_val = NULL;
735	args.array = NULL;
736
737	if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
738
739		CB_WARN("cb_dispatch: cannot getargs\n");
740		svcerr_decode(xprt);
741		return;
742	}
743
744	(*proc)(&args, &res, req, ncg);
745
746	if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
747
748		CB_WARN("cb_dispatch: bad sendreply\n");
749		svcerr_systemerr(xprt);
750	}
751
752	if (freeproc)
753		(*freeproc)(&res);
754
755	if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
756
757		CB_WARN("cb_dispatch: bad freeargs\n");
758	}
759}
760
761static rpcprog_t
762nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
763{
764	int i, j;
765
766	j = ncg->nfs4_program_hint;
767	for (i = 0; i < nfs4_num_prognums; i++, j++) {
768
769		if (j >= nfs4_num_prognums)
770			j = 0;
771
772		if (ncg->nfs4prog2server[j] == NULL) {
773			ncg->nfs4_program_hint = j+1;
774			return (j+NFS4_CALLBACK);
775		}
776	}
777
778	return (0);
779}
780
781void
782nfs4callback_destroy(nfs4_server_t *np)
783{
784	struct nfs4_callback_globals *ncg;
785	int i;
786
787	if (np->s_program == 0)
788		return;
789
790	ncg = np->zone_globals;
791	i = np->s_program - NFS4_CALLBACK;
792
793	mutex_enter(&ncg->nfs4_cb_lock);
794
795	ASSERT(ncg->nfs4prog2server[i] == np);
796
797	ncg->nfs4prog2server[i] = NULL;
798
799	if (i < ncg->nfs4_program_hint)
800		ncg->nfs4_program_hint = i;
801
802	mutex_exit(&ncg->nfs4_cb_lock);
803}
804
805/*
806 * nfs4_setport - This function saves a netid and univeral address for
807 * the callback program.  These values will be used during setclientid.
808 */
809static void
810nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
811	struct nfs4_callback_globals *ncg)
812{
813	struct nfs4_cb_port *p;
814	bool_t found = FALSE;
815
816	ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
817
818	p = list_head(&ncg->nfs4_cb_ports);
819	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
820		if (strcmp(p->netid, netid) == 0) {
821			found = TRUE;
822			break;
823		}
824	}
825	if (found == TRUE)
826		(void) strcpy(p->uaddr, uaddr);
827	else {
828		p = kmem_alloc(sizeof (*p), KM_SLEEP);
829
830		(void) strcpy(p->uaddr, uaddr);
831		(void) strcpy(p->netid, netid);
832		(void) strcpy(p->protofmly, protofmly);
833		(void) strcpy(p->proto, proto);
834		list_insert_head(&ncg->nfs4_cb_ports, p);
835	}
836}
837
838/*
839 * nfs4_cb_args - This function is used to construct the callback
840 * portion of the arguments needed for setclientid.
841 */
842
843void
844nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
845{
846	struct nfs4_cb_port *p;
847	bool_t found = FALSE;
848	rpcprog_t pgm;
849	struct nfs4_callback_globals *ncg = np->zone_globals;
850
851	/*
852	 * This server structure may already have a program number
853	 * assigned to it.  This happens when the client has to
854	 * re-issue SETCLIENTID.  Just re-use the information.
855	 */
856	if (np->s_program >= NFS4_CALLBACK &&
857	    np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
858		nfs4callback_destroy(np);
859
860	mutex_enter(&ncg->nfs4_cb_lock);
861
862	p = list_head(&ncg->nfs4_cb_ports);
863	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
864		if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
865		    strcmp(p->proto, knc->knc_proto) == 0) {
866			found = TRUE;
867			break;
868		}
869	}
870
871	if (found == FALSE) {
872
873		NFS4_DEBUG(nfs4_callback_debug,
874		    (CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
875		    knc->knc_protofmly, knc->knc_proto));
876
877		args->callback.cb_program = 0;
878		args->callback.cb_location.r_netid = NULL;
879		args->callback.cb_location.r_addr = NULL;
880		args->callback_ident = 0;
881		mutex_exit(&ncg->nfs4_cb_lock);
882		return;
883	}
884
885	if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
886		CB_WARN("nfs4_cb_args: out of program numbers\n");
887
888		args->callback.cb_program = 0;
889		args->callback.cb_location.r_netid = NULL;
890		args->callback.cb_location.r_addr = NULL;
891		args->callback_ident = 0;
892		mutex_exit(&ncg->nfs4_cb_lock);
893		return;
894	}
895
896	ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
897	args->callback.cb_program = pgm;
898	args->callback.cb_location.r_netid = p->netid;
899	args->callback.cb_location.r_addr = p->uaddr;
900	args->callback_ident = pgm;
901
902	np->s_program = pgm;
903
904	mutex_exit(&ncg->nfs4_cb_lock);
905}
906
907static int
908nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
909{
910	file_t *fp;
911	vnode_t *vp;
912	rnode4_t *rp;
913	int error;
914	STRUCT_HANDLE(nfs4_svc_args, uap);
915
916	STRUCT_SET_HANDLE(uap, model, arg);
917
918	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
919		return (EBADF);
920
921	vp = fp->f_vnode;
922
923	if (vp == NULL || vp->v_type != VREG ||
924	    !vn_matchops(vp, nfs4_vnodeops)) {
925		releasef(STRUCT_FGET(uap, fd));
926		return (EBADF);
927	}
928
929	rp = VTOR4(vp);
930
931	/*
932	 * I can't convince myself that we need locking here.  The
933	 * rnode cannot disappear and the value returned is instantly
934	 * stale anway, so why bother?
935	 */
936
937	error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
938	releasef(STRUCT_FGET(uap, fd));
939	return (error);
940}
941
942
943/*
944 * NFS4 client system call.  This service does the
945 * necessary initialization for the callback program.
946 * This is fashioned after the server side interaction
947 * between nfsd and the kernel.  On the client, the
948 * mount command forks and the child process does the
949 * necessary interaction with the kernel.
950 *
951 * uap->fd is the fd of an open transport provider
952 */
953int
954nfs4_svc(struct nfs4_svc_args *arg, model_t model)
955{
956	file_t *fp;
957	int error;
958	int readsize;
959	char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
960	char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
961	size_t len;
962	STRUCT_HANDLE(nfs4_svc_args, uap);
963	struct netbuf addrmask;
964	int cmd;
965	SVCMASTERXPRT *cb_xprt;
966	struct nfs4_callback_globals *ncg;
967
968#ifdef lint
969	model = model;		/* STRUCT macros don't always refer to it */
970#endif
971
972	STRUCT_SET_HANDLE(uap, model, arg);
973
974	if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
975		return (nfs4_dquery(arg, model));
976
977	if (secpolicy_nfs(CRED()) != 0)
978		return (EPERM);
979
980	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
981		return (EBADF);
982
983	/*
984	 * Set read buffer size to rsize
985	 * and add room for RPC headers.
986	 */
987	readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
988	if (readsize < RPC_MAXDATASIZE)
989		readsize = RPC_MAXDATASIZE;
990
991	error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
992	    KNC_STRSIZE, &len);
993	if (error) {
994		releasef(STRUCT_FGET(uap, fd));
995		return (error);
996	}
997
998	cmd = STRUCT_FGET(uap, cmd);
999
1000	if (cmd & NFS4_KRPC_START) {
1001		addrmask.len = STRUCT_FGET(uap, addrmask.len);
1002		addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1003		addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1004		error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1005		    addrmask.len);
1006		if (error) {
1007			releasef(STRUCT_FGET(uap, fd));
1008			kmem_free(addrmask.buf, addrmask.maxlen);
1009			return (error);
1010		}
1011	}
1012	else
1013		addrmask.buf = NULL;
1014
1015	error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1016	    sizeof (uaddr), &len);
1017	if (error) {
1018		releasef(STRUCT_FGET(uap, fd));
1019		if (addrmask.buf)
1020			kmem_free(addrmask.buf, addrmask.maxlen);
1021		return (error);
1022	}
1023
1024	error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1025	    sizeof (protofmly), &len);
1026	if (error) {
1027		releasef(STRUCT_FGET(uap, fd));
1028		if (addrmask.buf)
1029			kmem_free(addrmask.buf, addrmask.maxlen);
1030		return (error);
1031	}
1032
1033	error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1034	    sizeof (proto), &len);
1035	if (error) {
1036		releasef(STRUCT_FGET(uap, fd));
1037		if (addrmask.buf)
1038			kmem_free(addrmask.buf, addrmask.maxlen);
1039		return (error);
1040	}
1041
1042	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1043	ASSERT(ncg != NULL);
1044
1045	mutex_enter(&ncg->nfs4_cb_lock);
1046	if (cmd & NFS4_SETPORT)
1047		nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1048
1049	if (cmd & NFS4_KRPC_START) {
1050		error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1051		    &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1052		if (error) {
1053			CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1054			    error);
1055			kmem_free(addrmask.buf, addrmask.maxlen);
1056		}
1057	}
1058
1059	mutex_exit(&ncg->nfs4_cb_lock);
1060	releasef(STRUCT_FGET(uap, fd));
1061	return (error);
1062}
1063
1064struct nfs4_callback_globals *
1065nfs4_get_callback_globals(void)
1066{
1067	return (zone_getspecific(nfs4_callback_zone_key, nfs_zone()));
1068}
1069
1070static void *
1071nfs4_callback_init_zone(zoneid_t zoneid)
1072{
1073	kstat_t *nfs4_callback_kstat;
1074	struct nfs4_callback_globals *ncg;
1075
1076	ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1077
1078	ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1079	    sizeof (struct nfs4_server *), KM_SLEEP);
1080
1081	/* initialize the dlist */
1082	mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1083	list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1084	    offsetof(struct nfs4_dnode, linkage));
1085
1086	/* initialize cb_port list */
1087	mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1088	list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1089	    offsetof(struct nfs4_cb_port, linkage));
1090
1091	/* get our own copy of the kstats */
1092	bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1093	    sizeof (nfs4_callback_stats_tmpl));
1094	/* register "nfs:0:nfs4_callback_stats" for this zone */
1095	if ((nfs4_callback_kstat =
1096	    kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1097	    KSTAT_TYPE_NAMED,
1098	    sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1099	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1100	    zoneid)) != NULL) {
1101		nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1102		kstat_install(nfs4_callback_kstat);
1103	}
1104	return (ncg);
1105}
1106
1107static void
1108nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1109{
1110	nfs4_server_t *sp;
1111	int i, num_removed;
1112
1113	/*
1114	 * It's OK here to just run through the registered "programs", as
1115	 * servers without programs won't have any delegations to handle.
1116	 */
1117	for (i = 0; i < nfs4_num_prognums; i++) {
1118		rnode4_t *rp;
1119
1120		mutex_enter(&ncg->nfs4_cb_lock);
1121		sp = ncg->nfs4prog2server[i];
1122		mutex_exit(&ncg->nfs4_cb_lock);
1123
1124		if (nfs4_server_vlock(sp, 1) == FALSE)
1125			continue;
1126		num_removed = 0;
1127		while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1128			mutex_enter(&rp->r_statev4_lock);
1129			if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1130				/*
1131				 * We need to take matters into our own hands,
1132				 * as nfs4delegreturn_cleanup_impl() won't
1133				 * remove this from the list.
1134				 */
1135				list_remove(&sp->s_deleg_list, rp);
1136				mutex_exit(&rp->r_statev4_lock);
1137				nfs4_dec_state_ref_count_nolock(sp,
1138				    VTOMI4(RTOV4(rp)));
1139				num_removed++;
1140				continue;
1141			}
1142			mutex_exit(&rp->r_statev4_lock);
1143			VN_HOLD(RTOV4(rp));
1144			mutex_exit(&sp->s_lock);
1145			/*
1146			 * The following will remove the node from the list.
1147			 */
1148			nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1149			VN_RELE(RTOV4(rp));
1150			mutex_enter(&sp->s_lock);
1151		}
1152		mutex_exit(&sp->s_lock);
1153		/* each removed list node reles a reference */
1154		while (num_removed-- > 0)
1155			nfs4_server_rele(sp);
1156		/* remove our reference for nfs4_server_vlock */
1157		nfs4_server_rele(sp);
1158	}
1159}
1160
1161/* ARGSUSED */
1162static void
1163nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1164{
1165	struct nfs4_callback_globals *ncg = data;
1166
1167	/*
1168	 * Clean pending delegation return list.
1169	 */
1170	nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
1171
1172	/*
1173	 * Discard all delegations.
1174	 */
1175	nfs4_discard_delegations(ncg);
1176}
1177
1178static void
1179nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
1180{
1181	struct nfs4_callback_globals *ncg = data;
1182	struct nfs4_cb_port *p;
1183	nfs4_server_t *sp, *next;
1184	nfs4_server_t freelist;
1185	int i;
1186
1187	kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
1188
1189	/*
1190	 * Discard all delegations that may have crept in since we did the
1191	 * _shutdown.
1192	 */
1193	nfs4_discard_delegations(ncg);
1194	/*
1195	 * We're completely done with this zone and all associated
1196	 * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
1197	 * more reference outstanding -- the reference we didn't release in
1198	 * nfs4_renew_lease_thread().
1199	 *
1200	 * Here we need to run through the global nfs4_server_lst as we need to
1201	 * deal with nfs4_server_ts without programs, as they also have threads
1202	 * created for them, and so have outstanding references that we need to
1203	 * release.
1204	 */
1205	freelist.forw = &freelist;
1206	freelist.back = &freelist;
1207	mutex_enter(&nfs4_server_lst_lock);
1208	sp = nfs4_server_lst.forw;
1209	while (sp != &nfs4_server_lst) {
1210		next = sp->forw;
1211		if (sp->zoneid == zoneid) {
1212			remque(sp);
1213			insque(sp, &freelist);
1214		}
1215		sp = next;
1216	}
1217	mutex_exit(&nfs4_server_lst_lock);
1218
1219	sp = freelist.forw;
1220	while (sp != &freelist) {
1221		next = sp->forw;
1222		nfs4_server_rele(sp);	/* free the list's reference */
1223		sp = next;
1224	}
1225
1226#ifdef DEBUG
1227	for (i = 0; i < nfs4_num_prognums; i++) {
1228		ASSERT(ncg->nfs4prog2server[i] == NULL);
1229	}
1230#endif
1231	kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
1232	    sizeof (struct nfs4_server *));
1233
1234	mutex_enter(&ncg->nfs4_cb_lock);
1235	while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
1236		list_remove(&ncg->nfs4_cb_ports, p);
1237		kmem_free(p, sizeof (*p));
1238	}
1239	list_destroy(&ncg->nfs4_cb_ports);
1240	mutex_destroy(&ncg->nfs4_cb_lock);
1241	list_destroy(&ncg->nfs4_dlist);
1242	mutex_destroy(&ncg->nfs4_dlist_lock);
1243	kmem_free(ncg, sizeof (*ncg));
1244}
1245
1246void
1247nfs4_callback_init(void)
1248{
1249	int i;
1250	SVC_CALLOUT *nfs4_cb_sc;
1251
1252	/* initialize the callback table */
1253	nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
1254	    sizeof (SVC_CALLOUT), KM_SLEEP);
1255
1256	for (i = 0; i < nfs4_num_prognums; i++) {
1257		nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
1258		nfs4_cb_sc[i].sc_versmin = NFS_CB;
1259		nfs4_cb_sc[i].sc_versmax = NFS_CB;
1260		nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
1261	}
1262
1263	nfs4_cb_sct.sct_size = nfs4_num_prognums;
1264	nfs4_cb_sct.sct_free = FALSE;
1265	nfs4_cb_sct.sct_sc = nfs4_cb_sc;
1266
1267	/*
1268	 * Compute max bytes required for dyamically allocated parts
1269	 * of cb_getattr reply.  Only size and change are supported now.
1270	 * If CB_GETATTR is changed to reply with additional attrs,
1271	 * additional sizes must be added below.
1272	 *
1273	 * fattr4_change + fattr4_size == uint64_t + uint64_t
1274	 */
1275	cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
1276
1277	zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
1278	    nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
1279}
1280
1281void
1282nfs4_callback_fini(void)
1283{
1284}
1285
1286/*
1287 * NB: This function can be called from the *wrong* zone (ie, the zone that
1288 * 'rp' belongs to and the caller's zone may not be the same).  This can happen
1289 * if the zone is going away and we get called from nfs4_async_inactive().  In
1290 * this case the globals will be NULL and we won't update the counters, which
1291 * doesn't matter as the zone is going away anyhow.
1292 */
1293static void
1294nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
1295	struct nfs4_callback_globals *ncg)
1296{
1297	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1298	boolean_t need_rele = B_FALSE;
1299
1300	/*
1301	 * Caller must be holding mi_recovlock in read mode
1302	 * to call here.  This is provided by start_op.
1303	 * Delegation management requires to grab s_lock
1304	 * first and then r_statev4_lock.
1305	 */
1306
1307	if (np == NULL) {
1308		np = find_nfs4_server_all(mi, 1);
1309		ASSERT(np != NULL);
1310		need_rele = B_TRUE;
1311	} else {
1312		mutex_enter(&np->s_lock);
1313	}
1314
1315	mutex_enter(&rp->r_statev4_lock);
1316
1317	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1318		mutex_exit(&rp->r_statev4_lock);
1319		mutex_exit(&np->s_lock);
1320		if (need_rele)
1321			nfs4_server_rele(np);
1322		return;
1323	}
1324
1325	/*
1326	 * Free the cred originally held when
1327	 * the delegation was granted.  Caller must
1328	 * hold this cred if it wants to use it after
1329	 * this call.
1330	 */
1331	crfree(rp->r_deleg_cred);
1332	rp->r_deleg_cred = NULL;
1333	rp->r_deleg_type = OPEN_DELEGATE_NONE;
1334	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1335	rp->r_deleg_needs_recall = FALSE;
1336	rp->r_deleg_return_pending = FALSE;
1337
1338	/*
1339	 * Remove the rnode from the server's list and
1340	 * update the ref counts.
1341	 */
1342	list_remove(&np->s_deleg_list, rp);
1343	mutex_exit(&rp->r_statev4_lock);
1344	nfs4_dec_state_ref_count_nolock(np, mi);
1345	mutex_exit(&np->s_lock);
1346	/* removed list node removes a reference */
1347	nfs4_server_rele(np);
1348	if (need_rele)
1349		nfs4_server_rele(np);
1350	if (ncg != NULL)
1351		ncg->nfs4_callback_stats.delegations.value.ui64--;
1352}
1353
1354void
1355nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
1356{
1357	struct nfs4_callback_globals *ncg;
1358
1359	if (np != NULL) {
1360		ncg = np->zone_globals;
1361	} else if (nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone) {
1362		ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1363		ASSERT(ncg != NULL);
1364	} else {
1365		/*
1366		 * Request coming from the wrong zone.
1367		 */
1368		ASSERT(getzoneid() == GLOBAL_ZONEID);
1369		ncg = NULL;
1370	}
1371
1372	nfs4delegreturn_cleanup_impl(rp, np, ncg);
1373}
1374
1375static void
1376nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1377	cred_t *cr, vnode_t *vp)
1378{
1379	if (error != ETIMEDOUT && error != EINTR &&
1380	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1381		lost_rqstp->lr_op = 0;
1382		return;
1383	}
1384
1385	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1386	    "nfs4close_save_lost_rqst: error %d", error));
1387
1388	lost_rqstp->lr_op = OP_DELEGRETURN;
1389	/*
1390	 * The vp is held and rele'd via the recovery code.
1391	 * See nfs4_save_lost_rqst.
1392	 */
1393	lost_rqstp->lr_vp = vp;
1394	lost_rqstp->lr_dvp = NULL;
1395	lost_rqstp->lr_oop = NULL;
1396	lost_rqstp->lr_osp = NULL;
1397	lost_rqstp->lr_lop = NULL;
1398	lost_rqstp->lr_cr = cr;
1399	lost_rqstp->lr_flk = NULL;
1400	lost_rqstp->lr_putfirst = FALSE;
1401}
1402
1403static void
1404nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
1405{
1406	COMPOUND4args_clnt args;
1407	COMPOUND4res_clnt res;
1408	nfs_argop4 argops[3];
1409	nfs4_ga_res_t *garp = NULL;
1410	hrtime_t t;
1411	int numops;
1412	int doqueue = 1;
1413
1414	args.ctag = TAG_DELEGRETURN;
1415
1416	numops = 3;		/* PUTFH, GETATTR, DELEGRETURN */
1417
1418	args.array = argops;
1419	args.array_len = numops;
1420
1421	argops[0].argop = OP_CPUTFH;
1422	argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1423
1424	argops[1].argop = OP_GETATTR;
1425	argops[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1426	argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
1427
1428	argops[2].argop = OP_DELEGRETURN;
1429	argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
1430	    rp->r_deleg_stateid;
1431
1432	t = gethrtime();
1433	rfs4call(VTOMI4(RTOV4(rp)), &args, &res, cr, &doqueue, 0, ep);
1434
1435	if (ep->error)
1436		return;
1437
1438	if (res.status == NFS4_OK) {
1439		garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
1440		nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
1441
1442	}
1443	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1444}
1445
1446int
1447nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
1448	struct nfs4_callback_globals *ncg)
1449{
1450	vnode_t *vp = RTOV4(rp);
1451	mntinfo4_t *mi = VTOMI4(vp);
1452	nfs4_lost_rqst_t lost_rqst;
1453	nfs4_recov_state_t recov_state;
1454	bool_t needrecov = FALSE, recovonly, done = FALSE;
1455	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1456
1457	ncg->nfs4_callback_stats.delegreturn.value.ui64++;
1458
1459	while (!done) {
1460		e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
1461		    &recov_state, &recovonly);
1462
1463		if (e.error) {
1464			if (flags & NFS4_DR_FORCE) {
1465				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1466				    RW_READER, 0);
1467				nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1468				nfs_rw_exit(&mi->mi_recovlock);
1469			}
1470			break;
1471		}
1472
1473		/*
1474		 * Check to see if the delegation has already been
1475		 * returned by the recovery thread.   The state of
1476		 * the delegation cannot change at this point due
1477		 * to start_fop and the r_deleg_recall_lock.
1478		 */
1479		if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1480			e.error = 0;
1481			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1482			break;
1483		}
1484
1485		if (recovonly) {
1486			/*
1487			 * Delegation will be returned via the
1488			 * recovery framework.  Build a lost request
1489			 * structure, start recovery and get out.
1490			 */
1491			nfs4_error_init(&e, EINTR);
1492			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1493			    cr, vp);
1494			(void) nfs4_start_recovery(&e, mi, vp,
1495			    NULL, &rp->r_deleg_stateid,
1496			    lost_rqst.lr_op == OP_DELEGRETURN ?
1497			    &lost_rqst : NULL, OP_DELEGRETURN, NULL);
1498			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1499			break;
1500		}
1501
1502		nfs4delegreturn_otw(rp, cr, &e);
1503
1504		/*
1505		 * Ignore some errors on delegreturn; no point in marking
1506		 * the file dead on a state destroying operation.
1507		 */
1508		if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
1509		    e.stat == NFS4ERR_BADHANDLE ||
1510		    e.stat == NFS4ERR_STALE))
1511			needrecov = FALSE;
1512		else
1513			needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1514
1515		if (needrecov) {
1516			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1517			    cr, vp);
1518			(void) nfs4_start_recovery(&e, mi, vp,
1519			    NULL, &rp->r_deleg_stateid,
1520			    lost_rqst.lr_op == OP_DELEGRETURN ?
1521			    &lost_rqst : NULL, OP_DELEGRETURN, NULL);
1522		} else {
1523			nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1524			done = TRUE;
1525		}
1526
1527		nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1528	}
1529	return (e.error);
1530}
1531
1532/*
1533 * nfs4_resend_delegreturn - used to drive the delegreturn
1534 * operation via the recovery thread.
1535 */
1536void
1537nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
1538	nfs4_server_t *np)
1539{
1540	rnode4_t *rp = VTOR4(lorp->lr_vp);
1541
1542	/* If the file failed recovery, just quit. */
1543	mutex_enter(&rp->r_statelock);
1544	if (rp->r_flags & R4RECOVERR) {
1545		ep->error = EIO;
1546	}
1547	mutex_exit(&rp->r_statelock);
1548
1549	if (!ep->error)
1550		nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
1551
1552	/*
1553	 * If recovery is now needed, then return the error
1554	 * and status and let the recovery thread handle it,
1555	 * including re-driving another delegreturn.  Otherwise,
1556	 * just give up and clean up the delegation.
1557	 */
1558	if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
1559		return;
1560
1561	if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
1562		nfs4delegreturn_cleanup(rp, np);
1563
1564	nfs4_error_zinit(ep);
1565}
1566
1567/*
1568 * nfs4delegreturn - general function to return a delegation.
1569 *
1570 * NFS4_DR_FORCE - return the delegation even if start_op fails
1571 * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
1572 * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
1573 * NFS4_DR_DID_OP - calling function already did nfs4_start_op
1574 * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
1575 * NFS4_DR_REOPEN - do file reopens, if applicable
1576 */
1577static int
1578nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
1579{
1580	int error = 0;
1581	cred_t *cr = NULL;
1582	vnode_t *vp;
1583	bool_t needrecov = FALSE;
1584	bool_t rw_entered = FALSE;
1585	bool_t do_reopen;
1586
1587	vp = RTOV4(rp);
1588
1589	/*
1590	 * If NFS4_DR_DISCARD is set by itself, take a short-cut and
1591	 * discard without doing an otw DELEGRETURN.  This may only be used
1592	 * by the recovery thread because it bypasses the synchronization
1593	 * with r_deleg_recall_lock and mi->mi_recovlock.
1594	 */
1595	if (flags == NFS4_DR_DISCARD) {
1596		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1597		return (0);
1598	}
1599
1600	if (flags & NFS4_DR_DID_OP) {
1601		/*
1602		 * Caller had already done start_op, which means the
1603		 * r_deleg_recall_lock is already held in READ mode
1604		 * so we cannot take it in write mode.  Return the
1605		 * delegation asynchronously.
1606		 *
1607		 * Remove the NFS4_DR_DID_OP flag so we don't
1608		 * get stuck looping through here.
1609		 */
1610		VN_HOLD(vp);
1611		nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
1612		return (0);
1613	}
1614
1615	/*
1616	 * Verify we still have a delegation and crhold the credential.
1617	 */
1618	mutex_enter(&rp->r_statev4_lock);
1619	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1620		mutex_exit(&rp->r_statev4_lock);
1621		goto out;
1622	}
1623	cr = rp->r_deleg_cred;
1624	ASSERT(cr != NULL);
1625	crhold(cr);
1626	mutex_exit(&rp->r_statev4_lock);
1627
1628	/*
1629	 * Push the modified data back to the server synchronously
1630	 * before doing DELEGRETURN.
1631	 */
1632	if (flags & NFS4_DR_PUSH)
1633		(void) VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
1634
1635	/*
1636	 * Take r_deleg_recall_lock in WRITE mode, this will prevent
1637	 * nfs4_is_otw_open_necessary from trying to use the delegation
1638	 * while the DELEGRETURN is in progress.
1639	 */
1640	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1641
1642	rw_entered = TRUE;
1643
1644	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1645		goto out;
1646
1647	if (flags & NFS4_DR_REOPEN) {
1648		/*
1649		 * If R4RECOVERRP is already set, then skip re-opening
1650		 * the delegation open streams and go straight to doing
1651		 * delegreturn.  (XXX if the file has failed recovery, then the
1652		 * delegreturn attempt is likely to be futile.)
1653		 */
1654		mutex_enter(&rp->r_statelock);
1655		do_reopen = !(rp->r_flags & R4RECOVERRP);
1656		mutex_exit(&rp->r_statelock);
1657
1658		if (do_reopen) {
1659			error = deleg_reopen(vp, &needrecov, ncg, flags);
1660			if (error != 0) {
1661				if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
1662				    == 0)
1663					goto out;
1664			} else if (needrecov) {
1665				if ((flags & NFS4_DR_FORCE) == 0)
1666					goto out;
1667			}
1668		}
1669	}
1670
1671	if (flags & NFS4_DR_DISCARD) {
1672		mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1673
1674		mutex_enter(&rp->r_statelock);
1675		/*
1676		 * deleg_return_pending is cleared inside of delegation_accept
1677		 * when a delegation is accepted.  if this flag has been
1678		 * cleared, then a new delegation has overwritten the one we
1679		 * were about to throw away.
1680		 */
1681		if (!rp->r_deleg_return_pending) {
1682			mutex_exit(&rp->r_statelock);
1683			goto out;
1684		}
1685		mutex_exit(&rp->r_statelock);
1686		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
1687		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1688		nfs_rw_exit(&mi->mi_recovlock);
1689	} else {
1690		error = nfs4_do_delegreturn(rp, flags, cr, ncg);
1691	}
1692
1693out:
1694	if (cr)
1695		crfree(cr);
1696	if (rw_entered)
1697		nfs_rw_exit(&rp->r_deleg_recall_lock);
1698	return (error);
1699}
1700
1701int
1702nfs4delegreturn(rnode4_t *rp, int flags)
1703{
1704	struct nfs4_callback_globals *ncg;
1705
1706	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1707	ASSERT(ncg != NULL);
1708
1709	return (nfs4delegreturn_impl(rp, flags, ncg));
1710}
1711
1712void
1713nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
1714{
1715	struct cb_recall_pass *pp;
1716
1717	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
1718	pp->rp = rp;
1719	pp->flags = flags;
1720	pp->truncate = trunc;
1721
1722	/*
1723	 * Fire up a thread to do the actual delegreturn
1724	 * Caller must guarantee that the rnode doesn't
1725	 * vanish (by calling VN_HOLD).
1726	 */
1727
1728	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
1729	    minclsyspri);
1730}
1731
1732static void
1733delegreturn_all_thread(rpcprog_t *pp)
1734{
1735	nfs4_server_t *np;
1736	bool_t found = FALSE;
1737	rpcprog_t prog;
1738	rnode4_t *rp;
1739	vnode_t *vp;
1740	zoneid_t zoneid = getzoneid();
1741	struct nfs4_callback_globals *ncg;
1742
1743	NFS4_DEBUG(nfs4_drat_debug,
1744	    (CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
1745
1746	prog = *pp;
1747	kmem_free(pp, sizeof (*pp));
1748	pp = NULL;
1749
1750	mutex_enter(&nfs4_server_lst_lock);
1751	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
1752		if (np->zoneid == zoneid && np->s_program == prog) {
1753			mutex_enter(&np->s_lock);
1754			found = TRUE;
1755			break;
1756		}
1757	}
1758	mutex_exit(&nfs4_server_lst_lock);
1759
1760	/*
1761	 * It's possible that the nfs4_server which was using this
1762	 * program number has vanished since this thread is async.
1763	 * If so, just return.  Your work here is finished, my friend.
1764	 */
1765	if (!found)
1766		goto out;
1767
1768	ncg = np->zone_globals;
1769	while ((rp = list_head(&np->s_deleg_list)) != NULL) {
1770		vp = RTOV4(rp);
1771		VN_HOLD(vp);
1772		mutex_exit(&np->s_lock);
1773		(void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
1774		    ncg);
1775		VN_RELE(vp);
1776
1777		/* retake the s_lock for next trip through the loop */
1778		mutex_enter(&np->s_lock);
1779	}
1780	mutex_exit(&np->s_lock);
1781out:
1782	NFS4_DEBUG(nfs4_drat_debug,
1783	    (CE_NOTE, "delereturn_all_thread: complete\n"));
1784	zthread_exit();
1785}
1786
1787void
1788nfs4_delegreturn_all(nfs4_server_t *sp)
1789{
1790	rpcprog_t pro, *pp;
1791
1792	mutex_enter(&sp->s_lock);
1793
1794	/* Check to see if the delegation list is empty */
1795
1796	if (list_head(&sp->s_deleg_list) == NULL) {
1797		mutex_exit(&sp->s_lock);
1798		return;
1799	}
1800	/*
1801	 * Grab the program number; the async thread will use this
1802	 * to find the nfs4_server.
1803	 */
1804	pro = sp->s_program;
1805	mutex_exit(&sp->s_lock);
1806	pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
1807	*pp = pro;
1808	(void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
1809	    minclsyspri);
1810}
1811
1812
1813/*
1814 * Discard any delegations
1815 *
1816 * Iterate over the servers s_deleg_list and
1817 * for matching mount-point rnodes discard
1818 * the delegation.
1819 */
1820void
1821nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
1822{
1823	rnode4_t *rp, *next;
1824	mntinfo4_t *r_mi;
1825	struct nfs4_callback_globals *ncg;
1826
1827	ASSERT(mutex_owned(&sp->s_lock));
1828	ncg = sp->zone_globals;
1829
1830	for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
1831		r_mi = VTOMI4(RTOV4(rp));
1832		next = list_next(&sp->s_deleg_list, rp);
1833
1834		if (r_mi != mi) {
1835			/*
1836			 * Skip if this rnode is in not on the
1837			 * same mount-point
1838			 */
1839			continue;
1840		}
1841
1842		ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
1843
1844#ifdef DEBUG
1845		if (nfs4_client_recov_debug) {
1846			zprintf(getzoneid(),
1847			    "nfs4_deleg_discard: matched rnode %p "
1848			"-- discarding delegation\n", (void *)rp);
1849		}
1850#endif
1851		mutex_enter(&rp->r_statev4_lock);
1852		/*
1853		 * Free the cred originally held when the delegation
1854		 * was granted. Also need to decrement the refcnt
1855		 * on this server for each delegation we discard
1856		 */
1857		if (rp->r_deleg_cred)
1858			crfree(rp->r_deleg_cred);
1859		rp->r_deleg_cred = NULL;
1860		rp->r_deleg_type = OPEN_DELEGATE_NONE;
1861		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1862		rp->r_deleg_needs_recall = FALSE;
1863		ASSERT(sp->s_refcnt > 1);
1864		sp->s_refcnt--;
1865		list_remove(&sp->s_deleg_list, rp);
1866		mutex_exit(&rp->r_statev4_lock);
1867		nfs4_dec_state_ref_count_nolock(sp, mi);
1868		ncg->nfs4_callback_stats.delegations.value.ui64--;
1869	}
1870}
1871
1872/*
1873 * Reopen any open streams that were covered by the given file's
1874 * delegation.
1875 * Returns zero or an errno value.  If there was no error, *recovp
1876 * indicates whether recovery was initiated.
1877 */
1878
1879static int
1880deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
1881	int flags)
1882{
1883	nfs4_open_stream_t *osp;
1884	nfs4_recov_state_t recov_state;
1885	bool_t needrecov = FALSE;
1886	mntinfo4_t *mi;
1887	rnode4_t *rp;
1888	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1889	int claimnull;
1890
1891	mi = VTOMI4(vp);
1892	rp = VTOR4(vp);
1893
1894	recov_state.rs_flags = 0;
1895	recov_state.rs_num_retry_despite_err = 0;
1896
1897retry:
1898	if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
1899		return (e.error);
1900	}
1901
1902	/*
1903	 * if we mean to discard the delegation, it must be BAD, so don't
1904	 * use it when doing the reopen or it will fail too.
1905	 */
1906	claimnull = (flags & NFS4_DR_DISCARD);
1907	/*
1908	 * Loop through the open streams for this rnode to find
1909	 * all of the ones created using the delegation state ID.
1910	 * Each of these needs to be re-opened.
1911	 */
1912
1913	while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
1914
1915		if (claimnull) {
1916			nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
1917		} else {
1918			ncg->nfs4_callback_stats.claim_cur.value.ui64++;
1919
1920			nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
1921			    FALSE);
1922			if (e.error == 0 && e.stat == NFS4_OK)
1923				ncg->nfs4_callback_stats.
1924				    claim_cur_ok.value.ui64++;
1925		}
1926
1927		if (e.error == EAGAIN) {
1928			nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
1929			goto retry;
1930		}
1931
1932		/*
1933		 * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
1934		 * recovery has already been started inside of nfs4_reopen.
1935		 */
1936		if (e.error == EINTR || e.error == ETIMEDOUT ||
1937		    NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
1938			open_stream_rele(osp, rp);
1939			break;
1940		}
1941
1942		needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1943
1944		if (e.error != 0 && !needrecov) {
1945			/*
1946			 * Recovery is not possible, but don't give up yet;
1947			 * we'd still like to do delegreturn after
1948			 * reopening as many streams as possible.
1949			 * Continue processing the open streams.
1950			 */
1951
1952			ncg->nfs4_callback_stats.recall_failed.value.ui64++;
1953
1954		} else if (needrecov) {
1955			/*
1956			 * Start recovery and bail out.  The recovery
1957			 * thread will take it from here.
1958			 */
1959			(void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
1960			    NULL, OP_OPEN, NULL);
1961			open_stream_rele(osp, rp);
1962			*recovp = TRUE;
1963			break;
1964		}
1965
1966		open_stream_rele(osp, rp);
1967	}
1968
1969	nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1970
1971	return (e.error);
1972}
1973
1974/*
1975 * get_next_deleg_stream - returns the next open stream which
1976 * represents a delegation for this rnode.  In order to assure
1977 * forward progress, the caller must guarantee that each open
1978 * stream returned is changed so that a future call won't return
1979 * it again.
1980 *
1981 * There are several ways for the open stream to change.  If the open
1982 * stream is !os_delegation, then we aren't interested in it.  Also, if
1983 * either os_failed_reopen or !os_valid, then don't return the osp.
1984 *
1985 * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
1986 * the osp if it is an os_delegation open stream.  Also, if the rnode still
1987 * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
1988 * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
1989 * then return the osp.
1990 *
1991 * We have already taken the 'r_deleg_recall_lock' as WRITER, which
1992 * prevents new OPENs from going OTW (as start_fop takes this
1993 * lock in READ mode); thus, no new open streams can be created
1994 * (which inherently means no new delegation open streams are
1995 * being created).
1996 */
1997
1998static nfs4_open_stream_t *
1999get_next_deleg_stream(rnode4_t *rp, int claimnull)
2000{
2001	nfs4_open_stream_t	*osp;
2002
2003	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2004
2005	/*
2006	 * Search through the list of open streams looking for
2007	 * one that was created while holding the delegation.
2008	 */
2009	mutex_enter(&rp->r_os_lock);
2010	for (osp = list_head(&rp->r_open_streams); osp != NULL;
2011	    osp = list_next(&rp->r_open_streams, osp)) {
2012		mutex_enter(&osp->os_sync_lock);
2013		if (!osp->os_delegation || osp->os_failed_reopen ||
2014		    !osp->os_valid) {
2015			mutex_exit(&osp->os_sync_lock);
2016			continue;
2017		}
2018		if (!claimnull || rp->r_deleg_return_pending ||
2019		    !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2020			osp->os_ref_count++;
2021			mutex_exit(&osp->os_sync_lock);
2022			mutex_exit(&rp->r_os_lock);
2023			return (osp);
2024		}
2025		mutex_exit(&osp->os_sync_lock);
2026	}
2027	mutex_exit(&rp->r_os_lock);
2028
2029	return (NULL);
2030}
2031
2032static void
2033nfs4delegreturn_thread(struct cb_recall_pass *args)
2034{
2035	rnode4_t *rp;
2036	vnode_t *vp;
2037	cred_t *cr;
2038	int dtype, error, flags;
2039	bool_t rdirty, rip;
2040	kmutex_t cpr_lock;
2041	callb_cpr_t cpr_info;
2042	struct nfs4_callback_globals *ncg;
2043
2044	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2045	ASSERT(ncg != NULL);
2046
2047	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2048
2049	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2050	    "nfsv4delegRtn");
2051
2052	rp = args->rp;
2053	vp = RTOV4(rp);
2054
2055	mutex_enter(&rp->r_statev4_lock);
2056	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2057		mutex_exit(&rp->r_statev4_lock);
2058		goto out;
2059	}
2060	mutex_exit(&rp->r_statev4_lock);
2061
2062	/*
2063	 * Take the read-write lock in read mode to prevent other
2064	 * threads from modifying the data during the recall.  This
2065	 * doesn't affect mmappers.
2066	 */
2067	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2068
2069	/* Proceed with delegreturn */
2070
2071	mutex_enter(&rp->r_statev4_lock);
2072	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2073		mutex_exit(&rp->r_statev4_lock);
2074		nfs_rw_exit(&rp->r_rwlock);
2075		goto out;
2076	}
2077	dtype = rp->r_deleg_type;
2078	cr = rp->r_deleg_cred;
2079	ASSERT(cr != NULL);
2080	crhold(cr);
2081	mutex_exit(&rp->r_statev4_lock);
2082
2083	flags = args->flags;
2084
2085	/*
2086	 * If the file is being truncated at the server, then throw
2087	 * away all of the pages, it doesn't matter what flavor of
2088	 * delegation we have.
2089	 */
2090
2091	if (args->truncate) {
2092		ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2093		nfs4_invalidate_pages(vp, 0, cr);
2094	} else if (dtype == OPEN_DELEGATE_WRITE) {
2095
2096		mutex_enter(&rp->r_statelock);
2097		rdirty = rp->r_flags & R4DIRTY;
2098		mutex_exit(&rp->r_statelock);
2099
2100		if (rdirty) {
2101			error = VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
2102
2103			if (error)
2104				CB_WARN1("nfs4delegreturn_thread:"
2105				" VOP_PUTPAGE: %d\n", error);
2106		}
2107		/* turn off NFS4_DR_PUSH because we just did that above. */
2108		flags &= ~NFS4_DR_PUSH;
2109	}
2110
2111	mutex_enter(&rp->r_statelock);
2112	rip =  rp->r_flags & R4RECOVERRP;
2113	mutex_exit(&rp->r_statelock);
2114
2115	/* If a failed recovery is indicated, discard the pages */
2116
2117	if (rip) {
2118
2119		error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr, NULL);
2120
2121		if (error)
2122			CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2123			    error);
2124	}
2125
2126	/*
2127	 * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2128	 * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2129	 */
2130	flags &= ~NFS4_DR_DID_OP;
2131
2132	(void) nfs4delegreturn_impl(rp, flags, ncg);
2133
2134	nfs_rw_exit(&rp->r_rwlock);
2135	crfree(cr);
2136out:
2137	kmem_free(args, sizeof (struct cb_recall_pass));
2138	VN_RELE(vp);
2139	mutex_enter(&cpr_lock);
2140	CALLB_CPR_EXIT(&cpr_info);
2141	mutex_destroy(&cpr_lock);
2142	zthread_exit();
2143}
2144
2145/*
2146 * This function has one assumption that the caller of this function is
2147 * either doing recovery (therefore cannot call nfs4_start_op) or has
2148 * already called nfs4_start_op().
2149 */
2150void
2151nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim, OPEN4res *res,
2152	nfs4_ga_res_t *garp, cred_t *cr)
2153{
2154	open_read_delegation4 *orp;
2155	open_write_delegation4 *owp;
2156	nfs4_server_t *np;
2157	bool_t already = FALSE;
2158	bool_t recall = FALSE;
2159	bool_t valid_garp = TRUE;
2160	bool_t delegation_granted = FALSE;
2161	bool_t dr_needed = FALSE;
2162	bool_t recov;
2163	int dr_flags = 0;
2164	long mapcnt;
2165	uint_t rflag;
2166	mntinfo4_t *mi;
2167	struct nfs4_callback_globals *ncg;
2168	open_delegation_type4 odt;
2169
2170	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2171	ASSERT(ncg != NULL);
2172
2173	mi = VTOMI4(RTOV4(rp));
2174
2175	/*
2176	 * Accept a delegation granted to the client via an OPEN.
2177	 * Set the delegation fields in the rnode and insert the
2178	 * rnode onto the list anchored in the nfs4_server_t.  The
2179	 * proper locking order requires the nfs4_server_t first,
2180	 * even though it may not be needed in all cases.
2181	 *
2182	 * NB: find_nfs4_server returns with s_lock held.
2183	 */
2184
2185	if ((np = find_nfs4_server(mi)) == NULL)
2186		return;
2187
2188	/* grab the statelock too, for examining r_mapcnt */
2189	mutex_enter(&rp->r_statelock);
2190	mutex_enter(&rp->r_statev4_lock);
2191
2192	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
2193	    rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2194		already = TRUE;
2195
2196	odt = res->delegation.delegation_type;
2197
2198	if (odt == OPEN_DELEGATE_READ) {
2199
2200		rp->r_deleg_type = res->delegation.delegation_type;
2201		orp = &res->delegation.open_delegation4_u.read;
2202		rp->r_deleg_stateid = orp->stateid;
2203		rp->r_deleg_perms = orp->permissions;
2204		if (claim == CLAIM_PREVIOUS)
2205			if ((recall = orp->recall) != 0)
2206				dr_needed = TRUE;
2207
2208		delegation_granted = TRUE;
2209
2210		ncg->nfs4_callback_stats.delegations.value.ui64++;
2211		ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
2212
2213	} else if (odt == OPEN_DELEGATE_WRITE) {
2214
2215		rp->r_deleg_type = res->delegation.delegation_type;
2216		owp = &res->delegation.open_delegation4_u.write;
2217		rp->r_deleg_stateid = owp->stateid;
2218		rp->r_deleg_perms = owp->permissions;
2219		rp->r_deleg_limit = owp->space_limit;
2220		if (claim == CLAIM_PREVIOUS)
2221			if ((recall = owp->recall) != 0)
2222				dr_needed = TRUE;
2223
2224		delegation_granted = TRUE;
2225
2226		if (garp == NULL || !garp->n4g_change_valid) {
2227			valid_garp = FALSE;
2228			rp->r_deleg_change = 0;
2229			rp->r_deleg_change_grant = 0;
2230		} else {
2231			rp->r_deleg_change = garp->n4g_change;
2232			rp->r_deleg_change_grant = garp->n4g_change;
2233		}
2234		mapcnt = rp->r_mapcnt;
2235		rflag = rp->r_flags;
2236
2237		/*
2238		 * Update the delegation change attribute if
2239		 * there are mappers for the file is dirty.  This
2240		 * might be the case during recovery after server
2241		 * reboot.
2242		 */
2243		if (mapcnt > 0 || rflag & R4DIRTY)
2244			rp->r_deleg_change++;
2245
2246		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2247		    "nfs4_delegation_accept: r_deleg_change: 0x%x\n",
2248		    (int)(rp->r_deleg_change >> 32)));
2249		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2250		    "nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
2251		    (int)(rp->r_deleg_change_grant >> 32)));
2252
2253
2254		ncg->nfs4_callback_stats.delegations.value.ui64++;
2255		ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
2256	} else if (already) {
2257		/*
2258		 * No delegation granted.  If the rnode currently has
2259		 * has one, then consider it tainted and return it.
2260		 */
2261		dr_needed = TRUE;
2262	}
2263
2264	if (delegation_granted) {
2265		/* Add the rnode to the list. */
2266		if (!already) {
2267			crhold(cr);
2268			rp->r_deleg_cred = cr;
2269
2270			ASSERT(mutex_owned(&np->s_lock));
2271			list_insert_head(&np->s_deleg_list, rp);
2272			/* added list node gets a reference */
2273			np->s_refcnt++;
2274			nfs4_inc_state_ref_count_nolock(np, mi);
2275		}
2276		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2277	}
2278
2279	/*
2280	 * We've now safely accepted the delegation, if any.  Drop the
2281	 * locks and figure out what post-processing is needed.  We'd
2282	 * like to retain r_statev4_lock, but nfs4_server_rele takes
2283	 * s_lock which would be a lock ordering violation.
2284	 */
2285	mutex_exit(&rp->r_statev4_lock);
2286	mutex_exit(&rp->r_statelock);
2287	mutex_exit(&np->s_lock);
2288	nfs4_server_rele(np);
2289
2290	/*
2291	 * Check to see if we are in recovery.  Remember that
2292	 * this function is protected by start_op, so a recovery
2293	 * cannot begin until we are out of here.
2294	 */
2295	mutex_enter(&mi->mi_lock);
2296	recov = mi->mi_recovflags & MI4_RECOV_ACTIV;
2297	mutex_exit(&mi->mi_lock);
2298
2299	mutex_enter(&rp->r_statev4_lock);
2300
2301	if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
2302		dr_needed = TRUE;
2303
2304	if (dr_needed && rp->r_deleg_return_pending == FALSE) {
2305		if (recov) {
2306			/*
2307			 * We cannot call delegreturn from inside
2308			 * of recovery or VOP_PUTPAGE will hang
2309			 * due to nfs4_start_fop call in
2310			 * nfs4write.  Use dlistadd to add the
2311			 * rnode to the list of rnodes needing
2312			 * cleaning.  We do not need to do reopen
2313			 * here because recov_openfiles will do it.
2314			 * In the non-recall case, just discard the
2315			 * delegation as it is no longer valid.
2316			 */
2317			if (recall)
2318				dr_flags = NFS4_DR_PUSH;
2319			else
2320				dr_flags = NFS4_DR_PUSH|NFS4_DR_DISCARD;
2321
2322			nfs4_dlistadd(rp, ncg, dr_flags);
2323			dr_flags = 0;
2324		} else {
2325			/*
2326			 * Push the modified data back to the server,
2327			 * reopen any delegation open streams, and return
2328			 * the delegation.  Drop the statev4_lock first!
2329			 */
2330			dr_flags =  NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN;
2331		}
2332	}
2333	mutex_exit(&rp->r_statev4_lock);
2334	if (dr_flags)
2335		(void) nfs4delegreturn_impl(rp, dr_flags, ncg);
2336}
2337
2338/*
2339 * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
2340 * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
2341 * or BADSEQID and the recovery code is unable to recover.  Push any
2342 * dirty data back to the server and return the delegation (if any).
2343 */
2344
2345void
2346nfs4delegabandon(rnode4_t *rp)
2347{
2348	vnode_t *vp;
2349	struct cb_recall_pass *pp;
2350	open_delegation_type4 dt;
2351
2352	mutex_enter(&rp->r_statev4_lock);
2353	dt = rp->r_deleg_type;
2354	mutex_exit(&rp->r_statev4_lock);
2355
2356	if (dt == OPEN_DELEGATE_NONE)
2357		return;
2358
2359	vp = RTOV4(rp);
2360	VN_HOLD(vp);
2361
2362	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2363	pp->rp = rp;
2364	/*
2365	 * Recovery on the file has failed and we want to return
2366	 * the delegation.  We don't want to reopen files and
2367	 * nfs4delegreturn_thread() figures out what to do about
2368	 * the data.  The only thing to do is attempt to return
2369	 * the delegation.
2370	 */
2371	pp->flags = 0;
2372	pp->truncate = FALSE;
2373
2374	/*
2375	 * Fire up a thread to do the delegreturn; this is
2376	 * necessary because we could be inside a GETPAGE or
2377	 * PUTPAGE and we cannot do another one.
2378	 */
2379
2380	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2381	    minclsyspri);
2382}
2383
2384static int
2385wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
2386	int flg)
2387{
2388	rnode4_t *rp;
2389	int error = 0;
2390
2391#ifdef lint
2392	op = op;
2393#endif
2394
2395	if (vp && vp->v_type == VREG) {
2396		rp = VTOR4(vp);
2397
2398		/*
2399		 * Take r_deleg_recall_lock in read mode to synchronize
2400		 * with delegreturn.
2401		 */
2402		error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
2403		    RW_READER, INTR4(vp));
2404
2405		if (error == 0)
2406			rsp->rs_flags |= flg;
2407
2408	}
2409	return (error);
2410}
2411
2412void
2413nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
2414{
2415	NFS4_DEBUG(nfs4_recall_debug,
2416	    (CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
2417	    (void *)vp1, (void *)vp2));
2418
2419	if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
2420		nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
2421	if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
2422		nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2423}
2424
2425int
2426wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
2427	nfs4_recov_state_t *rsp)
2428{
2429	int error;
2430
2431	NFS4_DEBUG(nfs4_recall_debug,
2432	    (CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
2433	    (void *)vp1, (void *) vp2));
2434
2435	rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
2436
2437	if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
2438		return (error);
2439
2440	if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
2441	    != 0) {
2442		if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
2443			nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2444			rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
2445		}
2446
2447		return (error);
2448	}
2449
2450	return (0);
2451}
2452
2453/*
2454 * nfs4_dlistadd - Add this rnode to a list of rnodes to be
2455 * DELEGRETURN'd at the end of recovery.
2456 */
2457
2458static void
2459nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
2460{
2461	struct nfs4_dnode *dp;
2462
2463	ASSERT(mutex_owned(&rp->r_statev4_lock));
2464	/*
2465	 * Mark the delegation as having a return pending.
2466	 * This will prevent the use of the delegation stateID
2467	 * by read, write, setattr and open.
2468	 */
2469	rp->r_deleg_return_pending = TRUE;
2470	dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
2471	VN_HOLD(RTOV4(rp));
2472	dp->rnodep = rp;
2473	dp->flags = flags;
2474	mutex_enter(&ncg->nfs4_dlist_lock);
2475	list_insert_head(&ncg->nfs4_dlist, dp);
2476#ifdef	DEBUG
2477	ncg->nfs4_dlistadd_c++;
2478#endif
2479	mutex_exit(&ncg->nfs4_dlist_lock);
2480}
2481
2482/*
2483 * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
2484 * of files awaiting cleaning.  If the override_flags are non-zero
2485 * then use them rather than the flags that were set when the rnode
2486 * was added to the dlist.
2487 */
2488static void
2489nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
2490{
2491	rnode4_t *rp;
2492	struct nfs4_dnode *dp;
2493	int flags;
2494
2495	ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
2496
2497	mutex_enter(&ncg->nfs4_dlist_lock);
2498	while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
2499#ifdef	DEBUG
2500		ncg->nfs4_dlistclean_c++;
2501#endif
2502		list_remove(&ncg->nfs4_dlist, dp);
2503		mutex_exit(&ncg->nfs4_dlist_lock);
2504		rp = dp->rnodep;
2505		flags = (override_flags != 0) ? override_flags : dp->flags;
2506		kmem_free(dp, sizeof (*dp));
2507		(void) nfs4delegreturn_impl(rp, flags, ncg);
2508		VN_RELE(RTOV4(rp));
2509		mutex_enter(&ncg->nfs4_dlist_lock);
2510	}
2511	mutex_exit(&ncg->nfs4_dlist_lock);
2512}
2513
2514void
2515nfs4_dlistclean(void)
2516{
2517	struct nfs4_callback_globals *ncg;
2518
2519	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2520	ASSERT(ncg != NULL);
2521
2522	nfs4_dlistclean_impl(ncg, 0);
2523}
2524