1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27/* All Rights Reserved */
28
29#include <sys/param.h>
30#include <sys/types.h>
31#include <sys/systm.h>
32#include <sys/cred.h>
33#include <sys/vfs.h>
34#include <sys/vnode.h>
35#include <sys/pathname.h>
36#include <sys/sysmacros.h>
37#include <sys/kmem.h>
38#include <sys/kstat.h>
39#include <sys/mkdev.h>
40#include <sys/mount.h>
41#include <sys/statvfs.h>
42#include <sys/errno.h>
43#include <sys/debug.h>
44#include <sys/cmn_err.h>
45#include <sys/utsname.h>
46#include <sys/bootconf.h>
47#include <sys/modctl.h>
48#include <sys/acl.h>
49#include <sys/flock.h>
50#include <sys/kstr.h>
51#include <sys/stropts.h>
52#include <sys/strsubr.h>
53#include <sys/atomic.h>
54#include <sys/disp.h>
55#include <sys/policy.h>
56#include <sys/list.h>
57#include <sys/zone.h>
58
59#include <rpc/types.h>
60#include <rpc/auth.h>
61#include <rpc/rpcsec_gss.h>
62#include <rpc/clnt.h>
63#include <rpc/xdr.h>
64
65#include <nfs/nfs.h>
66#include <nfs/nfs_clnt.h>
67#include <nfs/mount.h>
68#include <nfs/nfs_acl.h>
69
70#include <fs/fs_subr.h>
71
72#include <nfs/nfs4.h>
73#include <nfs/rnode4.h>
74#include <nfs/nfs4_clnt.h>
75#include <nfs/nfssys.h>
76
77#ifdef	DEBUG
78/*
79 * These are "special" state IDs and file handles that
80 * match any delegation state ID or file handled.  This
81 * is for testing purposes only.
82 */
83
84stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
85char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
86nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
87nfsstat4 cb4_getattr_fail = NFS4_OK;
88nfsstat4 cb4_recall_fail = NFS4_OK;
89
90int nfs4_callback_debug;
91int nfs4_recall_debug;
92int nfs4_drat_debug;
93
94#endif
95
96#define	CB_NOTE(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
97#define	CB_WARN(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
98#define	CB_WARN1(x, y)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
99
100enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
101
102static zone_key_t nfs4_callback_zone_key;
103
104/*
105 * NFS4_MAPSIZE is the number of bytes we are willing to consume
106 * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
107 * style delegation.
108 */
109
110#define	NFS4_MAPSIZE	8192
111#define	NFS4_MAPWORDS	NFS4_MAPSIZE/sizeof (uint_t)
112#define	NbPW		(NBBY*sizeof (uint_t))
113
114static int nfs4_num_prognums = 1024;
115static SVC_CALLOUT_TABLE nfs4_cb_sct;
116
117struct nfs4_dnode {
118	list_node_t	linkage;
119	rnode4_t	*rnodep;
120	int		flags;		/* Flags for nfs4delegreturn_impl() */
121};
122
123static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
124	{ "delegations",	KSTAT_DATA_UINT64 },
125	{ "cb_getattr",		KSTAT_DATA_UINT64 },
126	{ "cb_recall",		KSTAT_DATA_UINT64 },
127	{ "cb_null",		KSTAT_DATA_UINT64 },
128	{ "cb_dispatch",	KSTAT_DATA_UINT64 },
129	{ "delegaccept_r",	KSTAT_DATA_UINT64 },
130	{ "delegaccept_rw",	KSTAT_DATA_UINT64 },
131	{ "delegreturn",	KSTAT_DATA_UINT64 },
132	{ "callbacks",		KSTAT_DATA_UINT64 },
133	{ "claim_cur",		KSTAT_DATA_UINT64 },
134	{ "claim_cur_ok",	KSTAT_DATA_UINT64 },
135	{ "recall_trunc",	KSTAT_DATA_UINT64 },
136	{ "recall_failed",	KSTAT_DATA_UINT64 },
137	{ "return_limit_write",	KSTAT_DATA_UINT64 },
138	{ "return_limit_addmap", KSTAT_DATA_UINT64 },
139	{ "deleg_recover",	KSTAT_DATA_UINT64 },
140	{ "cb_illegal",		KSTAT_DATA_UINT64 }
141};
142
143struct nfs4_cb_port {
144	list_node_t		linkage; /* linkage into per-zone port list */
145	char			netid[KNC_STRSIZE];
146	char			uaddr[KNC_STRSIZE];
147	char			protofmly[KNC_STRSIZE];
148	char			proto[KNC_STRSIZE];
149};
150
151static int cb_getattr_bytes;
152
153struct cb_recall_pass {
154	rnode4_t	*rp;
155	int		flags;		/* Flags for nfs4delegreturn_impl() */
156	bool_t		truncate;
157};
158
159static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
160static void nfs4delegreturn_thread(struct cb_recall_pass *);
161static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
162    int);
163static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
164static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
165static int nfs4delegreturn_impl(rnode4_t *, int,
166    struct nfs4_callback_globals *);
167static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
168    struct nfs4_callback_globals *);
169
170static void
171cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
172	struct compound_state *cs, struct nfs4_callback_globals *ncg)
173{
174	CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
175	CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
176	rnode4_t *rp;
177	vnode_t *vp;
178	bool_t found = FALSE;
179	struct nfs4_server *sp;
180	struct fattr4 *fap;
181	rpc_inline_t *fdata;
182	long mapcnt;
183	fattr4_change change;
184	fattr4_size size;
185	uint_t rflag;
186
187	ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
188
189#ifdef DEBUG
190	/*
191	 * error injection hook: set cb_getattr_fail global to
192	 * NFS4 pcol error to be returned
193	 */
194	if (cb4_getattr_fail != NFS4_OK) {
195		*cs->statusp = resp->status = cb4_getattr_fail;
196		return;
197	}
198#endif
199
200	resp->obj_attributes.attrmask = 0;
201
202	mutex_enter(&ncg->nfs4_cb_lock);
203	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
204	mutex_exit(&ncg->nfs4_cb_lock);
205
206	if (nfs4_server_vlock(sp, 0) == FALSE) {
207
208		CB_WARN("cb_getattr: cannot find server\n");
209
210		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
211		return;
212	}
213
214	/*
215	 * In cb_compound, callback_ident was validated against rq_prog,
216	 * but we couldn't verify that it was set to the value we provided
217	 * at setclientid time (because we didn't have server struct yet).
218	 * Now we have the server struct, but don't have callback_ident
219	 * handy.  So, validate server struct program number against req
220	 * RPC's prog number.  At this point, we know the RPC prog num
221	 * is valid (else we wouldn't be here); however, we don't know
222	 * that it was the prog number we supplied to this server at
223	 * setclientid time.  If the prog numbers aren't equivalent, then
224	 * log the problem and fail the request because either cbserv
225	 * and/or cbclient are confused.  This will probably never happen.
226	 */
227	if (sp->s_program != req->rq_prog) {
228#ifdef DEBUG
229		zcmn_err(getzoneid(), CE_WARN,
230		    "cb_getattr: wrong server program number srv=%d req=%d\n",
231		    sp->s_program, req->rq_prog);
232#else
233		zcmn_err(getzoneid(), CE_WARN,
234		    "cb_getattr: wrong server program number\n");
235#endif
236		mutex_exit(&sp->s_lock);
237		nfs4_server_rele(sp);
238		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
239		return;
240	}
241
242	/*
243	 * Search the delegation list for a matching file handle;
244	 * mutex on sp prevents the list from changing.
245	 */
246
247	rp = list_head(&sp->s_deleg_list);
248	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
249		nfs4_fhandle_t fhandle;
250
251		sfh4_copyval(rp->r_fh, &fhandle);
252
253		if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
254		    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
255		    fhandle.fh_len) == 0)) {
256
257			found = TRUE;
258			break;
259		}
260#ifdef	DEBUG
261		if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
262		    bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
263		    args->fh.nfs_fh4_len) == 0) {
264
265			found = TRUE;
266			break;
267		}
268#endif
269	}
270
271	/*
272	 * VN_HOLD the vnode before releasing s_lock to guarantee
273	 * we have a valid vnode reference.
274	 */
275	if (found == TRUE) {
276		vp = RTOV4(rp);
277		VN_HOLD(vp);
278	}
279
280	mutex_exit(&sp->s_lock);
281	nfs4_server_rele(sp);
282
283	if (found == FALSE) {
284
285		CB_WARN("cb_getattr: bad fhandle\n");
286
287		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
288		return;
289	}
290
291	/*
292	 * Figure out which attributes the server wants.  We only
293	 * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
294	 */
295	fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
296
297	/*
298	 * Don't actually need to create XDR to encode these
299	 * simple data structures.
300	 * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
301	 */
302	fap = &resp->obj_attributes;
303
304	fap->attrmask = 0;
305	/* attrlist4_len starts at 0 and increases as attrs are processed */
306	fap->attrlist4 = (char *)fdata;
307	fap->attrlist4_len = 0;
308
309	/* don't supply attrs if request was zero */
310	if (args->attr_request != 0) {
311		if (args->attr_request & FATTR4_CHANGE_MASK) {
312			/*
313			 * If the file is mmapped, then increment the change
314			 * attribute and return it.  This will guarantee that
315			 * the server will perceive that the file has changed
316			 * if there is any chance that the client application
317			 * has changed it.  Otherwise, just return the change
318			 * attribute as it has been updated by nfs4write_deleg.
319			 */
320
321			mutex_enter(&rp->r_statelock);
322			mapcnt = rp->r_mapcnt;
323			rflag = rp->r_flags;
324			mutex_exit(&rp->r_statelock);
325
326			mutex_enter(&rp->r_statev4_lock);
327			/*
328			 * If object mapped, then always return new change.
329			 * Otherwise, return change if object has dirty
330			 * pages.  If object doesn't have any dirty pages,
331			 * then all changes have been pushed to server, so
332			 * reset change to grant change.
333			 */
334			if (mapcnt)
335				rp->r_deleg_change++;
336			else if (! (rflag & R4DIRTY))
337				rp->r_deleg_change = rp->r_deleg_change_grant;
338			change = rp->r_deleg_change;
339			mutex_exit(&rp->r_statev4_lock);
340
341			/*
342			 * Use inline XDR code directly, we know that we
343			 * going to a memory buffer and it has enough
344			 * space so it cannot fail.
345			 */
346			IXDR_PUT_U_HYPER(fdata, change);
347			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
348			fap->attrmask |= FATTR4_CHANGE_MASK;
349		}
350
351		if (args->attr_request & FATTR4_SIZE_MASK) {
352			/*
353			 * Use an atomic add of 0 to fetch a consistent view
354			 * of r_size; this avoids having to take rw_lock
355			 * which could cause a deadlock.
356			 */
357			size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
358
359			/*
360			 * Use inline XDR code directly, we know that we
361			 * going to a memory buffer and it has enough
362			 * space so it cannot fail.
363			 */
364			IXDR_PUT_U_HYPER(fdata, size);
365			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
366			fap->attrmask |= FATTR4_SIZE_MASK;
367		}
368	}
369
370	VN_RELE(vp);
371
372	*cs->statusp = resp->status = NFS4_OK;
373}
374
375static void
376cb_getattr_free(nfs_cb_resop4 *resop)
377{
378	if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
379		kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
380		    obj_attributes.attrlist4, cb_getattr_bytes);
381}
382
383static void
384cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
385	struct compound_state *cs, struct nfs4_callback_globals *ncg)
386{
387	CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
388	CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
389	rnode4_t *rp;
390	vnode_t *vp;
391	struct nfs4_server *sp;
392	bool_t found = FALSE;
393
394	ncg->nfs4_callback_stats.cb_recall.value.ui64++;
395
396	ASSERT(req->rq_prog >= NFS4_CALLBACK);
397	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
398
399#ifdef DEBUG
400	/*
401	 * error injection hook: set cb_recall_fail global to
402	 * NFS4 pcol error to be returned
403	 */
404	if (cb4_recall_fail != NFS4_OK) {
405		*cs->statusp = resp->status = cb4_recall_fail;
406		return;
407	}
408#endif
409
410	mutex_enter(&ncg->nfs4_cb_lock);
411	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
412	mutex_exit(&ncg->nfs4_cb_lock);
413
414	if (nfs4_server_vlock(sp, 0) == FALSE) {
415
416		CB_WARN("cb_recall: cannot find server\n");
417
418		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
419		return;
420	}
421
422	/*
423	 * Search the delegation list for a matching file handle
424	 * AND stateid; mutex on sp prevents the list from changing.
425	 */
426
427	rp = list_head(&sp->s_deleg_list);
428	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
429		mutex_enter(&rp->r_statev4_lock);
430
431		/* check both state id and file handle! */
432
433		if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
434		    sizeof (stateid4)) == 0)) {
435			nfs4_fhandle_t fhandle;
436
437			sfh4_copyval(rp->r_fh, &fhandle);
438			if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
439			    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
440			    fhandle.fh_len) == 0)) {
441
442				found = TRUE;
443				break;
444			} else {
445#ifdef	DEBUG
446				CB_WARN("cb_recall: stateid OK, bad fh");
447#endif
448			}
449		}
450#ifdef	DEBUG
451		if (bcmp(&args->stateid, &nfs4_deleg_any,
452		    sizeof (stateid4)) == 0) {
453
454			found = TRUE;
455			break;
456		}
457#endif
458		mutex_exit(&rp->r_statev4_lock);
459	}
460
461	/*
462	 * VN_HOLD the vnode before releasing s_lock to guarantee
463	 * we have a valid vnode reference.  The async thread will
464	 * release the hold when it's done.
465	 */
466	if (found == TRUE) {
467		mutex_exit(&rp->r_statev4_lock);
468		vp = RTOV4(rp);
469		VN_HOLD(vp);
470	}
471	mutex_exit(&sp->s_lock);
472	nfs4_server_rele(sp);
473
474	if (found == FALSE) {
475
476		CB_WARN("cb_recall: bad stateid\n");
477
478		*cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
479		return;
480	}
481
482	/* Fire up a thread to do the delegreturn */
483	nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
484	    args->truncate);
485
486	*cs->statusp = resp->status = 0;
487}
488
489/* ARGSUSED */
490static void
491cb_recall_free(nfs_cb_resop4 *resop)
492{
493	/* nothing to do here, cb_recall doesn't kmem_alloc */
494}
495
496/*
497 * This function handles the CB_NULL proc call from an NFSv4 Server.
498 *
499 * We take note that the server has sent a CB_NULL for later processing
500 * in the recovery logic. It is noted so we may pause slightly after the
501 * setclientid and before reopening files. The pause is to allow the
502 * NFSv4 Server time to receive the CB_NULL reply and adjust any of
503 * its internal structures such that it has the opportunity to grant
504 * delegations to reopened files.
505 *
506 */
507
508/* ARGSUSED */
509static void
510cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
511    struct nfs4_callback_globals *ncg)
512{
513	struct nfs4_server *sp;
514
515	ncg->nfs4_callback_stats.cb_null.value.ui64++;
516
517	ASSERT(req->rq_prog >= NFS4_CALLBACK);
518	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
519
520	mutex_enter(&ncg->nfs4_cb_lock);
521	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
522	mutex_exit(&ncg->nfs4_cb_lock);
523
524	if (nfs4_server_vlock(sp, 0) != FALSE) {
525		sp->s_flags |= N4S_CB_PINGED;
526		cv_broadcast(&sp->wait_cb_null);
527		mutex_exit(&sp->s_lock);
528		nfs4_server_rele(sp);
529	}
530}
531
532/*
533 * cb_illegal	args: void
534 *		res : status (NFS4ERR_OP_CB_ILLEGAL)
535 */
536/* ARGSUSED */
537static void
538cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
539	struct compound_state *cs, struct nfs4_callback_globals *ncg)
540{
541	CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
542
543	ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
544	resop->resop = OP_CB_ILLEGAL;
545	*cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
546}
547
548static void
549cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
550	struct nfs4_callback_globals *ncg)
551{
552	uint_t i;
553	struct compound_state cs;
554	nfs_cb_argop4 *argop;
555	nfs_cb_resop4 *resop, *new_res;
556	uint_t op;
557
558	bzero(&cs, sizeof (cs));
559	cs.statusp = &resp->status;
560	cs.cont = TRUE;
561
562	/*
563	 * Form a reply tag by copying over the reqeuest tag.
564	 */
565	resp->tag.utf8string_len = args->tag.utf8string_len;
566	resp->tag.utf8string_val = kmem_alloc(resp->tag.utf8string_len,
567	    KM_SLEEP);
568	bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
569	    args->tag.utf8string_len);
570
571	/*
572	 * XXX for now, minorversion should be zero
573	 */
574	if (args->minorversion != CB4_MINORVERSION) {
575		resp->array_len = 0;
576		resp->array = NULL;
577		resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
578		return;
579	}
580
581#ifdef DEBUG
582	/*
583	 * Verify callback_ident.  It doesn't really matter if it's wrong
584	 * because we don't really use callback_ident -- we use prog number
585	 * of the RPC request instead.  In this case, just print a DEBUG
586	 * console message to reveal brokenness of cbclient (at bkoff/cthon).
587	 */
588	if (args->callback_ident != req->rq_prog)
589		zcmn_err(getzoneid(), CE_WARN,
590		    "cb_compound: cb_client using wrong "
591		    "callback_ident(%d), should be %d",
592		    args->callback_ident, req->rq_prog);
593#endif
594
595	resp->array_len = args->array_len;
596	resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
597	    KM_SLEEP);
598
599	for (i = 0; i < args->array_len && cs.cont; i++) {
600
601		argop = &args->array[i];
602		resop = &resp->array[i];
603		resop->resop = argop->argop;
604		op = (uint_t)resop->resop;
605
606		switch (op) {
607
608		case OP_CB_GETATTR:
609
610			cb_getattr(argop, resop, req, &cs, ncg);
611			break;
612
613		case OP_CB_RECALL:
614
615			cb_recall(argop, resop, req, &cs, ncg);
616			break;
617
618		case OP_CB_ILLEGAL:
619
620			/* fall through */
621
622		default:
623			/*
624			 * Handle OP_CB_ILLEGAL and any undefined opcode.
625			 * Currently, the XDR code will return BADXDR
626			 * if cb op doesn't decode to legal value, so
627			 * it really only handles OP_CB_ILLEGAL.
628			 */
629			op = OP_CB_ILLEGAL;
630			cb_illegal(argop, resop, req, &cs, ncg);
631		}
632
633		if (*cs.statusp != NFS4_OK)
634			cs.cont = FALSE;
635
636		/*
637		 * If not at last op, and if we are to stop, then
638		 * compact the results array.
639		 */
640		if ((i + 1) < args->array_len && !cs.cont) {
641
642			new_res = kmem_alloc(
643			    (i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
644			bcopy(resp->array,
645			    new_res, (i+1) * sizeof (nfs_cb_resop4));
646			kmem_free(resp->array,
647			    args->array_len * sizeof (nfs_cb_resop4));
648
649			resp->array_len =  i + 1;
650			resp->array = new_res;
651		}
652	}
653
654}
655
656static void
657cb_compound_free(CB_COMPOUND4res *resp)
658{
659	uint_t i, op;
660	nfs_cb_resop4 *resop;
661
662	if (resp->tag.utf8string_val) {
663		UTF8STRING_FREE(resp->tag)
664	}
665
666	for (i = 0; i < resp->array_len; i++) {
667
668		resop = &resp->array[i];
669		op = (uint_t)resop->resop;
670
671		switch (op) {
672
673		case OP_CB_GETATTR:
674
675			cb_getattr_free(resop);
676			break;
677
678		case OP_CB_RECALL:
679
680			cb_recall_free(resop);
681			break;
682
683		default:
684			break;
685		}
686	}
687
688	if (resp->array != NULL) {
689		kmem_free(resp->array,
690		    resp->array_len * sizeof (nfs_cb_resop4));
691	}
692}
693
694static void
695cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
696{
697	CB_COMPOUND4args args;
698	CB_COMPOUND4res res;
699	struct nfs4_callback_globals *ncg;
700
701	bool_t (*xdr_args)(), (*xdr_res)();
702	void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
703	    struct nfs4_callback_globals *);
704	void (*freeproc)(CB_COMPOUND4res *);
705
706	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
707	ASSERT(ncg != NULL);
708
709	ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
710
711	switch (req->rq_proc) {
712	case CB_NULL:
713		xdr_args = xdr_void;
714		xdr_res = xdr_void;
715		proc = cb_null;
716		freeproc = NULL;
717		break;
718
719	case CB_COMPOUND:
720		xdr_args = xdr_CB_COMPOUND4args_clnt;
721		xdr_res = xdr_CB_COMPOUND4res;
722		proc = cb_compound;
723		freeproc = cb_compound_free;
724		break;
725
726	default:
727		CB_WARN("cb_dispatch: no proc\n");
728		svcerr_noproc(xprt);
729		return;
730	}
731
732	args.tag.utf8string_val = NULL;
733	args.array = NULL;
734
735	if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
736
737		CB_WARN("cb_dispatch: cannot getargs\n");
738		svcerr_decode(xprt);
739		return;
740	}
741
742	(*proc)(&args, &res, req, ncg);
743
744	if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
745
746		CB_WARN("cb_dispatch: bad sendreply\n");
747		svcerr_systemerr(xprt);
748	}
749
750	if (freeproc)
751		(*freeproc)(&res);
752
753	if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
754
755		CB_WARN("cb_dispatch: bad freeargs\n");
756	}
757}
758
759static rpcprog_t
760nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
761{
762	int i, j;
763
764	j = ncg->nfs4_program_hint;
765	for (i = 0; i < nfs4_num_prognums; i++, j++) {
766
767		if (j >= nfs4_num_prognums)
768			j = 0;
769
770		if (ncg->nfs4prog2server[j] == NULL) {
771			ncg->nfs4_program_hint = j+1;
772			return (j+NFS4_CALLBACK);
773		}
774	}
775
776	return (0);
777}
778
779void
780nfs4callback_destroy(nfs4_server_t *np)
781{
782	struct nfs4_callback_globals *ncg;
783	int i;
784
785	if (np->s_program == 0)
786		return;
787
788	ncg = np->zone_globals;
789	i = np->s_program - NFS4_CALLBACK;
790
791	mutex_enter(&ncg->nfs4_cb_lock);
792
793	ASSERT(ncg->nfs4prog2server[i] == np);
794
795	ncg->nfs4prog2server[i] = NULL;
796
797	if (i < ncg->nfs4_program_hint)
798		ncg->nfs4_program_hint = i;
799
800	mutex_exit(&ncg->nfs4_cb_lock);
801}
802
803/*
804 * nfs4_setport - This function saves a netid and univeral address for
805 * the callback program.  These values will be used during setclientid.
806 */
807static void
808nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
809	struct nfs4_callback_globals *ncg)
810{
811	struct nfs4_cb_port *p;
812	bool_t found = FALSE;
813
814	ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
815
816	p = list_head(&ncg->nfs4_cb_ports);
817	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
818		if (strcmp(p->netid, netid) == 0) {
819			found = TRUE;
820			break;
821		}
822	}
823	if (found == TRUE)
824		(void) strcpy(p->uaddr, uaddr);
825	else {
826		p = kmem_alloc(sizeof (*p), KM_SLEEP);
827
828		(void) strcpy(p->uaddr, uaddr);
829		(void) strcpy(p->netid, netid);
830		(void) strcpy(p->protofmly, protofmly);
831		(void) strcpy(p->proto, proto);
832		list_insert_head(&ncg->nfs4_cb_ports, p);
833	}
834}
835
836/*
837 * nfs4_cb_args - This function is used to construct the callback
838 * portion of the arguments needed for setclientid.
839 */
840
841void
842nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
843{
844	struct nfs4_cb_port *p;
845	bool_t found = FALSE;
846	rpcprog_t pgm;
847	struct nfs4_callback_globals *ncg = np->zone_globals;
848
849	/*
850	 * This server structure may already have a program number
851	 * assigned to it.  This happens when the client has to
852	 * re-issue SETCLIENTID.  Just re-use the information.
853	 */
854	if (np->s_program >= NFS4_CALLBACK &&
855	    np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
856		nfs4callback_destroy(np);
857
858	mutex_enter(&ncg->nfs4_cb_lock);
859
860	p = list_head(&ncg->nfs4_cb_ports);
861	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
862		if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
863		    strcmp(p->proto, knc->knc_proto) == 0) {
864			found = TRUE;
865			break;
866		}
867	}
868
869	if (found == FALSE) {
870
871		NFS4_DEBUG(nfs4_callback_debug,
872		    (CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
873		    knc->knc_protofmly, knc->knc_proto));
874
875		args->callback.cb_program = 0;
876		args->callback.cb_location.r_netid = NULL;
877		args->callback.cb_location.r_addr = NULL;
878		args->callback_ident = 0;
879		mutex_exit(&ncg->nfs4_cb_lock);
880		return;
881	}
882
883	if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
884		CB_WARN("nfs4_cb_args: out of program numbers\n");
885
886		args->callback.cb_program = 0;
887		args->callback.cb_location.r_netid = NULL;
888		args->callback.cb_location.r_addr = NULL;
889		args->callback_ident = 0;
890		mutex_exit(&ncg->nfs4_cb_lock);
891		return;
892	}
893
894	ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
895	args->callback.cb_program = pgm;
896	args->callback.cb_location.r_netid = p->netid;
897	args->callback.cb_location.r_addr = p->uaddr;
898	args->callback_ident = pgm;
899
900	np->s_program = pgm;
901
902	mutex_exit(&ncg->nfs4_cb_lock);
903}
904
905static int
906nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
907{
908	file_t *fp;
909	vnode_t *vp;
910	rnode4_t *rp;
911	int error;
912	STRUCT_HANDLE(nfs4_svc_args, uap);
913
914	STRUCT_SET_HANDLE(uap, model, arg);
915
916	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
917		return (EBADF);
918
919	vp = fp->f_vnode;
920
921	if (vp == NULL || vp->v_type != VREG ||
922	    !vn_matchops(vp, nfs4_vnodeops)) {
923		releasef(STRUCT_FGET(uap, fd));
924		return (EBADF);
925	}
926
927	rp = VTOR4(vp);
928
929	/*
930	 * I can't convince myself that we need locking here.  The
931	 * rnode cannot disappear and the value returned is instantly
932	 * stale anway, so why bother?
933	 */
934
935	error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
936	releasef(STRUCT_FGET(uap, fd));
937	return (error);
938}
939
940
941/*
942 * NFS4 client system call.  This service does the
943 * necessary initialization for the callback program.
944 * This is fashioned after the server side interaction
945 * between nfsd and the kernel.  On the client, the
946 * mount command forks and the child process does the
947 * necessary interaction with the kernel.
948 *
949 * uap->fd is the fd of an open transport provider
950 */
951int
952nfs4_svc(struct nfs4_svc_args *arg, model_t model)
953{
954	file_t *fp;
955	int error;
956	int readsize;
957	char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
958	char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
959	size_t len;
960	STRUCT_HANDLE(nfs4_svc_args, uap);
961	struct netbuf addrmask;
962	int cmd;
963	SVCMASTERXPRT *cb_xprt;
964	struct nfs4_callback_globals *ncg;
965
966#ifdef lint
967	model = model;		/* STRUCT macros don't always refer to it */
968#endif
969
970	STRUCT_SET_HANDLE(uap, model, arg);
971
972	if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
973		return (nfs4_dquery(arg, model));
974
975	if (secpolicy_nfs(CRED()) != 0)
976		return (EPERM);
977
978	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
979		return (EBADF);
980
981	/*
982	 * Set read buffer size to rsize
983	 * and add room for RPC headers.
984	 */
985	readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
986	if (readsize < RPC_MAXDATASIZE)
987		readsize = RPC_MAXDATASIZE;
988
989	error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
990	    KNC_STRSIZE, &len);
991	if (error) {
992		releasef(STRUCT_FGET(uap, fd));
993		return (error);
994	}
995
996	cmd = STRUCT_FGET(uap, cmd);
997
998	if (cmd & NFS4_KRPC_START) {
999		addrmask.len = STRUCT_FGET(uap, addrmask.len);
1000		addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1001		addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1002		error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1003		    addrmask.len);
1004		if (error) {
1005			releasef(STRUCT_FGET(uap, fd));
1006			kmem_free(addrmask.buf, addrmask.maxlen);
1007			return (error);
1008		}
1009	}
1010	else
1011		addrmask.buf = NULL;
1012
1013	error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1014	    sizeof (uaddr), &len);
1015	if (error) {
1016		releasef(STRUCT_FGET(uap, fd));
1017		if (addrmask.buf)
1018			kmem_free(addrmask.buf, addrmask.maxlen);
1019		return (error);
1020	}
1021
1022	error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1023	    sizeof (protofmly), &len);
1024	if (error) {
1025		releasef(STRUCT_FGET(uap, fd));
1026		if (addrmask.buf)
1027			kmem_free(addrmask.buf, addrmask.maxlen);
1028		return (error);
1029	}
1030
1031	error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1032	    sizeof (proto), &len);
1033	if (error) {
1034		releasef(STRUCT_FGET(uap, fd));
1035		if (addrmask.buf)
1036			kmem_free(addrmask.buf, addrmask.maxlen);
1037		return (error);
1038	}
1039
1040	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1041	ASSERT(ncg != NULL);
1042
1043	mutex_enter(&ncg->nfs4_cb_lock);
1044	if (cmd & NFS4_SETPORT)
1045		nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1046
1047	if (cmd & NFS4_KRPC_START) {
1048		error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1049		    &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1050		if (error) {
1051			CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1052			    error);
1053			kmem_free(addrmask.buf, addrmask.maxlen);
1054		}
1055	}
1056
1057	mutex_exit(&ncg->nfs4_cb_lock);
1058	releasef(STRUCT_FGET(uap, fd));
1059	return (error);
1060}
1061
1062struct nfs4_callback_globals *
1063nfs4_get_callback_globals(void)
1064{
1065	return (zone_getspecific(nfs4_callback_zone_key, nfs_zone()));
1066}
1067
1068static void *
1069nfs4_callback_init_zone(zoneid_t zoneid)
1070{
1071	kstat_t *nfs4_callback_kstat;
1072	struct nfs4_callback_globals *ncg;
1073
1074	ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1075
1076	ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1077	    sizeof (struct nfs4_server *), KM_SLEEP);
1078
1079	/* initialize the dlist */
1080	mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1081	list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1082	    offsetof(struct nfs4_dnode, linkage));
1083
1084	/* initialize cb_port list */
1085	mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1086	list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1087	    offsetof(struct nfs4_cb_port, linkage));
1088
1089	/* get our own copy of the kstats */
1090	bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1091	    sizeof (nfs4_callback_stats_tmpl));
1092	/* register "nfs:0:nfs4_callback_stats" for this zone */
1093	if ((nfs4_callback_kstat =
1094	    kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1095	    KSTAT_TYPE_NAMED,
1096	    sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1097	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1098	    zoneid)) != NULL) {
1099		nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1100		kstat_install(nfs4_callback_kstat);
1101	}
1102	return (ncg);
1103}
1104
1105static void
1106nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1107{
1108	nfs4_server_t *sp;
1109	int i, num_removed;
1110
1111	/*
1112	 * It's OK here to just run through the registered "programs", as
1113	 * servers without programs won't have any delegations to handle.
1114	 */
1115	for (i = 0; i < nfs4_num_prognums; i++) {
1116		rnode4_t *rp;
1117
1118		mutex_enter(&ncg->nfs4_cb_lock);
1119		sp = ncg->nfs4prog2server[i];
1120		mutex_exit(&ncg->nfs4_cb_lock);
1121
1122		if (nfs4_server_vlock(sp, 1) == FALSE)
1123			continue;
1124		num_removed = 0;
1125		while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1126			mutex_enter(&rp->r_statev4_lock);
1127			if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1128				/*
1129				 * We need to take matters into our own hands,
1130				 * as nfs4delegreturn_cleanup_impl() won't
1131				 * remove this from the list.
1132				 */
1133				list_remove(&sp->s_deleg_list, rp);
1134				mutex_exit(&rp->r_statev4_lock);
1135				nfs4_dec_state_ref_count_nolock(sp,
1136				    VTOMI4(RTOV4(rp)));
1137				num_removed++;
1138				continue;
1139			}
1140			mutex_exit(&rp->r_statev4_lock);
1141			VN_HOLD(RTOV4(rp));
1142			mutex_exit(&sp->s_lock);
1143			/*
1144			 * The following will remove the node from the list.
1145			 */
1146			nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1147			VN_RELE(RTOV4(rp));
1148			mutex_enter(&sp->s_lock);
1149		}
1150		mutex_exit(&sp->s_lock);
1151		/* each removed list node reles a reference */
1152		while (num_removed-- > 0)
1153			nfs4_server_rele(sp);
1154		/* remove our reference for nfs4_server_vlock */
1155		nfs4_server_rele(sp);
1156	}
1157}
1158
1159/* ARGSUSED */
1160static void
1161nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1162{
1163	struct nfs4_callback_globals *ncg = data;
1164
1165	/*
1166	 * Clean pending delegation return list.
1167	 */
1168	nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
1169
1170	/*
1171	 * Discard all delegations.
1172	 */
1173	nfs4_discard_delegations(ncg);
1174}
1175
1176static void
1177nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
1178{
1179	struct nfs4_callback_globals *ncg = data;
1180	struct nfs4_cb_port *p;
1181	nfs4_server_t *sp, *next;
1182	nfs4_server_t freelist;
1183	int i;
1184
1185	kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
1186
1187	/*
1188	 * Discard all delegations that may have crept in since we did the
1189	 * _shutdown.
1190	 */
1191	nfs4_discard_delegations(ncg);
1192	/*
1193	 * We're completely done with this zone and all associated
1194	 * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
1195	 * more reference outstanding -- the reference we didn't release in
1196	 * nfs4_renew_lease_thread().
1197	 *
1198	 * Here we need to run through the global nfs4_server_lst as we need to
1199	 * deal with nfs4_server_ts without programs, as they also have threads
1200	 * created for them, and so have outstanding references that we need to
1201	 * release.
1202	 */
1203	freelist.forw = &freelist;
1204	freelist.back = &freelist;
1205	mutex_enter(&nfs4_server_lst_lock);
1206	sp = nfs4_server_lst.forw;
1207	while (sp != &nfs4_server_lst) {
1208		next = sp->forw;
1209		if (sp->zoneid == zoneid) {
1210			remque(sp);
1211			insque(sp, &freelist);
1212		}
1213		sp = next;
1214	}
1215	mutex_exit(&nfs4_server_lst_lock);
1216
1217	sp = freelist.forw;
1218	while (sp != &freelist) {
1219		next = sp->forw;
1220		nfs4_server_rele(sp);	/* free the list's reference */
1221		sp = next;
1222	}
1223
1224#ifdef DEBUG
1225	for (i = 0; i < nfs4_num_prognums; i++) {
1226		ASSERT(ncg->nfs4prog2server[i] == NULL);
1227	}
1228#endif
1229	kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
1230	    sizeof (struct nfs4_server *));
1231
1232	mutex_enter(&ncg->nfs4_cb_lock);
1233	while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
1234		list_remove(&ncg->nfs4_cb_ports, p);
1235		kmem_free(p, sizeof (*p));
1236	}
1237	list_destroy(&ncg->nfs4_cb_ports);
1238	mutex_destroy(&ncg->nfs4_cb_lock);
1239	list_destroy(&ncg->nfs4_dlist);
1240	mutex_destroy(&ncg->nfs4_dlist_lock);
1241	kmem_free(ncg, sizeof (*ncg));
1242}
1243
1244void
1245nfs4_callback_init(void)
1246{
1247	int i;
1248	SVC_CALLOUT *nfs4_cb_sc;
1249
1250	/* initialize the callback table */
1251	nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
1252	    sizeof (SVC_CALLOUT), KM_SLEEP);
1253
1254	for (i = 0; i < nfs4_num_prognums; i++) {
1255		nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
1256		nfs4_cb_sc[i].sc_versmin = NFS_CB;
1257		nfs4_cb_sc[i].sc_versmax = NFS_CB;
1258		nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
1259	}
1260
1261	nfs4_cb_sct.sct_size = nfs4_num_prognums;
1262	nfs4_cb_sct.sct_free = FALSE;
1263	nfs4_cb_sct.sct_sc = nfs4_cb_sc;
1264
1265	/*
1266	 * Compute max bytes required for dyamically allocated parts
1267	 * of cb_getattr reply.  Only size and change are supported now.
1268	 * If CB_GETATTR is changed to reply with additional attrs,
1269	 * additional sizes must be added below.
1270	 *
1271	 * fattr4_change + fattr4_size == uint64_t + uint64_t
1272	 */
1273	cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
1274
1275	zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
1276	    nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
1277}
1278
1279void
1280nfs4_callback_fini(void)
1281{
1282}
1283
1284/*
1285 * NB: This function can be called from the *wrong* zone (ie, the zone that
1286 * 'rp' belongs to and the caller's zone may not be the same).  This can happen
1287 * if the zone is going away and we get called from nfs4_async_inactive().  In
1288 * this case the globals will be NULL and we won't update the counters, which
1289 * doesn't matter as the zone is going away anyhow.
1290 */
1291static void
1292nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
1293	struct nfs4_callback_globals *ncg)
1294{
1295	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1296	boolean_t need_rele = B_FALSE;
1297
1298	/*
1299	 * Caller must be holding mi_recovlock in read mode
1300	 * to call here.  This is provided by start_op.
1301	 * Delegation management requires to grab s_lock
1302	 * first and then r_statev4_lock.
1303	 */
1304
1305	if (np == NULL) {
1306		np = find_nfs4_server_all(mi, 1);
1307		if (np == NULL)
1308			return;
1309		need_rele = B_TRUE;
1310	} else {
1311		mutex_enter(&np->s_lock);
1312	}
1313
1314	mutex_enter(&rp->r_statev4_lock);
1315
1316	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1317		mutex_exit(&rp->r_statev4_lock);
1318		mutex_exit(&np->s_lock);
1319		if (need_rele)
1320			nfs4_server_rele(np);
1321		return;
1322	}
1323
1324	/*
1325	 * Free the cred originally held when
1326	 * the delegation was granted.  Caller must
1327	 * hold this cred if it wants to use it after
1328	 * this call.
1329	 */
1330	crfree(rp->r_deleg_cred);
1331	rp->r_deleg_cred = NULL;
1332	rp->r_deleg_type = OPEN_DELEGATE_NONE;
1333	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1334	rp->r_deleg_needs_recall = FALSE;
1335	rp->r_deleg_return_pending = FALSE;
1336
1337	/*
1338	 * Remove the rnode from the server's list and
1339	 * update the ref counts.
1340	 */
1341	list_remove(&np->s_deleg_list, rp);
1342	mutex_exit(&rp->r_statev4_lock);
1343	nfs4_dec_state_ref_count_nolock(np, mi);
1344	mutex_exit(&np->s_lock);
1345	/* removed list node removes a reference */
1346	nfs4_server_rele(np);
1347	if (need_rele)
1348		nfs4_server_rele(np);
1349	if (ncg != NULL)
1350		ncg->nfs4_callback_stats.delegations.value.ui64--;
1351}
1352
1353void
1354nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
1355{
1356	struct nfs4_callback_globals *ncg;
1357
1358	if (np != NULL) {
1359		ncg = np->zone_globals;
1360	} else if (nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone) {
1361		ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1362		ASSERT(ncg != NULL);
1363	} else {
1364		/*
1365		 * Request coming from the wrong zone.
1366		 */
1367		ASSERT(getzoneid() == GLOBAL_ZONEID);
1368		ncg = NULL;
1369	}
1370
1371	nfs4delegreturn_cleanup_impl(rp, np, ncg);
1372}
1373
1374static void
1375nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1376	cred_t *cr, vnode_t *vp)
1377{
1378	if (error != ETIMEDOUT && error != EINTR &&
1379	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1380		lost_rqstp->lr_op = 0;
1381		return;
1382	}
1383
1384	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1385	    "nfs4close_save_lost_rqst: error %d", error));
1386
1387	lost_rqstp->lr_op = OP_DELEGRETURN;
1388	/*
1389	 * The vp is held and rele'd via the recovery code.
1390	 * See nfs4_save_lost_rqst.
1391	 */
1392	lost_rqstp->lr_vp = vp;
1393	lost_rqstp->lr_dvp = NULL;
1394	lost_rqstp->lr_oop = NULL;
1395	lost_rqstp->lr_osp = NULL;
1396	lost_rqstp->lr_lop = NULL;
1397	lost_rqstp->lr_cr = cr;
1398	lost_rqstp->lr_flk = NULL;
1399	lost_rqstp->lr_putfirst = FALSE;
1400}
1401
1402static void
1403nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
1404{
1405	COMPOUND4args_clnt args;
1406	COMPOUND4res_clnt res;
1407	nfs_argop4 argops[3];
1408	nfs4_ga_res_t *garp = NULL;
1409	hrtime_t t;
1410	int numops;
1411	int doqueue = 1;
1412
1413	args.ctag = TAG_DELEGRETURN;
1414
1415	numops = 3;		/* PUTFH, GETATTR, DELEGRETURN */
1416
1417	args.array = argops;
1418	args.array_len = numops;
1419
1420	argops[0].argop = OP_CPUTFH;
1421	argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1422
1423	argops[1].argop = OP_GETATTR;
1424	argops[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1425	argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
1426
1427	argops[2].argop = OP_DELEGRETURN;
1428	argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
1429	    rp->r_deleg_stateid;
1430
1431	t = gethrtime();
1432	rfs4call(VTOMI4(RTOV4(rp)), &args, &res, cr, &doqueue, 0, ep);
1433
1434	if (ep->error)
1435		return;
1436
1437	if (res.status == NFS4_OK) {
1438		garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
1439		nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
1440
1441	}
1442	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1443}
1444
1445int
1446nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
1447	struct nfs4_callback_globals *ncg)
1448{
1449	vnode_t *vp = RTOV4(rp);
1450	mntinfo4_t *mi = VTOMI4(vp);
1451	nfs4_lost_rqst_t lost_rqst;
1452	nfs4_recov_state_t recov_state;
1453	bool_t needrecov = FALSE, recovonly, done = FALSE;
1454	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1455
1456	ncg->nfs4_callback_stats.delegreturn.value.ui64++;
1457
1458	while (!done) {
1459		e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
1460		    &recov_state, &recovonly);
1461
1462		if (e.error) {
1463			if (flags & NFS4_DR_FORCE) {
1464				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1465				    RW_READER, 0);
1466				nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1467				nfs_rw_exit(&mi->mi_recovlock);
1468			}
1469			break;
1470		}
1471
1472		/*
1473		 * Check to see if the delegation has already been
1474		 * returned by the recovery thread.   The state of
1475		 * the delegation cannot change at this point due
1476		 * to start_fop and the r_deleg_recall_lock.
1477		 */
1478		if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1479			e.error = 0;
1480			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1481			break;
1482		}
1483
1484		if (recovonly) {
1485			/*
1486			 * Delegation will be returned via the
1487			 * recovery framework.  Build a lost request
1488			 * structure, start recovery and get out.
1489			 */
1490			nfs4_error_init(&e, EINTR);
1491			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1492			    cr, vp);
1493			(void) nfs4_start_recovery(&e, mi, vp,
1494			    NULL, &rp->r_deleg_stateid,
1495			    lost_rqst.lr_op == OP_DELEGRETURN ?
1496			    &lost_rqst : NULL, OP_DELEGRETURN, NULL,
1497			    NULL, NULL);
1498			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1499			break;
1500		}
1501
1502		nfs4delegreturn_otw(rp, cr, &e);
1503
1504		/*
1505		 * Ignore some errors on delegreturn; no point in marking
1506		 * the file dead on a state destroying operation.
1507		 */
1508		if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
1509		    e.stat == NFS4ERR_BADHANDLE ||
1510		    e.stat == NFS4ERR_STALE))
1511			needrecov = FALSE;
1512		else
1513			needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1514
1515		if (needrecov) {
1516			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1517			    cr, vp);
1518			(void) nfs4_start_recovery(&e, mi, vp,
1519			    NULL, &rp->r_deleg_stateid,
1520			    lost_rqst.lr_op == OP_DELEGRETURN ?
1521			    &lost_rqst : NULL, OP_DELEGRETURN, NULL,
1522			    NULL, NULL);
1523		} else {
1524			nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1525			done = TRUE;
1526		}
1527
1528		nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1529	}
1530	return (e.error);
1531}
1532
1533/*
1534 * nfs4_resend_delegreturn - used to drive the delegreturn
1535 * operation via the recovery thread.
1536 */
1537void
1538nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
1539	nfs4_server_t *np)
1540{
1541	rnode4_t *rp = VTOR4(lorp->lr_vp);
1542
1543	/* If the file failed recovery, just quit. */
1544	mutex_enter(&rp->r_statelock);
1545	if (rp->r_flags & R4RECOVERR) {
1546		ep->error = EIO;
1547	}
1548	mutex_exit(&rp->r_statelock);
1549
1550	if (!ep->error)
1551		nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
1552
1553	/*
1554	 * If recovery is now needed, then return the error
1555	 * and status and let the recovery thread handle it,
1556	 * including re-driving another delegreturn.  Otherwise,
1557	 * just give up and clean up the delegation.
1558	 */
1559	if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
1560		return;
1561
1562	if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
1563		nfs4delegreturn_cleanup(rp, np);
1564
1565	nfs4_error_zinit(ep);
1566}
1567
1568/*
1569 * nfs4delegreturn - general function to return a delegation.
1570 *
1571 * NFS4_DR_FORCE - return the delegation even if start_op fails
1572 * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
1573 * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
1574 * NFS4_DR_DID_OP - calling function already did nfs4_start_op
1575 * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
1576 * NFS4_DR_REOPEN - do file reopens, if applicable
1577 */
1578static int
1579nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
1580{
1581	int error = 0;
1582	cred_t *cr = NULL;
1583	vnode_t *vp;
1584	bool_t needrecov = FALSE;
1585	bool_t rw_entered = FALSE;
1586	bool_t do_reopen;
1587
1588	vp = RTOV4(rp);
1589
1590	/*
1591	 * If NFS4_DR_DISCARD is set by itself, take a short-cut and
1592	 * discard without doing an otw DELEGRETURN.  This may only be used
1593	 * by the recovery thread because it bypasses the synchronization
1594	 * with r_deleg_recall_lock and mi->mi_recovlock.
1595	 */
1596	if (flags == NFS4_DR_DISCARD) {
1597		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1598		return (0);
1599	}
1600
1601	if (flags & NFS4_DR_DID_OP) {
1602		/*
1603		 * Caller had already done start_op, which means the
1604		 * r_deleg_recall_lock is already held in READ mode
1605		 * so we cannot take it in write mode.  Return the
1606		 * delegation asynchronously.
1607		 *
1608		 * Remove the NFS4_DR_DID_OP flag so we don't
1609		 * get stuck looping through here.
1610		 */
1611		VN_HOLD(vp);
1612		nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
1613		return (0);
1614	}
1615
1616	/*
1617	 * Verify we still have a delegation and crhold the credential.
1618	 */
1619	mutex_enter(&rp->r_statev4_lock);
1620	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1621		mutex_exit(&rp->r_statev4_lock);
1622		goto out;
1623	}
1624	cr = rp->r_deleg_cred;
1625	ASSERT(cr != NULL);
1626	crhold(cr);
1627	mutex_exit(&rp->r_statev4_lock);
1628
1629	/*
1630	 * Push the modified data back to the server synchronously
1631	 * before doing DELEGRETURN.
1632	 */
1633	if (flags & NFS4_DR_PUSH)
1634		(void) VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
1635
1636	/*
1637	 * Take r_deleg_recall_lock in WRITE mode, this will prevent
1638	 * nfs4_is_otw_open_necessary from trying to use the delegation
1639	 * while the DELEGRETURN is in progress.
1640	 */
1641	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1642
1643	rw_entered = TRUE;
1644
1645	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1646		goto out;
1647
1648	if (flags & NFS4_DR_REOPEN) {
1649		/*
1650		 * If R4RECOVERRP is already set, then skip re-opening
1651		 * the delegation open streams and go straight to doing
1652		 * delegreturn.  (XXX if the file has failed recovery, then the
1653		 * delegreturn attempt is likely to be futile.)
1654		 */
1655		mutex_enter(&rp->r_statelock);
1656		do_reopen = !(rp->r_flags & R4RECOVERRP);
1657		mutex_exit(&rp->r_statelock);
1658
1659		if (do_reopen) {
1660			error = deleg_reopen(vp, &needrecov, ncg, flags);
1661			if (error != 0) {
1662				if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
1663				    == 0)
1664					goto out;
1665			} else if (needrecov) {
1666				if ((flags & NFS4_DR_FORCE) == 0)
1667					goto out;
1668			}
1669		}
1670	}
1671
1672	if (flags & NFS4_DR_DISCARD) {
1673		mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1674
1675		mutex_enter(&rp->r_statelock);
1676		/*
1677		 * deleg_return_pending is cleared inside of delegation_accept
1678		 * when a delegation is accepted.  if this flag has been
1679		 * cleared, then a new delegation has overwritten the one we
1680		 * were about to throw away.
1681		 */
1682		if (!rp->r_deleg_return_pending) {
1683			mutex_exit(&rp->r_statelock);
1684			goto out;
1685		}
1686		mutex_exit(&rp->r_statelock);
1687		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
1688		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1689		nfs_rw_exit(&mi->mi_recovlock);
1690	} else {
1691		error = nfs4_do_delegreturn(rp, flags, cr, ncg);
1692	}
1693
1694out:
1695	if (cr)
1696		crfree(cr);
1697	if (rw_entered)
1698		nfs_rw_exit(&rp->r_deleg_recall_lock);
1699	return (error);
1700}
1701
1702int
1703nfs4delegreturn(rnode4_t *rp, int flags)
1704{
1705	struct nfs4_callback_globals *ncg;
1706
1707	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1708	ASSERT(ncg != NULL);
1709
1710	return (nfs4delegreturn_impl(rp, flags, ncg));
1711}
1712
1713void
1714nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
1715{
1716	struct cb_recall_pass *pp;
1717
1718	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
1719	pp->rp = rp;
1720	pp->flags = flags;
1721	pp->truncate = trunc;
1722
1723	/*
1724	 * Fire up a thread to do the actual delegreturn
1725	 * Caller must guarantee that the rnode doesn't
1726	 * vanish (by calling VN_HOLD).
1727	 */
1728
1729	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
1730	    minclsyspri);
1731}
1732
1733static void
1734delegreturn_all_thread(rpcprog_t *pp)
1735{
1736	nfs4_server_t *np;
1737	bool_t found = FALSE;
1738	rpcprog_t prog;
1739	rnode4_t *rp;
1740	vnode_t *vp;
1741	zoneid_t zoneid = getzoneid();
1742	struct nfs4_callback_globals *ncg;
1743
1744	NFS4_DEBUG(nfs4_drat_debug,
1745	    (CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
1746
1747	prog = *pp;
1748	kmem_free(pp, sizeof (*pp));
1749	pp = NULL;
1750
1751	mutex_enter(&nfs4_server_lst_lock);
1752	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
1753		if (np->zoneid == zoneid && np->s_program == prog) {
1754			mutex_enter(&np->s_lock);
1755			found = TRUE;
1756			break;
1757		}
1758	}
1759	mutex_exit(&nfs4_server_lst_lock);
1760
1761	/*
1762	 * It's possible that the nfs4_server which was using this
1763	 * program number has vanished since this thread is async.
1764	 * If so, just return.  Your work here is finished, my friend.
1765	 */
1766	if (!found)
1767		goto out;
1768
1769	ncg = np->zone_globals;
1770	while ((rp = list_head(&np->s_deleg_list)) != NULL) {
1771		vp = RTOV4(rp);
1772		VN_HOLD(vp);
1773		mutex_exit(&np->s_lock);
1774		(void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
1775		    ncg);
1776		VN_RELE(vp);
1777
1778		/* retake the s_lock for next trip through the loop */
1779		mutex_enter(&np->s_lock);
1780	}
1781	mutex_exit(&np->s_lock);
1782out:
1783	NFS4_DEBUG(nfs4_drat_debug,
1784	    (CE_NOTE, "delereturn_all_thread: complete\n"));
1785	zthread_exit();
1786}
1787
1788void
1789nfs4_delegreturn_all(nfs4_server_t *sp)
1790{
1791	rpcprog_t pro, *pp;
1792
1793	mutex_enter(&sp->s_lock);
1794
1795	/* Check to see if the delegation list is empty */
1796
1797	if (list_head(&sp->s_deleg_list) == NULL) {
1798		mutex_exit(&sp->s_lock);
1799		return;
1800	}
1801	/*
1802	 * Grab the program number; the async thread will use this
1803	 * to find the nfs4_server.
1804	 */
1805	pro = sp->s_program;
1806	mutex_exit(&sp->s_lock);
1807	pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
1808	*pp = pro;
1809	(void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
1810	    minclsyspri);
1811}
1812
1813
1814/*
1815 * Discard any delegations
1816 *
1817 * Iterate over the servers s_deleg_list and
1818 * for matching mount-point rnodes discard
1819 * the delegation.
1820 */
1821void
1822nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
1823{
1824	rnode4_t *rp, *next;
1825	mntinfo4_t *r_mi;
1826	struct nfs4_callback_globals *ncg;
1827
1828	ASSERT(mutex_owned(&sp->s_lock));
1829	ncg = sp->zone_globals;
1830
1831	for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
1832		r_mi = VTOMI4(RTOV4(rp));
1833		next = list_next(&sp->s_deleg_list, rp);
1834
1835		if (r_mi != mi) {
1836			/*
1837			 * Skip if this rnode is in not on the
1838			 * same mount-point
1839			 */
1840			continue;
1841		}
1842
1843		ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
1844
1845#ifdef DEBUG
1846		if (nfs4_client_recov_debug) {
1847			zprintf(getzoneid(),
1848			    "nfs4_deleg_discard: matched rnode %p "
1849			"-- discarding delegation\n", (void *)rp);
1850		}
1851#endif
1852		mutex_enter(&rp->r_statev4_lock);
1853		/*
1854		 * Free the cred originally held when the delegation
1855		 * was granted. Also need to decrement the refcnt
1856		 * on this server for each delegation we discard
1857		 */
1858		if (rp->r_deleg_cred)
1859			crfree(rp->r_deleg_cred);
1860		rp->r_deleg_cred = NULL;
1861		rp->r_deleg_type = OPEN_DELEGATE_NONE;
1862		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1863		rp->r_deleg_needs_recall = FALSE;
1864		ASSERT(sp->s_refcnt > 1);
1865		sp->s_refcnt--;
1866		list_remove(&sp->s_deleg_list, rp);
1867		mutex_exit(&rp->r_statev4_lock);
1868		nfs4_dec_state_ref_count_nolock(sp, mi);
1869		ncg->nfs4_callback_stats.delegations.value.ui64--;
1870	}
1871}
1872
1873/*
1874 * Reopen any open streams that were covered by the given file's
1875 * delegation.
1876 * Returns zero or an errno value.  If there was no error, *recovp
1877 * indicates whether recovery was initiated.
1878 */
1879
1880static int
1881deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
1882	int flags)
1883{
1884	nfs4_open_stream_t *osp;
1885	nfs4_recov_state_t recov_state;
1886	bool_t needrecov = FALSE;
1887	mntinfo4_t *mi;
1888	rnode4_t *rp;
1889	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1890	int claimnull;
1891
1892	mi = VTOMI4(vp);
1893	rp = VTOR4(vp);
1894
1895	recov_state.rs_flags = 0;
1896	recov_state.rs_num_retry_despite_err = 0;
1897
1898retry:
1899	if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
1900		return (e.error);
1901	}
1902
1903	/*
1904	 * if we mean to discard the delegation, it must be BAD, so don't
1905	 * use it when doing the reopen or it will fail too.
1906	 */
1907	claimnull = (flags & NFS4_DR_DISCARD);
1908	/*
1909	 * Loop through the open streams for this rnode to find
1910	 * all of the ones created using the delegation state ID.
1911	 * Each of these needs to be re-opened.
1912	 */
1913
1914	while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
1915
1916		if (claimnull) {
1917			nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
1918		} else {
1919			ncg->nfs4_callback_stats.claim_cur.value.ui64++;
1920
1921			nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
1922			    FALSE);
1923			if (e.error == 0 && e.stat == NFS4_OK)
1924				ncg->nfs4_callback_stats.
1925				    claim_cur_ok.value.ui64++;
1926		}
1927
1928		if (e.error == EAGAIN) {
1929			nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
1930			goto retry;
1931		}
1932
1933		/*
1934		 * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
1935		 * recovery has already been started inside of nfs4_reopen.
1936		 */
1937		if (e.error == EINTR || e.error == ETIMEDOUT ||
1938		    NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
1939			open_stream_rele(osp, rp);
1940			break;
1941		}
1942
1943		needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1944
1945		if (e.error != 0 && !needrecov) {
1946			/*
1947			 * Recovery is not possible, but don't give up yet;
1948			 * we'd still like to do delegreturn after
1949			 * reopening as many streams as possible.
1950			 * Continue processing the open streams.
1951			 */
1952
1953			ncg->nfs4_callback_stats.recall_failed.value.ui64++;
1954
1955		} else if (needrecov) {
1956			/*
1957			 * Start recovery and bail out.  The recovery
1958			 * thread will take it from here.
1959			 */
1960			(void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
1961			    NULL, OP_OPEN, NULL, NULL, NULL);
1962			open_stream_rele(osp, rp);
1963			*recovp = TRUE;
1964			break;
1965		}
1966
1967		open_stream_rele(osp, rp);
1968	}
1969
1970	nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1971
1972	return (e.error);
1973}
1974
1975/*
1976 * get_next_deleg_stream - returns the next open stream which
1977 * represents a delegation for this rnode.  In order to assure
1978 * forward progress, the caller must guarantee that each open
1979 * stream returned is changed so that a future call won't return
1980 * it again.
1981 *
1982 * There are several ways for the open stream to change.  If the open
1983 * stream is !os_delegation, then we aren't interested in it.  Also, if
1984 * either os_failed_reopen or !os_valid, then don't return the osp.
1985 *
1986 * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
1987 * the osp if it is an os_delegation open stream.  Also, if the rnode still
1988 * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
1989 * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
1990 * then return the osp.
1991 *
1992 * We have already taken the 'r_deleg_recall_lock' as WRITER, which
1993 * prevents new OPENs from going OTW (as start_fop takes this
1994 * lock in READ mode); thus, no new open streams can be created
1995 * (which inherently means no new delegation open streams are
1996 * being created).
1997 */
1998
1999static nfs4_open_stream_t *
2000get_next_deleg_stream(rnode4_t *rp, int claimnull)
2001{
2002	nfs4_open_stream_t	*osp;
2003
2004	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2005
2006	/*
2007	 * Search through the list of open streams looking for
2008	 * one that was created while holding the delegation.
2009	 */
2010	mutex_enter(&rp->r_os_lock);
2011	for (osp = list_head(&rp->r_open_streams); osp != NULL;
2012	    osp = list_next(&rp->r_open_streams, osp)) {
2013		mutex_enter(&osp->os_sync_lock);
2014		if (!osp->os_delegation || osp->os_failed_reopen ||
2015		    !osp->os_valid) {
2016			mutex_exit(&osp->os_sync_lock);
2017			continue;
2018		}
2019		if (!claimnull || rp->r_deleg_return_pending ||
2020		    !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2021			osp->os_ref_count++;
2022			mutex_exit(&osp->os_sync_lock);
2023			mutex_exit(&rp->r_os_lock);
2024			return (osp);
2025		}
2026		mutex_exit(&osp->os_sync_lock);
2027	}
2028	mutex_exit(&rp->r_os_lock);
2029
2030	return (NULL);
2031}
2032
2033static void
2034nfs4delegreturn_thread(struct cb_recall_pass *args)
2035{
2036	rnode4_t *rp;
2037	vnode_t *vp;
2038	cred_t *cr;
2039	int dtype, error, flags;
2040	bool_t rdirty, rip;
2041	kmutex_t cpr_lock;
2042	callb_cpr_t cpr_info;
2043	struct nfs4_callback_globals *ncg;
2044
2045	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2046	ASSERT(ncg != NULL);
2047
2048	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2049
2050	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2051	    "nfsv4delegRtn");
2052
2053	rp = args->rp;
2054	vp = RTOV4(rp);
2055
2056	mutex_enter(&rp->r_statev4_lock);
2057	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2058		mutex_exit(&rp->r_statev4_lock);
2059		goto out;
2060	}
2061	mutex_exit(&rp->r_statev4_lock);
2062
2063	/*
2064	 * Take the read-write lock in read mode to prevent other
2065	 * threads from modifying the data during the recall.  This
2066	 * doesn't affect mmappers.
2067	 */
2068	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2069
2070	/* Proceed with delegreturn */
2071
2072	mutex_enter(&rp->r_statev4_lock);
2073	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2074		mutex_exit(&rp->r_statev4_lock);
2075		nfs_rw_exit(&rp->r_rwlock);
2076		goto out;
2077	}
2078	dtype = rp->r_deleg_type;
2079	cr = rp->r_deleg_cred;
2080	ASSERT(cr != NULL);
2081	crhold(cr);
2082	mutex_exit(&rp->r_statev4_lock);
2083
2084	flags = args->flags;
2085
2086	/*
2087	 * If the file is being truncated at the server, then throw
2088	 * away all of the pages, it doesn't matter what flavor of
2089	 * delegation we have.
2090	 */
2091
2092	if (args->truncate) {
2093		ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2094		nfs4_invalidate_pages(vp, 0, cr);
2095	} else if (dtype == OPEN_DELEGATE_WRITE) {
2096
2097		mutex_enter(&rp->r_statelock);
2098		rdirty = rp->r_flags & R4DIRTY;
2099		mutex_exit(&rp->r_statelock);
2100
2101		if (rdirty) {
2102			error = VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
2103
2104			if (error)
2105				CB_WARN1("nfs4delegreturn_thread:"
2106				" VOP_PUTPAGE: %d\n", error);
2107		}
2108		/* turn off NFS4_DR_PUSH because we just did that above. */
2109		flags &= ~NFS4_DR_PUSH;
2110	}
2111
2112	mutex_enter(&rp->r_statelock);
2113	rip =  rp->r_flags & R4RECOVERRP;
2114	mutex_exit(&rp->r_statelock);
2115
2116	/* If a failed recovery is indicated, discard the pages */
2117
2118	if (rip) {
2119
2120		error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr, NULL);
2121
2122		if (error)
2123			CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2124			    error);
2125	}
2126
2127	/*
2128	 * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2129	 * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2130	 */
2131	flags &= ~NFS4_DR_DID_OP;
2132
2133	(void) nfs4delegreturn_impl(rp, flags, ncg);
2134
2135	nfs_rw_exit(&rp->r_rwlock);
2136	crfree(cr);
2137out:
2138	kmem_free(args, sizeof (struct cb_recall_pass));
2139	VN_RELE(vp);
2140	mutex_enter(&cpr_lock);
2141	CALLB_CPR_EXIT(&cpr_info);
2142	mutex_destroy(&cpr_lock);
2143	zthread_exit();
2144}
2145
2146/*
2147 * This function has one assumption that the caller of this function is
2148 * either doing recovery (therefore cannot call nfs4_start_op) or has
2149 * already called nfs4_start_op().
2150 */
2151void
2152nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim, OPEN4res *res,
2153	nfs4_ga_res_t *garp, cred_t *cr)
2154{
2155	open_read_delegation4 *orp;
2156	open_write_delegation4 *owp;
2157	nfs4_server_t *np;
2158	bool_t already = FALSE;
2159	bool_t recall = FALSE;
2160	bool_t valid_garp = TRUE;
2161	bool_t delegation_granted = FALSE;
2162	bool_t dr_needed = FALSE;
2163	bool_t recov;
2164	int dr_flags = 0;
2165	long mapcnt;
2166	uint_t rflag;
2167	mntinfo4_t *mi;
2168	struct nfs4_callback_globals *ncg;
2169	open_delegation_type4 odt;
2170
2171	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2172	ASSERT(ncg != NULL);
2173
2174	mi = VTOMI4(RTOV4(rp));
2175
2176	/*
2177	 * Accept a delegation granted to the client via an OPEN.
2178	 * Set the delegation fields in the rnode and insert the
2179	 * rnode onto the list anchored in the nfs4_server_t.  The
2180	 * proper locking order requires the nfs4_server_t first,
2181	 * even though it may not be needed in all cases.
2182	 *
2183	 * NB: find_nfs4_server returns with s_lock held.
2184	 */
2185
2186	if ((np = find_nfs4_server(mi)) == NULL)
2187		return;
2188
2189	/* grab the statelock too, for examining r_mapcnt */
2190	mutex_enter(&rp->r_statelock);
2191	mutex_enter(&rp->r_statev4_lock);
2192
2193	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
2194	    rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2195		already = TRUE;
2196
2197	odt = res->delegation.delegation_type;
2198
2199	if (odt == OPEN_DELEGATE_READ) {
2200
2201		rp->r_deleg_type = res->delegation.delegation_type;
2202		orp = &res->delegation.open_delegation4_u.read;
2203		rp->r_deleg_stateid = orp->stateid;
2204		rp->r_deleg_perms = orp->permissions;
2205		if (claim == CLAIM_PREVIOUS)
2206			if ((recall = orp->recall) != 0)
2207				dr_needed = TRUE;
2208
2209		delegation_granted = TRUE;
2210
2211		ncg->nfs4_callback_stats.delegations.value.ui64++;
2212		ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
2213
2214	} else if (odt == OPEN_DELEGATE_WRITE) {
2215
2216		rp->r_deleg_type = res->delegation.delegation_type;
2217		owp = &res->delegation.open_delegation4_u.write;
2218		rp->r_deleg_stateid = owp->stateid;
2219		rp->r_deleg_perms = owp->permissions;
2220		rp->r_deleg_limit = owp->space_limit;
2221		if (claim == CLAIM_PREVIOUS)
2222			if ((recall = owp->recall) != 0)
2223				dr_needed = TRUE;
2224
2225		delegation_granted = TRUE;
2226
2227		if (garp == NULL || !garp->n4g_change_valid) {
2228			valid_garp = FALSE;
2229			rp->r_deleg_change = 0;
2230			rp->r_deleg_change_grant = 0;
2231		} else {
2232			rp->r_deleg_change = garp->n4g_change;
2233			rp->r_deleg_change_grant = garp->n4g_change;
2234		}
2235		mapcnt = rp->r_mapcnt;
2236		rflag = rp->r_flags;
2237
2238		/*
2239		 * Update the delegation change attribute if
2240		 * there are mappers for the file is dirty.  This
2241		 * might be the case during recovery after server
2242		 * reboot.
2243		 */
2244		if (mapcnt > 0 || rflag & R4DIRTY)
2245			rp->r_deleg_change++;
2246
2247		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2248		    "nfs4_delegation_accept: r_deleg_change: 0x%x\n",
2249		    (int)(rp->r_deleg_change >> 32)));
2250		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2251		    "nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
2252		    (int)(rp->r_deleg_change_grant >> 32)));
2253
2254
2255		ncg->nfs4_callback_stats.delegations.value.ui64++;
2256		ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
2257	} else if (already) {
2258		/*
2259		 * No delegation granted.  If the rnode currently has
2260		 * has one, then consider it tainted and return it.
2261		 */
2262		dr_needed = TRUE;
2263	}
2264
2265	if (delegation_granted) {
2266		/* Add the rnode to the list. */
2267		if (!already) {
2268			crhold(cr);
2269			rp->r_deleg_cred = cr;
2270
2271			ASSERT(mutex_owned(&np->s_lock));
2272			list_insert_head(&np->s_deleg_list, rp);
2273			/* added list node gets a reference */
2274			np->s_refcnt++;
2275			nfs4_inc_state_ref_count_nolock(np, mi);
2276		}
2277		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2278	}
2279
2280	/*
2281	 * We've now safely accepted the delegation, if any.  Drop the
2282	 * locks and figure out what post-processing is needed.  We'd
2283	 * like to retain r_statev4_lock, but nfs4_server_rele takes
2284	 * s_lock which would be a lock ordering violation.
2285	 */
2286	mutex_exit(&rp->r_statev4_lock);
2287	mutex_exit(&rp->r_statelock);
2288	mutex_exit(&np->s_lock);
2289	nfs4_server_rele(np);
2290
2291	/*
2292	 * Check to see if we are in recovery.  Remember that
2293	 * this function is protected by start_op, so a recovery
2294	 * cannot begin until we are out of here.
2295	 */
2296	mutex_enter(&mi->mi_lock);
2297	recov = mi->mi_recovflags & MI4_RECOV_ACTIV;
2298	mutex_exit(&mi->mi_lock);
2299
2300	mutex_enter(&rp->r_statev4_lock);
2301
2302	if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
2303		dr_needed = TRUE;
2304
2305	if (dr_needed && rp->r_deleg_return_pending == FALSE) {
2306		if (recov) {
2307			/*
2308			 * We cannot call delegreturn from inside
2309			 * of recovery or VOP_PUTPAGE will hang
2310			 * due to nfs4_start_fop call in
2311			 * nfs4write.  Use dlistadd to add the
2312			 * rnode to the list of rnodes needing
2313			 * cleaning.  We do not need to do reopen
2314			 * here because recov_openfiles will do it.
2315			 * In the non-recall case, just discard the
2316			 * delegation as it is no longer valid.
2317			 */
2318			if (recall)
2319				dr_flags = NFS4_DR_PUSH;
2320			else
2321				dr_flags = NFS4_DR_PUSH|NFS4_DR_DISCARD;
2322
2323			nfs4_dlistadd(rp, ncg, dr_flags);
2324			dr_flags = 0;
2325		} else {
2326			/*
2327			 * Push the modified data back to the server,
2328			 * reopen any delegation open streams, and return
2329			 * the delegation.  Drop the statev4_lock first!
2330			 */
2331			dr_flags =  NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN;
2332		}
2333	}
2334	mutex_exit(&rp->r_statev4_lock);
2335	if (dr_flags)
2336		(void) nfs4delegreturn_impl(rp, dr_flags, ncg);
2337}
2338
2339/*
2340 * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
2341 * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
2342 * or BADSEQID and the recovery code is unable to recover.  Push any
2343 * dirty data back to the server and return the delegation (if any).
2344 */
2345
2346void
2347nfs4delegabandon(rnode4_t *rp)
2348{
2349	vnode_t *vp;
2350	struct cb_recall_pass *pp;
2351	open_delegation_type4 dt;
2352
2353	mutex_enter(&rp->r_statev4_lock);
2354	dt = rp->r_deleg_type;
2355	mutex_exit(&rp->r_statev4_lock);
2356
2357	if (dt == OPEN_DELEGATE_NONE)
2358		return;
2359
2360	vp = RTOV4(rp);
2361	VN_HOLD(vp);
2362
2363	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2364	pp->rp = rp;
2365	/*
2366	 * Recovery on the file has failed and we want to return
2367	 * the delegation.  We don't want to reopen files and
2368	 * nfs4delegreturn_thread() figures out what to do about
2369	 * the data.  The only thing to do is attempt to return
2370	 * the delegation.
2371	 */
2372	pp->flags = 0;
2373	pp->truncate = FALSE;
2374
2375	/*
2376	 * Fire up a thread to do the delegreturn; this is
2377	 * necessary because we could be inside a GETPAGE or
2378	 * PUTPAGE and we cannot do another one.
2379	 */
2380
2381	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2382	    minclsyspri);
2383}
2384
2385static int
2386wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
2387	int flg)
2388{
2389	rnode4_t *rp;
2390	int error = 0;
2391
2392#ifdef lint
2393	op = op;
2394#endif
2395
2396	if (vp && vp->v_type == VREG) {
2397		rp = VTOR4(vp);
2398
2399		/*
2400		 * Take r_deleg_recall_lock in read mode to synchronize
2401		 * with delegreturn.
2402		 */
2403		error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
2404		    RW_READER, INTR4(vp));
2405
2406		if (error == 0)
2407			rsp->rs_flags |= flg;
2408
2409	}
2410	return (error);
2411}
2412
2413void
2414nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
2415{
2416	NFS4_DEBUG(nfs4_recall_debug,
2417	    (CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
2418	    (void *)vp1, (void *)vp2));
2419
2420	if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
2421		nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
2422	if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
2423		nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2424}
2425
2426int
2427wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
2428	nfs4_recov_state_t *rsp)
2429{
2430	int error;
2431
2432	NFS4_DEBUG(nfs4_recall_debug,
2433	    (CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
2434	    (void *)vp1, (void *) vp2));
2435
2436	rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
2437
2438	if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
2439		return (error);
2440
2441	if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
2442	    != 0) {
2443		if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
2444			nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2445			rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
2446		}
2447
2448		return (error);
2449	}
2450
2451	return (0);
2452}
2453
2454/*
2455 * nfs4_dlistadd - Add this rnode to a list of rnodes to be
2456 * DELEGRETURN'd at the end of recovery.
2457 */
2458
2459static void
2460nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
2461{
2462	struct nfs4_dnode *dp;
2463
2464	ASSERT(mutex_owned(&rp->r_statev4_lock));
2465	/*
2466	 * Mark the delegation as having a return pending.
2467	 * This will prevent the use of the delegation stateID
2468	 * by read, write, setattr and open.
2469	 */
2470	rp->r_deleg_return_pending = TRUE;
2471	dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
2472	VN_HOLD(RTOV4(rp));
2473	dp->rnodep = rp;
2474	dp->flags = flags;
2475	mutex_enter(&ncg->nfs4_dlist_lock);
2476	list_insert_head(&ncg->nfs4_dlist, dp);
2477#ifdef	DEBUG
2478	ncg->nfs4_dlistadd_c++;
2479#endif
2480	mutex_exit(&ncg->nfs4_dlist_lock);
2481}
2482
2483/*
2484 * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
2485 * of files awaiting cleaning.  If the override_flags are non-zero
2486 * then use them rather than the flags that were set when the rnode
2487 * was added to the dlist.
2488 */
2489static void
2490nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
2491{
2492	rnode4_t *rp;
2493	struct nfs4_dnode *dp;
2494	int flags;
2495
2496	ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
2497
2498	mutex_enter(&ncg->nfs4_dlist_lock);
2499	while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
2500#ifdef	DEBUG
2501		ncg->nfs4_dlistclean_c++;
2502#endif
2503		list_remove(&ncg->nfs4_dlist, dp);
2504		mutex_exit(&ncg->nfs4_dlist_lock);
2505		rp = dp->rnodep;
2506		flags = (override_flags != 0) ? override_flags : dp->flags;
2507		kmem_free(dp, sizeof (*dp));
2508		(void) nfs4delegreturn_impl(rp, flags, ncg);
2509		VN_RELE(RTOV4(rp));
2510		mutex_enter(&ncg->nfs4_dlist_lock);
2511	}
2512	mutex_exit(&ncg->nfs4_dlist_lock);
2513}
2514
2515void
2516nfs4_dlistclean(void)
2517{
2518	struct nfs4_callback_globals *ncg;
2519
2520	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2521	ASSERT(ncg != NULL);
2522
2523	nfs4_dlistclean_impl(ncg, 0);
2524}
2525