1/*
2 * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1991, 1993, 1995
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 *    must display the following acknowledgement:
46 *	This product includes software developed by the University of
47 *	California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 *    may be used to endorse or promote products derived from this software
50 *    without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
66 */
67
68/*
69 * Socket operations for use by nfs
70 */
71
72#include <sys/param.h>
73#include <sys/systm.h>
74#include <sys/proc.h>
75#include <sys/signalvar.h>
76#include <sys/kauth.h>
77#include <sys/mount_internal.h>
78#include <sys/kernel.h>
79#include <sys/kpi_mbuf.h>
80#include <sys/malloc.h>
81#include <sys/vnode.h>
82#include <sys/domain.h>
83#include <sys/protosw.h>
84#include <sys/socket.h>
85#include <sys/syslog.h>
86#include <sys/tprintf.h>
87#include <libkern/OSAtomic.h>
88
89#include <sys/time.h>
90#include <kern/clock.h>
91#include <kern/task.h>
92#include <kern/thread.h>
93#include <kern/thread_call.h>
94#include <sys/user.h>
95#include <sys/acct.h>
96
97#include <netinet/in.h>
98#include <netinet/tcp.h>
99
100#include <nfs/rpcv2.h>
101#include <nfs/krpc.h>
102#include <nfs/nfsproto.h>
103#include <nfs/nfs.h>
104#include <nfs/xdr_subs.h>
105#include <nfs/nfsm_subs.h>
106#include <nfs/nfs_gss.h>
107#include <nfs/nfsmount.h>
108#include <nfs/nfsnode.h>
109
110/* XXX */
111boolean_t	current_thread_aborted(void);
112kern_return_t	thread_terminate(thread_t);
113
114
115#if NFSSERVER
116int nfsrv_sock_max_rec_queue_length = 128; /* max # RPC records queued on (UDP) socket */
117
118int nfsrv_getstream(struct nfsrv_sock *,int);
119int nfsrv_getreq(struct nfsrv_descript *);
120extern int nfsv3_procid[NFS_NPROCS];
121#endif /* NFSSERVER */
122
123/*
124 * compare two sockaddr structures
125 */
126int
127nfs_sockaddr_cmp(struct sockaddr *sa1, struct sockaddr *sa2)
128{
129	if (!sa1)
130		return (-1);
131	if (!sa2)
132		return (1);
133	if (sa1->sa_family != sa2->sa_family)
134		return ((sa1->sa_family < sa2->sa_family) ? -1 : 1);
135	if (sa1->sa_len != sa2->sa_len)
136		return ((sa1->sa_len < sa2->sa_len) ? -1 : 1);
137	if (sa1->sa_family == AF_INET)
138		return (bcmp(&((struct sockaddr_in*)sa1)->sin_addr,
139			     &((struct sockaddr_in*)sa2)->sin_addr, sizeof(((struct sockaddr_in*)sa1)->sin_addr)));
140	if (sa1->sa_family == AF_INET6)
141		return (bcmp(&((struct sockaddr_in6*)sa1)->sin6_addr,
142			     &((struct sockaddr_in6*)sa2)->sin6_addr, sizeof(((struct sockaddr_in6*)sa1)->sin6_addr)));
143	return (-1);
144}
145
146#if NFSCLIENT
147
148int	nfs_reconnect(struct nfsmount *);
149int	nfs_connect_setup(struct nfsmount *);
150void	nfs_mount_sock_thread(void *, wait_result_t);
151void	nfs_udp_rcv(socket_t, void*, int);
152void	nfs_tcp_rcv(socket_t, void*, int);
153void	nfs_sock_poke(struct nfsmount *);
154void	nfs_request_match_reply(struct nfsmount *, mbuf_t);
155void	nfs_reqdequeue(struct nfsreq *);
156void	nfs_reqbusy(struct nfsreq *);
157struct nfsreq *nfs_reqnext(struct nfsreq *);
158int	nfs_wait_reply(struct nfsreq *);
159void	nfs_softterm(struct nfsreq *);
160int	nfs_can_squish(struct nfsmount *);
161int	nfs_is_squishy(struct nfsmount *);
162int	nfs_is_dead(int, struct nfsmount *);
163
164#ifdef NFS_SOCKET_DEBUGGING
165#define NFS_SOCK_DBG(X)	printf X
166#else
167#define NFS_SOCK_DBG(X)
168#endif
169
170/*
171 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
172 * Use the mean and mean deviation of rtt for the appropriate type of rpc
173 * for the frequent rpcs and a default for the others.
174 * The justification for doing "other" this way is that these rpcs
175 * happen so infrequently that timer est. would probably be stale.
176 * Also, since many of these rpcs are
177 * non-idempotent, a conservative timeout is desired.
178 * getattr, lookup - A+2D
179 * read, write     - A+4D
180 * other	   - nm_timeo
181 */
182#define	NFS_RTO(n, t) \
183	((t) == 0 ? (n)->nm_timeo : \
184	 ((t) < 3 ? \
185	  (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
186	  ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
187#define	NFS_SRTT(r)	(r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
188#define	NFS_SDRTT(r)	(r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
189
190/*
191 * Defines which timer to use for the procnum.
192 * 0 - default
193 * 1 - getattr
194 * 2 - lookup
195 * 3 - read
196 * 4 - write
197 */
198static int proct[NFS_NPROCS] = {
199	0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
200};
201
202/*
203 * There is a congestion window for outstanding rpcs maintained per mount
204 * point. The cwnd size is adjusted in roughly the way that:
205 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
206 * SIGCOMM '88". ACM, August 1988.
207 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
208 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
209 * of rpcs is in progress.
210 * (The sent count and cwnd are scaled for integer arith.)
211 * Variants of "slow start" were tried and were found to be too much of a
212 * performance hit (ave. rtt 3 times larger),
213 * I suspect due to the large rtt that nfs rpcs have.
214 */
215#define	NFS_CWNDSCALE	256
216#define	NFS_MAXCWND	(NFS_CWNDSCALE * 32)
217static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
218
219/*
220 * Increment location index to next address/server/location.
221 */
222void
223nfs_location_next(struct nfs_fs_locations *nlp, struct nfs_location_index *nlip)
224{
225	uint8_t loc = nlip->nli_loc;
226	uint8_t serv = nlip->nli_serv;
227	uint8_t addr = nlip->nli_addr;
228
229	/* move to next address */
230	addr++;
231	if (addr >= nlp->nl_locations[loc]->nl_servers[serv]->ns_addrcount) {
232		/* no more addresses on current server, go to first address of next server */
233next_server:
234		addr = 0;
235		serv++;
236		if (serv >= nlp->nl_locations[loc]->nl_servcount) {
237			/* no more servers on current location, go to first server of next location */
238			serv = 0;
239			loc++;
240			if (loc >= nlp->nl_numlocs)
241				loc = 0; /* after last location, wrap back around to first location */
242		}
243	}
244	/*
245	 * It's possible for this next server to not have any addresses.
246	 * Check for that here and go to the next server.
247	 * But bail out if we've managed to come back around to the original
248	 * location that was passed in. (That would mean no servers had any
249	 * addresses.  And we don't want to spin here forever.)
250	 */
251	if ((loc == nlip->nli_loc) && (serv == nlip->nli_serv) && (addr == nlip->nli_addr))
252		return;
253	if (addr >= nlp->nl_locations[loc]->nl_servers[serv]->ns_addrcount)
254		goto next_server;
255
256	nlip->nli_loc = loc;
257	nlip->nli_serv = serv;
258	nlip->nli_addr = addr;
259}
260
261/*
262 * Compare two location indices.
263 */
264int
265nfs_location_index_cmp(struct nfs_location_index *nlip1, struct nfs_location_index *nlip2)
266{
267	if (nlip1->nli_loc != nlip2->nli_loc)
268		return (nlip1->nli_loc - nlip2->nli_loc);
269	if (nlip1->nli_serv != nlip2->nli_serv)
270		return (nlip1->nli_serv - nlip2->nli_serv);
271	return (nlip1->nli_addr - nlip2->nli_addr);
272}
273
274/*
275 * Get the mntfromname (or path portion only) for a given location.
276 */
277void
278nfs_location_mntfromname(struct nfs_fs_locations *locs, struct nfs_location_index idx, char *s, int size, int pathonly)
279{
280	struct nfs_fs_location *fsl = locs->nl_locations[idx.nli_loc];
281	char *p;
282	int cnt, i;
283
284	p = s;
285	if (!pathonly) {
286		cnt = snprintf(p, size, "%s:", fsl->nl_servers[idx.nli_serv]->ns_name);
287		p += cnt;
288		size -= cnt;
289	}
290	if (fsl->nl_path.np_compcount == 0) {
291		/* mounting root export on server */
292		if (size > 0) {
293			*p++ = '/';
294			*p++ = '\0';
295		}
296		return;
297	}
298	/* append each server path component */
299	for (i=0; (size > 0) && (i < (int)fsl->nl_path.np_compcount); i++) {
300		cnt = snprintf(p, size, "/%s", fsl->nl_path.np_components[i]);
301		p += cnt;
302		size -= cnt;
303	}
304}
305
306/*
307 * NFS client connect socket upcall.
308 * (Used only during socket connect/search.)
309 */
310void
311nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag)
312{
313	struct nfs_socket *nso = arg;
314	size_t rcvlen;
315	mbuf_t m;
316	int error = 0, recv = 1;
317
318	if (nso->nso_flags & NSO_CONNECTING) {
319		NFS_SOCK_DBG(("nfs connect - socket %p upcall - connecting\n", nso));
320		wakeup(nso->nso_wake);
321		return;
322	}
323
324	lck_mtx_lock(&nso->nso_lock);
325	if ((nso->nso_flags & (NSO_UPCALL|NSO_DISCONNECTING|NSO_DEAD)) || !(nso->nso_flags & NSO_PINGING)) {
326		NFS_SOCK_DBG(("nfs connect - socket %p upcall - nevermind\n", nso));
327		lck_mtx_unlock(&nso->nso_lock);
328		return;
329	}
330	NFS_SOCK_DBG(("nfs connect - socket %p upcall\n", nso));
331	nso->nso_flags |= NSO_UPCALL;
332
333	/* loop while we make error-free progress */
334	while (!error && recv) {
335		/* make sure we're still interested in this socket */
336		if (nso->nso_flags & (NSO_DISCONNECTING|NSO_DEAD))
337			break;
338		lck_mtx_unlock(&nso->nso_lock);
339		m = NULL;
340		if (nso->nso_sotype == SOCK_STREAM) {
341			error = nfs_rpc_record_read(so, &nso->nso_rrs, MSG_DONTWAIT, &recv, &m);
342		} else {
343			rcvlen = 1000000;
344			error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen);
345			recv = m ? 1 : 0;
346		}
347		lck_mtx_lock(&nso->nso_lock);
348		if (m) {
349			/* match response with request */
350			struct nfsm_chain nmrep;
351			uint32_t reply = 0, rxid = 0, verf_type, verf_len;
352			uint32_t reply_status, rejected_status, accepted_status;
353
354			nfsm_chain_dissect_init(error, &nmrep, m);
355			nfsm_chain_get_32(error, &nmrep, rxid);
356			nfsm_chain_get_32(error, &nmrep, reply);
357			if (!error && ((reply != RPC_REPLY) || (rxid != nso->nso_pingxid)))
358				error = EBADRPC;
359			nfsm_chain_get_32(error, &nmrep, reply_status);
360			if (!error && (reply_status == RPC_MSGDENIED)) {
361				nfsm_chain_get_32(error, &nmrep, rejected_status);
362				if (!error)
363					error = (rejected_status == RPC_MISMATCH) ? ERPCMISMATCH : EACCES;
364			}
365			nfsm_chain_get_32(error, &nmrep, verf_type); /* verifier flavor */
366			nfsm_chain_get_32(error, &nmrep, verf_len); /* verifier length */
367			nfsmout_if(error);
368			if (verf_len)
369				nfsm_chain_adv(error, &nmrep, nfsm_rndup(verf_len));
370			nfsm_chain_get_32(error, &nmrep, accepted_status);
371			nfsmout_if(error);
372			if ((accepted_status == RPC_PROGMISMATCH) && !nso->nso_version) {
373				uint32_t minvers, maxvers;
374				nfsm_chain_get_32(error, &nmrep, minvers);
375				nfsm_chain_get_32(error, &nmrep, maxvers);
376				nfsmout_if(error);
377				if (nso->nso_protocol == PMAPPROG) {
378					if ((minvers > RPCBVERS4) || (maxvers < PMAPVERS))
379						error = EPROGMISMATCH;
380					else if ((nso->nso_saddr->sa_family == AF_INET) &&
381						 (PMAPVERS >= minvers) && (PMAPVERS <= maxvers))
382						nso->nso_version = PMAPVERS;
383					else if (nso->nso_saddr->sa_family == AF_INET6) {
384						if ((RPCBVERS4 >= minvers) && (RPCBVERS4 <= maxvers))
385							nso->nso_version = RPCBVERS4;
386						else if ((RPCBVERS3 >= minvers) && (RPCBVERS3 <= maxvers))
387							nso->nso_version = RPCBVERS3;
388					}
389				} else if (nso->nso_protocol == NFS_PROG) {
390					if ((minvers > NFS_VER4) || (maxvers < NFS_VER2))
391						error = EPROGMISMATCH;
392					else if ((NFS_VER3 >= minvers) && (NFS_VER3 <= maxvers))
393						nso->nso_version = NFS_VER3;
394					else if ((NFS_VER2 >= minvers) && (NFS_VER2 <= maxvers))
395						nso->nso_version = NFS_VER2;
396					else if ((NFS_VER4 >= minvers) && (NFS_VER4 <= maxvers))
397						nso->nso_version = NFS_VER4;
398				}
399				if (!error && nso->nso_version)
400					accepted_status = RPC_SUCCESS;
401			}
402			if (!error) {
403				switch (accepted_status) {
404				case RPC_SUCCESS:
405					error = 0;
406					break;
407				case RPC_PROGUNAVAIL:
408					error = EPROGUNAVAIL;
409					break;
410				case RPC_PROGMISMATCH:
411					error = EPROGMISMATCH;
412					break;
413				case RPC_PROCUNAVAIL:
414					error = EPROCUNAVAIL;
415					break;
416				case RPC_GARBAGE:
417					error = EBADRPC;
418					break;
419				case RPC_SYSTEM_ERR:
420				default:
421					error = EIO;
422					break;
423				}
424			}
425nfsmout:
426			nso->nso_flags &= ~NSO_PINGING;
427			if (error) {
428				nso->nso_error = error;
429				nso->nso_flags |= NSO_DEAD;
430			} else {
431				nso->nso_flags |= NSO_VERIFIED;
432			}
433			mbuf_freem(m);
434			/* wake up search thread */
435			wakeup(nso->nso_wake);
436			break;
437		}
438	}
439
440	nso->nso_flags &= ~NSO_UPCALL;
441	if ((error != EWOULDBLOCK) && (error || !recv)) {
442		/* problems with the socket... */
443		nso->nso_error = error ? error : EPIPE;
444		nso->nso_flags |= NSO_DEAD;
445		wakeup(nso->nso_wake);
446	}
447	if (nso->nso_flags & NSO_DISCONNECTING)
448		wakeup(&nso->nso_flags);
449	lck_mtx_unlock(&nso->nso_lock);
450}
451
452/*
453 * Create/initialize an nfs_socket structure.
454 */
455int
456nfs_socket_create(
457	__unused struct nfsmount *nmp,
458	struct sockaddr *sa,
459	int sotype,
460	in_port_t port,
461	uint32_t protocol,
462	uint32_t vers,
463	int resvport,
464	struct nfs_socket **nsop)
465{
466	struct nfs_socket *nso;
467	struct timeval now;
468	int error;
469#ifdef NFS_SOCKET_DEBUGGING
470	char naddr[MAX_IPv6_STR_LEN];
471	void *sinaddr;
472
473	if (sa->sa_family == AF_INET)
474		sinaddr = &((struct sockaddr_in*)sa)->sin_addr;
475	else
476		sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr;
477	if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr)
478		strlcpy(naddr, "<unknown>", sizeof(naddr));
479#endif
480
481	*nsop = NULL;
482
483	/* Create the socket. */
484	MALLOC(nso, struct nfs_socket *, sizeof(struct nfs_socket), M_TEMP, M_WAITOK|M_ZERO);
485	if (nso)
486		MALLOC(nso->nso_saddr, struct sockaddr *, sa->sa_len, M_SONAME, M_WAITOK|M_ZERO);
487	if (!nso || !nso->nso_saddr) {
488		if (nso)
489			FREE(nso, M_TEMP);
490		return (ENOMEM);
491	}
492	lck_mtx_init(&nso->nso_lock, nfs_request_grp, LCK_ATTR_NULL);
493	nso->nso_sotype = sotype;
494	if (nso->nso_sotype == SOCK_STREAM)
495		nfs_rpc_record_state_init(&nso->nso_rrs);
496	microuptime(&now);
497	nso->nso_timestamp = now.tv_sec;
498	bcopy(sa, nso->nso_saddr, sa->sa_len);
499	if (sa->sa_family == AF_INET)
500		((struct sockaddr_in*)nso->nso_saddr)->sin_port = htons(port);
501	else if (sa->sa_family == AF_INET6)
502		((struct sockaddr_in6*)nso->nso_saddr)->sin6_port = htons(port);
503	nso->nso_protocol = protocol;
504	nso->nso_version = vers;
505
506	error = sock_socket(sa->sa_family, nso->nso_sotype, 0, NULL, NULL, &nso->nso_so);
507
508	/* Some servers require that the client port be a reserved port number. */
509	if (!error && resvport && ((sa->sa_family == AF_INET) || (sa->sa_family == AF_INET6))) {
510		struct sockaddr_storage ss;
511		int level = (sa->sa_family == AF_INET) ? IPPROTO_IP : IPPROTO_IPV6;
512		int optname = (sa->sa_family == AF_INET) ? IP_PORTRANGE : IPV6_PORTRANGE;
513		int portrange = IP_PORTRANGE_LOW;
514
515		error = sock_setsockopt(nso->nso_so, level, optname, &portrange, sizeof(portrange));
516		if (!error) {	/* bind now to check for failure */
517			ss.ss_len = sa->sa_len;
518			ss.ss_family = sa->sa_family;
519			if (ss.ss_family == AF_INET) {
520				((struct sockaddr_in*)&ss)->sin_addr.s_addr = INADDR_ANY;
521				((struct sockaddr_in*)&ss)->sin_port = htons(0);
522			} else if (ss.ss_family == AF_INET6) {
523				((struct sockaddr_in6*)&ss)->sin6_addr = in6addr_any;
524				((struct sockaddr_in6*)&ss)->sin6_port = htons(0);
525			} else {
526				error = EINVAL;
527			}
528			if (!error)
529				error = sock_bind(nso->nso_so, (struct sockaddr*)&ss);
530		}
531	}
532
533	if (error) {
534		NFS_SOCK_DBG(("nfs connect %s error %d creating socket %p %s type %d%s port %d prot %d %d\n",
535			vfs_statfs(nmp->nm_mountp)->f_mntfromname, error, nso, naddr, sotype,
536			resvport ? "r" : "", port, protocol, vers));
537		nfs_socket_destroy(nso);
538	} else {
539		NFS_SOCK_DBG(("nfs connect %s created socket %p %s type %d%s port %d prot %d %d\n",
540			vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, naddr,
541			sotype, resvport ? "r" : "", port, protocol, vers));
542		*nsop = nso;
543	}
544	return (error);
545}
546
547/*
548 * Destroy an nfs_socket structure.
549 */
550void
551nfs_socket_destroy(struct nfs_socket *nso)
552{
553	struct timespec ts = { 4, 0 };
554
555	lck_mtx_lock(&nso->nso_lock);
556	nso->nso_flags |= NSO_DISCONNECTING;
557	if (nso->nso_flags & NSO_UPCALL) /* give upcall a chance to complete */
558		msleep(&nso->nso_flags, &nso->nso_lock, PZERO-1, "nfswaitupcall", &ts);
559	lck_mtx_unlock(&nso->nso_lock);
560	sock_shutdown(nso->nso_so, SHUT_RDWR);
561	sock_close(nso->nso_so);
562	if (nso->nso_sotype == SOCK_STREAM)
563		nfs_rpc_record_state_cleanup(&nso->nso_rrs);
564	lck_mtx_destroy(&nso->nso_lock, nfs_request_grp);
565	if (nso->nso_saddr)
566		FREE(nso->nso_saddr, M_SONAME);
567	if (nso->nso_saddr2)
568		FREE(nso->nso_saddr2, M_SONAME);
569	NFS_SOCK_DBG(("nfs connect - socket %p destroyed\n", nso));
570	FREE(nso, M_TEMP);
571}
572
573/*
574 * Set common socket options on an nfs_socket.
575 */
576void
577nfs_socket_options(struct nfsmount *nmp, struct nfs_socket *nso)
578{
579	/*
580	 * Set socket send/receive timeouts
581	 * - Receive timeout shouldn't matter because most receives are performed
582	 *   in the socket upcall non-blocking.
583	 * - Send timeout should allow us to react to a blocked socket.
584	 *   Soft mounts will want to abort sooner.
585	 */
586	struct timeval timeo;
587	int on = 1, proto;
588
589	timeo.tv_usec = 0;
590	timeo.tv_sec = (NMFLAG(nmp, SOFT) || nfs_can_squish(nmp)) ? 5 : 60;
591	sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
592	sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
593	if (nso->nso_sotype == SOCK_STREAM) {
594		/* Assume that SOCK_STREAM always requires a connection */
595		sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
596		/* set nodelay for TCP */
597		sock_gettype(nso->nso_so, NULL, NULL, &proto);
598		if (proto == IPPROTO_TCP)
599			sock_setsockopt(nso->nso_so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
600	}
601	if (nso->nso_sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */
602		int reserve = NFS_UDPSOCKBUF;
603		sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve));
604		sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve));
605	}
606	/* set SO_NOADDRERR to detect network changes ASAP */
607	sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on));
608	/* just playin' it safe with upcalls */
609	sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on));
610	/* socket should be interruptible if the mount is */
611	if (!NMFLAG(nmp, INTR))
612		sock_nointerrupt(nso->nso_so, 1);
613}
614
615/*
616 * Release resources held in an nfs_socket_search.
617 */
618void
619nfs_socket_search_cleanup(struct nfs_socket_search *nss)
620{
621	struct nfs_socket *nso, *nsonext;
622
623	TAILQ_FOREACH_SAFE(nso, &nss->nss_socklist, nso_link, nsonext) {
624		TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link);
625		nss->nss_sockcnt--;
626		nfs_socket_destroy(nso);
627	}
628	if (nss->nss_sock) {
629		nfs_socket_destroy(nss->nss_sock);
630		nss->nss_sock = NULL;
631	}
632}
633
634/*
635 * Prefer returning certain errors over others.
636 * This function returns a ranking of the given error.
637 */
638int
639nfs_connect_error_class(int error)
640{
641	switch (error) {
642	case 0:
643		return (0);
644	case ETIMEDOUT:
645	case EAGAIN:
646		return (1);
647	case EPIPE:
648	case EADDRNOTAVAIL:
649	case ENETDOWN:
650	case ENETUNREACH:
651	case ENETRESET:
652	case ECONNABORTED:
653	case ECONNRESET:
654	case EISCONN:
655	case ENOTCONN:
656	case ESHUTDOWN:
657	case ECONNREFUSED:
658	case EHOSTDOWN:
659	case EHOSTUNREACH:
660		return (2);
661	case ERPCMISMATCH:
662	case EPROCUNAVAIL:
663	case EPROGMISMATCH:
664	case EPROGUNAVAIL:
665		return (3);
666	case EBADRPC:
667		return (4);
668	default:
669		return (5);
670	}
671}
672
673/*
674 * Make sure a socket search returns the best error.
675 */
676void
677nfs_socket_search_update_error(struct nfs_socket_search *nss, int error)
678{
679	if (nfs_connect_error_class(error) >= nfs_connect_error_class(nss->nss_error))
680		nss->nss_error = error;
681}
682
683/*
684 * Continue the socket search until we have something to report.
685 */
686int
687nfs_connect_search_loop(struct nfsmount *nmp, struct nfs_socket_search *nss)
688{
689	struct nfs_socket *nso, *nsonext;
690	struct timeval now;
691	struct nfs_fs_location *fsl;
692	struct nfs_fs_server *fss;
693	struct sockaddr_storage ss;
694	char *addrstr;
695	int error, nomore = 0;
696
697loop:
698	microuptime(&now);
699	NFS_SOCK_DBG(("nfs connect %s search %ld\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, now.tv_sec));
700
701	/* Time to start another socket? */
702	while ((nss->nss_last < 0) || (nss->nss_sockcnt == 0) ||
703	       ((nss->nss_sockcnt < 4) && (now.tv_sec >= (nss->nss_last + 2)))) {
704		if (nmp->nm_sockflags & NMSOCK_UNMOUNT)
705			return (EINTR);
706		/* Find the next address to try... */
707		/* Have we run out of locations? */
708		if (!nomore && (nss->nss_last != -1) && !nfs_location_index_cmp(&nss->nss_nextloc, &nss->nss_startloc))
709			nomore = 1;
710		if (nomore) {
711			if (nss->nss_last < 0)
712				nss->nss_last = now.tv_sec;
713			break;
714		}
715		/* Can we convert the address to a sockaddr? */
716		fsl = nmp->nm_locations.nl_locations[nss->nss_nextloc.nli_loc];
717		fss = fsl->nl_servers[nss->nss_nextloc.nli_serv];
718		addrstr = fss->ns_addresses[nss->nss_nextloc.nli_addr];
719		if (!nfs_uaddr2sockaddr(addrstr, (struct sockaddr*)&ss)) {
720			nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc);
721			nss->nss_last = -2;
722			continue;
723		}
724		/* Check that socket family is acceptable. */
725		if (nmp->nm_sofamily && (ss.ss_family != nmp->nm_sofamily)) {
726			nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc);
727			nss->nss_last = -2;
728			continue;
729		}
730
731		/* Create the socket. */
732		error = nfs_socket_create(nmp, (struct sockaddr*)&ss, nss->nss_sotype,
733				nss->nss_port, nss->nss_protocol, nss->nss_version,
734				((nss->nss_protocol == NFS_PROG) && NMFLAG(nmp, RESVPORT)), &nso);
735		if (error)
736			return (error);
737
738		nso->nso_location = nss->nss_nextloc;
739		nso->nso_wake = nss;
740		error = sock_setupcall(nso->nso_so, nfs_connect_upcall, nso);
741		if (error) {
742			lck_mtx_lock(&nso->nso_lock);
743			nso->nso_error = error;
744			nso->nso_flags |= NSO_DEAD;
745			lck_mtx_unlock(&nso->nso_lock);
746		}
747
748		TAILQ_INSERT_TAIL(&nss->nss_socklist, nso, nso_link);
749		nss->nss_sockcnt++;
750		nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc);
751
752		nss->nss_last = now.tv_sec;
753	}
754
755	/* check each active socket and try to push it along */
756	TAILQ_FOREACH(nso, &nss->nss_socklist, nso_link) {
757		lck_mtx_lock(&nso->nso_lock);
758		if (!(nso->nso_flags & NSO_CONNECTED)) {
759			if ((nso->nso_sotype != SOCK_STREAM) && NMFLAG(nmp, NOCONNECT)) {
760				/* no connection needed, just say it's already connected */
761				nso->nso_flags |= NSO_CONNECTED;
762				NFS_SOCK_DBG(("nfs connect %s UDP socket %p noconnect\n",
763					vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso));
764			} else if (!(nso->nso_flags & NSO_CONNECTING)) {
765				/* initiate the connection */
766				nso->nso_flags |= NSO_CONNECTING;
767				lck_mtx_unlock(&nso->nso_lock);
768				NFS_SOCK_DBG(("nfs connect %s connecting socket %p\n",
769					vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso));
770				error = sock_connect(nso->nso_so, nso->nso_saddr, MSG_DONTWAIT);
771				lck_mtx_lock(&nso->nso_lock);
772				if (error && (error != EINPROGRESS)) {
773					nso->nso_error = error;
774					nso->nso_flags |= NSO_DEAD;
775					lck_mtx_unlock(&nso->nso_lock);
776					continue;
777				}
778			}
779			if (nso->nso_flags & NSO_CONNECTING) {
780				/* check the connection */
781				if (sock_isconnected(nso->nso_so)) {
782					NFS_SOCK_DBG(("nfs connect %s socket %p is connected\n",
783						vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso));
784					nso->nso_flags &= ~NSO_CONNECTING;
785					nso->nso_flags |= NSO_CONNECTED;
786				} else {
787					int optlen = sizeof(error);
788					error = 0;
789					sock_getsockopt(nso->nso_so, SOL_SOCKET, SO_ERROR, &error, &optlen);
790					if (error) { /* we got an error on the socket */
791						NFS_SOCK_DBG(("nfs connect %s socket %p connection error %d\n",
792							vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error));
793						if (nss->nss_flags & NSS_VERBOSE)
794							log(LOG_INFO, "nfs_connect: socket error %d for %s\n",
795								error, vfs_statfs(nmp->nm_mountp)->f_mntfromname);
796						nso->nso_error = error;
797						nso->nso_flags |= NSO_DEAD;
798						lck_mtx_unlock(&nso->nso_lock);
799						continue;
800					}
801				}
802			}
803			if (nso->nso_flags & NSO_CONNECTED)
804				nfs_socket_options(nmp, nso);
805		}
806		if (!(nso->nso_flags & NSO_CONNECTED)) {
807			lck_mtx_unlock(&nso->nso_lock);
808			continue;
809		}
810		if (!(nso->nso_flags & (NSO_PINGING|NSO_VERIFIED)) ||
811		    ((nso->nso_sotype == SOCK_DGRAM) && (now.tv_sec >= nso->nso_reqtimestamp+2))) {
812			/* initiate a NULL RPC request */
813			uint64_t xid = nso->nso_pingxid;
814			mbuf_t m, mreq = NULL;
815			struct msghdr msg;
816			size_t reqlen, sentlen;
817			uint32_t vers;
818
819			if (!(vers = nso->nso_version)) {
820				if (nso->nso_protocol == PMAPPROG)
821					vers = (nso->nso_saddr->sa_family == AF_INET) ? PMAPVERS : RPCBVERS4;
822				else if (nso->nso_protocol == NFS_PROG)
823					vers = NFS_VER3;
824			}
825			lck_mtx_unlock(&nso->nso_lock);
826			error = nfsm_rpchead2(nmp, nso->nso_sotype, nso->nso_protocol, vers, 0, RPCAUTH_SYS,
827					vfs_context_ucred(vfs_context_kernel()), NULL, NULL, &xid, &mreq);
828			lck_mtx_lock(&nso->nso_lock);
829			if (!error) {
830				nso->nso_flags |= NSO_PINGING;
831				nso->nso_pingxid = R_XID32(xid);
832				nso->nso_reqtimestamp = now.tv_sec;
833				bzero(&msg, sizeof(msg));
834				if ((nso->nso_sotype != SOCK_STREAM) && !sock_isconnected(nso->nso_so)) {
835					msg.msg_name = nso->nso_saddr;
836					msg.msg_namelen = nso->nso_saddr->sa_len;
837				}
838				for (reqlen=0, m=mreq; m; m = mbuf_next(m))
839					reqlen += mbuf_len(m);
840				lck_mtx_unlock(&nso->nso_lock);
841				error = sock_sendmbuf(nso->nso_so, &msg, mreq, 0, &sentlen);
842				NFS_SOCK_DBG(("nfs connect %s verifying socket %p send rv %d\n",
843					vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error));
844				lck_mtx_lock(&nso->nso_lock);
845				if (!error && (sentlen != reqlen))
846					error = ETIMEDOUT;
847			}
848			if (error) {
849				nso->nso_error = error;
850				nso->nso_flags |= NSO_DEAD;
851				lck_mtx_unlock(&nso->nso_lock);
852				continue;
853			}
854		}
855		if (nso->nso_flags & NSO_VERIFIED) {
856			/* WOOHOO!! This socket looks good! */
857			NFS_SOCK_DBG(("nfs connect %s socket %p verified\n",
858				vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso));
859			if (!nso->nso_version) {
860				/* If the version isn't set, the default must have worked. */
861				if (nso->nso_protocol == PMAPPROG)
862					nso->nso_version = (nso->nso_saddr->sa_family == AF_INET) ? PMAPVERS : RPCBVERS4;
863				if (nso->nso_protocol == NFS_PROG)
864					nso->nso_version = NFS_VER3;
865			}
866			lck_mtx_unlock(&nso->nso_lock);
867			TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link);
868			nss->nss_sockcnt--;
869			nss->nss_sock = nso;
870			break;
871		}
872		lck_mtx_unlock(&nso->nso_lock);
873	}
874
875	TAILQ_FOREACH_SAFE(nso, &nss->nss_socklist, nso_link, nsonext) {
876		lck_mtx_lock(&nso->nso_lock);
877		if (now.tv_sec >= (nso->nso_timestamp + nss->nss_timeo)) {
878			/* took too long */
879			NFS_SOCK_DBG(("nfs connect %s socket %p timed out\n",
880				vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso));
881			nso->nso_error = ETIMEDOUT;
882			nso->nso_flags |= NSO_DEAD;
883		}
884		if (!(nso->nso_flags & NSO_DEAD)) {
885			lck_mtx_unlock(&nso->nso_lock);
886			continue;
887		}
888		lck_mtx_unlock(&nso->nso_lock);
889		NFS_SOCK_DBG(("nfs connect %s reaping socket %p %d\n",
890			vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, nso->nso_error));
891		nfs_socket_search_update_error(nss, nso->nso_error);
892		TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link);
893		nss->nss_sockcnt--;
894		nfs_socket_destroy(nso);
895		if (!nomore)
896			nss->nss_last = -2;
897	}
898
899	/*
900	 * Keep looping if we haven't found a socket yet and we have more
901	 * sockets to (continue to) try.
902	 */
903	error = 0;
904	if (!nss->nss_sock && (!TAILQ_EMPTY(&nss->nss_socklist) || !nomore)) {
905		/* log a warning if connect is taking a while */
906		if (((now.tv_sec - nss->nss_timestamp) >= 30) && ((nss->nss_flags & (NSS_VERBOSE|NSS_WARNED)) == NSS_VERBOSE)) {
907			log(LOG_INFO, "nfs_connect: socket connect taking a while for %s\n",
908				vfs_statfs(nmp->nm_mountp)->f_mntfromname);
909			nss->nss_flags |= NSS_WARNED;
910		}
911		if (nmp->nm_sockflags & NMSOCK_UNMOUNT)
912			return (EINTR);
913		if ((error = nfs_sigintr(nmp, NULL, current_thread(), 0)))
914			return (error);
915		if (nss->nss_last >= 0)
916			tsleep(nss, PSOCK, "nfs_connect_search_wait", hz);
917		goto loop;
918	}
919
920	NFS_SOCK_DBG(("nfs connect %s returning %d\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, error));
921	return (error);
922}
923
924/*
925 * Initialize a new NFS connection.
926 *
927 * Search for a location to connect a socket to and initialize the connection.
928 *
929 * An NFS mount may have multiple locations/servers/addresses available.
930 * We attempt to connect to each one asynchronously and will start
931 * several sockets in parallel if other locations are slow to answer.
932 * We'll use the first NFS socket we can successfully set up.
933 *
934 * The search may involve contacting the portmapper service first.
935 *
936 * A mount's initial connection may require negotiating some parameters such
937 * as socket type and NFS version.
938 */
939int
940nfs_connect(struct nfsmount *nmp, int verbose, int timeo)
941{
942	struct nfs_socket_search nss;
943	struct nfs_socket *nso, *nsonfs;
944	struct sockaddr_storage ss;
945	struct sockaddr *saddr, *oldsaddr;
946	sock_upcall upcall;
947	struct timeval now, start;
948	int error, savederror, nfsvers;
949	uint8_t	sotype = nmp->nm_sotype ? nmp->nm_sotype : SOCK_STREAM;
950	fhandle_t *fh = NULL;
951	char *path = NULL;
952	in_port_t port;
953
954	/* paranoia... check that we have at least one address in the locations */
955	uint32_t loc, serv;
956	for (loc=0; loc < nmp->nm_locations.nl_numlocs; loc++) {
957		for (serv=0; serv < nmp->nm_locations.nl_locations[loc]->nl_servcount; serv++) {
958			if (nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addrcount)
959				break;
960			NFS_SOCK_DBG(("nfs connect %s search, server %s has no addresses\n",
961				vfs_statfs(nmp->nm_mountp)->f_mntfromname,
962				nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_name));
963		}
964		if (serv < nmp->nm_locations.nl_locations[loc]->nl_servcount)
965			break;
966	}
967	if (loc >= nmp->nm_locations.nl_numlocs) {
968		NFS_SOCK_DBG(("nfs connect %s search failed, no addresses\n",
969			vfs_statfs(nmp->nm_mountp)->f_mntfromname));
970		return (EINVAL);
971	}
972
973	lck_mtx_lock(&nmp->nm_lock);
974	nmp->nm_sockflags |= NMSOCK_CONNECTING;
975	nmp->nm_nss = &nss;
976	lck_mtx_unlock(&nmp->nm_lock);
977	microuptime(&start);
978	savederror = error = 0;
979
980tryagain:
981	/* initialize socket search state */
982	bzero(&nss, sizeof(nss));
983	nss.nss_error = savederror;
984	TAILQ_INIT(&nss.nss_socklist);
985	nss.nss_sotype = sotype;
986	nss.nss_startloc = nmp->nm_locations.nl_current;
987	nss.nss_timestamp = start.tv_sec;
988	nss.nss_timeo = timeo;
989	if (verbose)
990		nss.nss_flags |= NSS_VERBOSE;
991
992	/* First time connecting, we may need to negotiate some things */
993	if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) {
994		if (!nmp->nm_vers) {
995			/* No NFS version specified... */
996			if (!nmp->nm_nfsport || (!NM_OMATTR_GIVEN(nmp, FH) && !nmp->nm_mountport)) {
997				/* ...connect to portmapper first if we (may) need any ports. */
998				nss.nss_port = PMAPPORT;
999				nss.nss_protocol = PMAPPROG;
1000				nss.nss_version = 0;
1001			} else {
1002				/* ...connect to NFS port first. */
1003				nss.nss_port = nmp->nm_nfsport;
1004				nss.nss_protocol = NFS_PROG;
1005				nss.nss_version = 0;
1006			}
1007		} else if (nmp->nm_vers >= NFS_VER4) {
1008			/* For NFSv4, we use the given (or default) port. */
1009			nss.nss_port = nmp->nm_nfsport ? nmp->nm_nfsport : NFS_PORT;
1010			nss.nss_protocol = NFS_PROG;
1011			nss.nss_version = 4;
1012		} else {
1013			/* For NFSv3/v2... */
1014			if (!nmp->nm_nfsport || (!NM_OMATTR_GIVEN(nmp, FH) && !nmp->nm_mountport)) {
1015				/* ...connect to portmapper first if we need any ports. */
1016				nss.nss_port = PMAPPORT;
1017				nss.nss_protocol = PMAPPROG;
1018				nss.nss_version = 0;
1019			} else {
1020				/* ...connect to NFS port first. */
1021				nss.nss_port = nmp->nm_nfsport;
1022				nss.nss_protocol = NFS_PROG;
1023				nss.nss_version = nmp->nm_vers;
1024			}
1025		}
1026		NFS_SOCK_DBG(("nfs connect first %s, so type %d port %d prot %d %d\n",
1027			vfs_statfs(nmp->nm_mountp)->f_mntfromname, nss.nss_sotype, nss.nss_port,
1028			nss.nss_protocol, nss.nss_version));
1029	} else {
1030		/* we've connected before, just connect to NFS port */
1031		if (!nmp->nm_nfsport) {
1032			/* need to ask portmapper which port that would be */
1033			nss.nss_port = PMAPPORT;
1034			nss.nss_protocol = PMAPPROG;
1035			nss.nss_version = 0;
1036		} else {
1037			nss.nss_port = nmp->nm_nfsport;
1038			nss.nss_protocol = NFS_PROG;
1039			nss.nss_version = nmp->nm_vers;
1040		}
1041		NFS_SOCK_DBG(("nfs connect %s, so type %d port %d prot %d %d\n",
1042			vfs_statfs(nmp->nm_mountp)->f_mntfromname, nss.nss_sotype, nss.nss_port,
1043			nss.nss_protocol, nss.nss_version));
1044	}
1045
1046	/* Set next location to first valid location. */
1047	/* If start location is invalid, find next location. */
1048	nss.nss_nextloc = nss.nss_startloc;
1049	if ((nss.nss_nextloc.nli_serv >= nmp->nm_locations.nl_locations[nss.nss_nextloc.nli_loc]->nl_servcount) ||
1050	    (nss.nss_nextloc.nli_addr >= nmp->nm_locations.nl_locations[nss.nss_nextloc.nli_loc]->nl_servers[nss.nss_nextloc.nli_serv]->ns_addrcount)) {
1051		nfs_location_next(&nmp->nm_locations, &nss.nss_nextloc);
1052		if (!nfs_location_index_cmp(&nss.nss_nextloc, &nss.nss_startloc)) {
1053			NFS_SOCK_DBG(("nfs connect %s search failed, couldn't find a valid location index\n",
1054				vfs_statfs(nmp->nm_mountp)->f_mntfromname));
1055			return (ENOENT);
1056		}
1057	}
1058	nss.nss_last = -1;
1059
1060keepsearching:
1061
1062	error = nfs_connect_search_loop(nmp, &nss);
1063	if (error || !nss.nss_sock) {
1064		/* search failed */
1065		nfs_socket_search_cleanup(&nss);
1066		if (!error && (nss.nss_sotype == SOCK_STREAM) && !nmp->nm_sotype && (nmp->nm_vers < NFS_VER4)) {
1067			/* Try using UDP */
1068			sotype = SOCK_DGRAM;
1069			savederror = nss.nss_error;
1070			NFS_SOCK_DBG(("nfs connect %s TCP failed %d %d, trying UDP\n",
1071				vfs_statfs(nmp->nm_mountp)->f_mntfromname, error, nss.nss_error));
1072			goto tryagain;
1073		}
1074		if (!error)
1075			error = nss.nss_error ? nss.nss_error : ETIMEDOUT;
1076		lck_mtx_lock(&nmp->nm_lock);
1077		nmp->nm_sockflags &= ~NMSOCK_CONNECTING;
1078		nmp->nm_nss = NULL;
1079		lck_mtx_unlock(&nmp->nm_lock);
1080		if (nss.nss_flags & NSS_WARNED)
1081			log(LOG_INFO, "nfs_connect: socket connect aborted for %s\n",
1082				vfs_statfs(nmp->nm_mountp)->f_mntfromname);
1083		if (fh)
1084			FREE(fh, M_TEMP);
1085		if (path)
1086			FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
1087		NFS_SOCK_DBG(("nfs connect %s search failed, returning %d\n",
1088			vfs_statfs(nmp->nm_mountp)->f_mntfromname, error));
1089		return (error);
1090	}
1091
1092	/* try to use nss_sock */
1093	nso = nss.nss_sock;
1094	nss.nss_sock = NULL;
1095
1096	/* We may be speaking to portmap first... to determine port(s). */
1097	if (nso->nso_saddr->sa_family == AF_INET)
1098		port = ntohs(((struct sockaddr_in*)nso->nso_saddr)->sin_port);
1099	else
1100		port = ntohs(((struct sockaddr_in6*)nso->nso_saddr)->sin6_port);
1101	if (port == PMAPPORT) {
1102		/* Use this portmapper port to get the port #s we need. */
1103		NFS_SOCK_DBG(("nfs connect %s got portmapper socket %p\n",
1104			vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso));
1105
1106		/* remove the connect upcall so nfs_portmap_lookup() can use this socket */
1107		sock_setupcall(nso->nso_so, NULL, NULL);
1108
1109		/* Set up socket address and port for NFS socket. */
1110		bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len);
1111
1112		/* If NFS version not set, try NFSv3 then NFSv2. */
1113		nfsvers = nmp->nm_vers ? nmp->nm_vers : NFS_VER3;
1114
1115		if (!(port = nmp->nm_nfsport)) {
1116			if (ss.ss_family == AF_INET)
1117				((struct sockaddr_in*)&ss)->sin_port = htons(0);
1118			else if (ss.ss_family == AF_INET6)
1119				((struct sockaddr_in6*)&ss)->sin6_port = htons(0);
1120			error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss,
1121					nso->nso_so, NFS_PROG, nfsvers,
1122					(nso->nso_sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP, timeo);
1123			if (!error) {
1124				if (ss.ss_family == AF_INET)
1125					port = ntohs(((struct sockaddr_in*)&ss)->sin_port);
1126				else if (ss.ss_family == AF_INET6)
1127					port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port);
1128				if (!port)
1129					error = EPROGUNAVAIL;
1130			}
1131			if (error && !nmp->nm_vers) {
1132				nfsvers = NFS_VER2;
1133				error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss,
1134						nso->nso_so, NFS_PROG, nfsvers,
1135						(nso->nso_sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP, timeo);
1136				if (!error) {
1137					if (ss.ss_family == AF_INET)
1138						port = ntohs(((struct sockaddr_in*)&ss)->sin_port);
1139					else if (ss.ss_family == AF_INET6)
1140						port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port);
1141					if (!port)
1142						error = EPROGUNAVAIL;
1143				}
1144			}
1145			if (error) {
1146				nfs_socket_search_update_error(&nss, error);
1147				nfs_socket_destroy(nso);
1148				goto keepsearching;
1149			}
1150		}
1151		/* Create NFS protocol socket and add it to the list of sockets. */
1152		error = nfs_socket_create(nmp, (struct sockaddr*)&ss, nso->nso_sotype, port,
1153				NFS_PROG, nfsvers, NMFLAG(nmp, RESVPORT), &nsonfs);
1154		if (error) {
1155			nfs_socket_search_update_error(&nss, error);
1156			nfs_socket_destroy(nso);
1157			goto keepsearching;
1158		}
1159		nsonfs->nso_location = nso->nso_location;
1160		nsonfs->nso_wake = &nss;
1161		error = sock_setupcall(nsonfs->nso_so, nfs_connect_upcall, nsonfs);
1162		if (error) {
1163			nfs_socket_search_update_error(&nss, error);
1164			nfs_socket_destroy(nsonfs);
1165			nfs_socket_destroy(nso);
1166			goto keepsearching;
1167		}
1168		TAILQ_INSERT_TAIL(&nss.nss_socklist, nsonfs, nso_link);
1169		nss.nss_sockcnt++;
1170		if ((nfsvers < NFS_VER4) && !(nmp->nm_sockflags & NMSOCK_HASCONNECTED) && !NM_OMATTR_GIVEN(nmp, FH)) {
1171			/* Set up socket address and port for MOUNT socket. */
1172			error = 0;
1173			bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len);
1174			port = nmp->nm_mountport;
1175			if (ss.ss_family == AF_INET)
1176				((struct sockaddr_in*)&ss)->sin_port = htons(port);
1177			else if (ss.ss_family == AF_INET6)
1178				((struct sockaddr_in6*)&ss)->sin6_port = htons(port);
1179			if (!port) {
1180				/* Get port/sockaddr for MOUNT version corresponding to NFS version. */
1181				/* If NFS version is unknown, optimistically choose for NFSv3. */
1182				int mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3;
1183				int mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nso->nso_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP;
1184				error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss,
1185						nso->nso_so, RPCPROG_MNT, mntvers, mntproto, timeo);
1186			}
1187			if (!error) {
1188				if (ss.ss_family == AF_INET)
1189					port = ntohs(((struct sockaddr_in*)&ss)->sin_port);
1190				else if (ss.ss_family == AF_INET6)
1191					port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port);
1192				if (!port)
1193					error = EPROGUNAVAIL;
1194			}
1195			/* create sockaddr for MOUNT */
1196			if (!error)
1197				MALLOC(nsonfs->nso_saddr2, struct sockaddr *, ss.ss_len, M_SONAME, M_WAITOK|M_ZERO);
1198			if (!error && !nsonfs->nso_saddr2)
1199				error = ENOMEM;
1200			if (!error)
1201				bcopy(&ss, nsonfs->nso_saddr2, ss.ss_len);
1202			if (error) {
1203				lck_mtx_lock(&nsonfs->nso_lock);
1204				nsonfs->nso_error = error;
1205				nsonfs->nso_flags |= NSO_DEAD;
1206				lck_mtx_unlock(&nsonfs->nso_lock);
1207			}
1208		}
1209		nfs_socket_destroy(nso);
1210		goto keepsearching;
1211	}
1212
1213	/* nso is an NFS socket */
1214	NFS_SOCK_DBG(("nfs connect %s got NFS socket %p\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso));
1215
1216	/* If NFS version wasn't specified, it was determined during the connect. */
1217	nfsvers = nmp->nm_vers ? nmp->nm_vers : (int)nso->nso_version;
1218
1219	/* Perform MOUNT call for initial NFSv2/v3 connection/mount. */
1220	if ((nfsvers < NFS_VER4) && !(nmp->nm_sockflags & NMSOCK_HASCONNECTED) && !NM_OMATTR_GIVEN(nmp, FH)) {
1221		error = 0;
1222		saddr = nso->nso_saddr2;
1223		if (!saddr) {
1224			/* Need sockaddr for MOUNT port */
1225			bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len);
1226			port = nmp->nm_mountport;
1227			if (ss.ss_family == AF_INET)
1228				((struct sockaddr_in*)&ss)->sin_port = htons(port);
1229			else if (ss.ss_family == AF_INET6)
1230				((struct sockaddr_in6*)&ss)->sin6_port = htons(port);
1231			if (!port) {
1232				/* Get port/sockaddr for MOUNT version corresponding to NFS version. */
1233				int mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3;
1234				int mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nso->nso_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP;
1235				error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss,
1236						NULL, RPCPROG_MNT, mntvers, mntproto, timeo);
1237				if (ss.ss_family == AF_INET)
1238					port = ntohs(((struct sockaddr_in*)&ss)->sin_port);
1239				else if (ss.ss_family == AF_INET6)
1240					port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port);
1241			}
1242			if (!error) {
1243				if (port)
1244					saddr = (struct sockaddr*)&ss;
1245				else
1246					error = EPROGUNAVAIL;
1247			}
1248		}
1249		if (saddr)
1250			MALLOC(fh, fhandle_t *, sizeof(fhandle_t), M_TEMP, M_WAITOK|M_ZERO);
1251		if (saddr && fh)
1252			MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
1253		if (!saddr || !fh || !path) {
1254			if (!error)
1255				error = ENOMEM;
1256			if (fh)
1257				FREE(fh, M_TEMP);
1258			if (path)
1259				FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
1260			fh = NULL;
1261			path = NULL;
1262			nfs_socket_search_update_error(&nss, error);
1263			nfs_socket_destroy(nso);
1264			goto keepsearching;
1265		}
1266		nfs_location_mntfromname(&nmp->nm_locations, nso->nso_location, path, MAXPATHLEN, 1);
1267		error = nfs3_mount_rpc(nmp, saddr, nso->nso_sotype, nfsvers,
1268				path, vfs_context_current(), timeo, fh, &nmp->nm_servsec);
1269		NFS_SOCK_DBG(("nfs connect %s socket %p mount %d\n",
1270			vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error));
1271		if (!error) {
1272			/* Make sure we can agree on a security flavor. */
1273			int o, s;  /* indices into mount option and server security flavor lists */
1274			int found = 0;
1275
1276			if ((nfsvers == NFS_VER3) && !nmp->nm_servsec.count) {
1277				/* Some servers return an empty list to indicate RPCAUTH_SYS? */
1278				nmp->nm_servsec.count = 1;
1279				nmp->nm_servsec.flavors[0] = RPCAUTH_SYS;
1280			}
1281			if (nmp->nm_sec.count) {
1282				/* Choose the first flavor in our list that the server supports. */
1283				if (!nmp->nm_servsec.count) {
1284					/* we don't know what the server supports, just use our first choice */
1285					nmp->nm_auth = nmp->nm_sec.flavors[0];
1286					found = 1;
1287				}
1288				for (o=0; !found && (o < nmp->nm_sec.count); o++)
1289					for (s=0; !found && (s < nmp->nm_servsec.count); s++)
1290						if (nmp->nm_sec.flavors[o] == nmp->nm_servsec.flavors[s]) {
1291							nmp->nm_auth = nmp->nm_sec.flavors[o];
1292							found = 1;
1293						}
1294			} else {
1295				/* Choose the first one we support from the server's list. */
1296				if (!nmp->nm_servsec.count) {
1297					nmp->nm_auth = RPCAUTH_SYS;
1298					found = 1;
1299				}
1300				for (s=0; s < nmp->nm_servsec.count; s++)
1301					switch (nmp->nm_servsec.flavors[s]) {
1302					case RPCAUTH_SYS:
1303						/* prefer RPCAUTH_SYS to RPCAUTH_NONE */
1304						if (found && (nmp->nm_auth == RPCAUTH_NONE))
1305							found = 0;
1306					case RPCAUTH_NONE:
1307					case RPCAUTH_KRB5:
1308					case RPCAUTH_KRB5I:
1309					case RPCAUTH_KRB5P:
1310						if (!found) {
1311							nmp->nm_auth = nmp->nm_servsec.flavors[s];
1312							found = 1;
1313						}
1314						break;
1315					}
1316			}
1317			error = !found ? EAUTH : 0;
1318		}
1319		FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
1320		path = NULL;
1321		if (error) {
1322			nfs_socket_search_update_error(&nss, error);
1323			FREE(fh, M_TEMP);
1324			fh = NULL;
1325			nfs_socket_destroy(nso);
1326			goto keepsearching;
1327		}
1328		if (nmp->nm_fh)
1329			FREE(nmp->nm_fh, M_TEMP);
1330		nmp->nm_fh = fh;
1331		fh = NULL;
1332		NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_CALLUMNT);
1333	}
1334
1335	/* put the real upcall in place */
1336	upcall = (nso->nso_sotype == SOCK_STREAM) ? nfs_tcp_rcv : nfs_udp_rcv;
1337	error = sock_setupcall(nso->nso_so, upcall, nmp);
1338	if (error) {
1339		nfs_socket_search_update_error(&nss, error);
1340		nfs_socket_destroy(nso);
1341		goto keepsearching;
1342	}
1343
1344	if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) {
1345		/* set mntfromname to this location */
1346		if (!NM_OMATTR_GIVEN(nmp, MNTFROM))
1347			nfs_location_mntfromname(&nmp->nm_locations, nso->nso_location,
1348				vfs_statfs(nmp->nm_mountp)->f_mntfromname,
1349				sizeof(vfs_statfs(nmp->nm_mountp)->f_mntfromname), 0);
1350		/* some negotiated values need to remain unchanged for the life of the mount */
1351		if (!nmp->nm_sotype)
1352			nmp->nm_sotype = nso->nso_sotype;
1353		if (!nmp->nm_vers) {
1354			nmp->nm_vers = nfsvers;
1355			/* If we negotiated NFSv4, set nm_nfsport if we ended up on the standard NFS port */
1356			if ((nfsvers >= NFS_VER4) && !NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_PORT)) {
1357				if (nso->nso_saddr->sa_family == AF_INET)
1358					port = ((struct sockaddr_in*)nso->nso_saddr)->sin_port = htons(port);
1359				else if (nso->nso_saddr->sa_family == AF_INET6)
1360					port = ((struct sockaddr_in6*)nso->nso_saddr)->sin6_port = htons(port);
1361				else
1362					port = 0;
1363				if (port == NFS_PORT)
1364					nmp->nm_nfsport = NFS_PORT;
1365			}
1366		}
1367		/* do some version-specific pre-mount set up */
1368		if (nmp->nm_vers >= NFS_VER4) {
1369			microtime(&now);
1370			nmp->nm_mounttime = ((uint64_t)now.tv_sec << 32) | now.tv_usec;
1371			if (!NMFLAG(nmp, NOCALLBACK))
1372				nfs4_mount_callback_setup(nmp);
1373		}
1374	}
1375
1376	/* Initialize NFS socket state variables */
1377	lck_mtx_lock(&nmp->nm_lock);
1378	nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
1379		nmp->nm_srtt[3] = (NFS_TIMEO << 3);
1380	nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
1381		nmp->nm_sdrtt[3] = 0;
1382	if (nso->nso_sotype == SOCK_DGRAM) {
1383		nmp->nm_cwnd = NFS_MAXCWND / 2;	    /* Initial send window */
1384		nmp->nm_sent = 0;
1385	} else if (nso->nso_sotype == SOCK_STREAM) {
1386		nmp->nm_timeouts = 0;
1387	}
1388	nmp->nm_sockflags &= ~NMSOCK_CONNECTING;
1389	nmp->nm_sockflags |= NMSOCK_SETUP;
1390	/* move the socket to the mount structure */
1391	nmp->nm_nso = nso;
1392	oldsaddr = nmp->nm_saddr;
1393	nmp->nm_saddr = nso->nso_saddr;
1394	lck_mtx_unlock(&nmp->nm_lock);
1395	error = nfs_connect_setup(nmp);
1396	lck_mtx_lock(&nmp->nm_lock);
1397	nmp->nm_sockflags &= ~NMSOCK_SETUP;
1398	if (!error) {
1399		nmp->nm_sockflags |= NMSOCK_READY;
1400		wakeup(&nmp->nm_sockflags);
1401	}
1402	if (error) {
1403		NFS_SOCK_DBG(("nfs connect %s socket %p setup failed %d\n",
1404			vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error));
1405		nfs_socket_search_update_error(&nss, error);
1406		nmp->nm_saddr = oldsaddr;
1407		if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) {
1408			/* undo settings made prior to setup */
1409			if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_SOCKET_TYPE))
1410				nmp->nm_sotype = 0;
1411			if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_VERSION)) {
1412				if (nmp->nm_vers >= NFS_VER4) {
1413					if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_PORT))
1414						nmp->nm_nfsport = 0;
1415					if (nmp->nm_cbid)
1416						nfs4_mount_callback_shutdown(nmp);
1417					if (IS_VALID_CRED(nmp->nm_mcred))
1418						kauth_cred_unref(&nmp->nm_mcred);
1419					bzero(&nmp->nm_un, sizeof(nmp->nm_un));
1420				}
1421				nmp->nm_vers = 0;
1422			}
1423		}
1424		lck_mtx_unlock(&nmp->nm_lock);
1425		nmp->nm_nso = NULL;
1426		nfs_socket_destroy(nso);
1427		goto keepsearching;
1428	}
1429
1430	/* update current location */
1431	if ((nmp->nm_locations.nl_current.nli_flags & NLI_VALID) &&
1432	    (nmp->nm_locations.nl_current.nli_serv != nso->nso_location.nli_serv)) {
1433		/* server has changed, we should initiate failover/recovery */
1434		// XXX
1435	}
1436	nmp->nm_locations.nl_current = nso->nso_location;
1437	nmp->nm_locations.nl_current.nli_flags |= NLI_VALID;
1438
1439	if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) {
1440		/* We have now successfully connected... make a note of it. */
1441		nmp->nm_sockflags |= NMSOCK_HASCONNECTED;
1442	}
1443
1444	lck_mtx_unlock(&nmp->nm_lock);
1445	if (oldsaddr)
1446		FREE(oldsaddr, M_SONAME);
1447
1448	if (nss.nss_flags & NSS_WARNED)
1449		log(LOG_INFO, "nfs_connect: socket connect completed for %s\n",
1450			vfs_statfs(nmp->nm_mountp)->f_mntfromname);
1451
1452	nmp->nm_nss = NULL;
1453	nfs_socket_search_cleanup(&nss);
1454	if (fh)
1455		FREE(fh, M_TEMP);
1456	if (path)
1457		FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
1458	NFS_SOCK_DBG(("nfs connect %s success\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname));
1459	return (0);
1460}
1461
1462
1463/* setup & confirm socket connection is functional */
1464int
1465nfs_connect_setup(struct nfsmount *nmp)
1466{
1467	int error = 0;
1468
1469	if (nmp->nm_vers >= NFS_VER4) {
1470		if (nmp->nm_state & NFSSTA_CLIENTID) {
1471			/* first, try to renew our current state */
1472			error = nfs4_renew(nmp, R_SETUP);
1473			if ((error == NFSERR_ADMIN_REVOKED) ||
1474			    (error == NFSERR_CB_PATH_DOWN) ||
1475			    (error == NFSERR_EXPIRED) ||
1476			    (error == NFSERR_LEASE_MOVED) ||
1477			    (error == NFSERR_STALE_CLIENTID)) {
1478				lck_mtx_lock(&nmp->nm_lock);
1479				nfs_need_recover(nmp, error);
1480				lck_mtx_unlock(&nmp->nm_lock);
1481			}
1482		}
1483		error = nfs4_setclientid(nmp);
1484	}
1485	return (error);
1486}
1487
1488/*
1489 * NFS socket reconnect routine:
1490 * Called when a connection is broken.
1491 * - disconnect the old socket
1492 * - nfs_connect() again
1493 * - set R_MUSTRESEND for all outstanding requests on mount point
1494 * If this fails the mount point is DEAD!
1495 */
1496int
1497nfs_reconnect(struct nfsmount *nmp)
1498{
1499	struct nfsreq *rq;
1500	struct timeval now;
1501	thread_t thd = current_thread();
1502	int error, wentdown = 0, verbose = 1;
1503	time_t lastmsg;
1504	int timeo;
1505
1506	microuptime(&now);
1507	lastmsg = now.tv_sec - (nmp->nm_tprintf_delay - nmp->nm_tprintf_initial_delay);
1508
1509	nfs_disconnect(nmp);
1510
1511
1512	lck_mtx_lock(&nmp->nm_lock);
1513	timeo = nfs_is_squishy(nmp) ? 8 : 30;
1514	lck_mtx_unlock(&nmp->nm_lock);
1515
1516	while ((error = nfs_connect(nmp, verbose, timeo))) {
1517		verbose = 0;
1518		nfs_disconnect(nmp);
1519		if ((error == EINTR) || (error == ERESTART))
1520			return (EINTR);
1521		if (error == EIO)
1522			return (EIO);
1523		microuptime(&now);
1524		if ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec) {
1525			lastmsg = now.tv_sec;
1526			nfs_down(nmp, thd, error, NFSSTA_TIMEO, "can not connect");
1527			wentdown = 1;
1528		}
1529		lck_mtx_lock(&nmp->nm_lock);
1530		if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1531			/* we're not yet completely mounted and */
1532			/* we can't reconnect, so we fail */
1533			lck_mtx_unlock(&nmp->nm_lock);
1534			return (error);
1535		}
1536		nfs_mount_check_dead_timeout(nmp);
1537		if ((error = nfs_sigintr(nmp, NULL, thd, 1))) {
1538			lck_mtx_unlock(&nmp->nm_lock);
1539			return (error);
1540		}
1541		lck_mtx_unlock(&nmp->nm_lock);
1542		tsleep(&lbolt, PSOCK, "nfs_reconnect_delay", 0);
1543		if ((error = nfs_sigintr(nmp, NULL, thd, 0)))
1544			return (error);
1545	}
1546
1547	if (wentdown)
1548		nfs_up(nmp, thd, NFSSTA_TIMEO, "connected");
1549
1550	/*
1551	 * Loop through outstanding request list and mark all requests
1552	 * as needing a resend.  (Though nfs_need_reconnect() probably
1553	 * marked them all already.)
1554	 */
1555	lck_mtx_lock(nfs_request_mutex);
1556	TAILQ_FOREACH(rq, &nfs_reqq, r_chain) {
1557		if (rq->r_nmp == nmp) {
1558			lck_mtx_lock(&rq->r_mtx);
1559			if (!rq->r_error && !rq->r_nmrep.nmc_mhead && !(rq->r_flags & R_MUSTRESEND)) {
1560				rq->r_flags |= R_MUSTRESEND;
1561				rq->r_rtt = -1;
1562				wakeup(rq);
1563				if ((rq->r_flags & (R_ASYNC|R_ASYNCWAIT|R_SENDING)) == R_ASYNC)
1564					nfs_asyncio_resend(rq);
1565			}
1566			lck_mtx_unlock(&rq->r_mtx);
1567		}
1568	}
1569	lck_mtx_unlock(nfs_request_mutex);
1570	return (0);
1571}
1572
1573/*
1574 * NFS disconnect. Clean up and unlink.
1575 */
1576void
1577nfs_disconnect(struct nfsmount *nmp)
1578{
1579	struct nfs_socket *nso;
1580
1581	lck_mtx_lock(&nmp->nm_lock);
1582tryagain:
1583	if (nmp->nm_nso) {
1584		struct timespec ts = { 1, 0 };
1585		if (nmp->nm_state & NFSSTA_SENDING) { /* wait for sending to complete */
1586			nmp->nm_state |= NFSSTA_WANTSND;
1587			msleep(&nmp->nm_state, &nmp->nm_lock, PZERO-1, "nfswaitsending", &ts);
1588			goto tryagain;
1589		}
1590		if (nmp->nm_sockflags & NMSOCK_POKE) { /* wait for poking to complete */
1591			msleep(&nmp->nm_sockflags, &nmp->nm_lock, PZERO-1, "nfswaitpoke", &ts);
1592			goto tryagain;
1593		}
1594		nmp->nm_sockflags |= NMSOCK_DISCONNECTING;
1595		nmp->nm_sockflags &= ~NMSOCK_READY;
1596		nso = nmp->nm_nso;
1597		nmp->nm_nso = NULL;
1598		if (nso->nso_saddr == nmp->nm_saddr)
1599			nso->nso_saddr = NULL;
1600		lck_mtx_unlock(&nmp->nm_lock);
1601		nfs_socket_destroy(nso);
1602		lck_mtx_lock(&nmp->nm_lock);
1603		nmp->nm_sockflags &= ~NMSOCK_DISCONNECTING;
1604		lck_mtx_unlock(&nmp->nm_lock);
1605	} else {
1606		lck_mtx_unlock(&nmp->nm_lock);
1607	}
1608}
1609
1610/*
1611 * mark an NFS mount as needing a reconnect/resends.
1612 */
1613void
1614nfs_need_reconnect(struct nfsmount *nmp)
1615{
1616	struct nfsreq *rq;
1617
1618	lck_mtx_lock(&nmp->nm_lock);
1619	nmp->nm_sockflags &= ~(NMSOCK_READY|NMSOCK_SETUP);
1620	lck_mtx_unlock(&nmp->nm_lock);
1621
1622	/*
1623	 * Loop through outstanding request list and
1624	 * mark all requests as needing a resend.
1625	 */
1626	lck_mtx_lock(nfs_request_mutex);
1627	TAILQ_FOREACH(rq, &nfs_reqq, r_chain) {
1628		if (rq->r_nmp == nmp) {
1629			lck_mtx_lock(&rq->r_mtx);
1630			if (!rq->r_error && !rq->r_nmrep.nmc_mhead && !(rq->r_flags & R_MUSTRESEND)) {
1631				rq->r_flags |= R_MUSTRESEND;
1632				rq->r_rtt = -1;
1633				wakeup(rq);
1634				if ((rq->r_flags & (R_ASYNC|R_ASYNCWAIT|R_SENDING)) == R_ASYNC)
1635					nfs_asyncio_resend(rq);
1636			}
1637			lck_mtx_unlock(&rq->r_mtx);
1638		}
1639	}
1640	lck_mtx_unlock(nfs_request_mutex);
1641}
1642
1643
1644/*
1645 * thread to handle miscellaneous async NFS socket work (reconnects/resends)
1646 */
1647void
1648nfs_mount_sock_thread(void *arg, __unused wait_result_t wr)
1649{
1650	struct nfsmount *nmp = arg;
1651	struct timespec ts = { 30, 0 };
1652	thread_t thd = current_thread();
1653	struct nfsreq *req;
1654	struct timeval now;
1655	int error, dofinish;
1656	nfsnode_t np;
1657
1658	lck_mtx_lock(&nmp->nm_lock);
1659
1660	while (!(nmp->nm_sockflags & NMSOCK_READY) ||
1661	       !TAILQ_EMPTY(&nmp->nm_resendq) ||
1662	       !LIST_EMPTY(&nmp->nm_monlist) ||
1663	       nmp->nm_deadto_start ||
1664	       (nmp->nm_state & NFSSTA_RECOVER) ||
1665	       ((nmp->nm_vers >= NFS_VER4) && !TAILQ_EMPTY(&nmp->nm_dreturnq)))
1666	{
1667		if (nmp->nm_sockflags & NMSOCK_UNMOUNT)
1668			break;
1669		/* do reconnect, if necessary */
1670		if (!(nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_FORCE)) {
1671			if (nmp->nm_reconnect_start <= 0) {
1672				microuptime(&now);
1673				nmp->nm_reconnect_start = now.tv_sec;
1674			}
1675			lck_mtx_unlock(&nmp->nm_lock);
1676			NFS_SOCK_DBG(("nfs reconnect %s\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname));
1677			if (nfs_reconnect(nmp) == 0)
1678				nmp->nm_reconnect_start = 0;
1679			lck_mtx_lock(&nmp->nm_lock);
1680		}
1681		if ((nmp->nm_sockflags & NMSOCK_READY) &&
1682		    (nmp->nm_state & NFSSTA_RECOVER) &&
1683		    !(nmp->nm_sockflags & NMSOCK_UNMOUNT) &&
1684		    !(nmp->nm_state & NFSSTA_FORCE)) {
1685			/* perform state recovery */
1686			lck_mtx_unlock(&nmp->nm_lock);
1687			nfs_recover(nmp);
1688			lck_mtx_lock(&nmp->nm_lock);
1689		}
1690		/* handle NFSv4 delegation returns */
1691		while ((nmp->nm_vers >= NFS_VER4) && !(nmp->nm_state & NFSSTA_FORCE) &&
1692		       (nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER) &&
1693		       ((np = TAILQ_FIRST(&nmp->nm_dreturnq)))) {
1694			lck_mtx_unlock(&nmp->nm_lock);
1695			nfs4_delegation_return(np, R_RECOVER, thd, nmp->nm_mcred);
1696			lck_mtx_lock(&nmp->nm_lock);
1697		}
1698		/* do resends, if necessary/possible */
1699		while ((((nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER)) || (nmp->nm_state & NFSSTA_FORCE)) &&
1700		       ((req = TAILQ_FIRST(&nmp->nm_resendq)))) {
1701			if (req->r_resendtime)
1702				microuptime(&now);
1703			while (req && !(nmp->nm_state & NFSSTA_FORCE) && req->r_resendtime && (now.tv_sec < req->r_resendtime))
1704				req = TAILQ_NEXT(req, r_rchain);
1705			if (!req)
1706				break;
1707			TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
1708			req->r_rchain.tqe_next = NFSREQNOLIST;
1709			lck_mtx_unlock(&nmp->nm_lock);
1710			lck_mtx_lock(&req->r_mtx);
1711			if (req->r_error || req->r_nmrep.nmc_mhead) {
1712				dofinish = req->r_callback.rcb_func && !(req->r_flags & R_WAITSENT);
1713				req->r_flags &= ~R_RESENDQ;
1714				wakeup(req);
1715				lck_mtx_unlock(&req->r_mtx);
1716				if (dofinish)
1717					nfs_asyncio_finish(req);
1718				lck_mtx_lock(&nmp->nm_lock);
1719				continue;
1720			}
1721			if ((req->r_flags & R_RESTART) || nfs_request_using_gss(req)) {
1722				req->r_flags &= ~R_RESTART;
1723				req->r_resendtime = 0;
1724				lck_mtx_unlock(&req->r_mtx);
1725				/* async RPCs on GSS mounts need to be rebuilt and resent. */
1726				nfs_reqdequeue(req);
1727				if (nfs_request_using_gss(req)) {
1728					nfs_gss_clnt_rpcdone(req);
1729					error = nfs_gss_clnt_args_restore(req);
1730					if (error == ENEEDAUTH)
1731						req->r_xid = 0;
1732				}
1733				NFS_SOCK_DBG(("nfs async%s restart: p %d x 0x%llx f 0x%x rtt %d\n",
1734					nfs_request_using_gss(req) ? " gss" : "", req->r_procnum, req->r_xid,
1735					req->r_flags, req->r_rtt));
1736				error = !req->r_nmp ? ENXIO : 0;	/* unmounted? */
1737				if (!error)
1738					error = nfs_sigintr(nmp, req, req->r_thread, 0);
1739				if (!error)
1740					error = nfs_request_add_header(req);
1741				if (!error)
1742					error = nfs_request_send(req, 0);
1743				lck_mtx_lock(&req->r_mtx);
1744				if (req->r_flags & R_RESENDQ)
1745					req->r_flags &= ~R_RESENDQ;
1746				if (error)
1747					req->r_error = error;
1748				wakeup(req);
1749				dofinish = error && req->r_callback.rcb_func && !(req->r_flags & R_WAITSENT);
1750				lck_mtx_unlock(&req->r_mtx);
1751				if (dofinish)
1752					nfs_asyncio_finish(req);
1753				lck_mtx_lock(&nmp->nm_lock);
1754				error = 0;
1755				continue;
1756			}
1757			NFS_SOCK_DBG(("nfs async resend: p %d x 0x%llx f 0x%x rtt %d\n",
1758				req->r_procnum, req->r_xid, req->r_flags, req->r_rtt));
1759			error = !req->r_nmp ? ENXIO : 0;	/* unmounted? */
1760			if (!error)
1761				error = nfs_sigintr(nmp, req, req->r_thread, 0);
1762			if (!error) {
1763				req->r_flags |= R_SENDING;
1764				lck_mtx_unlock(&req->r_mtx);
1765				error = nfs_send(req, 0);
1766				lck_mtx_lock(&req->r_mtx);
1767				if (!error) {
1768					if (req->r_flags & R_RESENDQ)
1769						req->r_flags &= ~R_RESENDQ;
1770					wakeup(req);
1771					lck_mtx_unlock(&req->r_mtx);
1772					lck_mtx_lock(&nmp->nm_lock);
1773					continue;
1774				}
1775			}
1776			req->r_error = error;
1777			if (req->r_flags & R_RESENDQ)
1778				req->r_flags &= ~R_RESENDQ;
1779			wakeup(req);
1780			dofinish = req->r_callback.rcb_func && !(req->r_flags & R_WAITSENT);
1781			lck_mtx_unlock(&req->r_mtx);
1782			if (dofinish)
1783				nfs_asyncio_finish(req);
1784			lck_mtx_lock(&nmp->nm_lock);
1785		}
1786		if (nmp->nm_deadto_start)
1787			nfs_mount_check_dead_timeout(nmp);
1788		if (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_DEAD))
1789			break;
1790		/* check monitored nodes, if necessary/possible */
1791		if (!LIST_EMPTY(&nmp->nm_monlist)) {
1792			nmp->nm_state |= NFSSTA_MONITOR_SCAN;
1793			LIST_FOREACH(np, &nmp->nm_monlist, n_monlink) {
1794				if (!(nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & (NFSSTA_RECOVER|NFSSTA_UNMOUNTING|NFSSTA_FORCE)))
1795					break;
1796				np->n_mflag |= NMMONSCANINPROG;
1797				lck_mtx_unlock(&nmp->nm_lock);
1798				error = nfs_getattr(np, NULL, vfs_context_kernel(), (NGA_UNCACHED|NGA_MONITOR));
1799				if (!error && ISSET(np->n_flag, NUPDATESIZE)) /* update quickly to avoid multiple events */
1800					nfs_data_update_size(np, 0);
1801				lck_mtx_lock(&nmp->nm_lock);
1802				np->n_mflag &= ~NMMONSCANINPROG;
1803				if (np->n_mflag & NMMONSCANWANT) {
1804					np->n_mflag &= ~NMMONSCANWANT;
1805					wakeup(&np->n_mflag);
1806				}
1807				if (error || !(nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & (NFSSTA_RECOVER|NFSSTA_UNMOUNTING|NFSSTA_FORCE)))
1808					break;
1809			}
1810			nmp->nm_state &= ~NFSSTA_MONITOR_SCAN;
1811			if (nmp->nm_state & NFSSTA_UNMOUNTING)
1812				wakeup(&nmp->nm_state); /* let unmounting thread know scan is done */
1813		}
1814		if ((nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & (NFSSTA_RECOVER|NFSSTA_UNMOUNTING))) {
1815			if (nmp->nm_deadto_start || !TAILQ_EMPTY(&nmp->nm_resendq) ||
1816			    (nmp->nm_state & NFSSTA_RECOVER))
1817				ts.tv_sec = 1;
1818			else
1819				ts.tv_sec = 5;
1820			msleep(&nmp->nm_sockthd, &nmp->nm_lock, PSOCK, "nfssockthread", &ts);
1821		}
1822	}
1823
1824	/* If we're unmounting, send the unmount RPC, if requested/appropriate. */
1825	if ((nmp->nm_sockflags & NMSOCK_UNMOUNT) &&
1826	    (nmp->nm_state & NFSSTA_MOUNTED) && NMFLAG(nmp, CALLUMNT) &&
1827	    (nmp->nm_vers < NFS_VER4) && !(nmp->nm_state & (NFSSTA_FORCE|NFSSTA_DEAD))) {
1828		lck_mtx_unlock(&nmp->nm_lock);
1829		nfs3_umount_rpc(nmp, vfs_context_kernel(),
1830			(nmp->nm_sockflags & NMSOCK_READY) ? 6 : 2);
1831		lck_mtx_lock(&nmp->nm_lock);
1832	}
1833
1834	if (nmp->nm_sockthd == thd)
1835		nmp->nm_sockthd = NULL;
1836	lck_mtx_unlock(&nmp->nm_lock);
1837	wakeup(&nmp->nm_sockthd);
1838	thread_terminate(thd);
1839}
1840
1841/* start or wake a mount's socket thread */
1842void
1843nfs_mount_sock_thread_wake(struct nfsmount *nmp)
1844{
1845	if (nmp->nm_sockthd)
1846		wakeup(&nmp->nm_sockthd);
1847	else if (kernel_thread_start(nfs_mount_sock_thread, nmp, &nmp->nm_sockthd) == KERN_SUCCESS)
1848		thread_deallocate(nmp->nm_sockthd);
1849}
1850
1851/*
1852 * Check if we should mark the mount dead because the
1853 * unresponsive mount has reached the dead timeout.
1854 * (must be called with nmp locked)
1855 */
1856void
1857nfs_mount_check_dead_timeout(struct nfsmount *nmp)
1858{
1859	struct timeval now;
1860
1861	if (nmp->nm_deadto_start == 0)
1862		return;
1863	if (nmp->nm_state & NFSSTA_DEAD)
1864		return;
1865	nfs_is_squishy(nmp);
1866	if (nmp->nm_curdeadtimeout <= 0)
1867		return;
1868	microuptime(&now);
1869	if ((now.tv_sec - nmp->nm_deadto_start) < nmp->nm_curdeadtimeout)
1870		return;
1871	printf("nfs server %s: %sdead\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname,
1872	       (nmp->nm_curdeadtimeout != nmp->nm_deadtimeout) ? "squished " : "");
1873	nmp->nm_state |= NFSSTA_DEAD;
1874	vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_DEAD, 0);
1875}
1876
1877/*
1878 * NFS callback channel socket state
1879 */
1880struct nfs_callback_socket
1881{
1882	TAILQ_ENTRY(nfs_callback_socket) ncbs_link;
1883	socket_t			ncbs_so;	/* the socket */
1884	struct sockaddr_storage		ncbs_saddr;	/* socket address */
1885	struct nfs_rpc_record_state	ncbs_rrs;	/* RPC record parsing state */
1886	time_t				ncbs_stamp;	/* last accessed at */
1887	uint32_t			ncbs_flags;	/* see below */
1888};
1889#define NCBSOCK_UPCALL		0x0001
1890#define NCBSOCK_UPCALLWANT	0x0002
1891#define NCBSOCK_DEAD		0x0004
1892
1893/*
1894 * NFS callback channel state
1895 *
1896 * One listening socket for accepting socket connections from servers and
1897 * a list of connected sockets to handle callback requests on.
1898 * Mounts registered with the callback channel are assigned IDs and
1899 * put on a list so that the callback request handling code can match
1900 * the requests up with mounts.
1901 */
1902socket_t nfs4_cb_so = NULL;
1903socket_t nfs4_cb_so6 = NULL;
1904in_port_t nfs4_cb_port = 0;
1905in_port_t nfs4_cb_port6 = 0;
1906uint32_t nfs4_cb_id = 0;
1907uint32_t nfs4_cb_so_usecount = 0;
1908TAILQ_HEAD(nfs4_cb_sock_list,nfs_callback_socket) nfs4_cb_socks;
1909TAILQ_HEAD(nfs4_cb_mount_list,nfsmount) nfs4_cb_mounts;
1910
1911int nfs4_cb_handler(struct nfs_callback_socket *, mbuf_t);
1912
1913/*
1914 * Set up the callback channel for the NFS mount.
1915 *
1916 * Initializes the callback channel socket state and
1917 * assigns a callback ID to the mount.
1918 */
1919void
1920nfs4_mount_callback_setup(struct nfsmount *nmp)
1921{
1922	struct sockaddr_in sin;
1923	struct sockaddr_in6 sin6;
1924	socket_t so = NULL;
1925	socket_t so6 = NULL;
1926	struct timeval timeo;
1927	int error, on = 1;
1928	in_port_t port;
1929
1930	lck_mtx_lock(nfs_global_mutex);
1931	if (nfs4_cb_id == 0) {
1932		TAILQ_INIT(&nfs4_cb_mounts);
1933		TAILQ_INIT(&nfs4_cb_socks);
1934		nfs4_cb_id++;
1935	}
1936	nmp->nm_cbid = nfs4_cb_id++;
1937	if (nmp->nm_cbid == 0)
1938		nmp->nm_cbid = nfs4_cb_id++;
1939	nfs4_cb_so_usecount++;
1940	TAILQ_INSERT_HEAD(&nfs4_cb_mounts, nmp, nm_cblink);
1941
1942	if (nfs4_cb_so) {
1943		lck_mtx_unlock(nfs_global_mutex);
1944		return;
1945	}
1946
1947	/* IPv4 */
1948	error = sock_socket(AF_INET, SOCK_STREAM, IPPROTO_TCP, nfs4_cb_accept, NULL, &nfs4_cb_so);
1949	if (error) {
1950		log(LOG_INFO, "nfs callback setup: error %d creating listening IPv4 socket\n", error);
1951		goto fail;
1952	}
1953	so = nfs4_cb_so;
1954
1955	sock_setsockopt(so, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
1956	sin.sin_len = sizeof(struct sockaddr_in);
1957	sin.sin_family = AF_INET;
1958	sin.sin_addr.s_addr = htonl(INADDR_ANY);
1959	sin.sin_port = htons(nfs_callback_port); /* try to use specified port */
1960	error = sock_bind(so, (struct sockaddr *)&sin);
1961	if (error) {
1962		log(LOG_INFO, "nfs callback setup: error %d binding listening IPv4 socket\n", error);
1963		goto fail;
1964	}
1965	error = sock_getsockname(so, (struct sockaddr *)&sin, sin.sin_len);
1966	if (error) {
1967		log(LOG_INFO, "nfs callback setup: error %d getting listening IPv4 socket port\n", error);
1968		goto fail;
1969	}
1970	nfs4_cb_port = ntohs(sin.sin_port);
1971
1972	error = sock_listen(so, 32);
1973	if (error) {
1974		log(LOG_INFO, "nfs callback setup: error %d on IPv4 listen\n", error);
1975		goto fail;
1976	}
1977
1978	/* receive timeout shouldn't matter.  If timeout on send, we'll want to drop the socket */
1979	timeo.tv_usec = 0;
1980	timeo.tv_sec = 60;
1981	error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
1982	if (error)
1983		log(LOG_INFO, "nfs callback setup: error %d setting IPv4 socket rx timeout\n", error);
1984	error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
1985	if (error)
1986		log(LOG_INFO, "nfs callback setup: error %d setting IPv4 socket tx timeout\n", error);
1987	sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
1988	sock_setsockopt(so, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on));
1989	sock_setsockopt(so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on));
1990	error = 0;
1991
1992	/* IPv6 */
1993	error = sock_socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP, nfs4_cb_accept, NULL, &nfs4_cb_so6);
1994	if (error) {
1995		log(LOG_INFO, "nfs callback setup: error %d creating listening IPv6 socket\n", error);
1996		goto fail;
1997	}
1998	so6 = nfs4_cb_so6;
1999
2000	sock_setsockopt(so6, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
2001	sock_setsockopt(so6, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on));
2002	/* try to use specified port or same port as IPv4 */
2003	port = nfs_callback_port ? nfs_callback_port : nfs4_cb_port;
2004ipv6_bind_again:
2005	sin6.sin6_len = sizeof(struct sockaddr_in6);
2006	sin6.sin6_family = AF_INET6;
2007	sin6.sin6_addr = in6addr_any;
2008	sin6.sin6_port = htons(port);
2009	error = sock_bind(so6, (struct sockaddr *)&sin6);
2010	if (error) {
2011		if (port != nfs_callback_port) {
2012			/* if we simply tried to match the IPv4 port, then try any port */
2013			port = 0;
2014			goto ipv6_bind_again;
2015		}
2016		log(LOG_INFO, "nfs callback setup: error %d binding listening IPv6 socket\n", error);
2017		goto fail;
2018	}
2019	error = sock_getsockname(so6, (struct sockaddr *)&sin6, sin6.sin6_len);
2020	if (error) {
2021		log(LOG_INFO, "nfs callback setup: error %d getting listening IPv6 socket port\n", error);
2022		goto fail;
2023	}
2024	nfs4_cb_port6 = ntohs(sin6.sin6_port);
2025
2026	error = sock_listen(so6, 32);
2027	if (error) {
2028		log(LOG_INFO, "nfs callback setup: error %d on IPv6 listen\n", error);
2029		goto fail;
2030	}
2031
2032	/* receive timeout shouldn't matter.  If timeout on send, we'll want to drop the socket */
2033	timeo.tv_usec = 0;
2034	timeo.tv_sec = 60;
2035	error = sock_setsockopt(so6, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
2036	if (error)
2037		log(LOG_INFO, "nfs callback setup: error %d setting IPv6 socket rx timeout\n", error);
2038	error = sock_setsockopt(so6, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
2039	if (error)
2040		log(LOG_INFO, "nfs callback setup: error %d setting IPv6 socket tx timeout\n", error);
2041	sock_setsockopt(so6, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
2042	sock_setsockopt(so6, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on));
2043	sock_setsockopt(so6, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on));
2044	error = 0;
2045
2046fail:
2047	if (error) {
2048		nfs4_cb_so = nfs4_cb_so6 = NULL;
2049		lck_mtx_unlock(nfs_global_mutex);
2050		if (so) {
2051			sock_shutdown(so, SHUT_RDWR);
2052			sock_close(so);
2053		}
2054		if (so6) {
2055			sock_shutdown(so6, SHUT_RDWR);
2056			sock_close(so6);
2057		}
2058	} else {
2059		lck_mtx_unlock(nfs_global_mutex);
2060	}
2061}
2062
2063/*
2064 * Shut down the callback channel for the NFS mount.
2065 *
2066 * Clears the mount's callback ID and releases the mounts
2067 * reference on the callback socket.  Last reference dropped
2068 * will also shut down the callback socket(s).
2069 */
2070void
2071nfs4_mount_callback_shutdown(struct nfsmount *nmp)
2072{
2073	struct nfs_callback_socket *ncbsp;
2074	socket_t so, so6;
2075	struct nfs4_cb_sock_list cb_socks;
2076	struct timespec ts = {1,0};
2077
2078	lck_mtx_lock(nfs_global_mutex);
2079	TAILQ_REMOVE(&nfs4_cb_mounts, nmp, nm_cblink);
2080	/* wait for any callbacks in progress to complete */
2081	while (nmp->nm_cbrefs)
2082		msleep(&nmp->nm_cbrefs, nfs_global_mutex, PSOCK, "cbshutwait", &ts);
2083	nmp->nm_cbid = 0;
2084	if (--nfs4_cb_so_usecount) {
2085		lck_mtx_unlock(nfs_global_mutex);
2086		return;
2087	}
2088	so = nfs4_cb_so;
2089	so6 = nfs4_cb_so6;
2090	nfs4_cb_so = nfs4_cb_so6 = NULL;
2091	TAILQ_INIT(&cb_socks);
2092	TAILQ_CONCAT(&cb_socks, &nfs4_cb_socks, ncbs_link);
2093	lck_mtx_unlock(nfs_global_mutex);
2094	if (so) {
2095		sock_shutdown(so, SHUT_RDWR);
2096		sock_close(so);
2097	}
2098	if (so6) {
2099		sock_shutdown(so6, SHUT_RDWR);
2100		sock_close(so6);
2101	}
2102	while ((ncbsp = TAILQ_FIRST(&cb_socks))) {
2103		TAILQ_REMOVE(&cb_socks, ncbsp, ncbs_link);
2104		sock_shutdown(ncbsp->ncbs_so, SHUT_RDWR);
2105		sock_close(ncbsp->ncbs_so);
2106		nfs_rpc_record_state_cleanup(&ncbsp->ncbs_rrs);
2107		FREE(ncbsp, M_TEMP);
2108	}
2109}
2110
2111/*
2112 * Check periodically for stale/unused nfs callback sockets
2113 */
2114#define NFS4_CB_TIMER_PERIOD	30
2115#define NFS4_CB_IDLE_MAX	300
2116void
2117nfs4_callback_timer(__unused void *param0, __unused void *param1)
2118{
2119	struct nfs_callback_socket *ncbsp, *nextncbsp;
2120	struct timeval now;
2121
2122loop:
2123	lck_mtx_lock(nfs_global_mutex);
2124	if (TAILQ_EMPTY(&nfs4_cb_socks)) {
2125		nfs4_callback_timer_on = 0;
2126		lck_mtx_unlock(nfs_global_mutex);
2127		return;
2128	}
2129	microuptime(&now);
2130	TAILQ_FOREACH_SAFE(ncbsp, &nfs4_cb_socks, ncbs_link, nextncbsp) {
2131		if (!(ncbsp->ncbs_flags & NCBSOCK_DEAD) &&
2132		     (now.tv_sec < (ncbsp->ncbs_stamp + NFS4_CB_IDLE_MAX)))
2133			continue;
2134		TAILQ_REMOVE(&nfs4_cb_socks, ncbsp, ncbs_link);
2135		lck_mtx_unlock(nfs_global_mutex);
2136		sock_shutdown(ncbsp->ncbs_so, SHUT_RDWR);
2137		sock_close(ncbsp->ncbs_so);
2138		nfs_rpc_record_state_cleanup(&ncbsp->ncbs_rrs);
2139		FREE(ncbsp, M_TEMP);
2140		goto loop;
2141	}
2142	nfs4_callback_timer_on = 1;
2143	nfs_interval_timer_start(nfs4_callback_timer_call,
2144		NFS4_CB_TIMER_PERIOD * 1000);
2145	lck_mtx_unlock(nfs_global_mutex);
2146}
2147
2148/*
2149 * Accept a new callback socket.
2150 */
2151void
2152nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag)
2153{
2154	socket_t newso = NULL;
2155	struct nfs_callback_socket *ncbsp;
2156	struct nfsmount *nmp;
2157	struct timeval timeo, now;
2158	int error, on = 1, ip;
2159
2160	if (so == nfs4_cb_so)
2161		ip = 4;
2162	else if (so == nfs4_cb_so6)
2163		ip = 6;
2164	else
2165		return;
2166
2167	/* allocate/initialize a new nfs_callback_socket */
2168	MALLOC(ncbsp, struct nfs_callback_socket *, sizeof(struct nfs_callback_socket), M_TEMP, M_WAITOK);
2169	if (!ncbsp) {
2170		log(LOG_ERR, "nfs callback accept: no memory for new socket\n");
2171		return;
2172	}
2173	bzero(ncbsp, sizeof(*ncbsp));
2174	ncbsp->ncbs_saddr.ss_len = (ip == 4) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
2175	nfs_rpc_record_state_init(&ncbsp->ncbs_rrs);
2176
2177	/* accept a new socket */
2178	error = sock_accept(so, (struct sockaddr*)&ncbsp->ncbs_saddr,
2179			ncbsp->ncbs_saddr.ss_len, MSG_DONTWAIT,
2180			nfs4_cb_rcv, ncbsp, &newso);
2181	if (error) {
2182		log(LOG_INFO, "nfs callback accept: error %d accepting IPv%d socket\n", error, ip);
2183		FREE(ncbsp, M_TEMP);
2184		return;
2185	}
2186
2187	/* set up the new socket */
2188	/* receive timeout shouldn't matter.  If timeout on send, we'll want to drop the socket */
2189	timeo.tv_usec = 0;
2190	timeo.tv_sec = 60;
2191	error = sock_setsockopt(newso, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
2192	if (error)
2193		log(LOG_INFO, "nfs callback socket: error %d setting IPv%d socket rx timeout\n", error, ip);
2194	error = sock_setsockopt(newso, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
2195	if (error)
2196		log(LOG_INFO, "nfs callback socket: error %d setting IPv%d socket tx timeout\n", error, ip);
2197	sock_setsockopt(newso, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
2198	sock_setsockopt(newso, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
2199	sock_setsockopt(newso, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on));
2200	sock_setsockopt(newso, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on));
2201
2202	ncbsp->ncbs_so = newso;
2203	microuptime(&now);
2204	ncbsp->ncbs_stamp = now.tv_sec;
2205
2206	lck_mtx_lock(nfs_global_mutex);
2207
2208	/* add it to the list */
2209	TAILQ_INSERT_HEAD(&nfs4_cb_socks, ncbsp, ncbs_link);
2210
2211	/* verify it's from a host we have mounted */
2212	TAILQ_FOREACH(nmp, &nfs4_cb_mounts, nm_cblink) {
2213		/* check if socket's source address matches this mount's server address */
2214		if (!nmp->nm_saddr)
2215			continue;
2216		if (nfs_sockaddr_cmp((struct sockaddr*)&ncbsp->ncbs_saddr, nmp->nm_saddr) == 0)
2217			break;
2218	}
2219	if (!nmp) /* we don't want this socket, mark it dead */
2220		ncbsp->ncbs_flags |= NCBSOCK_DEAD;
2221
2222	/* make sure the callback socket cleanup timer is running */
2223	/* (shorten the timer if we've got a socket we don't want) */
2224	if (!nfs4_callback_timer_on) {
2225		nfs4_callback_timer_on = 1;
2226		nfs_interval_timer_start(nfs4_callback_timer_call,
2227			!nmp ? 500 : (NFS4_CB_TIMER_PERIOD * 1000));
2228	} else if (!nmp && (nfs4_callback_timer_on < 2)) {
2229		nfs4_callback_timer_on = 2;
2230		thread_call_cancel(nfs4_callback_timer_call);
2231		nfs_interval_timer_start(nfs4_callback_timer_call, 500);
2232	}
2233
2234	lck_mtx_unlock(nfs_global_mutex);
2235}
2236
2237/*
2238 * Receive mbufs from callback sockets into RPC records and process each record.
2239 * Detect connection has been closed and shut down.
2240 */
2241void
2242nfs4_cb_rcv(socket_t so, void *arg, __unused int waitflag)
2243{
2244	struct nfs_callback_socket *ncbsp = arg;
2245	struct timespec ts = {1,0};
2246	struct timeval now;
2247	mbuf_t m;
2248	int error = 0, recv = 1;
2249
2250	lck_mtx_lock(nfs_global_mutex);
2251	while (ncbsp->ncbs_flags & NCBSOCK_UPCALL) {
2252		/* wait if upcall is already in progress */
2253		ncbsp->ncbs_flags |= NCBSOCK_UPCALLWANT;
2254		msleep(ncbsp, nfs_global_mutex, PSOCK, "cbupcall", &ts);
2255	}
2256	ncbsp->ncbs_flags |= NCBSOCK_UPCALL;
2257	lck_mtx_unlock(nfs_global_mutex);
2258
2259	/* loop while we make error-free progress */
2260	while (!error && recv) {
2261		error = nfs_rpc_record_read(so, &ncbsp->ncbs_rrs, MSG_DONTWAIT, &recv, &m);
2262		if (m) /* handle the request */
2263			error = nfs4_cb_handler(ncbsp, m);
2264	}
2265
2266	/* note: no error and no data indicates server closed its end */
2267	if ((error != EWOULDBLOCK) && (error || !recv)) {
2268		/*
2269		 * Socket is either being closed or should be.
2270		 * We can't close the socket in the context of the upcall.
2271		 * So we mark it as dead and leave it for the cleanup timer to reap.
2272		 */
2273		ncbsp->ncbs_stamp = 0;
2274		ncbsp->ncbs_flags |= NCBSOCK_DEAD;
2275	} else {
2276		microuptime(&now);
2277		ncbsp->ncbs_stamp = now.tv_sec;
2278	}
2279
2280	lck_mtx_lock(nfs_global_mutex);
2281	ncbsp->ncbs_flags &= ~NCBSOCK_UPCALL;
2282	lck_mtx_unlock(nfs_global_mutex);
2283	wakeup(ncbsp);
2284}
2285
2286/*
2287 * Handle an NFS callback channel request.
2288 */
2289int
2290nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq)
2291{
2292	socket_t so = ncbsp->ncbs_so;
2293	struct nfsm_chain nmreq, nmrep;
2294	mbuf_t mhead = NULL, mrest = NULL, m;
2295	struct msghdr msg;
2296	struct nfsmount *nmp;
2297	fhandle_t fh;
2298	nfsnode_t np;
2299	nfs_stateid stateid;
2300	uint32_t bitmap[NFS_ATTR_BITMAP_LEN], rbitmap[NFS_ATTR_BITMAP_LEN], bmlen, truncate, attrbytes;
2301	uint32_t val, xid, procnum, taglen, cbid, numops, op, status;
2302	uint32_t auth_type, auth_len;
2303	uint32_t numres, *pnumres;
2304	int error = 0, replen, len;
2305	size_t sentlen = 0;
2306
2307	xid = numops = op = status = procnum = taglen = cbid = 0;
2308
2309	nfsm_chain_dissect_init(error, &nmreq, mreq);
2310	nfsm_chain_get_32(error, &nmreq, xid);		// RPC XID
2311	nfsm_chain_get_32(error, &nmreq, val);		// RPC Call
2312	nfsm_assert(error, (val == RPC_CALL), EBADRPC);
2313	nfsm_chain_get_32(error, &nmreq, val);		// RPC Version
2314	nfsm_assert(error, (val == RPC_VER2), ERPCMISMATCH);
2315	nfsm_chain_get_32(error, &nmreq, val);		// RPC Program Number
2316	nfsm_assert(error, (val == NFS4_CALLBACK_PROG), EPROGUNAVAIL);
2317	nfsm_chain_get_32(error, &nmreq, val);		// NFS Callback Program Version Number
2318	nfsm_assert(error, (val == NFS4_CALLBACK_PROG_VERSION), EPROGMISMATCH);
2319	nfsm_chain_get_32(error, &nmreq, procnum);	// NFS Callback Procedure Number
2320	nfsm_assert(error, (procnum <= NFSPROC4_CB_COMPOUND), EPROCUNAVAIL);
2321
2322	/* Handle authentication */
2323	/* XXX just ignore auth for now - handling kerberos may be tricky */
2324	nfsm_chain_get_32(error, &nmreq, auth_type);	// RPC Auth Flavor
2325	nfsm_chain_get_32(error, &nmreq, auth_len);	// RPC Auth Length
2326	nfsm_assert(error, (auth_len <= RPCAUTH_MAXSIZ), EBADRPC);
2327	if (!error && (auth_len > 0))
2328		nfsm_chain_adv(error, &nmreq, nfsm_rndup(auth_len));
2329	nfsm_chain_adv(error, &nmreq, NFSX_UNSIGNED);	// verifier flavor (should be AUTH_NONE)
2330	nfsm_chain_get_32(error, &nmreq, auth_len);	// verifier length
2331	nfsm_assert(error, (auth_len <= RPCAUTH_MAXSIZ), EBADRPC);
2332	if (!error && (auth_len > 0))
2333		nfsm_chain_adv(error, &nmreq, nfsm_rndup(auth_len));
2334	if (error) {
2335		status = error;
2336		error = 0;
2337		goto nfsmout;
2338	}
2339
2340	switch (procnum) {
2341	case NFSPROC4_CB_NULL:
2342		status = NFSERR_RETVOID;
2343		break;
2344	case NFSPROC4_CB_COMPOUND:
2345		/* tag, minorversion, cb ident, numops, op array */
2346		nfsm_chain_get_32(error, &nmreq, taglen);	/* tag length */
2347		nfsm_assert(error, (val <= NFS4_OPAQUE_LIMIT), EBADRPC);
2348
2349		/* start building the body of the response */
2350		nfsm_mbuf_get(error, &mrest, nfsm_rndup(taglen) + 5*NFSX_UNSIGNED);
2351		nfsm_chain_init(&nmrep, mrest);
2352
2353		/* copy tag from request to response */
2354		nfsm_chain_add_32(error, &nmrep, taglen);	/* tag length */
2355		for (len = (int)taglen; !error && (len > 0); len -= NFSX_UNSIGNED) {
2356			nfsm_chain_get_32(error, &nmreq, val);
2357			nfsm_chain_add_32(error, &nmrep, val);
2358		}
2359
2360		/* insert number of results placeholder */
2361		numres = 0;
2362		nfsm_chain_add_32(error, &nmrep, numres);
2363		pnumres = (uint32_t*)(nmrep.nmc_ptr - NFSX_UNSIGNED);
2364
2365		nfsm_chain_get_32(error, &nmreq, val);		/* minorversion */
2366		nfsm_assert(error, (val == 0), NFSERR_MINOR_VERS_MISMATCH);
2367		nfsm_chain_get_32(error, &nmreq, cbid);		/* callback ID */
2368		nfsm_chain_get_32(error, &nmreq, numops);	/* number of operations */
2369		if (error) {
2370			if ((error == EBADRPC) || (error == NFSERR_MINOR_VERS_MISMATCH))
2371				status = error;
2372			else if ((error == ENOBUFS) || (error == ENOMEM))
2373				status = NFSERR_RESOURCE;
2374			else
2375				status = NFSERR_SERVERFAULT;
2376			error = 0;
2377			nfsm_chain_null(&nmrep);
2378			goto nfsmout;
2379		}
2380		/* match the callback ID to a registered mount */
2381		lck_mtx_lock(nfs_global_mutex);
2382		TAILQ_FOREACH(nmp, &nfs4_cb_mounts, nm_cblink) {
2383			if (nmp->nm_cbid != cbid)
2384				continue;
2385			/* verify socket's source address matches this mount's server address */
2386			if (!nmp->nm_saddr)
2387				continue;
2388			if (nfs_sockaddr_cmp((struct sockaddr*)&ncbsp->ncbs_saddr, nmp->nm_saddr) == 0)
2389				break;
2390		}
2391		/* mark the NFS mount as busy */
2392		if (nmp)
2393			nmp->nm_cbrefs++;
2394		lck_mtx_unlock(nfs_global_mutex);
2395		if (!nmp) {
2396			/* if no mount match, just drop socket. */
2397			error = EPERM;
2398			nfsm_chain_null(&nmrep);
2399			goto out;
2400		}
2401
2402		/* process ops, adding results to mrest */
2403		while (numops > 0) {
2404			numops--;
2405			nfsm_chain_get_32(error, &nmreq, op);
2406			if (error)
2407				break;
2408			switch (op) {
2409			case NFS_OP_CB_GETATTR:
2410				// (FH, BITMAP) -> (STATUS, BITMAP, ATTRS)
2411				np = NULL;
2412				nfsm_chain_get_fh(error, &nmreq, NFS_VER4, &fh);
2413				bmlen = NFS_ATTR_BITMAP_LEN;
2414				nfsm_chain_get_bitmap(error, &nmreq, bitmap, bmlen);
2415				if (error) {
2416					status = error;
2417					error = 0;
2418					numops = 0; /* don't process any more ops */
2419				} else {
2420					/* find the node for the file handle */
2421					error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, NULL, NULL, RPCAUTH_UNKNOWN, NG_NOCREATE, &np);
2422					if (error || !np) {
2423						status = NFSERR_BADHANDLE;
2424						error = 0;
2425						np = NULL;
2426						numops = 0; /* don't process any more ops */
2427					}
2428				}
2429				nfsm_chain_add_32(error, &nmrep, op);
2430				nfsm_chain_add_32(error, &nmrep, status);
2431				if (!error && (status == EBADRPC))
2432					error = status;
2433				if (np) {
2434					/* only allow returning size, change, and mtime attrs */
2435					NFS_CLEAR_ATTRIBUTES(&rbitmap);
2436					attrbytes = 0;
2437					if (NFS_BITMAP_ISSET(&bitmap, NFS_FATTR_CHANGE)) {
2438						NFS_BITMAP_SET(&rbitmap, NFS_FATTR_CHANGE);
2439						attrbytes += 2 * NFSX_UNSIGNED;
2440					}
2441					if (NFS_BITMAP_ISSET(&bitmap, NFS_FATTR_SIZE)) {
2442						NFS_BITMAP_SET(&rbitmap, NFS_FATTR_SIZE);
2443						attrbytes += 2 * NFSX_UNSIGNED;
2444					}
2445					if (NFS_BITMAP_ISSET(&bitmap, NFS_FATTR_TIME_MODIFY)) {
2446						NFS_BITMAP_SET(&rbitmap, NFS_FATTR_TIME_MODIFY);
2447						attrbytes += 3 * NFSX_UNSIGNED;
2448					}
2449					nfsm_chain_add_bitmap(error, &nmrep, rbitmap, NFS_ATTR_BITMAP_LEN);
2450					nfsm_chain_add_32(error, &nmrep, attrbytes);
2451					if (NFS_BITMAP_ISSET(&bitmap, NFS_FATTR_CHANGE))
2452						nfsm_chain_add_64(error, &nmrep,
2453							np->n_vattr.nva_change + ((np->n_flag & NMODIFIED) ? 1 : 0));
2454					if (NFS_BITMAP_ISSET(&bitmap, NFS_FATTR_SIZE))
2455						nfsm_chain_add_64(error, &nmrep, np->n_size);
2456					if (NFS_BITMAP_ISSET(&bitmap, NFS_FATTR_TIME_MODIFY)) {
2457						nfsm_chain_add_64(error, &nmrep, np->n_vattr.nva_timesec[NFSTIME_MODIFY]);
2458						nfsm_chain_add_32(error, &nmrep, np->n_vattr.nva_timensec[NFSTIME_MODIFY]);
2459					}
2460					nfs_node_unlock(np);
2461					vnode_put(NFSTOV(np));
2462					np = NULL;
2463				}
2464				/*
2465				 * If we hit an error building the reply, we can't easily back up.
2466				 * So we'll just update the status and hope the server ignores the
2467				 * extra garbage.
2468				 */
2469				break;
2470			case NFS_OP_CB_RECALL:
2471				// (STATEID, TRUNCATE, FH) -> (STATUS)
2472				np = NULL;
2473				nfsm_chain_get_stateid(error, &nmreq, &stateid);
2474				nfsm_chain_get_32(error, &nmreq, truncate);
2475				nfsm_chain_get_fh(error, &nmreq, NFS_VER4, &fh);
2476				if (error) {
2477					status = error;
2478					error = 0;
2479					numops = 0; /* don't process any more ops */
2480				} else {
2481					/* find the node for the file handle */
2482					error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, NULL, NULL, RPCAUTH_UNKNOWN, NG_NOCREATE, &np);
2483					if (error || !np) {
2484						status = NFSERR_BADHANDLE;
2485						error = 0;
2486						np = NULL;
2487						numops = 0; /* don't process any more ops */
2488					} else if (!(np->n_openflags & N_DELEG_MASK) ||
2489						    bcmp(&np->n_dstateid, &stateid, sizeof(stateid))) {
2490						/* delegation stateid state doesn't match */
2491						status = NFSERR_BAD_STATEID;
2492						numops = 0; /* don't process any more ops */
2493					}
2494					if (!status) /* add node to recall queue, and wake socket thread */
2495						nfs4_delegation_return_enqueue(np);
2496					if (np) {
2497						nfs_node_unlock(np);
2498						vnode_put(NFSTOV(np));
2499					}
2500				}
2501				nfsm_chain_add_32(error, &nmrep, op);
2502				nfsm_chain_add_32(error, &nmrep, status);
2503				if (!error && (status == EBADRPC))
2504					error = status;
2505				break;
2506			case NFS_OP_CB_ILLEGAL:
2507			default:
2508				nfsm_chain_add_32(error, &nmrep, NFS_OP_CB_ILLEGAL);
2509				status = NFSERR_OP_ILLEGAL;
2510				nfsm_chain_add_32(error, &nmrep, status);
2511				numops = 0; /* don't process any more ops */
2512				break;
2513			}
2514			numres++;
2515		}
2516
2517		if (!status && error) {
2518			if (error == EBADRPC)
2519				status = error;
2520			else if ((error == ENOBUFS) || (error == ENOMEM))
2521				status = NFSERR_RESOURCE;
2522			else
2523				status = NFSERR_SERVERFAULT;
2524			error = 0;
2525		}
2526
2527		/* Now, set the numres field */
2528		*pnumres = txdr_unsigned(numres);
2529		nfsm_chain_build_done(error, &nmrep);
2530		nfsm_chain_null(&nmrep);
2531
2532		/* drop the callback reference on the mount */
2533		lck_mtx_lock(nfs_global_mutex);
2534		nmp->nm_cbrefs--;
2535		if (!nmp->nm_cbid)
2536			wakeup(&nmp->nm_cbrefs);
2537		lck_mtx_unlock(nfs_global_mutex);
2538		break;
2539	}
2540
2541nfsmout:
2542	if (status == EBADRPC)
2543		OSAddAtomic64(1, &nfsstats.rpcinvalid);
2544
2545	/* build reply header */
2546	error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mhead);
2547	nfsm_chain_init(&nmrep, mhead);
2548	nfsm_chain_add_32(error, &nmrep, 0); /* insert space for an RPC record mark */
2549	nfsm_chain_add_32(error, &nmrep, xid);
2550	nfsm_chain_add_32(error, &nmrep, RPC_REPLY);
2551	if ((status == ERPCMISMATCH) || (status & NFSERR_AUTHERR)) {
2552		nfsm_chain_add_32(error, &nmrep, RPC_MSGDENIED);
2553		if (status & NFSERR_AUTHERR) {
2554			nfsm_chain_add_32(error, &nmrep, RPC_AUTHERR);
2555			nfsm_chain_add_32(error, &nmrep, (status & ~NFSERR_AUTHERR));
2556		} else {
2557			nfsm_chain_add_32(error, &nmrep, RPC_MISMATCH);
2558			nfsm_chain_add_32(error, &nmrep, RPC_VER2);
2559			nfsm_chain_add_32(error, &nmrep, RPC_VER2);
2560		}
2561	} else {
2562		/* reply status */
2563		nfsm_chain_add_32(error, &nmrep, RPC_MSGACCEPTED);
2564		/* XXX RPCAUTH_NULL verifier */
2565		nfsm_chain_add_32(error, &nmrep, RPCAUTH_NULL);
2566		nfsm_chain_add_32(error, &nmrep, 0);
2567		/* accepted status */
2568		switch (status) {
2569		case EPROGUNAVAIL:
2570			nfsm_chain_add_32(error, &nmrep, RPC_PROGUNAVAIL);
2571			break;
2572		case EPROGMISMATCH:
2573			nfsm_chain_add_32(error, &nmrep, RPC_PROGMISMATCH);
2574			nfsm_chain_add_32(error, &nmrep, NFS4_CALLBACK_PROG_VERSION);
2575			nfsm_chain_add_32(error, &nmrep, NFS4_CALLBACK_PROG_VERSION);
2576			break;
2577		case EPROCUNAVAIL:
2578			nfsm_chain_add_32(error, &nmrep, RPC_PROCUNAVAIL);
2579			break;
2580		case EBADRPC:
2581			nfsm_chain_add_32(error, &nmrep, RPC_GARBAGE);
2582			break;
2583		default:
2584			nfsm_chain_add_32(error, &nmrep, RPC_SUCCESS);
2585			if (status != NFSERR_RETVOID)
2586				nfsm_chain_add_32(error, &nmrep, status);
2587			break;
2588		}
2589	}
2590	nfsm_chain_build_done(error, &nmrep);
2591	if (error) {
2592		nfsm_chain_null(&nmrep);
2593		goto out;
2594	}
2595	error = mbuf_setnext(nmrep.nmc_mcur, mrest);
2596	if (error) {
2597		printf("nfs cb: mbuf_setnext failed %d\n", error);
2598		goto out;
2599	}
2600	mrest = NULL;
2601	/* Calculate the size of the reply */
2602	replen = 0;
2603	for (m = nmrep.nmc_mhead; m; m = mbuf_next(m))
2604		replen += mbuf_len(m);
2605	mbuf_pkthdr_setlen(mhead, replen);
2606	error = mbuf_pkthdr_setrcvif(mhead, NULL);
2607	nfsm_chain_set_recmark(error, &nmrep, (replen - NFSX_UNSIGNED) | 0x80000000);
2608	nfsm_chain_null(&nmrep);
2609
2610	/* send the reply */
2611	bzero(&msg, sizeof(msg));
2612	error = sock_sendmbuf(so, &msg, mhead, 0, &sentlen);
2613	mhead = NULL;
2614	if (!error && ((int)sentlen != replen))
2615		error = EWOULDBLOCK;
2616	if (error == EWOULDBLOCK) /* inability to send response is considered fatal */
2617		error = ETIMEDOUT;
2618out:
2619	if (error)
2620		nfsm_chain_cleanup(&nmrep);
2621	if (mhead)
2622		mbuf_freem(mhead);
2623	if (mrest)
2624		mbuf_freem(mrest);
2625	if (mreq)
2626		mbuf_freem(mreq);
2627	return (error);
2628}
2629
2630
2631/*
2632 * Initialize an nfs_rpc_record_state structure.
2633 */
2634void
2635nfs_rpc_record_state_init(struct nfs_rpc_record_state *nrrsp)
2636{
2637	bzero(nrrsp, sizeof(*nrrsp));
2638	nrrsp->nrrs_markerleft = sizeof(nrrsp->nrrs_fragleft);
2639}
2640
2641/*
2642 * Clean up an nfs_rpc_record_state structure.
2643 */
2644void
2645nfs_rpc_record_state_cleanup(struct nfs_rpc_record_state *nrrsp)
2646{
2647	if (nrrsp->nrrs_m) {
2648		mbuf_freem(nrrsp->nrrs_m);
2649		nrrsp->nrrs_m = nrrsp->nrrs_mlast = NULL;
2650	}
2651}
2652
2653/*
2654 * Read the next (marked) RPC record from the socket.
2655 *
2656 * *recvp returns if any data was received.
2657 * *mp returns the next complete RPC record
2658 */
2659int
2660nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int flags, int *recvp, mbuf_t *mp)
2661{
2662	struct iovec aio;
2663	struct msghdr msg;
2664	size_t rcvlen;
2665	int error = 0;
2666	mbuf_t m;
2667
2668	*recvp = 0;
2669	*mp = NULL;
2670
2671	/* read the TCP RPC record marker */
2672	while (!error && nrrsp->nrrs_markerleft) {
2673		aio.iov_base = ((char*)&nrrsp->nrrs_fragleft +
2674				sizeof(nrrsp->nrrs_fragleft) - nrrsp->nrrs_markerleft);
2675		aio.iov_len = nrrsp->nrrs_markerleft;
2676		bzero(&msg, sizeof(msg));
2677		msg.msg_iov = &aio;
2678		msg.msg_iovlen = 1;
2679		error = sock_receive(so, &msg, flags, &rcvlen);
2680		if (error || !rcvlen)
2681			break;
2682		*recvp = 1;
2683		nrrsp->nrrs_markerleft -= rcvlen;
2684		if (nrrsp->nrrs_markerleft)
2685			continue;
2686		/* record marker complete */
2687		nrrsp->nrrs_fragleft = ntohl(nrrsp->nrrs_fragleft);
2688		if (nrrsp->nrrs_fragleft & 0x80000000) {
2689			nrrsp->nrrs_lastfrag = 1;
2690			nrrsp->nrrs_fragleft &= ~0x80000000;
2691		}
2692		nrrsp->nrrs_reclen += nrrsp->nrrs_fragleft;
2693		if (nrrsp->nrrs_reclen > NFS_MAXPACKET) {
2694			/* This is SERIOUS! We are out of sync with the sender. */
2695			log(LOG_ERR, "impossible RPC record length (%d) on callback", nrrsp->nrrs_reclen);
2696			error = EFBIG;
2697		}
2698	}
2699
2700	/* read the TCP RPC record fragment */
2701	while (!error && !nrrsp->nrrs_markerleft && nrrsp->nrrs_fragleft) {
2702		m = NULL;
2703		rcvlen = nrrsp->nrrs_fragleft;
2704		error = sock_receivembuf(so, NULL, &m, flags, &rcvlen);
2705		if (error || !rcvlen || !m)
2706			break;
2707		*recvp = 1;
2708		/* append mbufs to list */
2709		nrrsp->nrrs_fragleft -= rcvlen;
2710		if (!nrrsp->nrrs_m) {
2711			nrrsp->nrrs_m = m;
2712		} else {
2713			error = mbuf_setnext(nrrsp->nrrs_mlast, m);
2714			if (error) {
2715				printf("nfs tcp rcv: mbuf_setnext failed %d\n", error);
2716				mbuf_freem(m);
2717				break;
2718			}
2719		}
2720		while (mbuf_next(m))
2721			m = mbuf_next(m);
2722		nrrsp->nrrs_mlast = m;
2723	}
2724
2725	/* done reading fragment? */
2726	if (!error && !nrrsp->nrrs_markerleft && !nrrsp->nrrs_fragleft) {
2727		/* reset socket fragment parsing state */
2728		nrrsp->nrrs_markerleft = sizeof(nrrsp->nrrs_fragleft);
2729		if (nrrsp->nrrs_lastfrag) {
2730			/* RPC record complete */
2731			*mp = nrrsp->nrrs_m;
2732			/* reset socket record parsing state */
2733			nrrsp->nrrs_reclen = 0;
2734			nrrsp->nrrs_m = nrrsp->nrrs_mlast = NULL;
2735			nrrsp->nrrs_lastfrag = 0;
2736		}
2737	}
2738
2739	return (error);
2740}
2741
2742
2743
2744/*
2745 * The NFS client send routine.
2746 *
2747 * Send the given NFS request out the mount's socket.
2748 * Holds nfs_sndlock() for the duration of this call.
2749 *
2750 * - check for request termination (sigintr)
2751 * - wait for reconnect, if necessary
2752 * - UDP: check the congestion window
2753 * - make a copy of the request to send
2754 * - UDP: update the congestion window
2755 * - send the request
2756 *
2757 * If sent successfully, R_MUSTRESEND and R_RESENDERR are cleared.
2758 * rexmit count is also updated if this isn't the first send.
2759 *
2760 * If the send is not successful, make sure R_MUSTRESEND is set.
2761 * If this wasn't the first transmit, set R_RESENDERR.
2762 * Also, undo any UDP congestion window changes made.
2763 *
2764 * If the error appears to indicate that the socket should
2765 * be reconnected, mark the socket for reconnection.
2766 *
2767 * Only return errors when the request should be aborted.
2768 */
2769int
2770nfs_send(struct nfsreq *req, int wait)
2771{
2772	struct nfsmount *nmp;
2773	struct nfs_socket *nso;
2774	int error, error2, sotype, rexmit, slpflag = 0, needrecon;
2775	struct msghdr msg;
2776	struct sockaddr *sendnam;
2777	mbuf_t mreqcopy;
2778	size_t sentlen = 0;
2779	struct timespec ts = { 2, 0 };
2780
2781again:
2782	error = nfs_sndlock(req);
2783	if (error) {
2784		lck_mtx_lock(&req->r_mtx);
2785		req->r_error = error;
2786		req->r_flags &= ~R_SENDING;
2787		lck_mtx_unlock(&req->r_mtx);
2788		return (error);
2789	}
2790
2791	error = nfs_sigintr(req->r_nmp, req, NULL, 0);
2792	if (error) {
2793		nfs_sndunlock(req);
2794		lck_mtx_lock(&req->r_mtx);
2795		req->r_error = error;
2796		req->r_flags &= ~R_SENDING;
2797		lck_mtx_unlock(&req->r_mtx);
2798		return (error);
2799	}
2800	nmp = req->r_nmp;
2801	sotype = nmp->nm_sotype;
2802
2803	/*
2804	 * If it's a setup RPC but we're not in SETUP... must need reconnect.
2805	 * If it's a recovery RPC but the socket's not ready... must need reconnect.
2806	 */
2807	if (((req->r_flags & R_SETUP) && !(nmp->nm_sockflags & NMSOCK_SETUP)) ||
2808	    ((req->r_flags & R_RECOVER) && !(nmp->nm_sockflags & NMSOCK_READY))) {
2809		error = ETIMEDOUT;
2810		nfs_sndunlock(req);
2811		lck_mtx_lock(&req->r_mtx);
2812		req->r_error = error;
2813		req->r_flags &= ~R_SENDING;
2814		lck_mtx_unlock(&req->r_mtx);
2815		return (error);
2816	}
2817
2818	/* If the socket needs reconnection, do that now. */
2819	/* wait until socket is ready - unless this request is part of setup */
2820	lck_mtx_lock(&nmp->nm_lock);
2821	if (!(nmp->nm_sockflags & NMSOCK_READY) &&
2822	    !((nmp->nm_sockflags & NMSOCK_SETUP) && (req->r_flags & R_SETUP))) {
2823		if (NMFLAG(nmp, INTR) && !(req->r_flags & R_NOINTR))
2824			slpflag |= PCATCH;
2825		lck_mtx_unlock(&nmp->nm_lock);
2826		nfs_sndunlock(req);
2827		if (!wait) {
2828			lck_mtx_lock(&req->r_mtx);
2829			req->r_flags &= ~R_SENDING;
2830			req->r_flags |= R_MUSTRESEND;
2831			req->r_rtt = 0;
2832			lck_mtx_unlock(&req->r_mtx);
2833			return (0);
2834		}
2835		NFS_SOCK_DBG(("nfs_send: 0x%llx wait reconnect\n", req->r_xid));
2836		lck_mtx_lock(&req->r_mtx);
2837		req->r_flags &= ~R_MUSTRESEND;
2838		req->r_rtt = 0;
2839		lck_mtx_unlock(&req->r_mtx);
2840		lck_mtx_lock(&nmp->nm_lock);
2841		while (!(nmp->nm_sockflags & NMSOCK_READY)) {
2842			/* don't bother waiting if the socket thread won't be reconnecting it */
2843			if (nmp->nm_state & NFSSTA_FORCE) {
2844				error = EIO;
2845				break;
2846			}
2847			if (NMFLAG(nmp, SOFT) && (nmp->nm_reconnect_start > 0)) {
2848				struct timeval now;
2849				microuptime(&now);
2850				if ((now.tv_sec - nmp->nm_reconnect_start) >= 8) {
2851					/* soft mount in reconnect for a while... terminate ASAP */
2852					OSAddAtomic64(1, &nfsstats.rpctimeouts);
2853					req->r_flags |= R_SOFTTERM;
2854					req->r_error = error = ETIMEDOUT;
2855					break;
2856				}
2857			}
2858			/* make sure socket thread is running, then wait */
2859			nfs_mount_sock_thread_wake(nmp);
2860			if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 1)))
2861				break;
2862			msleep(req, &nmp->nm_lock, slpflag|PSOCK, "nfsconnectwait", &ts);
2863			slpflag = 0;
2864		}
2865		lck_mtx_unlock(&nmp->nm_lock);
2866		if (error) {
2867			lck_mtx_lock(&req->r_mtx);
2868			req->r_error = error;
2869			req->r_flags &= ~R_SENDING;
2870			lck_mtx_unlock(&req->r_mtx);
2871			return (error);
2872		}
2873		goto again;
2874	}
2875	nso = nmp->nm_nso;
2876	/* note that we're using the mount's socket to do the send */
2877	nmp->nm_state |= NFSSTA_SENDING;  /* will be cleared by nfs_sndunlock() */
2878	lck_mtx_unlock(&nmp->nm_lock);
2879	if (!nso) {
2880		nfs_sndunlock(req);
2881		lck_mtx_lock(&req->r_mtx);
2882		req->r_flags &= ~R_SENDING;
2883		req->r_flags |= R_MUSTRESEND;
2884		req->r_rtt = 0;
2885		lck_mtx_unlock(&req->r_mtx);
2886		return (0);
2887	}
2888
2889	lck_mtx_lock(&req->r_mtx);
2890	rexmit = (req->r_flags & R_SENT);
2891
2892	if (sotype == SOCK_DGRAM) {
2893		lck_mtx_lock(&nmp->nm_lock);
2894		if (!(req->r_flags & R_CWND) && (nmp->nm_sent >= nmp->nm_cwnd)) {
2895			/* if we can't send this out yet, wait on the cwnd queue */
2896			slpflag = (NMFLAG(nmp, INTR) && req->r_thread) ? PCATCH : 0;
2897			lck_mtx_unlock(&nmp->nm_lock);
2898			nfs_sndunlock(req);
2899			req->r_flags &= ~R_SENDING;
2900			req->r_flags |= R_MUSTRESEND;
2901			lck_mtx_unlock(&req->r_mtx);
2902			if (!wait) {
2903				req->r_rtt = 0;
2904				return (0);
2905			}
2906			lck_mtx_lock(&nmp->nm_lock);
2907			while (nmp->nm_sent >= nmp->nm_cwnd) {
2908				if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 1)))
2909					break;
2910				TAILQ_INSERT_TAIL(&nmp->nm_cwndq, req, r_cchain);
2911				msleep(req, &nmp->nm_lock, slpflag | (PZERO - 1), "nfswaitcwnd", &ts);
2912				slpflag = 0;
2913				if ((req->r_cchain.tqe_next != NFSREQNOLIST)) {
2914					TAILQ_REMOVE(&nmp->nm_cwndq, req, r_cchain);
2915					req->r_cchain.tqe_next = NFSREQNOLIST;
2916				}
2917			}
2918			lck_mtx_unlock(&nmp->nm_lock);
2919			goto again;
2920		}
2921		/*
2922		 * We update these *before* the send to avoid racing
2923		 * against others who may be looking to send requests.
2924		 */
2925		if (!rexmit) {
2926			/* first transmit */
2927			req->r_flags |= R_CWND;
2928			nmp->nm_sent += NFS_CWNDSCALE;
2929		} else {
2930			/*
2931			 * When retransmitting, turn timing off
2932			 * and divide congestion window by 2.
2933			 */
2934			req->r_flags &= ~R_TIMING;
2935			nmp->nm_cwnd >>= 1;
2936			if (nmp->nm_cwnd < NFS_CWNDSCALE)
2937				nmp->nm_cwnd = NFS_CWNDSCALE;
2938		}
2939		lck_mtx_unlock(&nmp->nm_lock);
2940	}
2941
2942	req->r_flags &= ~R_MUSTRESEND;
2943	lck_mtx_unlock(&req->r_mtx);
2944
2945	error = mbuf_copym(req->r_mhead, 0, MBUF_COPYALL,
2946			wait ? MBUF_WAITOK : MBUF_DONTWAIT, &mreqcopy);
2947	if (error) {
2948		if (wait)
2949			log(LOG_INFO, "nfs_send: mbuf copy failed %d\n", error);
2950		nfs_sndunlock(req);
2951		lck_mtx_lock(&req->r_mtx);
2952		req->r_flags &= ~R_SENDING;
2953		req->r_flags |= R_MUSTRESEND;
2954		req->r_rtt = 0;
2955		lck_mtx_unlock(&req->r_mtx);
2956		return (0);
2957	}
2958
2959	bzero(&msg, sizeof(msg));
2960	if ((sotype != SOCK_STREAM) && !sock_isconnected(nso->nso_so) && ((sendnam = nmp->nm_saddr))) {
2961		msg.msg_name = (caddr_t)sendnam;
2962		msg.msg_namelen = sendnam->sa_len;
2963	}
2964	error = sock_sendmbuf(nso->nso_so, &msg, mreqcopy, 0, &sentlen);
2965#ifdef NFS_SOCKET_DEBUGGING
2966	if (error || (sentlen != req->r_mreqlen))
2967		NFS_SOCK_DBG(("nfs_send: 0x%llx sent %d/%d error %d\n",
2968			req->r_xid, (int)sentlen, (int)req->r_mreqlen, error));
2969#endif
2970	if (!error && (sentlen != req->r_mreqlen))
2971		error = EWOULDBLOCK;
2972	needrecon = ((sotype == SOCK_STREAM) && sentlen && (sentlen != req->r_mreqlen));
2973
2974	lck_mtx_lock(&req->r_mtx);
2975	req->r_flags &= ~R_SENDING;
2976	req->r_rtt = 0;
2977	if (rexmit && (++req->r_rexmit > NFS_MAXREXMIT))
2978		req->r_rexmit = NFS_MAXREXMIT;
2979
2980	if (!error) {
2981		/* SUCCESS */
2982		req->r_flags &= ~R_RESENDERR;
2983		if (rexmit)
2984			OSAddAtomic64(1, &nfsstats.rpcretries);
2985		req->r_flags |= R_SENT;
2986		if (req->r_flags & R_WAITSENT) {
2987			req->r_flags &= ~R_WAITSENT;
2988			wakeup(req);
2989		}
2990		nfs_sndunlock(req);
2991		lck_mtx_unlock(&req->r_mtx);
2992		return (0);
2993	}
2994
2995	/* send failed */
2996	req->r_flags |= R_MUSTRESEND;
2997	if (rexmit)
2998		req->r_flags |= R_RESENDERR;
2999	if ((error == EINTR) || (error == ERESTART))
3000		req->r_error = error;
3001	lck_mtx_unlock(&req->r_mtx);
3002
3003	if (sotype == SOCK_DGRAM) {
3004		/*
3005		 * Note: even though a first send may fail, we consider
3006		 * the request sent for congestion window purposes.
3007		 * So we don't need to undo any of the changes made above.
3008		 */
3009		/*
3010		 * Socket errors ignored for connectionless sockets??
3011		 * For now, ignore them all
3012		 */
3013		if ((error != EINTR) && (error != ERESTART) &&
3014		    (error != EWOULDBLOCK) && (error != EIO) && (nso == nmp->nm_nso)) {
3015			int clearerror = 0, optlen = sizeof(clearerror);
3016			sock_getsockopt(nso->nso_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
3017#ifdef NFS_SOCKET_DEBUGGING
3018			if (clearerror)
3019				NFS_SOCK_DBG(("nfs_send: ignoring UDP socket error %d so %d\n",
3020					error, clearerror));
3021#endif
3022		}
3023	}
3024
3025	/* check if it appears we should reconnect the socket */
3026	switch (error) {
3027	case EWOULDBLOCK:
3028		/* if send timed out, reconnect if on TCP */
3029		if (sotype != SOCK_STREAM)
3030			break;
3031	case EPIPE:
3032	case EADDRNOTAVAIL:
3033	case ENETDOWN:
3034	case ENETUNREACH:
3035	case ENETRESET:
3036	case ECONNABORTED:
3037	case ECONNRESET:
3038	case ENOTCONN:
3039	case ESHUTDOWN:
3040	case ECONNREFUSED:
3041	case EHOSTDOWN:
3042	case EHOSTUNREACH:
3043		needrecon = 1;
3044		break;
3045	}
3046	if (needrecon && (nso == nmp->nm_nso)) { /* mark socket as needing reconnect */
3047		NFS_SOCK_DBG(("nfs_send: 0x%llx need reconnect %d\n", req->r_xid, error));
3048		nfs_need_reconnect(nmp);
3049	}
3050
3051	nfs_sndunlock(req);
3052
3053	/*
3054	 * Don't log some errors:
3055	 * EPIPE errors may be common with servers that drop idle connections.
3056	 * EADDRNOTAVAIL may occur on network transitions.
3057	 * ENOTCONN may occur under some network conditions.
3058	 */
3059	if ((error == EPIPE) || (error == EADDRNOTAVAIL) || (error == ENOTCONN))
3060		error = 0;
3061	if (error && (error != EINTR) && (error != ERESTART))
3062		log(LOG_INFO, "nfs send error %d for server %s\n", error,
3063			!req->r_nmp ? "<unmounted>" :
3064			vfs_statfs(req->r_nmp->nm_mountp)->f_mntfromname);
3065
3066	if (nfs_is_dead(error, nmp))
3067		error = EIO;
3068
3069	/* prefer request termination error over other errors */
3070	error2 = nfs_sigintr(req->r_nmp, req, req->r_thread, 0);
3071	if (error2)
3072		error = error2;
3073
3074	/* only allow the following errors to be returned */
3075	if ((error != EINTR) && (error != ERESTART) && (error != EIO) &&
3076	    (error != ENXIO) && (error != ETIMEDOUT))
3077		error = 0;
3078	return (error);
3079}
3080
3081/*
3082 * NFS client socket upcalls
3083 *
3084 * Pull RPC replies out of an NFS mount's socket and match them
3085 * up with the pending request.
3086 *
3087 * The datagram code is simple because we always get whole
3088 * messages out of the socket.
3089 *
3090 * The stream code is more involved because we have to parse
3091 * the RPC records out of the stream.
3092 */
3093
3094/* NFS client UDP socket upcall */
3095void
3096nfs_udp_rcv(socket_t so, void *arg, __unused int waitflag)
3097{
3098	struct nfsmount *nmp = arg;
3099	struct nfs_socket *nso = nmp->nm_nso;
3100	size_t rcvlen;
3101	mbuf_t m;
3102	int error = 0;
3103
3104	if (nmp->nm_sockflags & NMSOCK_CONNECTING)
3105		return;
3106
3107	do {
3108		/* make sure we're on the current socket */
3109		if (!nso || (nso->nso_so != so))
3110			return;
3111
3112		m = NULL;
3113		rcvlen = 1000000;
3114		error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen);
3115		if (m)
3116			nfs_request_match_reply(nmp, m);
3117	} while (m && !error);
3118
3119	if (error && (error != EWOULDBLOCK)) {
3120		/* problems with the socket... mark for reconnection */
3121		NFS_SOCK_DBG(("nfs_udp_rcv: need reconnect %d\n", error));
3122		nfs_need_reconnect(nmp);
3123	}
3124}
3125
3126/* NFS client TCP socket upcall */
3127void
3128nfs_tcp_rcv(socket_t so, void *arg, __unused int waitflag)
3129{
3130	struct nfsmount *nmp = arg;
3131	struct nfs_socket *nso = nmp->nm_nso;
3132	struct nfs_rpc_record_state nrrs;
3133	mbuf_t m;
3134	int error = 0;
3135	int recv = 1;
3136
3137	if (nmp->nm_sockflags & NMSOCK_CONNECTING)
3138		return;
3139
3140	/* make sure we're on the current socket */
3141	lck_mtx_lock(&nmp->nm_lock);
3142	nso = nmp->nm_nso;
3143	if (!nso || (nso->nso_so != so) || (nmp->nm_sockflags & (NMSOCK_DISCONNECTING))) {
3144		lck_mtx_unlock(&nmp->nm_lock);
3145		return;
3146	}
3147	lck_mtx_unlock(&nmp->nm_lock);
3148
3149	/* make sure this upcall should be trying to do work */
3150	lck_mtx_lock(&nso->nso_lock);
3151	if (nso->nso_flags & (NSO_UPCALL|NSO_DISCONNECTING|NSO_DEAD)) {
3152		lck_mtx_unlock(&nso->nso_lock);
3153		return;
3154	}
3155	nso->nso_flags |= NSO_UPCALL;
3156	nrrs = nso->nso_rrs;
3157	lck_mtx_unlock(&nso->nso_lock);
3158
3159	/* loop while we make error-free progress */
3160	while (!error && recv) {
3161		error = nfs_rpc_record_read(so, &nrrs, MSG_DONTWAIT, &recv, &m);
3162		if (m) /* match completed response with request */
3163			nfs_request_match_reply(nmp, m);
3164	}
3165
3166	lck_mtx_lock(&nmp->nm_lock);
3167	if (nmp->nm_nso == nso) {
3168		/* still the same socket, so update socket's RPC parsing state */
3169		lck_mtx_unlock(&nmp->nm_lock);
3170		lck_mtx_lock(&nso->nso_lock);
3171		nso->nso_rrs = nrrs;
3172		nso->nso_flags &= ~NSO_UPCALL;
3173		lck_mtx_unlock(&nso->nso_lock);
3174		if (nmp->nm_sockflags & NMSOCK_DISCONNECTING)
3175			wakeup(&nmp->nm_sockflags);
3176	} else {
3177		lck_mtx_unlock(&nmp->nm_lock);
3178	}
3179#ifdef NFS_SOCKET_DEBUGGING
3180	if (!recv && (error != EWOULDBLOCK))
3181		NFS_SOCK_DBG(("nfs_tcp_rcv: got nothing, error %d, got FIN?\n", error));
3182#endif
3183	/* note: no error and no data indicates server closed its end */
3184	if ((error != EWOULDBLOCK) && (error || !recv)) {
3185		/* problems with the socket... mark for reconnection */
3186		NFS_SOCK_DBG(("nfs_tcp_rcv: need reconnect %d\n", error));
3187		nfs_need_reconnect(nmp);
3188	}
3189}
3190
3191/*
3192 * "poke" a socket to try to provoke any pending errors
3193 */
3194void
3195nfs_sock_poke(struct nfsmount *nmp)
3196{
3197	struct iovec aio;
3198	struct msghdr msg;
3199	size_t len;
3200	int error = 0;
3201	int dummy;
3202
3203	lck_mtx_lock(&nmp->nm_lock);
3204	if ((nmp->nm_sockflags & NMSOCK_UNMOUNT) ||
3205	    !(nmp->nm_sockflags & NMSOCK_READY) || !nmp->nm_nso || !nmp->nm_nso->nso_so) {
3206		lck_mtx_unlock(&nmp->nm_lock);
3207		return;
3208	}
3209	lck_mtx_unlock(&nmp->nm_lock);
3210	aio.iov_base = &dummy;
3211	aio.iov_len = 0;
3212	len = 0;
3213	bzero(&msg, sizeof(msg));
3214	msg.msg_iov = &aio;
3215	msg.msg_iovlen = 1;
3216	error = sock_send(nmp->nm_nso->nso_so, &msg, MSG_DONTWAIT, &len);
3217	NFS_SOCK_DBG(("nfs_sock_poke: error %d\n", error));
3218	nfs_is_dead(error, nmp);
3219}
3220
3221/*
3222 * Match an RPC reply with the corresponding request
3223 */
3224void
3225nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep)
3226{
3227	struct nfsreq *req;
3228	struct nfsm_chain nmrep;
3229	u_int32_t reply = 0, rxid = 0;
3230	int error = 0, asyncioq, t1;
3231
3232	/* Get the xid and check that it is an rpc reply */
3233	nfsm_chain_dissect_init(error, &nmrep, mrep);
3234	nfsm_chain_get_32(error, &nmrep, rxid);
3235	nfsm_chain_get_32(error, &nmrep, reply);
3236	if (error || (reply != RPC_REPLY)) {
3237		OSAddAtomic64(1, &nfsstats.rpcinvalid);
3238		mbuf_freem(mrep);
3239		return;
3240	}
3241
3242	/*
3243	 * Loop through the request list to match up the reply
3244	 * Iff no match, just drop it.
3245	 */
3246	lck_mtx_lock(nfs_request_mutex);
3247	TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
3248		if (req->r_nmrep.nmc_mhead || (rxid != R_XID32(req->r_xid)))
3249			continue;
3250		/* looks like we have it, grab lock and double check */
3251		lck_mtx_lock(&req->r_mtx);
3252		if (req->r_nmrep.nmc_mhead || (rxid != R_XID32(req->r_xid))) {
3253			lck_mtx_unlock(&req->r_mtx);
3254			continue;
3255		}
3256		/* Found it.. */
3257		req->r_nmrep = nmrep;
3258		lck_mtx_lock(&nmp->nm_lock);
3259		if (nmp->nm_sotype == SOCK_DGRAM) {
3260			/*
3261			 * Update congestion window.
3262			 * Do the additive increase of one rpc/rtt.
3263			 */
3264			FSDBG(530, R_XID32(req->r_xid), req, nmp->nm_sent, nmp->nm_cwnd);
3265			if (nmp->nm_cwnd <= nmp->nm_sent) {
3266				nmp->nm_cwnd +=
3267				   ((NFS_CWNDSCALE * NFS_CWNDSCALE) +
3268				    (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
3269				if (nmp->nm_cwnd > NFS_MAXCWND)
3270					nmp->nm_cwnd = NFS_MAXCWND;
3271			}
3272			if (req->r_flags & R_CWND) {
3273				nmp->nm_sent -= NFS_CWNDSCALE;
3274				req->r_flags &= ~R_CWND;
3275			}
3276			if ((nmp->nm_sent < nmp->nm_cwnd) && !TAILQ_EMPTY(&nmp->nm_cwndq)) {
3277				/* congestion window is open, poke the cwnd queue */
3278				struct nfsreq *req2 = TAILQ_FIRST(&nmp->nm_cwndq);
3279				TAILQ_REMOVE(&nmp->nm_cwndq, req2, r_cchain);
3280				req2->r_cchain.tqe_next = NFSREQNOLIST;
3281				wakeup(req2);
3282			}
3283		}
3284		/*
3285		 * Update rtt using a gain of 0.125 on the mean
3286		 * and a gain of 0.25 on the deviation.
3287		 */
3288		if (req->r_flags & R_TIMING) {
3289			/*
3290			 * Since the timer resolution of
3291			 * NFS_HZ is so course, it can often
3292			 * result in r_rtt == 0. Since
3293			 * r_rtt == N means that the actual
3294			 * rtt is between N+dt and N+2-dt ticks,
3295			 * add 1.
3296			 */
3297			if (proct[req->r_procnum] == 0)
3298				panic("nfs_request_match_reply: proct[%d] is zero", req->r_procnum);
3299			t1 = req->r_rtt + 1;
3300			t1 -= (NFS_SRTT(req) >> 3);
3301			NFS_SRTT(req) += t1;
3302			if (t1 < 0)
3303				t1 = -t1;
3304			t1 -= (NFS_SDRTT(req) >> 2);
3305			NFS_SDRTT(req) += t1;
3306		}
3307		nmp->nm_timeouts = 0;
3308		lck_mtx_unlock(&nmp->nm_lock);
3309		/* signal anyone waiting on this request */
3310		wakeup(req);
3311		asyncioq = (req->r_callback.rcb_func != NULL);
3312		if (nfs_request_using_gss(req))
3313			nfs_gss_clnt_rpcdone(req);
3314		lck_mtx_unlock(&req->r_mtx);
3315		lck_mtx_unlock(nfs_request_mutex);
3316		/* if it's an async RPC with a callback, queue it up */
3317		if (asyncioq)
3318			nfs_asyncio_finish(req);
3319		break;
3320	}
3321
3322	if (!req) {
3323		/* not matched to a request, so drop it. */
3324		lck_mtx_unlock(nfs_request_mutex);
3325		OSAddAtomic64(1, &nfsstats.rpcunexpected);
3326		mbuf_freem(mrep);
3327	}
3328}
3329
3330/*
3331 * Wait for the reply for a given request...
3332 * ...potentially resending the request if necessary.
3333 */
3334int
3335nfs_wait_reply(struct nfsreq *req)
3336{
3337	struct timespec ts = { 2, 0 };
3338	int error = 0, slpflag, first = 1;
3339
3340	if (req->r_nmp && NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR))
3341		slpflag = PCATCH;
3342	else
3343		slpflag = 0;
3344
3345	lck_mtx_lock(&req->r_mtx);
3346	while (!req->r_nmrep.nmc_mhead) {
3347		if ((error = nfs_sigintr(req->r_nmp, req, first ? NULL : req->r_thread, 0)))
3348			break;
3349		if (((error = req->r_error)) || req->r_nmrep.nmc_mhead)
3350			break;
3351		/* check if we need to resend */
3352		if (req->r_flags & R_MUSTRESEND) {
3353			NFS_SOCK_DBG(("nfs wait resend: p %d x 0x%llx f 0x%x rtt %d\n",
3354				req->r_procnum, req->r_xid, req->r_flags, req->r_rtt));
3355			req->r_flags |= R_SENDING;
3356			lck_mtx_unlock(&req->r_mtx);
3357			if (nfs_request_using_gss(req)) {
3358				/*
3359				 * It's an RPCSEC_GSS request.
3360				 * Can't just resend the original request
3361				 * without bumping the cred sequence number.
3362				 * Go back and re-build the request.
3363				 */
3364				lck_mtx_lock(&req->r_mtx);
3365				req->r_flags &= ~R_SENDING;
3366				lck_mtx_unlock(&req->r_mtx);
3367				return (EAGAIN);
3368			}
3369			error = nfs_send(req, 1);
3370			lck_mtx_lock(&req->r_mtx);
3371			NFS_SOCK_DBG(("nfs wait resend: p %d x 0x%llx f 0x%x rtt %d err %d\n",
3372				req->r_procnum, req->r_xid, req->r_flags, req->r_rtt, error));
3373			if (error)
3374				break;
3375			if (((error = req->r_error)) || req->r_nmrep.nmc_mhead)
3376				break;
3377		}
3378		/* need to poll if we're P_NOREMOTEHANG */
3379		if (nfs_noremotehang(req->r_thread))
3380			ts.tv_sec = 1;
3381		msleep(req, &req->r_mtx, slpflag | (PZERO - 1), "nfswaitreply", &ts);
3382		first = slpflag = 0;
3383	}
3384	lck_mtx_unlock(&req->r_mtx);
3385
3386	return (error);
3387}
3388
3389/*
3390 * An NFS request goes something like this:
3391 * (nb: always frees up mreq mbuf list)
3392 * nfs_request_create()
3393 *	- allocates a request struct if one is not provided
3394 *	- initial fill-in of the request struct
3395 * nfs_request_add_header()
3396 *	- add the RPC header
3397 * nfs_request_send()
3398 *	- link it into list
3399 *	- call nfs_send() for first transmit
3400 * nfs_request_wait()
3401 *	- call nfs_wait_reply() to wait for the reply
3402 * nfs_request_finish()
3403 *	- break down rpc header and return with error or nfs reply
3404 *	  pointed to by nmrep.
3405 * nfs_request_rele()
3406 * nfs_request_destroy()
3407 *      - clean up the request struct
3408 *      - free the request struct if it was allocated by nfs_request_create()
3409 */
3410
3411/*
3412 * Set up an NFS request struct (allocating if no request passed in).
3413 */
3414int
3415nfs_request_create(
3416	nfsnode_t np,
3417	mount_t mp,	/* used only if !np */
3418	struct nfsm_chain *nmrest,
3419	int procnum,
3420	thread_t thd,
3421	kauth_cred_t cred,
3422	struct nfsreq **reqp)
3423{
3424	struct nfsreq *req, *newreq = NULL;
3425	struct nfsmount *nmp;
3426
3427	req = *reqp;
3428	if (!req) {
3429		/* allocate a new NFS request structure */
3430		MALLOC_ZONE(newreq, struct nfsreq*, sizeof(*newreq), M_NFSREQ, M_WAITOK);
3431		if (!newreq) {
3432			mbuf_freem(nmrest->nmc_mhead);
3433			nmrest->nmc_mhead = NULL;
3434			return (ENOMEM);
3435		}
3436		req = newreq;
3437	}
3438
3439	bzero(req, sizeof(*req));
3440	if (req == newreq)
3441		req->r_flags = R_ALLOCATED;
3442
3443	nmp = VFSTONFS(np ? NFSTOMP(np) : mp);
3444	if (!nmp) {
3445		if (newreq)
3446			FREE_ZONE(newreq, sizeof(*newreq), M_NFSREQ);
3447		return (ENXIO);
3448	}
3449	lck_mtx_lock(&nmp->nm_lock);
3450	if ((nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
3451	    (NFSSTA_FORCE|NFSSTA_TIMEO)) {
3452		lck_mtx_unlock(&nmp->nm_lock);
3453		mbuf_freem(nmrest->nmc_mhead);
3454		nmrest->nmc_mhead = NULL;
3455		if (newreq)
3456			FREE_ZONE(newreq, sizeof(*newreq), M_NFSREQ);
3457		return (ENXIO);
3458	}
3459
3460	if ((nmp->nm_vers != NFS_VER4) && (procnum >= 0) && (procnum < NFS_NPROCS))
3461		OSAddAtomic64(1, &nfsstats.rpccnt[procnum]);
3462	if ((nmp->nm_vers == NFS_VER4) && (procnum != NFSPROC4_COMPOUND) && (procnum != NFSPROC4_NULL))
3463		panic("nfs_request: invalid NFSv4 RPC request %d\n", procnum);
3464
3465	lck_mtx_init(&req->r_mtx, nfs_request_grp, LCK_ATTR_NULL);
3466	req->r_nmp = nmp;
3467	req->r_np = np;
3468	req->r_thread = thd;
3469	if (!thd)
3470		req->r_flags |= R_NOINTR;
3471	if (IS_VALID_CRED(cred)) {
3472		kauth_cred_ref(cred);
3473		req->r_cred = cred;
3474	}
3475	req->r_procnum = procnum;
3476	if (proct[procnum] > 0)
3477		req->r_flags |= R_TIMING;
3478	req->r_nmrep.nmc_mhead = NULL;
3479	SLIST_INIT(&req->r_gss_seqlist);
3480	req->r_achain.tqe_next = NFSREQNOLIST;
3481	req->r_rchain.tqe_next = NFSREQNOLIST;
3482	req->r_cchain.tqe_next = NFSREQNOLIST;
3483
3484	/* set auth flavor to use for request */
3485	if (!req->r_cred)
3486		req->r_auth = RPCAUTH_NONE;
3487	else if (req->r_np && (req->r_np->n_auth != RPCAUTH_INVALID))
3488		req->r_auth = req->r_np->n_auth;
3489	else
3490		req->r_auth = nmp->nm_auth;
3491
3492	lck_mtx_unlock(&nmp->nm_lock);
3493
3494	/* move the request mbuf chain to the nfsreq */
3495	req->r_mrest = nmrest->nmc_mhead;
3496	nmrest->nmc_mhead = NULL;
3497
3498	req->r_flags |= R_INITTED;
3499	req->r_refs = 1;
3500	if (newreq)
3501		*reqp = req;
3502	return (0);
3503}
3504
3505/*
3506 * Clean up and free an NFS request structure.
3507 */
3508void
3509nfs_request_destroy(struct nfsreq *req)
3510{
3511	struct nfsmount *nmp = req->r_np ? NFSTONMP(req->r_np) : req->r_nmp;
3512	struct gss_seq *gsp, *ngsp;
3513	struct timespec ts = { 1, 0 };
3514	int clearjbtimeo = 0;
3515
3516	if (!req || !(req->r_flags & R_INITTED))
3517		return;
3518	req->r_flags &= ~R_INITTED;
3519	if (req->r_lflags & RL_QUEUED)
3520		nfs_reqdequeue(req);
3521	if (req->r_achain.tqe_next != NFSREQNOLIST) {
3522		/* still on an async I/O queue? */
3523		lck_mtx_lock(nfsiod_mutex);
3524		if (nmp && (req->r_achain.tqe_next != NFSREQNOLIST)) {
3525			TAILQ_REMOVE(&nmp->nm_iodq, req, r_achain);
3526			req->r_achain.tqe_next = NFSREQNOLIST;
3527		}
3528		lck_mtx_unlock(nfsiod_mutex);
3529	}
3530	lck_mtx_lock(&req->r_mtx);
3531	if (nmp) {
3532		lck_mtx_lock(&nmp->nm_lock);
3533		if (req->r_flags & R_CWND) {
3534			/* Decrement the outstanding request count.  */
3535			req->r_flags &= ~R_CWND;
3536			nmp->nm_sent -= NFS_CWNDSCALE;
3537			if ((nmp->nm_sent < nmp->nm_cwnd) && !TAILQ_EMPTY(&nmp->nm_cwndq)) {
3538				/* congestion window is open, poke the cwnd queue */
3539				struct nfsreq *req2 = TAILQ_FIRST(&nmp->nm_cwndq);
3540				TAILQ_REMOVE(&nmp->nm_cwndq, req2, r_cchain);
3541				req2->r_cchain.tqe_next = NFSREQNOLIST;
3542				wakeup(req2);
3543			}
3544		}
3545		if (req->r_rchain.tqe_next != NFSREQNOLIST) {
3546			TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
3547			req->r_rchain.tqe_next = NFSREQNOLIST;
3548			if (req->r_flags & R_RESENDQ)
3549				req->r_flags &= ~R_RESENDQ;
3550		}
3551		if (req->r_cchain.tqe_next != NFSREQNOLIST) {
3552			TAILQ_REMOVE(&nmp->nm_cwndq, req, r_cchain);
3553			req->r_cchain.tqe_next = NFSREQNOLIST;
3554		}
3555		if (req->r_flags & R_JBTPRINTFMSG) {
3556			req->r_flags &= ~R_JBTPRINTFMSG;
3557			nmp->nm_jbreqs--;
3558			clearjbtimeo = (nmp->nm_jbreqs == 0) ? NFSSTA_JUKEBOXTIMEO : 0;
3559		}
3560		lck_mtx_unlock(&nmp->nm_lock);
3561	}
3562	while (req->r_flags & R_RESENDQ)
3563		msleep(req, &req->r_mtx, (PZERO - 1), "nfsresendqwait", &ts);
3564	lck_mtx_unlock(&req->r_mtx);
3565	if (clearjbtimeo)
3566		nfs_up(nmp, req->r_thread, clearjbtimeo, NULL);
3567	if (req->r_mhead)
3568		mbuf_freem(req->r_mhead);
3569	else if (req->r_mrest)
3570		mbuf_freem(req->r_mrest);
3571	if (req->r_nmrep.nmc_mhead)
3572		mbuf_freem(req->r_nmrep.nmc_mhead);
3573	if (IS_VALID_CRED(req->r_cred))
3574		kauth_cred_unref(&req->r_cred);
3575	if (nfs_request_using_gss(req))
3576		nfs_gss_clnt_rpcdone(req);
3577	SLIST_FOREACH_SAFE(gsp, &req->r_gss_seqlist, gss_seqnext, ngsp)
3578		FREE(gsp, M_TEMP);
3579	if (req->r_gss_ctx)
3580		nfs_gss_clnt_ctx_unref(req);
3581	if (req->r_wrongsec)
3582		FREE(req->r_wrongsec, M_TEMP);
3583
3584	lck_mtx_destroy(&req->r_mtx, nfs_request_grp);
3585	if (req->r_flags & R_ALLOCATED)
3586		FREE_ZONE(req, sizeof(*req), M_NFSREQ);
3587}
3588
3589void
3590nfs_request_ref(struct nfsreq *req, int locked)
3591{
3592	if (!locked)
3593		lck_mtx_lock(&req->r_mtx);
3594	if (req->r_refs <= 0)
3595		panic("nfsreq reference error");
3596	req->r_refs++;
3597	if (!locked)
3598		lck_mtx_unlock(&req->r_mtx);
3599}
3600
3601void
3602nfs_request_rele(struct nfsreq *req)
3603{
3604	int destroy;
3605
3606	lck_mtx_lock(&req->r_mtx);
3607	if (req->r_refs <= 0)
3608		panic("nfsreq reference underflow");
3609	req->r_refs--;
3610	destroy = (req->r_refs == 0);
3611	lck_mtx_unlock(&req->r_mtx);
3612	if (destroy)
3613		nfs_request_destroy(req);
3614}
3615
3616
3617/*
3618 * Add an (updated) RPC header with authorization to an NFS request.
3619 */
3620int
3621nfs_request_add_header(struct nfsreq *req)
3622{
3623	struct nfsmount *nmp;
3624	int error = 0;
3625	mbuf_t m;
3626
3627	/* free up any previous header */
3628	if ((m = req->r_mhead)) {
3629		while (m && (m != req->r_mrest))
3630			m = mbuf_free(m);
3631		req->r_mhead = NULL;
3632	}
3633
3634	nmp = req->r_np ? NFSTONMP(req->r_np) : req->r_nmp;
3635	if (!nmp)
3636		return (ENXIO);
3637
3638	error = nfsm_rpchead(req, req->r_mrest, &req->r_xid, &req->r_mhead);
3639	if (error)
3640		return (error);
3641
3642	req->r_mreqlen = mbuf_pkthdr_len(req->r_mhead);
3643	nmp = req->r_np ? NFSTONMP(req->r_np) : req->r_nmp;
3644	if (!nmp)
3645		return (ENXIO);
3646	lck_mtx_lock(&nmp->nm_lock);
3647	if (NMFLAG(nmp, SOFT))
3648		req->r_retry = nmp->nm_retry;
3649	else
3650		req->r_retry = NFS_MAXREXMIT + 1;	/* past clip limit */
3651	lck_mtx_unlock(&nmp->nm_lock);
3652
3653	return (error);
3654}
3655
3656
3657/*
3658 * Queue an NFS request up and send it out.
3659 */
3660int
3661nfs_request_send(struct nfsreq *req, int wait)
3662{
3663	struct nfsmount *nmp;
3664	struct timeval now;
3665
3666	lck_mtx_lock(&req->r_mtx);
3667	req->r_flags |= R_SENDING;
3668	lck_mtx_unlock(&req->r_mtx);
3669
3670	lck_mtx_lock(nfs_request_mutex);
3671
3672	nmp = req->r_np ? NFSTONMP(req->r_np) : req->r_nmp;
3673	if (!nmp) {
3674		lck_mtx_unlock(nfs_request_mutex);
3675		return (ENXIO);
3676	}
3677
3678	microuptime(&now);
3679	if (!req->r_start) {
3680		req->r_start = now.tv_sec;
3681		req->r_lastmsg = now.tv_sec -
3682		    ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
3683	}
3684
3685	OSAddAtomic64(1, &nfsstats.rpcrequests);
3686
3687	/*
3688	 * Chain request into list of outstanding requests. Be sure
3689	 * to put it LAST so timer finds oldest requests first.
3690	 * Make sure that the request queue timer is running
3691	 * to check for possible request timeout.
3692	 */
3693	TAILQ_INSERT_TAIL(&nfs_reqq, req, r_chain);
3694	req->r_lflags |= RL_QUEUED;
3695	if (!nfs_request_timer_on) {
3696		nfs_request_timer_on = 1;
3697		nfs_interval_timer_start(nfs_request_timer_call,
3698			NFS_REQUESTDELAY);
3699	}
3700	lck_mtx_unlock(nfs_request_mutex);
3701
3702	/* Send the request... */
3703	return (nfs_send(req, wait));
3704}
3705
3706/*
3707 * Call nfs_wait_reply() to wait for the reply.
3708 */
3709void
3710nfs_request_wait(struct nfsreq *req)
3711{
3712	req->r_error = nfs_wait_reply(req);
3713}
3714
3715/*
3716 * Finish up an NFS request by dequeueing it and
3717 * doing the initial NFS request reply processing.
3718 */
3719int
3720nfs_request_finish(
3721	struct nfsreq *req,
3722	struct nfsm_chain *nmrepp,
3723	int *status)
3724{
3725	struct nfsmount *nmp;
3726	mbuf_t mrep;
3727	int verf_type = 0;
3728	uint32_t verf_len = 0;
3729	uint32_t reply_status = 0;
3730	uint32_t rejected_status = 0;
3731	uint32_t auth_status = 0;
3732	uint32_t accepted_status = 0;
3733	struct nfsm_chain nmrep;
3734	int error, clearjbtimeo;
3735
3736	error = req->r_error;
3737
3738	if (nmrepp)
3739		nmrepp->nmc_mhead = NULL;
3740
3741	/* RPC done, unlink the request. */
3742	nfs_reqdequeue(req);
3743
3744	mrep = req->r_nmrep.nmc_mhead;
3745
3746	nmp = req->r_np ? NFSTONMP(req->r_np) : req->r_nmp;
3747
3748	if ((req->r_flags & R_CWND) && nmp) {
3749		/*
3750		 * Decrement the outstanding request count.
3751		 */
3752		req->r_flags &= ~R_CWND;
3753		lck_mtx_lock(&nmp->nm_lock);
3754		FSDBG(273, R_XID32(req->r_xid), req, nmp->nm_sent, nmp->nm_cwnd);
3755		nmp->nm_sent -= NFS_CWNDSCALE;
3756		if ((nmp->nm_sent < nmp->nm_cwnd) && !TAILQ_EMPTY(&nmp->nm_cwndq)) {
3757			/* congestion window is open, poke the cwnd queue */
3758			struct nfsreq *req2 = TAILQ_FIRST(&nmp->nm_cwndq);
3759			TAILQ_REMOVE(&nmp->nm_cwndq, req2, r_cchain);
3760			req2->r_cchain.tqe_next = NFSREQNOLIST;
3761			wakeup(req2);
3762		}
3763		lck_mtx_unlock(&nmp->nm_lock);
3764	}
3765
3766	if (nfs_request_using_gss(req)) {
3767		/*
3768		 * If the request used an RPCSEC_GSS credential
3769		 * then reset its sequence number bit in the
3770		 * request window.
3771		 */
3772		nfs_gss_clnt_rpcdone(req);
3773
3774		/*
3775		 * If we need to re-send, go back and re-build the
3776		 * request based on a new sequence number.
3777		 * Note that we're using the original XID.
3778		 */
3779		if (error == EAGAIN) {
3780			req->r_error = 0;
3781			if (mrep)
3782				mbuf_freem(mrep);
3783			error = nfs_gss_clnt_args_restore(req);	// remove any trailer mbufs
3784			req->r_nmrep.nmc_mhead = NULL;
3785			req->r_flags |= R_RESTART;
3786			if (error == ENEEDAUTH) {
3787				req->r_xid = 0;		// get a new XID
3788				error = 0;
3789			}
3790			goto nfsmout;
3791		}
3792	}
3793
3794	/*
3795	 * If there was a successful reply, make sure to mark the mount as up.
3796	 * If a tprintf message was given (or if this is a timed-out soft mount)
3797	 * then post a tprintf message indicating the server is alive again.
3798	 */
3799	if (!error) {
3800		if ((req->r_flags & R_TPRINTFMSG) ||
3801		    (nmp && NMFLAG(nmp, SOFT) &&
3802		     ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_FORCE)) == NFSSTA_TIMEO)))
3803			nfs_up(nmp, req->r_thread, NFSSTA_TIMEO, "is alive again");
3804		else
3805			nfs_up(nmp, req->r_thread, NFSSTA_TIMEO, NULL);
3806	}
3807	if (!error && !nmp)
3808		error = ENXIO;
3809	nfsmout_if(error);
3810
3811	/*
3812	 * break down the RPC header and check if ok
3813	 */
3814	nmrep = req->r_nmrep;
3815	nfsm_chain_get_32(error, &nmrep, reply_status);
3816	nfsmout_if(error);
3817	if (reply_status == RPC_MSGDENIED) {
3818		nfsm_chain_get_32(error, &nmrep, rejected_status);
3819		nfsmout_if(error);
3820		if (rejected_status == RPC_MISMATCH) {
3821			error = ENOTSUP;
3822			goto nfsmout;
3823		}
3824		nfsm_chain_get_32(error, &nmrep, auth_status);
3825		nfsmout_if(error);
3826		switch (auth_status) {
3827		case RPCSEC_GSS_CREDPROBLEM:
3828		case RPCSEC_GSS_CTXPROBLEM:
3829			/*
3830			 * An RPCSEC_GSS cred or context problem.
3831			 * We can't use it anymore.
3832			 * Restore the args, renew the context
3833			 * and set up for a resend.
3834			 */
3835			error = nfs_gss_clnt_args_restore(req);
3836			if (error && error != ENEEDAUTH)
3837				break;
3838
3839			if (!error) {
3840				error = nfs_gss_clnt_ctx_renew(req);
3841				if (error)
3842					break;
3843			}
3844			mbuf_freem(mrep);
3845			req->r_nmrep.nmc_mhead = NULL;
3846			req->r_xid = 0;		// get a new XID
3847			req->r_flags |= R_RESTART;
3848			goto nfsmout;
3849		default:
3850			error = EACCES;
3851			break;
3852		}
3853		goto nfsmout;
3854	}
3855
3856	/* Now check the verifier */
3857	nfsm_chain_get_32(error, &nmrep, verf_type); // verifier flavor
3858	nfsm_chain_get_32(error, &nmrep, verf_len);  // verifier length
3859	nfsmout_if(error);
3860
3861	switch (req->r_auth) {
3862	case RPCAUTH_NONE:
3863	case RPCAUTH_SYS:
3864		/* Any AUTH_SYS verifier is ignored */
3865		if (verf_len > 0)
3866			nfsm_chain_adv(error, &nmrep, nfsm_rndup(verf_len));
3867		nfsm_chain_get_32(error, &nmrep, accepted_status);
3868		break;
3869	case RPCAUTH_KRB5:
3870	case RPCAUTH_KRB5I:
3871	case RPCAUTH_KRB5P:
3872		error = nfs_gss_clnt_verf_get(req, &nmrep,
3873			verf_type, verf_len, &accepted_status);
3874		break;
3875	}
3876	nfsmout_if(error);
3877
3878	switch (accepted_status) {
3879	case RPC_SUCCESS:
3880		if (req->r_procnum == NFSPROC_NULL) {
3881			/*
3882			 * The NFS null procedure is unique,
3883			 * in not returning an NFS status.
3884			 */
3885			*status = NFS_OK;
3886		} else {
3887			nfsm_chain_get_32(error, &nmrep, *status);
3888			nfsmout_if(error);
3889		}
3890
3891		if ((nmp->nm_vers != NFS_VER2) && (*status == NFSERR_TRYLATER)) {
3892			/*
3893			 * It's a JUKEBOX error - delay and try again
3894			 */
3895			int delay, slpflag = (NMFLAG(nmp, INTR) && !(req->r_flags & R_NOINTR)) ? PCATCH : 0;
3896
3897			mbuf_freem(mrep);
3898			req->r_nmrep.nmc_mhead = NULL;
3899			if ((req->r_delay >= 30) && !(nmp->nm_state & NFSSTA_MOUNTED)) {
3900				/* we're not yet completely mounted and */
3901				/* we can't complete an RPC, so we fail */
3902				OSAddAtomic64(1, &nfsstats.rpctimeouts);
3903				nfs_softterm(req);
3904				error = req->r_error;
3905				goto nfsmout;
3906			}
3907			req->r_delay = !req->r_delay ? NFS_TRYLATERDEL : (req->r_delay * 2);
3908			if (req->r_delay > 30)
3909				req->r_delay = 30;
3910			if (nmp->nm_tprintf_initial_delay && (req->r_delay >= nmp->nm_tprintf_initial_delay)) {
3911				if (!(req->r_flags & R_JBTPRINTFMSG)) {
3912					req->r_flags |= R_JBTPRINTFMSG;
3913					lck_mtx_lock(&nmp->nm_lock);
3914					nmp->nm_jbreqs++;
3915					lck_mtx_unlock(&nmp->nm_lock);
3916				}
3917				nfs_down(req->r_nmp, req->r_thread, 0, NFSSTA_JUKEBOXTIMEO,
3918					"resource temporarily unavailable (jukebox)");
3919			}
3920			if (NMFLAG(nmp, SOFT) && (req->r_delay == 30) && !(req->r_flags & R_NOINTR)) {
3921				/* for soft mounts, just give up after a short while */
3922				OSAddAtomic64(1, &nfsstats.rpctimeouts);
3923				nfs_softterm(req);
3924				error = req->r_error;
3925				goto nfsmout;
3926			}
3927			delay = req->r_delay;
3928			if (req->r_callback.rcb_func) {
3929				struct timeval now;
3930				microuptime(&now);
3931				req->r_resendtime = now.tv_sec + delay;
3932			} else {
3933				do {
3934					if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0)))
3935						goto nfsmout;
3936					tsleep(&lbolt, PSOCK|slpflag, "nfs_jukebox_trylater", 0);
3937					slpflag = 0;
3938				} while (--delay > 0);
3939			}
3940			req->r_xid = 0;			// get a new XID
3941			req->r_flags |= R_RESTART;
3942			req->r_start = 0;
3943			FSDBG(273, R_XID32(req->r_xid), nmp, req, NFSERR_TRYLATER);
3944			return (0);
3945		}
3946
3947		if (req->r_flags & R_JBTPRINTFMSG) {
3948			req->r_flags &= ~R_JBTPRINTFMSG;
3949			lck_mtx_lock(&nmp->nm_lock);
3950			nmp->nm_jbreqs--;
3951			clearjbtimeo = (nmp->nm_jbreqs == 0) ? NFSSTA_JUKEBOXTIMEO : 0;
3952			lck_mtx_unlock(&nmp->nm_lock);
3953			nfs_up(nmp, req->r_thread, clearjbtimeo, "resource available again");
3954		}
3955
3956		if ((nmp->nm_vers >= NFS_VER4) && (*status == NFSERR_WRONGSEC)) {
3957			/*
3958			 * Hmmm... we need to try a different security flavor.
3959			 * The first time a request hits this, we will allocate an array
3960			 * to track flavors to try.  We fill the array with the mount's
3961			 * preferred flavors or the server's preferred flavors or just the
3962			 * flavors we support.
3963			 */
3964			uint32_t srvflavors[NX_MAX_SEC_FLAVORS];
3965			int srvcount, i, j;
3966
3967			/* Call SECINFO to try to get list of flavors from server. */
3968			srvcount = NX_MAX_SEC_FLAVORS;
3969			nfs4_secinfo_rpc(nmp, &req->r_secinfo, req->r_cred, srvflavors, &srvcount);
3970
3971			if (!req->r_wrongsec) {
3972				/* first time... set up flavor array */
3973				MALLOC(req->r_wrongsec, uint32_t*, NX_MAX_SEC_FLAVORS*sizeof(uint32_t), M_TEMP, M_WAITOK);
3974				if (!req->r_wrongsec) {
3975					error = EACCES;
3976					goto nfsmout;
3977				}
3978				i=0;
3979				if (nmp->nm_sec.count) { /* use the mount's preferred list of flavors */
3980					for(; i < nmp->nm_sec.count; i++)
3981						req->r_wrongsec[i] = nmp->nm_sec.flavors[i];
3982				} else if (srvcount) { /* otherwise use the server's list of flavors */
3983					for(; i < srvcount; i++)
3984						req->r_wrongsec[i] = srvflavors[i];
3985				} else { /* otherwise, just try the flavors we support. */
3986					req->r_wrongsec[i++] = RPCAUTH_KRB5P;
3987					req->r_wrongsec[i++] = RPCAUTH_KRB5I;
3988					req->r_wrongsec[i++] = RPCAUTH_KRB5;
3989					req->r_wrongsec[i++] = RPCAUTH_SYS;
3990					req->r_wrongsec[i++] = RPCAUTH_NONE;
3991				}
3992				for(; i < NX_MAX_SEC_FLAVORS; i++) /* invalidate any remaining slots */
3993					req->r_wrongsec[i] = RPCAUTH_INVALID;
3994			}
3995
3996			/* clear the current flavor from the list */
3997			for(i=0; i < NX_MAX_SEC_FLAVORS; i++)
3998				if (req->r_wrongsec[i] == req->r_auth)
3999					req->r_wrongsec[i] = RPCAUTH_INVALID;
4000
4001			/* find the next flavor to try */
4002			for(i=0; i < NX_MAX_SEC_FLAVORS; i++)
4003				if (req->r_wrongsec[i] != RPCAUTH_INVALID) {
4004					if (((req->r_wrongsec[i] == RPCAUTH_KRB5P) ||
4005					     (req->r_wrongsec[i] == RPCAUTH_KRB5I) ||
4006					     (req->r_wrongsec[i] == RPCAUTH_KRB5)) && (req->r_gss_ctx &&
4007					    (req->r_gss_ctx->gss_clnt_service == RPCSEC_GSS_SVC_SYS))) {
4008						/* don't bother trying Kerberos if we've already got a fallback context */
4009						req->r_wrongsec[i] = RPCAUTH_INVALID;
4010						continue;
4011					}
4012					if (!srvcount) /* no server list, just try it */
4013						break;
4014					/* check that it's in the server's list */
4015					for(j=0; j < srvcount; j++)
4016						if (req->r_wrongsec[i] == srvflavors[j])
4017							break;
4018					if (j < srvcount) /* found */
4019						break;
4020					/* not found in server list */
4021					req->r_wrongsec[i] = RPCAUTH_INVALID;
4022				}
4023			if (i == NX_MAX_SEC_FLAVORS) {
4024				/* nothing left to try! */
4025				error = EACCES;
4026				goto nfsmout;
4027			}
4028
4029			/* retry with the next auth flavor */
4030			req->r_auth = req->r_wrongsec[i];
4031			req->r_xid = 0;			// get a new XID
4032			req->r_flags |= R_RESTART;
4033			req->r_start = 0;
4034			FSDBG(273, R_XID32(req->r_xid), nmp, req, NFSERR_WRONGSEC);
4035			return (0);
4036		}
4037		if ((nmp->nm_vers >= NFS_VER4) && req->r_wrongsec) {
4038			/*
4039			 * We renegotiated security for this request; so update the
4040			 * default security flavor for the associated node.
4041			 */
4042			if (req->r_np)
4043				req->r_np->n_auth = req->r_auth;
4044		}
4045
4046		if (*status == NFS_OK) {
4047			/*
4048			 * Successful NFS request
4049			 */
4050			*nmrepp = nmrep;
4051			req->r_nmrep.nmc_mhead = NULL;
4052			break;
4053		}
4054		/* Got an NFS error of some kind */
4055
4056		/*
4057		 * If the File Handle was stale, invalidate the
4058		 * lookup cache, just in case.
4059		 */
4060		if ((*status == ESTALE) && req->r_np) {
4061			cache_purge(NFSTOV(req->r_np));
4062			/* if monitored, also send delete event */
4063			if (vnode_ismonitored(NFSTOV(req->r_np)))
4064				nfs_vnode_notify(req->r_np, (VNODE_EVENT_ATTRIB|VNODE_EVENT_DELETE));
4065		}
4066		if (nmp->nm_vers == NFS_VER2)
4067			mbuf_freem(mrep);
4068		else
4069			*nmrepp = nmrep;
4070		req->r_nmrep.nmc_mhead = NULL;
4071		error = 0;
4072		break;
4073	case RPC_PROGUNAVAIL:
4074		error = EPROGUNAVAIL;
4075		break;
4076	case RPC_PROGMISMATCH:
4077		error = ERPCMISMATCH;
4078		break;
4079	case RPC_PROCUNAVAIL:
4080		error = EPROCUNAVAIL;
4081		break;
4082	case RPC_GARBAGE:
4083		error = EBADRPC;
4084		break;
4085	case RPC_SYSTEM_ERR:
4086	default:
4087		error = EIO;
4088		break;
4089	}
4090nfsmout:
4091	if (req->r_flags & R_JBTPRINTFMSG) {
4092		req->r_flags &= ~R_JBTPRINTFMSG;
4093		lck_mtx_lock(&nmp->nm_lock);
4094		nmp->nm_jbreqs--;
4095		clearjbtimeo = (nmp->nm_jbreqs == 0) ? NFSSTA_JUKEBOXTIMEO : 0;
4096		lck_mtx_unlock(&nmp->nm_lock);
4097		if (clearjbtimeo)
4098			nfs_up(nmp, req->r_thread, clearjbtimeo, NULL);
4099	}
4100	FSDBG(273, R_XID32(req->r_xid), nmp, req,
4101		(!error && (*status == NFS_OK)) ? 0xf0f0f0f0 : error);
4102	return (error);
4103}
4104
4105/*
4106 * NFS request using a GSS/Kerberos security flavor?
4107 */
4108int
4109nfs_request_using_gss(struct nfsreq *req)
4110{
4111	if (!req->r_gss_ctx)
4112		return (0);
4113	switch (req->r_auth) {
4114		case RPCAUTH_KRB5:
4115		case RPCAUTH_KRB5I:
4116		case RPCAUTH_KRB5P:
4117			return (1);
4118	}
4119	return (0);
4120}
4121
4122/*
4123 * Perform an NFS request synchronously.
4124 */
4125
4126int
4127nfs_request(
4128	nfsnode_t np,
4129	mount_t mp,	/* used only if !np */
4130	struct nfsm_chain *nmrest,
4131	int procnum,
4132	vfs_context_t ctx,
4133	struct nfsreq_secinfo_args *si,
4134	struct nfsm_chain *nmrepp,
4135	u_int64_t *xidp,
4136	int *status)
4137{
4138	return nfs_request2(np, mp, nmrest, procnum,
4139		vfs_context_thread(ctx), vfs_context_ucred(ctx),
4140		si, 0, nmrepp, xidp, status);
4141}
4142
4143int
4144nfs_request2(
4145	nfsnode_t np,
4146	mount_t mp,	/* used only if !np */
4147	struct nfsm_chain *nmrest,
4148	int procnum,
4149	thread_t thd,
4150	kauth_cred_t cred,
4151	struct nfsreq_secinfo_args *si,
4152	int flags,
4153	struct nfsm_chain *nmrepp,
4154	u_int64_t *xidp,
4155	int *status)
4156{
4157	struct nfsreq rq, *req = &rq;
4158	int error;
4159
4160	if ((error = nfs_request_create(np, mp, nmrest, procnum, thd, cred, &req)))
4161		return (error);
4162	req->r_flags |= (flags & R_OPTMASK);
4163	if (si)
4164		req->r_secinfo = *si;
4165
4166	FSDBG_TOP(273, R_XID32(req->r_xid), np, procnum, 0);
4167	do {
4168		req->r_error = 0;
4169		req->r_flags &= ~R_RESTART;
4170		if ((error = nfs_request_add_header(req)))
4171			break;
4172		if (xidp)
4173			*xidp = req->r_xid;
4174		if ((error = nfs_request_send(req, 1)))
4175			break;
4176		nfs_request_wait(req);
4177		if ((error = nfs_request_finish(req, nmrepp, status)))
4178			break;
4179	} while (req->r_flags & R_RESTART);
4180
4181	FSDBG_BOT(273, R_XID32(req->r_xid), np, procnum, error);
4182	nfs_request_rele(req);
4183	return (error);
4184}
4185
4186
4187/*
4188 * Set up a new null proc request to exchange GSS context tokens with the
4189 * server. Associate the context that we are setting up with the request that we
4190 * are sending.
4191 */
4192
4193int
4194nfs_request_gss(
4195		mount_t mp,
4196		struct nfsm_chain *nmrest,
4197		thread_t thd,
4198		kauth_cred_t cred,
4199		int flags,
4200		struct nfs_gss_clnt_ctx *cp,   /* Set to gss context to renew or setup */
4201		struct nfsm_chain *nmrepp,
4202		int *status)
4203{
4204	struct nfsreq rq, *req = &rq;
4205	int error;
4206
4207	if ((error = nfs_request_create(NULL, mp, nmrest, NFSPROC_NULL, thd, cred, &req)))
4208		return (error);
4209	req->r_flags |= (flags & R_OPTMASK);
4210
4211	if (cp == NULL) {
4212		printf("nfs_request_gss request has no context\n");
4213		nfs_request_rele(req);
4214		return (NFSERR_EAUTH);
4215	}
4216	nfs_gss_clnt_ctx_ref(req, cp);
4217
4218	FSDBG_TOP(273, R_XID32(req->r_xid), NULL, NFSPROC_NULL, 0);
4219	do {
4220		req->r_error = 0;
4221		req->r_flags &= ~R_RESTART;
4222		if ((error = nfs_request_add_header(req)))
4223			break;
4224
4225		if ((error = nfs_request_send(req, 1)))
4226			break;
4227		nfs_request_wait(req);
4228		if ((error = nfs_request_finish(req, nmrepp, status)))
4229			break;
4230	} while (req->r_flags & R_RESTART);
4231
4232	FSDBG_BOT(273, R_XID32(req->r_xid), NULL, NFSPROC_NULL, error);
4233	nfs_request_rele(req);
4234	return (error);
4235}
4236
4237/*
4238 * Create and start an asynchronous NFS request.
4239 */
4240int
4241nfs_request_async(
4242	nfsnode_t np,
4243	mount_t mp,	/* used only if !np */
4244	struct nfsm_chain *nmrest,
4245	int procnum,
4246	thread_t thd,
4247	kauth_cred_t cred,
4248	struct nfsreq_secinfo_args *si,
4249	int flags,
4250	struct nfsreq_cbinfo *cb,
4251	struct nfsreq **reqp)
4252{
4253	struct nfsreq *req;
4254	struct nfsmount *nmp;
4255	int error, sent;
4256
4257	error = nfs_request_create(np, mp, nmrest, procnum, thd, cred, reqp);
4258	req = *reqp;
4259	FSDBG(274, (req ? R_XID32(req->r_xid) : 0), np, procnum, error);
4260	if (error)
4261		return (error);
4262	req->r_flags |= (flags & R_OPTMASK);
4263	req->r_flags |= R_ASYNC;
4264	if (si)
4265		req->r_secinfo = *si;
4266	if (cb)
4267		req->r_callback = *cb;
4268	error = nfs_request_add_header(req);
4269	if (!error) {
4270		req->r_flags |= R_WAITSENT;
4271		if (req->r_callback.rcb_func)
4272			nfs_request_ref(req, 0);
4273		error = nfs_request_send(req, 1);
4274		lck_mtx_lock(&req->r_mtx);
4275		if (!error && !(req->r_flags & R_SENT) && req->r_callback.rcb_func) {
4276			/* make sure to wait until this async I/O request gets sent */
4277			int slpflag = (req->r_nmp && NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) ? PCATCH : 0;
4278			struct timespec ts = { 2, 0 };
4279			while (!(req->r_flags & R_SENT)) {
4280				if ((req->r_flags & R_RESENDQ) && ((nmp = req->r_nmp))) {
4281					lck_mtx_lock(&nmp->nm_lock);
4282					if ((nmp->nm_state & NFSSTA_RECOVER) && (req->r_rchain.tqe_next != NFSREQNOLIST)) {
4283						/*
4284						 * It's not going to get off the resend queue if we're in recovery.
4285						 * So, just take it off ourselves.  We could be holding mount state
4286						 * busy and thus holding up the start of recovery.
4287						 */
4288						TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
4289						req->r_rchain.tqe_next = NFSREQNOLIST;
4290						if (req->r_flags & R_RESENDQ)
4291							req->r_flags &= ~R_RESENDQ;
4292						lck_mtx_unlock(&nmp->nm_lock);
4293						req->r_flags |= R_SENDING;
4294						lck_mtx_unlock(&req->r_mtx);
4295						error = nfs_send(req, 1);
4296						lck_mtx_lock(&req->r_mtx);
4297						if (error)
4298							break;
4299						continue;
4300					}
4301					lck_mtx_unlock(&nmp->nm_lock);
4302				}
4303				if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0)))
4304					break;
4305				msleep(req, &req->r_mtx, slpflag | (PZERO - 1), "nfswaitsent", &ts);
4306				slpflag = 0;
4307			}
4308		}
4309		sent = req->r_flags & R_SENT;
4310		lck_mtx_unlock(&req->r_mtx);
4311		if (error && req->r_callback.rcb_func && !sent)
4312			nfs_request_rele(req);
4313	}
4314	FSDBG(274, R_XID32(req->r_xid), np, procnum, error);
4315	if (error || req->r_callback.rcb_func)
4316		nfs_request_rele(req);
4317	return (error);
4318}
4319
4320/*
4321 * Wait for and finish an asynchronous NFS request.
4322 */
4323int
4324nfs_request_async_finish(
4325	struct nfsreq *req,
4326	struct nfsm_chain *nmrepp,
4327	u_int64_t *xidp,
4328	int *status)
4329{
4330	int error = 0, asyncio = req->r_callback.rcb_func ? 1 : 0;
4331	struct nfsmount *nmp;
4332
4333	lck_mtx_lock(&req->r_mtx);
4334	if (!asyncio)
4335		req->r_flags |= R_ASYNCWAIT;
4336	while (req->r_flags & R_RESENDQ) {  /* wait until the request is off the resend queue */
4337		struct timespec ts = { 2, 0 };
4338		if ((nmp = req->r_nmp)) {
4339			lck_mtx_lock(&nmp->nm_lock);
4340			if ((nmp->nm_state & NFSSTA_RECOVER) && (req->r_rchain.tqe_next != NFSREQNOLIST)) {
4341				/*
4342				 * It's not going to get off the resend queue if we're in recovery.
4343				 * So, just take it off ourselves.  We could be holding mount state
4344				 * busy and thus holding up the start of recovery.
4345				 */
4346				TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
4347				req->r_rchain.tqe_next = NFSREQNOLIST;
4348				if (req->r_flags & R_RESENDQ)
4349					req->r_flags &= ~R_RESENDQ;
4350				lck_mtx_unlock(&nmp->nm_lock);
4351				break;
4352			}
4353			lck_mtx_unlock(&nmp->nm_lock);
4354		}
4355		if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0)))
4356			break;
4357		msleep(req, &req->r_mtx, PZERO-1, "nfsresendqwait", &ts);
4358	}
4359	lck_mtx_unlock(&req->r_mtx);
4360
4361	if (!error) {
4362		nfs_request_wait(req);
4363		error = nfs_request_finish(req, nmrepp, status);
4364	}
4365
4366	while (!error && (req->r_flags & R_RESTART)) {
4367		if (asyncio && req->r_resendtime) {  /* send later */
4368			lck_mtx_lock(&req->r_mtx);
4369			nfs_asyncio_resend(req);
4370			lck_mtx_unlock(&req->r_mtx);
4371			return (EINPROGRESS);
4372		}
4373		req->r_error = 0;
4374		req->r_flags &= ~R_RESTART;
4375		if ((error = nfs_request_add_header(req)))
4376			break;
4377		if ((error = nfs_request_send(req, !asyncio)))
4378			break;
4379		if (asyncio)
4380			return (EINPROGRESS);
4381		nfs_request_wait(req);
4382		if ((error = nfs_request_finish(req, nmrepp, status)))
4383			break;
4384	}
4385	if (xidp)
4386		*xidp = req->r_xid;
4387
4388	FSDBG(275, R_XID32(req->r_xid), req->r_np, req->r_procnum, error);
4389	nfs_request_rele(req);
4390	return (error);
4391}
4392
4393/*
4394 * Cancel a pending asynchronous NFS request.
4395 */
4396void
4397nfs_request_async_cancel(struct nfsreq *req)
4398{
4399	nfs_reqdequeue(req);
4400	FSDBG(275, R_XID32(req->r_xid), req->r_np, req->r_procnum, 0xD1ED1E);
4401	nfs_request_rele(req);
4402}
4403
4404/*
4405 * Flag a request as being terminated.
4406 */
4407void
4408nfs_softterm(struct nfsreq *req)
4409{
4410	struct nfsmount *nmp = req->r_nmp;
4411	req->r_flags |= R_SOFTTERM;
4412	req->r_error = ETIMEDOUT;
4413	if (!(req->r_flags & R_CWND) || !nmp)
4414		return;
4415	/* update congestion window */
4416	req->r_flags &= ~R_CWND;
4417	lck_mtx_lock(&nmp->nm_lock);
4418	FSDBG(532, R_XID32(req->r_xid), req, nmp->nm_sent, nmp->nm_cwnd);
4419	nmp->nm_sent -= NFS_CWNDSCALE;
4420	if ((nmp->nm_sent < nmp->nm_cwnd) && !TAILQ_EMPTY(&nmp->nm_cwndq)) {
4421		/* congestion window is open, poke the cwnd queue */
4422		struct nfsreq *req2 = TAILQ_FIRST(&nmp->nm_cwndq);
4423		TAILQ_REMOVE(&nmp->nm_cwndq, req2, r_cchain);
4424		req2->r_cchain.tqe_next = NFSREQNOLIST;
4425		wakeup(req2);
4426	}
4427	lck_mtx_unlock(&nmp->nm_lock);
4428}
4429
4430/*
4431 * Ensure req isn't in use by the timer, then dequeue it.
4432 */
4433void
4434nfs_reqdequeue(struct nfsreq *req)
4435{
4436	lck_mtx_lock(nfs_request_mutex);
4437	while (req->r_lflags & RL_BUSY) {
4438		req->r_lflags |= RL_WAITING;
4439		msleep(&req->r_lflags, nfs_request_mutex, PSOCK, "reqdeq", NULL);
4440	}
4441	if (req->r_lflags & RL_QUEUED) {
4442		TAILQ_REMOVE(&nfs_reqq, req, r_chain);
4443		req->r_lflags &= ~RL_QUEUED;
4444	}
4445	lck_mtx_unlock(nfs_request_mutex);
4446}
4447
4448/*
4449 * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
4450 * free()'d out from under it.
4451 */
4452void
4453nfs_reqbusy(struct nfsreq *req)
4454{
4455	if (req->r_lflags & RL_BUSY)
4456		panic("req locked");
4457	req->r_lflags |= RL_BUSY;
4458}
4459
4460/*
4461 * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
4462 */
4463struct nfsreq *
4464nfs_reqnext(struct nfsreq *req)
4465{
4466	struct nfsreq * nextreq;
4467
4468	if (req == NULL)
4469		return (NULL);
4470	/*
4471	 * We need to get and busy the next req before signalling the
4472	 * current one, otherwise wakeup() may block us and we'll race to
4473	 * grab the next req.
4474	 */
4475	nextreq = TAILQ_NEXT(req, r_chain);
4476	if (nextreq != NULL)
4477		nfs_reqbusy(nextreq);
4478	/* unbusy and signal. */
4479	req->r_lflags &= ~RL_BUSY;
4480	if (req->r_lflags & RL_WAITING) {
4481		req->r_lflags &= ~RL_WAITING;
4482		wakeup(&req->r_lflags);
4483	}
4484	return (nextreq);
4485}
4486
4487/*
4488 * NFS request queue timer routine
4489 *
4490 * Scan the NFS request queue for any requests that have timed out.
4491 *
4492 * Alert the system of unresponsive servers.
4493 * Mark expired requests on soft mounts as terminated.
4494 * For UDP, mark/signal requests for retransmission.
4495 */
4496void
4497nfs_request_timer(__unused void *param0, __unused void *param1)
4498{
4499	struct nfsreq *req;
4500	struct nfsmount *nmp;
4501	int timeo, maxtime, finish_asyncio, error;
4502	struct timeval now;
4503	TAILQ_HEAD(nfs_mount_pokeq, nfsmount) nfs_mount_poke_queue;
4504
4505	lck_mtx_lock(nfs_request_mutex);
4506	req = TAILQ_FIRST(&nfs_reqq);
4507	if (req == NULL) {	/* no requests - turn timer off */
4508		nfs_request_timer_on = 0;
4509		lck_mtx_unlock(nfs_request_mutex);
4510		return;
4511	}
4512
4513	nfs_reqbusy(req);
4514	TAILQ_INIT(&nfs_mount_poke_queue);
4515
4516	microuptime(&now);
4517	for ( ; req != NULL ; req = nfs_reqnext(req)) {
4518		nmp = req->r_nmp;
4519		if (!nmp) /* unmounted */
4520			continue;
4521		if (req->r_error || req->r_nmrep.nmc_mhead)
4522			continue;
4523		if ((error = nfs_sigintr(nmp, req, req->r_thread, 0))) {
4524			if (req->r_callback.rcb_func != NULL) {
4525				/* async I/O RPC needs to be finished */
4526				lck_mtx_lock(&req->r_mtx);
4527				req->r_error = error;
4528				finish_asyncio = !(req->r_flags & R_WAITSENT);
4529				wakeup(req);
4530				lck_mtx_unlock(&req->r_mtx);
4531				if (finish_asyncio)
4532					nfs_asyncio_finish(req);
4533			}
4534			continue;
4535		}
4536
4537		lck_mtx_lock(&req->r_mtx);
4538
4539		if (nmp->nm_tprintf_initial_delay &&
4540		    ((req->r_rexmit > 2) || (req->r_flags & R_RESENDERR)) &&
4541		    ((req->r_lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
4542			req->r_lastmsg = now.tv_sec;
4543			nfs_down(req->r_nmp, req->r_thread, 0, NFSSTA_TIMEO,
4544				"not responding");
4545			req->r_flags |= R_TPRINTFMSG;
4546			lck_mtx_lock(&nmp->nm_lock);
4547			if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
4548				lck_mtx_unlock(&nmp->nm_lock);
4549				/* we're not yet completely mounted and */
4550				/* we can't complete an RPC, so we fail */
4551				OSAddAtomic64(1, &nfsstats.rpctimeouts);
4552				nfs_softterm(req);
4553				finish_asyncio = ((req->r_callback.rcb_func != NULL) && !(req->r_flags & R_WAITSENT));
4554				wakeup(req);
4555				lck_mtx_unlock(&req->r_mtx);
4556				if (finish_asyncio)
4557					nfs_asyncio_finish(req);
4558				continue;
4559			}
4560			lck_mtx_unlock(&nmp->nm_lock);
4561		}
4562
4563		/*
4564		 * Put a reasonable limit on the maximum timeout,
4565		 * and reduce that limit when soft mounts get timeouts or are in reconnect.
4566		 */
4567		if (!NMFLAG(nmp, SOFT) && !nfs_can_squish(nmp))
4568			maxtime = NFS_MAXTIMEO;
4569		else if ((req->r_flags & (R_SETUP|R_RECOVER)) ||
4570			 ((nmp->nm_reconnect_start <= 0) || ((now.tv_sec - nmp->nm_reconnect_start) < 8)))
4571			maxtime = (NFS_MAXTIMEO / (nmp->nm_timeouts+1))/2;
4572		else
4573			maxtime = NFS_MINTIMEO/4;
4574
4575		/*
4576		 * Check for request timeout.
4577		 */
4578		if (req->r_rtt >= 0) {
4579			req->r_rtt++;
4580			lck_mtx_lock(&nmp->nm_lock);
4581			if (req->r_flags & R_RESENDERR) {
4582				/* with resend errors, retry every few seconds */
4583				timeo = 4*hz;
4584			} else {
4585				if (req->r_procnum == NFSPROC_NULL && req->r_gss_ctx != NULL)
4586					timeo = NFS_MINIDEMTIMEO; // gss context setup
4587				else if (NMFLAG(nmp, DUMBTIMER))
4588					timeo = nmp->nm_timeo;
4589				else
4590					timeo = NFS_RTO(nmp, proct[req->r_procnum]);
4591
4592				/* ensure 62.5 ms floor */
4593				while (16 * timeo < hz)
4594					timeo *= 2;
4595				if (nmp->nm_timeouts > 0)
4596					timeo *= nfs_backoff[nmp->nm_timeouts - 1];
4597			}
4598			/* limit timeout to max */
4599			if (timeo > maxtime)
4600				timeo = maxtime;
4601			if (req->r_rtt <= timeo) {
4602				lck_mtx_unlock(&nmp->nm_lock);
4603				lck_mtx_unlock(&req->r_mtx);
4604				continue;
4605			}
4606			/* The request has timed out */
4607			NFS_SOCK_DBG(("nfs timeout: proc %d %d xid %llx rtt %d to %d # %d, t %ld/%d\n",
4608				req->r_procnum, proct[req->r_procnum],
4609				req->r_xid, req->r_rtt, timeo, nmp->nm_timeouts,
4610				(now.tv_sec - req->r_start)*NFS_HZ, maxtime));
4611			if (nmp->nm_timeouts < 8)
4612				nmp->nm_timeouts++;
4613			nfs_mount_check_dead_timeout(nmp);
4614			/* if it's been a few seconds, try poking the socket */
4615			if ((nmp->nm_sotype == SOCK_STREAM) &&
4616			    ((now.tv_sec - req->r_start) >= 3) &&
4617			    !(nmp->nm_sockflags & (NMSOCK_POKE|NMSOCK_UNMOUNT)) &&
4618			    (nmp->nm_sockflags & NMSOCK_READY)) {
4619				nmp->nm_sockflags |= NMSOCK_POKE;
4620				TAILQ_INSERT_TAIL(&nfs_mount_poke_queue, nmp, nm_pokeq);
4621			}
4622			lck_mtx_unlock(&nmp->nm_lock);
4623		}
4624
4625		/* For soft mounts (& SETUPs/RECOVERs), check for too many retransmits/timeout. */
4626		if ((NMFLAG(nmp, SOFT) ||  (req->r_flags & (R_SETUP|R_RECOVER))) &&
4627		    ((req->r_rexmit >= req->r_retry) || /* too many */
4628		     ((now.tv_sec - req->r_start)*NFS_HZ > maxtime))) { /* too long */
4629			OSAddAtomic64(1, &nfsstats.rpctimeouts);
4630			lck_mtx_lock(&nmp->nm_lock);
4631			if (!(nmp->nm_state & NFSSTA_TIMEO)) {
4632				lck_mtx_unlock(&nmp->nm_lock);
4633				/* make sure we note the unresponsive server */
4634				/* (maxtime may be less than tprintf delay) */
4635				nfs_down(req->r_nmp, req->r_thread, 0, NFSSTA_TIMEO,
4636					"not responding");
4637				req->r_lastmsg = now.tv_sec;
4638				req->r_flags |= R_TPRINTFMSG;
4639			} else {
4640				lck_mtx_unlock(&nmp->nm_lock);
4641			}
4642			if (req->r_flags & R_NOINTR) {
4643				/* don't terminate nointr requests on timeout */
4644				lck_mtx_unlock(&req->r_mtx);
4645				continue;
4646			}
4647			NFS_SOCK_DBG(("nfs timer TERMINATE: p %d x 0x%llx f 0x%x rtt %d t %ld\n",
4648				req->r_procnum, req->r_xid, req->r_flags, req->r_rtt,
4649				now.tv_sec - req->r_start));
4650			nfs_softterm(req);
4651			finish_asyncio = ((req->r_callback.rcb_func != NULL) && !(req->r_flags & R_WAITSENT));
4652			wakeup(req);
4653			lck_mtx_unlock(&req->r_mtx);
4654			if (finish_asyncio)
4655				nfs_asyncio_finish(req);
4656			continue;
4657		}
4658
4659		/* for TCP, only resend if explicitly requested */
4660		if ((nmp->nm_sotype == SOCK_STREAM) && !(req->r_flags & R_MUSTRESEND)) {
4661			if (++req->r_rexmit > NFS_MAXREXMIT)
4662				req->r_rexmit = NFS_MAXREXMIT;
4663			req->r_rtt = 0;
4664			lck_mtx_unlock(&req->r_mtx);
4665			continue;
4666		}
4667
4668		/*
4669		 * The request needs to be (re)sent.  Kick the requester to resend it.
4670		 * (unless it's already marked as needing a resend)
4671		 */
4672		if ((req->r_flags & R_MUSTRESEND) && (req->r_rtt == -1)) {
4673			lck_mtx_unlock(&req->r_mtx);
4674			continue;
4675		}
4676		NFS_SOCK_DBG(("nfs timer mark resend: p %d x 0x%llx f 0x%x rtt %d\n",
4677			req->r_procnum, req->r_xid, req->r_flags, req->r_rtt));
4678		req->r_flags |= R_MUSTRESEND;
4679		req->r_rtt = -1;
4680		wakeup(req);
4681		if ((req->r_flags & (R_ASYNC|R_ASYNCWAIT|R_SENDING)) == R_ASYNC)
4682			nfs_asyncio_resend(req);
4683		lck_mtx_unlock(&req->r_mtx);
4684	}
4685
4686	lck_mtx_unlock(nfs_request_mutex);
4687
4688	/* poke any sockets */
4689	while ((nmp = TAILQ_FIRST(&nfs_mount_poke_queue))) {
4690		TAILQ_REMOVE(&nfs_mount_poke_queue, nmp, nm_pokeq);
4691		nfs_sock_poke(nmp);
4692		lck_mtx_lock(&nmp->nm_lock);
4693		nmp->nm_sockflags &= ~NMSOCK_POKE;
4694		wakeup(&nmp->nm_sockflags);
4695		lck_mtx_unlock(&nmp->nm_lock);
4696	}
4697
4698	nfs_interval_timer_start(nfs_request_timer_call, NFS_REQUESTDELAY);
4699}
4700
4701/*
4702 * check a thread's proc for the "noremotehang" flag.
4703 */
4704int
4705nfs_noremotehang(thread_t thd)
4706{
4707	proc_t p = thd ? get_bsdthreadtask_info(thd) : NULL;
4708	return (p && proc_noremotehang(p));
4709}
4710
4711/*
4712 * Test for a termination condition pending on the process.
4713 * This is used to determine if we need to bail on a mount.
4714 * ETIMEDOUT is returned if there has been a soft timeout.
4715 * EINTR is returned if there is a signal pending that is not being ignored
4716 * and the mount is interruptable, or if we are a thread that is in the process
4717 * of cancellation (also SIGKILL posted).
4718 */
4719extern int sigprop[NSIG+1];
4720int
4721nfs_sigintr(struct nfsmount *nmp, struct nfsreq *req, thread_t thd, int nmplocked)
4722{
4723	proc_t p;
4724	int error = 0;
4725
4726	if (nmp == NULL)
4727		return (ENXIO);
4728
4729	if (req && (req->r_flags & R_SOFTTERM))
4730		return (ETIMEDOUT); /* request has been terminated. */
4731	if (req && (req->r_flags & R_NOINTR))
4732		thd = NULL; /* don't check for signal on R_NOINTR */
4733
4734	if (!nmplocked)
4735		lck_mtx_lock(&nmp->nm_lock);
4736	if (nmp->nm_state & NFSSTA_FORCE) {
4737		/* If a force unmount is in progress then fail. */
4738		error = EIO;
4739	} else if (nmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
4740		/* Someone is unmounting us, go soft and mark it. */
4741		NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_SOFT);
4742		nmp->nm_state |= NFSSTA_FORCE;
4743	}
4744
4745	/* Check if the mount is marked dead. */
4746	if (!error && (nmp->nm_state & NFSSTA_DEAD))
4747		error = ENXIO;
4748
4749	/*
4750	 * If the mount is hung and we've requested not to hang
4751	 * on remote filesystems, then bail now.
4752	 */
4753	if (!error && (nmp->nm_state & NFSSTA_TIMEO) && nfs_noremotehang(thd))
4754		error = EIO;
4755
4756	if (!nmplocked)
4757		lck_mtx_unlock(&nmp->nm_lock);
4758	if (error)
4759		return (error);
4760
4761	/* may not have a thread for async I/O */
4762	if (thd == NULL)
4763		return (0);
4764
4765	/*
4766	 * Check if the process is aborted, but don't interrupt if we
4767	 * were killed by a signal and this is the exiting thread which
4768	 * is attempting to dump core.
4769	 */
4770	if (((p = current_proc()) != kernproc) && current_thread_aborted() &&
4771	    (!(p->p_acflag & AXSIG) || (p->exit_thread != current_thread()) ||
4772	     (p->p_sigacts == NULL) ||
4773	     (p->p_sigacts->ps_sig < 1) || (p->p_sigacts->ps_sig > NSIG) ||
4774	     !(sigprop[p->p_sigacts->ps_sig] & SA_CORE)))
4775		return (EINTR);
4776
4777	/* mask off thread and process blocked signals. */
4778	if (NMFLAG(nmp, INTR) && ((p = get_bsdthreadtask_info(thd))) &&
4779	    proc_pendingsignals(p, NFSINT_SIGMASK))
4780		return (EINTR);
4781	return (0);
4782}
4783
4784/*
4785 * Lock a socket against others.
4786 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
4787 * and also to avoid race conditions between the processes with nfs requests
4788 * in progress when a reconnect is necessary.
4789 */
4790int
4791nfs_sndlock(struct nfsreq *req)
4792{
4793	struct nfsmount *nmp = req->r_nmp;
4794	int *statep;
4795	int error = 0, slpflag = 0;
4796	struct timespec ts = { 0, 0 };
4797
4798	if (nmp == NULL)
4799		return (ENXIO);
4800
4801	lck_mtx_lock(&nmp->nm_lock);
4802	statep = &nmp->nm_state;
4803
4804	if (NMFLAG(nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR))
4805		slpflag = PCATCH;
4806	while (*statep & NFSSTA_SNDLOCK) {
4807		if ((error = nfs_sigintr(nmp, req, req->r_thread, 1)))
4808			break;
4809		*statep |= NFSSTA_WANTSND;
4810		if (nfs_noremotehang(req->r_thread))
4811			ts.tv_sec = 1;
4812		msleep(statep, &nmp->nm_lock, slpflag | (PZERO - 1), "nfsndlck", &ts);
4813		if (slpflag == PCATCH) {
4814			slpflag = 0;
4815			ts.tv_sec = 2;
4816		}
4817	}
4818	if (!error)
4819		*statep |= NFSSTA_SNDLOCK;
4820	lck_mtx_unlock(&nmp->nm_lock);
4821	return (error);
4822}
4823
4824/*
4825 * Unlock the stream socket for others.
4826 */
4827void
4828nfs_sndunlock(struct nfsreq *req)
4829{
4830	struct nfsmount *nmp = req->r_nmp;
4831	int *statep, wake = 0;
4832
4833	if (nmp == NULL)
4834		return;
4835	lck_mtx_lock(&nmp->nm_lock);
4836	statep = &nmp->nm_state;
4837	if ((*statep & NFSSTA_SNDLOCK) == 0)
4838		panic("nfs sndunlock");
4839	*statep &= ~(NFSSTA_SNDLOCK|NFSSTA_SENDING);
4840	if (*statep & NFSSTA_WANTSND) {
4841		*statep &= ~NFSSTA_WANTSND;
4842		wake = 1;
4843	}
4844	lck_mtx_unlock(&nmp->nm_lock);
4845	if (wake)
4846		wakeup(statep);
4847}
4848
4849int
4850nfs_aux_request(
4851	struct nfsmount *nmp,
4852	thread_t thd,
4853	struct sockaddr *saddr,
4854	socket_t so,
4855	int sotype,
4856	mbuf_t mreq,
4857	uint32_t xid,
4858	int bindresv,
4859	int timeo,
4860	struct nfsm_chain *nmrep)
4861{
4862	int error = 0, on = 1, try, sendat = 2, soproto, recv, optlen, restoreto = 0;
4863	socket_t newso = NULL;
4864	struct sockaddr_storage ss;
4865	struct timeval orig_rcvto, orig_sndto, tv = { 1, 0 };
4866	mbuf_t m, mrep = NULL;
4867	struct msghdr msg;
4868	uint32_t rxid = 0, reply = 0, reply_status, rejected_status;
4869	uint32_t verf_type, verf_len, accepted_status;
4870	size_t readlen, sentlen;
4871	struct nfs_rpc_record_state nrrs;
4872
4873	if (!so) {
4874		/* create socket and set options */
4875		soproto = (sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP;
4876		if ((error = sock_socket(saddr->sa_family, sotype, soproto, NULL, NULL, &newso)))
4877			goto nfsmout;
4878
4879		if (bindresv) {
4880			int level = (saddr->sa_family == AF_INET) ? IPPROTO_IP : IPPROTO_IPV6;
4881			int optname = (saddr->sa_family == AF_INET) ? IP_PORTRANGE : IPV6_PORTRANGE;
4882			int portrange = IP_PORTRANGE_LOW;
4883			error = sock_setsockopt(newso, level, optname, &portrange, sizeof(portrange));
4884			nfsmout_if(error);
4885			ss.ss_len = saddr->sa_len;
4886			ss.ss_family = saddr->sa_family;
4887			if (ss.ss_family == AF_INET) {
4888				((struct sockaddr_in*)&ss)->sin_addr.s_addr = INADDR_ANY;
4889				((struct sockaddr_in*)&ss)->sin_port = htons(0);
4890			} else if (ss.ss_family == AF_INET6) {
4891				((struct sockaddr_in6*)&ss)->sin6_addr = in6addr_any;
4892				((struct sockaddr_in6*)&ss)->sin6_port = htons(0);
4893			} else {
4894				error = EINVAL;
4895			}
4896			if (!error)
4897				error = sock_bind(newso, (struct sockaddr *)&ss);
4898			nfsmout_if(error);
4899		}
4900
4901		if (sotype == SOCK_STREAM) {
4902			on = 4; /* don't wait too long for the socket to connect */
4903			sock_setsockopt(newso, IPPROTO_TCP, TCP_CONNECTIONTIMEOUT, &on, sizeof(on));
4904			error = sock_connect(newso, saddr, 0);
4905			nfsmout_if(error);
4906		}
4907		if (((error = sock_setsockopt(newso, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))) ||
4908		    ((error = sock_setsockopt(newso, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)))) ||
4909		    ((error = sock_setsockopt(newso, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)))))
4910			goto nfsmout;
4911		so = newso;
4912	} else {
4913		/* make sure socket is using a one second timeout in this function */
4914		optlen = sizeof(orig_rcvto);
4915		error = sock_getsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &orig_rcvto, &optlen);
4916		if (!error) {
4917			optlen = sizeof(orig_sndto);
4918			error = sock_getsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &orig_sndto, &optlen);
4919		}
4920		if (!error) {
4921			sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
4922			sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv));
4923			restoreto = 1;
4924		}
4925	}
4926
4927	if (sotype == SOCK_STREAM) {
4928		sendat = 0; /* we only resend the request for UDP */
4929		nfs_rpc_record_state_init(&nrrs);
4930	}
4931
4932	for (try=0; try < timeo; try++) {
4933		if ((error = nfs_sigintr(nmp, NULL, !try ? NULL : thd, 0)))
4934			break;
4935		if (!try || (try == sendat)) {
4936			/* send the request (resending periodically for UDP) */
4937			if ((error = mbuf_copym(mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m)))
4938				goto nfsmout;
4939			bzero(&msg, sizeof(msg));
4940			if ((sotype == SOCK_DGRAM) && !sock_isconnected(so)) {
4941				msg.msg_name = saddr;
4942				msg.msg_namelen = saddr->sa_len;
4943			}
4944			if ((error = sock_sendmbuf(so, &msg, m, 0, &sentlen)))
4945				goto nfsmout;
4946			sendat *= 2;
4947			if (sendat > 30)
4948				sendat = 30;
4949		}
4950		/* wait for the response */
4951		if (sotype == SOCK_STREAM) {
4952			/* try to read (more of) record */
4953			error = nfs_rpc_record_read(so, &nrrs, 0, &recv, &mrep);
4954			/* if we don't have the whole record yet, we'll keep trying */
4955		} else {
4956			readlen = 1<<18;
4957			bzero(&msg, sizeof(msg));
4958			error = sock_receivembuf(so, &msg, &mrep, 0, &readlen);
4959		}
4960		if (error == EWOULDBLOCK)
4961			continue;
4962		nfsmout_if(error);
4963		/* parse the response */
4964		nfsm_chain_dissect_init(error, nmrep, mrep);
4965		nfsm_chain_get_32(error, nmrep, rxid);
4966		nfsm_chain_get_32(error, nmrep, reply);
4967		nfsmout_if(error);
4968		if ((rxid != xid) || (reply != RPC_REPLY))
4969			error = EBADRPC;
4970		nfsm_chain_get_32(error, nmrep, reply_status);
4971		nfsmout_if(error);
4972		if (reply_status == RPC_MSGDENIED) {
4973			nfsm_chain_get_32(error, nmrep, rejected_status);
4974			nfsmout_if(error);
4975			error = (rejected_status == RPC_MISMATCH) ? ERPCMISMATCH : EACCES;
4976			goto nfsmout;
4977		}
4978		nfsm_chain_get_32(error, nmrep, verf_type); /* verifier flavor */
4979		nfsm_chain_get_32(error, nmrep, verf_len); /* verifier length */
4980		nfsmout_if(error);
4981		if (verf_len)
4982			nfsm_chain_adv(error, nmrep, nfsm_rndup(verf_len));
4983		nfsm_chain_get_32(error, nmrep, accepted_status);
4984		nfsmout_if(error);
4985		switch (accepted_status) {
4986		case RPC_SUCCESS:
4987			error = 0;
4988			break;
4989		case RPC_PROGUNAVAIL:
4990			error = EPROGUNAVAIL;
4991			break;
4992		case RPC_PROGMISMATCH:
4993			error = EPROGMISMATCH;
4994			break;
4995		case RPC_PROCUNAVAIL:
4996			error = EPROCUNAVAIL;
4997			break;
4998		case RPC_GARBAGE:
4999			error = EBADRPC;
5000			break;
5001		case RPC_SYSTEM_ERR:
5002		default:
5003			error = EIO;
5004			break;
5005		}
5006		break;
5007	}
5008nfsmout:
5009	if (restoreto) {
5010		sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &orig_rcvto, sizeof(tv));
5011		sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &orig_sndto, sizeof(tv));
5012	}
5013	if (newso) {
5014		sock_shutdown(newso, SHUT_RDWR);
5015		sock_close(newso);
5016	}
5017	mbuf_freem(mreq);
5018	return (error);
5019}
5020
5021int
5022nfs_portmap_lookup(
5023	struct nfsmount *nmp,
5024	vfs_context_t ctx,
5025	struct sockaddr *sa,
5026	socket_t so,
5027	uint32_t protocol,
5028	uint32_t vers,
5029	uint32_t ipproto,
5030	int timeo)
5031{
5032	thread_t thd = vfs_context_thread(ctx);
5033	kauth_cred_t cred = vfs_context_ucred(ctx);
5034	struct sockaddr_storage ss;
5035	struct sockaddr *saddr = (struct sockaddr*)&ss;
5036	struct nfsm_chain nmreq, nmrep;
5037	mbuf_t mreq;
5038	int error = 0, ip, pmprog, pmvers, pmproc, ualen = 0;
5039	uint32_t port;
5040	uint64_t xid = 0;
5041	char uaddr[MAX_IPv6_STR_LEN+16];
5042
5043	bcopy(sa, saddr, min(sizeof(ss), sa->sa_len));
5044	if (saddr->sa_family == AF_INET) {
5045		ip = 4;
5046		pmprog = PMAPPROG;
5047		pmvers = PMAPVERS;
5048		pmproc = PMAPPROC_GETPORT;
5049	} else if (saddr->sa_family == AF_INET6) {
5050		ip = 6;
5051		pmprog = RPCBPROG;
5052		pmvers = RPCBVERS4;
5053		pmproc = RPCBPROC_GETVERSADDR;
5054	} else {
5055		return (EINVAL);
5056	}
5057	nfsm_chain_null(&nmreq);
5058	nfsm_chain_null(&nmrep);
5059
5060tryagain:
5061	/* send portmapper request to get port/uaddr */
5062	if (ip == 4)
5063		((struct sockaddr_in*)saddr)->sin_port = htons(PMAPPORT);
5064	else
5065		((struct sockaddr_in6*)saddr)->sin6_port = htons(PMAPPORT);
5066	nfsm_chain_build_alloc_init(error, &nmreq, 8*NFSX_UNSIGNED);
5067	nfsm_chain_add_32(error, &nmreq, protocol);
5068	nfsm_chain_add_32(error, &nmreq, vers);
5069	if (ip == 4) {
5070		nfsm_chain_add_32(error, &nmreq, ipproto);
5071		nfsm_chain_add_32(error, &nmreq, 0);
5072	} else {
5073		if (ipproto == IPPROTO_TCP)
5074			nfsm_chain_add_string(error, &nmreq, "tcp6", 4);
5075		else
5076			nfsm_chain_add_string(error, &nmreq, "udp6", 4);
5077		nfsm_chain_add_string(error, &nmreq, "", 0); /* uaddr */
5078		nfsm_chain_add_string(error, &nmreq, "", 0); /* owner */
5079	}
5080	nfsm_chain_build_done(error, &nmreq);
5081	nfsmout_if(error);
5082	error = nfsm_rpchead2(nmp, (ipproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM,
5083			pmprog, pmvers, pmproc, RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead,
5084			&xid, &mreq);
5085	nfsmout_if(error);
5086	nmreq.nmc_mhead = NULL;
5087	error = nfs_aux_request(nmp, thd, saddr, so, (ipproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM,
5088			mreq, R_XID32(xid), 0, timeo, &nmrep);
5089
5090	/* grab port from portmap response */
5091	if (ip == 4) {
5092		nfsm_chain_get_32(error, &nmrep, port);
5093		if (!error)
5094			((struct sockaddr_in*)sa)->sin_port = htons(port);
5095	} else {
5096		/* get uaddr string and convert to sockaddr */
5097		nfsm_chain_get_32(error, &nmrep, ualen);
5098		if (!error) {
5099			if (ualen > ((int)sizeof(uaddr)-1))
5100				error = EIO;
5101			if (ualen < 1) {
5102				/* program is not available, just return a zero port */
5103				bcopy(sa, saddr, min(sizeof(ss), sa->sa_len));
5104				((struct sockaddr_in6*)saddr)->sin6_port = htons(0);
5105			} else {
5106				nfsm_chain_get_opaque(error, &nmrep, ualen, uaddr);
5107				if (!error) {
5108					uaddr[ualen] = '\0';
5109					if (!nfs_uaddr2sockaddr(uaddr, saddr))
5110						error = EIO;
5111				}
5112			}
5113		}
5114		if ((error == EPROGMISMATCH) || (error == EPROCUNAVAIL) || (error == EIO) || (error == EBADRPC)) {
5115			/* remote doesn't support rpcbind version or proc (or we couldn't parse uaddr) */
5116			if (pmvers == RPCBVERS4) {
5117				/* fall back to v3 and GETADDR */
5118				pmvers = RPCBVERS3;
5119				pmproc = RPCBPROC_GETADDR;
5120				nfsm_chain_cleanup(&nmreq);
5121				nfsm_chain_cleanup(&nmrep);
5122				bcopy(sa, saddr, min(sizeof(ss), sa->sa_len));
5123				xid = 0;
5124				error = 0;
5125				goto tryagain;
5126			}
5127		}
5128		if (!error)
5129			bcopy(saddr, sa, min(saddr->sa_len, sa->sa_len));
5130	}
5131nfsmout:
5132	nfsm_chain_cleanup(&nmreq);
5133	nfsm_chain_cleanup(&nmrep);
5134	return (error);
5135}
5136
5137int
5138nfs_msg(thread_t thd,
5139	const char *server,
5140	const char *msg,
5141	int error)
5142{
5143	proc_t p = thd ? get_bsdthreadtask_info(thd) : NULL;
5144	tpr_t tpr;
5145
5146	if (p)
5147		tpr = tprintf_open(p);
5148	else
5149		tpr = NULL;
5150	if (error)
5151		tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg, error);
5152	else
5153		tprintf(tpr, "nfs server %s: %s\n", server, msg);
5154	tprintf_close(tpr);
5155	return (0);
5156}
5157
5158#define	NFS_SQUISH_MOBILE_ONLY		0x0001		/* Squish mounts only on mobile machines */
5159#define NFS_SQUISH_AUTOMOUNTED_ONLY	0x0002		/* Squish mounts only if the are automounted */
5160#define NFS_SQUISH_SOFT			0x0004		/* Treat all soft mounts as though they were on a mobile machine */
5161#define NFS_SQUISH_QUICK		0x0008		/* Try to squish mounts more quickly. */
5162#define NFS_SQUISH_SHUTDOWN		0x1000		/* Squish all mounts on shutdown. Currently not implemented */
5163
5164uint32_t nfs_squishy_flags = NFS_SQUISH_MOBILE_ONLY | NFS_SQUISH_AUTOMOUNTED_ONLY | NFS_SQUISH_QUICK;
5165int32_t nfs_is_mobile;
5166
5167#define	NFS_SQUISHY_DEADTIMEOUT		8	/* Dead time out for squishy mounts */
5168#define NFS_SQUISHY_QUICKTIMEOUT	4	/* Quicker dead time out when nfs_squish_flags NFS_SQUISH_QUICK bit is set*/
5169
5170/*
5171 * Could this mount be squished?
5172 */
5173int
5174nfs_can_squish(struct nfsmount *nmp)
5175{
5176	uint64_t flags = vfs_flags(nmp->nm_mountp);
5177	int softsquish = ((nfs_squishy_flags & NFS_SQUISH_SOFT) & NMFLAG(nmp, SOFT));
5178
5179	if (!softsquish && (nfs_squishy_flags & NFS_SQUISH_MOBILE_ONLY) && nfs_is_mobile == 0)
5180		return (0);
5181
5182	if ((nfs_squishy_flags & NFS_SQUISH_AUTOMOUNTED_ONLY) && (flags & MNT_AUTOMOUNTED) == 0)
5183		return (0);
5184
5185	return (1);
5186}
5187
5188/*
5189 * NFS mounts default to "rw,hard" - but frequently on mobile clients
5190 * the mount may become "not responding".  It's desirable to be able
5191 * to unmount these dead mounts, but only if there is no risk of
5192 * losing data or crashing applications.  A "squishy" NFS mount is one
5193 * that can be force unmounted with little risk of harm.
5194 *
5195 * nfs_is_squishy checks if a mount is in a squishy state.  A mount is
5196 * in a squishy state iff it is allowed to be squishy and there are no
5197 * dirty pages and there are no mmapped files and there are no files
5198 * open for write. Mounts are allowed to be squishy is controlled by
5199 * the settings of the nfs_squishy_flags and its mobility state. These
5200 * flags can be set by sysctls.
5201 *
5202 * If nfs_is_squishy determines that we are in a squishy state we will
5203 * update the current dead timeout to at least NFS_SQUISHY_DEADTIMEOUT
5204 * (or NFS_SQUISHY_QUICKTIMEOUT if NFS_SQUISH_QUICK is set) (see
5205 * above) or 1/8th of the mount's nm_deadtimeout value, otherwise we just
5206 * update the current dead timeout with the mount's nm_deadtimeout
5207 * value set at mount time.
5208 *
5209 * Assumes that nm_lock is held.
5210 *
5211 * Note this routine is racey, but its effects on setting the
5212 * dead timeout only have effects when we're in trouble and are likely
5213 * to stay that way. Since by default its only for automounted
5214 * volumes on mobile machines; this is a reasonable trade off between
5215 * data integrity and user experience. It can be disabled or set via
5216 * nfs.conf file.
5217 */
5218
5219int
5220nfs_is_squishy(struct nfsmount *nmp)
5221{
5222	mount_t mp = nmp->nm_mountp;
5223	int squishy = 0;
5224	int timeo = (nfs_squishy_flags & NFS_SQUISH_QUICK) ? NFS_SQUISHY_QUICKTIMEOUT : NFS_SQUISHY_DEADTIMEOUT;
5225
5226	NFS_SOCK_DBG(("nfs_is_squishy: %s: nm_curdeadtiemout = %d, nfs_is_mobile = %d\n",
5227		      vfs_statfs(mp)->f_mntfromname, nmp->nm_curdeadtimeout,  nfs_is_mobile));
5228
5229	if (!nfs_can_squish(nmp))
5230		goto out;
5231
5232	timeo =  (nmp->nm_deadtimeout > timeo) ? max(nmp->nm_deadtimeout/8, timeo) : timeo;
5233	NFS_SOCK_DBG(("nfs_is_squishy:  nm_writers = %d  nm_mappers = %d timeo = %d\n", nmp->nm_writers, nmp->nm_mappers, timeo));
5234
5235	if (nmp->nm_writers == 0 && nmp->nm_mappers == 0) {
5236		uint64_t flags = mp ? vfs_flags(mp) : 0;
5237		squishy = 1;
5238
5239		/*
5240		 * Walk the nfs nodes and check for dirty buffers it we're not
5241		 * RDONLY and we've not already been declared as squishy since
5242		 * this can be a bit expensive.
5243		 */
5244		if (!(flags & MNT_RDONLY) && !(nmp->nm_state & NFSSTA_SQUISHY))
5245			squishy = !nfs_mount_is_dirty(mp);
5246	}
5247
5248out:
5249	if (squishy)
5250		nmp->nm_state |= NFSSTA_SQUISHY;
5251	else
5252		nmp->nm_state &= ~NFSSTA_SQUISHY;
5253
5254	nmp->nm_curdeadtimeout = squishy ? timeo : nmp->nm_deadtimeout;
5255
5256	NFS_SOCK_DBG(("nfs_is_squishy: nm_curdeadtimeout = %d\n", nmp->nm_curdeadtimeout));
5257
5258	return (squishy);
5259}
5260
5261/*
5262 * On a send operation, if we can't reach the server and we've got only one server to talk to
5263 * and NFS_SQUISH_QUICK flag is set and we are in a squishy state then mark the mount as dead
5264 * and ask to be forcibly unmounted. Return 1 if we're dead and 0 otherwise.
5265 */
5266static int
5267nfs_is_dead_lock(int error, struct nfsmount *nmp)
5268{
5269	if (nmp->nm_state & NFSSTA_DEAD)
5270		return (1);
5271
5272	if ((error != ENETUNREACH && error != EHOSTUNREACH) ||
5273	    !(nmp->nm_locations.nl_numlocs == 1 && nmp->nm_locations.nl_locations[0]->nl_servcount == 1))
5274		return (0);
5275	if ((nfs_squishy_flags & NFS_SQUISH_QUICK) && nfs_is_squishy(nmp)) {
5276		printf("nfs_is_dead: nfs server %s: unreachable. Squished dead\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname);
5277		nmp->nm_state |= NFSSTA_DEAD;
5278		vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_DEAD, 0);
5279		return (1);
5280	}
5281	return (0);
5282}
5283
5284int
5285nfs_is_dead(int error, struct nfsmount *nmp)
5286{
5287	int is_dead;
5288
5289	lck_mtx_lock(&nmp->nm_lock);
5290	is_dead = nfs_is_dead_lock(error, nmp);
5291	lck_mtx_unlock(&nmp->nm_lock);
5292
5293	return (is_dead);
5294}
5295
5296void
5297nfs_down(struct nfsmount *nmp, thread_t thd, int error, int flags, const char *msg)
5298{
5299	int timeoutmask, wasunresponsive, unresponsive, softnobrowse;
5300	uint32_t do_vfs_signal;
5301	struct timeval now;
5302
5303	if (nmp == NULL)
5304		return;
5305
5306	lck_mtx_lock(&nmp->nm_lock);
5307
5308	timeoutmask = NFSSTA_TIMEO | NFSSTA_LOCKTIMEO | NFSSTA_JUKEBOXTIMEO;
5309	if (NMFLAG(nmp, MUTEJUKEBOX)) /* jukebox timeouts don't count as unresponsive if muted */
5310		   timeoutmask &= ~NFSSTA_JUKEBOXTIMEO;
5311	wasunresponsive = (nmp->nm_state & timeoutmask);
5312
5313	/* XXX don't allow users to know about/disconnect unresponsive, soft, nobrowse mounts */
5314	softnobrowse = (NMFLAG(nmp, SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE));
5315
5316	if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO))
5317		nmp->nm_state |= NFSSTA_TIMEO;
5318	if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO))
5319		nmp->nm_state |= NFSSTA_LOCKTIMEO;
5320	if ((flags & NFSSTA_JUKEBOXTIMEO) && !(nmp->nm_state & NFSSTA_JUKEBOXTIMEO))
5321		nmp->nm_state |= NFSSTA_JUKEBOXTIMEO;
5322
5323	unresponsive = (nmp->nm_state & timeoutmask);
5324
5325	nfs_is_squishy(nmp);
5326
5327	if (unresponsive && (nmp->nm_curdeadtimeout > 0)) {
5328		microuptime(&now);
5329		if (!wasunresponsive) {
5330			nmp->nm_deadto_start = now.tv_sec;
5331			nfs_mount_sock_thread_wake(nmp);
5332		} else if ((now.tv_sec - nmp->nm_deadto_start) > nmp->nm_curdeadtimeout) {
5333			if (!(nmp->nm_state & NFSSTA_DEAD))
5334				printf("nfs server %s: %sdead\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname,
5335				       (nmp->nm_curdeadtimeout != nmp->nm_deadtimeout) ? "squished " : "");
5336			nmp->nm_state |= NFSSTA_DEAD;
5337		}
5338	}
5339	lck_mtx_unlock(&nmp->nm_lock);
5340
5341	if (nmp->nm_state & NFSSTA_DEAD)
5342		do_vfs_signal = VQ_DEAD;
5343	else if (softnobrowse || wasunresponsive || !unresponsive)
5344		do_vfs_signal = 0;
5345	else
5346		do_vfs_signal = VQ_NOTRESP;
5347	if (do_vfs_signal)
5348		vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, do_vfs_signal, 0);
5349
5350	nfs_msg(thd, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
5351}
5352
5353void
5354nfs_up(struct nfsmount *nmp, thread_t thd, int flags, const char *msg)
5355{
5356	int timeoutmask, wasunresponsive, unresponsive, softnobrowse;
5357	int do_vfs_signal;
5358
5359	if (nmp == NULL)
5360		return;
5361
5362	if (msg)
5363		nfs_msg(thd, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
5364
5365	lck_mtx_lock(&nmp->nm_lock);
5366
5367	timeoutmask = NFSSTA_TIMEO | NFSSTA_LOCKTIMEO | NFSSTA_JUKEBOXTIMEO;
5368	if (NMFLAG(nmp, MUTEJUKEBOX)) /* jukebox timeouts don't count as unresponsive if muted */
5369		   timeoutmask &= ~NFSSTA_JUKEBOXTIMEO;
5370	wasunresponsive = (nmp->nm_state & timeoutmask);
5371
5372	/* XXX don't allow users to know about/disconnect unresponsive, soft, nobrowse mounts */
5373	softnobrowse = (NMFLAG(nmp, SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE));
5374
5375	if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO))
5376		nmp->nm_state &= ~NFSSTA_TIMEO;
5377	if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO))
5378		nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
5379	if ((flags & NFSSTA_JUKEBOXTIMEO) && (nmp->nm_state & NFSSTA_JUKEBOXTIMEO))
5380		nmp->nm_state &= ~NFSSTA_JUKEBOXTIMEO;
5381
5382	unresponsive = (nmp->nm_state & timeoutmask);
5383
5384	nmp->nm_deadto_start = 0;
5385	nmp->nm_curdeadtimeout = nmp->nm_deadtimeout;
5386	nmp->nm_state &= ~NFSSTA_SQUISHY;
5387	lck_mtx_unlock(&nmp->nm_lock);
5388
5389	if (softnobrowse)
5390		do_vfs_signal = 0;
5391	else
5392		do_vfs_signal = (wasunresponsive && !unresponsive);
5393	if (do_vfs_signal)
5394		vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
5395}
5396
5397
5398#endif /* NFSCLIENT */
5399
5400#if NFSSERVER
5401
5402/*
5403 * Generate the rpc reply header
5404 * siz arg. is used to decide if adding a cluster is worthwhile
5405 */
5406int
5407nfsrv_rephead(
5408	struct nfsrv_descript *nd,
5409	__unused struct nfsrv_sock *slp,
5410	struct nfsm_chain *nmrepp,
5411	size_t siz)
5412{
5413	mbuf_t mrep;
5414	u_int32_t *tl;
5415	struct nfsm_chain nmrep;
5416	int err, error;
5417
5418	err = nd->nd_repstat;
5419	if (err && (nd->nd_vers == NFS_VER2))
5420		siz = 0;
5421
5422	/*
5423	 * If this is a big reply, use a cluster else
5424	 * try and leave leading space for the lower level headers.
5425	 */
5426	siz += RPC_REPLYSIZ;
5427	if (siz >= nfs_mbuf_minclsize) {
5428		error = mbuf_getpacket(MBUF_WAITOK, &mrep);
5429	} else {
5430		error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mrep);
5431	}
5432	if (error) {
5433		/* unable to allocate packet */
5434		/* XXX should we keep statistics for these errors? */
5435		return (error);
5436	}
5437	if (siz < nfs_mbuf_minclsize) {
5438		/* leave space for lower level headers */
5439		tl = mbuf_data(mrep);
5440		tl += 80/sizeof(*tl);  /* XXX max_hdr? XXX */
5441		mbuf_setdata(mrep, tl, 6 * NFSX_UNSIGNED);
5442	}
5443	nfsm_chain_init(&nmrep, mrep);
5444	nfsm_chain_add_32(error, &nmrep, nd->nd_retxid);
5445	nfsm_chain_add_32(error, &nmrep, RPC_REPLY);
5446	if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
5447		nfsm_chain_add_32(error, &nmrep, RPC_MSGDENIED);
5448		if (err & NFSERR_AUTHERR) {
5449			nfsm_chain_add_32(error, &nmrep, RPC_AUTHERR);
5450			nfsm_chain_add_32(error, &nmrep, (err & ~NFSERR_AUTHERR));
5451		} else {
5452			nfsm_chain_add_32(error, &nmrep, RPC_MISMATCH);
5453			nfsm_chain_add_32(error, &nmrep, RPC_VER2);
5454			nfsm_chain_add_32(error, &nmrep, RPC_VER2);
5455		}
5456	} else {
5457		/* reply status */
5458		nfsm_chain_add_32(error, &nmrep, RPC_MSGACCEPTED);
5459		if (nd->nd_gss_context != NULL) {
5460			/* RPCSEC_GSS verifier */
5461			error = nfs_gss_svc_verf_put(nd, &nmrep);
5462			if (error) {
5463				nfsm_chain_add_32(error, &nmrep, RPC_SYSTEM_ERR);
5464				goto done;
5465			}
5466		} else {
5467			/* RPCAUTH_NULL verifier */
5468			nfsm_chain_add_32(error, &nmrep, RPCAUTH_NULL);
5469			nfsm_chain_add_32(error, &nmrep, 0);
5470		}
5471		/* accepted status */
5472		switch (err) {
5473		case EPROGUNAVAIL:
5474			nfsm_chain_add_32(error, &nmrep, RPC_PROGUNAVAIL);
5475			break;
5476		case EPROGMISMATCH:
5477			nfsm_chain_add_32(error, &nmrep, RPC_PROGMISMATCH);
5478			/* XXX hard coded versions? */
5479			nfsm_chain_add_32(error, &nmrep, NFS_VER2);
5480			nfsm_chain_add_32(error, &nmrep, NFS_VER3);
5481			break;
5482		case EPROCUNAVAIL:
5483			nfsm_chain_add_32(error, &nmrep, RPC_PROCUNAVAIL);
5484			break;
5485		case EBADRPC:
5486			nfsm_chain_add_32(error, &nmrep, RPC_GARBAGE);
5487			break;
5488		default:
5489			nfsm_chain_add_32(error, &nmrep, RPC_SUCCESS);
5490			if (nd->nd_gss_context != NULL)
5491				error = nfs_gss_svc_prepare_reply(nd, &nmrep);
5492			if (err != NFSERR_RETVOID)
5493				nfsm_chain_add_32(error, &nmrep,
5494					(err ? nfsrv_errmap(nd, err) : 0));
5495			break;
5496		}
5497	}
5498
5499done:
5500	nfsm_chain_build_done(error, &nmrep);
5501	if (error) {
5502		/* error composing reply header */
5503		/* XXX should we keep statistics for these errors? */
5504		mbuf_freem(mrep);
5505		return (error);
5506	}
5507
5508	*nmrepp = nmrep;
5509	if ((err != 0) && (err != NFSERR_RETVOID))
5510		OSAddAtomic64(1, &nfsstats.srvrpc_errs);
5511	return (0);
5512}
5513
5514/*
5515 * The nfs server send routine.
5516 *
5517 * - return EINTR or ERESTART if interrupted by a signal
5518 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
5519 * - do any cleanup required by recoverable socket errors (???)
5520 */
5521int
5522nfsrv_send(struct nfsrv_sock *slp, mbuf_t nam, mbuf_t top)
5523{
5524	int error;
5525	socket_t so = slp->ns_so;
5526	struct sockaddr *sendnam;
5527	struct msghdr msg;
5528
5529	bzero(&msg, sizeof(msg));
5530	if (nam && !sock_isconnected(so) && (slp->ns_sotype != SOCK_STREAM)) {
5531		if ((sendnam = mbuf_data(nam))) {
5532			msg.msg_name = (caddr_t)sendnam;
5533			msg.msg_namelen = sendnam->sa_len;
5534		}
5535	}
5536	error = sock_sendmbuf(so, &msg, top, 0, NULL);
5537	if (!error)
5538		return (0);
5539	log(LOG_INFO, "nfsd send error %d\n", error);
5540
5541	if ((error == EWOULDBLOCK) && (slp->ns_sotype == SOCK_STREAM))
5542		error = EPIPE;  /* zap TCP sockets if they time out on send */
5543
5544	/* Handle any recoverable (soft) socket errors here. (???) */
5545	if (error != EINTR && error != ERESTART && error != EIO &&
5546		error != EWOULDBLOCK && error != EPIPE)
5547		error = 0;
5548
5549	return (error);
5550}
5551
5552/*
5553 * Socket upcall routine for the nfsd sockets.
5554 * The caddr_t arg is a pointer to the "struct nfsrv_sock".
5555 * Essentially do as much as possible non-blocking, else punt and it will
5556 * be called with MBUF_WAITOK from an nfsd.
5557 */
5558void
5559nfsrv_rcv(socket_t so, void *arg, int waitflag)
5560{
5561	struct nfsrv_sock *slp = arg;
5562
5563	if (!nfsd_thread_count || !(slp->ns_flag & SLP_VALID))
5564		return;
5565
5566	lck_rw_lock_exclusive(&slp->ns_rwlock);
5567	nfsrv_rcv_locked(so, slp, waitflag);
5568	/* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
5569}
5570void
5571nfsrv_rcv_locked(socket_t so, struct nfsrv_sock *slp, int waitflag)
5572{
5573	mbuf_t m, mp, mhck, m2;
5574	int ns_flag=0, error;
5575	struct msghdr	msg;
5576	size_t bytes_read;
5577
5578	if ((slp->ns_flag & SLP_VALID) == 0) {
5579		if (waitflag == MBUF_DONTWAIT)
5580			lck_rw_done(&slp->ns_rwlock);
5581		return;
5582	}
5583
5584#ifdef notdef
5585	/*
5586	 * Define this to test for nfsds handling this under heavy load.
5587	 */
5588	if (waitflag == MBUF_DONTWAIT) {
5589		ns_flag = SLP_NEEDQ;
5590		goto dorecs;
5591	}
5592#endif
5593	if (slp->ns_sotype == SOCK_STREAM) {
5594		/*
5595		 * If there are already records on the queue, defer soreceive()
5596		 * to an(other) nfsd so that there is feedback to the TCP layer that
5597		 * the nfs servers are heavily loaded.
5598		 */
5599		if (slp->ns_rec) {
5600			ns_flag = SLP_NEEDQ;
5601			goto dorecs;
5602		}
5603
5604		/*
5605		 * Do soreceive().
5606		 */
5607		bytes_read = 1000000000;
5608		error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
5609		if (error || mp == NULL) {
5610			if (error == EWOULDBLOCK)
5611				ns_flag = (waitflag == MBUF_DONTWAIT) ? SLP_NEEDQ : 0;
5612			else
5613				ns_flag = SLP_DISCONN;
5614			goto dorecs;
5615		}
5616		m = mp;
5617		if (slp->ns_rawend) {
5618			if ((error = mbuf_setnext(slp->ns_rawend, m)))
5619				panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
5620			slp->ns_cc += bytes_read;
5621		} else {
5622			slp->ns_raw = m;
5623			slp->ns_cc = bytes_read;
5624		}
5625		while ((m2 = mbuf_next(m)))
5626			m = m2;
5627		slp->ns_rawend = m;
5628
5629		/*
5630		 * Now try and parse record(s) out of the raw stream data.
5631		 */
5632		error = nfsrv_getstream(slp, waitflag);
5633		if (error) {
5634			if (error == EPERM)
5635				ns_flag = SLP_DISCONN;
5636			else
5637				ns_flag = SLP_NEEDQ;
5638		}
5639	} else {
5640		struct sockaddr_storage	nam;
5641
5642		if (slp->ns_reccnt >= nfsrv_sock_max_rec_queue_length) {
5643			/* already have max # RPC records queued on this socket */
5644			ns_flag = SLP_NEEDQ;
5645			goto dorecs;
5646		}
5647
5648		bzero(&msg, sizeof(msg));
5649		msg.msg_name = (caddr_t)&nam;
5650		msg.msg_namelen = sizeof(nam);
5651
5652		do {
5653			bytes_read = 1000000000;
5654			error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
5655			if (mp) {
5656				if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
5657					mbuf_setlen(mhck, nam.ss_len);
5658					bcopy(&nam, mbuf_data(mhck), nam.ss_len);
5659					m = mhck;
5660					if (mbuf_setnext(m, mp)) {
5661						/* trouble... just drop it */
5662						printf("nfsrv_rcv: mbuf_setnext failed\n");
5663						mbuf_free(mhck);
5664						m = mp;
5665					}
5666				} else {
5667					m = mp;
5668				}
5669				if (slp->ns_recend)
5670					mbuf_setnextpkt(slp->ns_recend, m);
5671				else {
5672					slp->ns_rec = m;
5673					slp->ns_flag |= SLP_DOREC;
5674				}
5675				slp->ns_recend = m;
5676				mbuf_setnextpkt(m, NULL);
5677				slp->ns_reccnt++;
5678			}
5679		} while (mp);
5680	}
5681
5682	/*
5683	 * Now try and process the request records, non-blocking.
5684	 */
5685dorecs:
5686	if (ns_flag)
5687		slp->ns_flag |= ns_flag;
5688	if (waitflag == MBUF_DONTWAIT) {
5689		int wake = (slp->ns_flag & SLP_WORKTODO);
5690		lck_rw_done(&slp->ns_rwlock);
5691		if (wake && nfsd_thread_count) {
5692			lck_mtx_lock(nfsd_mutex);
5693			nfsrv_wakenfsd(slp);
5694			lck_mtx_unlock(nfsd_mutex);
5695		}
5696	}
5697}
5698
5699/*
5700 * Try and extract an RPC request from the mbuf data list received on a
5701 * stream socket. The "waitflag" argument indicates whether or not it
5702 * can sleep.
5703 */
5704int
5705nfsrv_getstream(struct nfsrv_sock *slp, int waitflag)
5706{
5707	mbuf_t m;
5708	char *cp1, *cp2, *mdata;
5709	int len, mlen, error;
5710	mbuf_t om, m2, recm;
5711	u_int32_t recmark;
5712
5713	if (slp->ns_flag & SLP_GETSTREAM)
5714		panic("nfs getstream");
5715	slp->ns_flag |= SLP_GETSTREAM;
5716	for (;;) {
5717	    if (slp->ns_reclen == 0) {
5718		if (slp->ns_cc < NFSX_UNSIGNED) {
5719			slp->ns_flag &= ~SLP_GETSTREAM;
5720			return (0);
5721		}
5722		m = slp->ns_raw;
5723		mdata = mbuf_data(m);
5724		mlen = mbuf_len(m);
5725		if (mlen >= NFSX_UNSIGNED) {
5726			bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
5727			mdata += NFSX_UNSIGNED;
5728			mlen -= NFSX_UNSIGNED;
5729			mbuf_setdata(m, mdata, mlen);
5730		} else {
5731			cp1 = (caddr_t)&recmark;
5732			cp2 = mdata;
5733			while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
5734				while (mlen == 0) {
5735					m = mbuf_next(m);
5736					cp2 = mbuf_data(m);
5737					mlen = mbuf_len(m);
5738				}
5739				*cp1++ = *cp2++;
5740				mlen--;
5741				mbuf_setdata(m, cp2, mlen);
5742			}
5743		}
5744		slp->ns_cc -= NFSX_UNSIGNED;
5745		recmark = ntohl(recmark);
5746		slp->ns_reclen = recmark & ~0x80000000;
5747		if (recmark & 0x80000000)
5748			slp->ns_flag |= SLP_LASTFRAG;
5749		else
5750			slp->ns_flag &= ~SLP_LASTFRAG;
5751		if (slp->ns_reclen <= 0 || slp->ns_reclen > NFS_MAXPACKET) {
5752			slp->ns_flag &= ~SLP_GETSTREAM;
5753			return (EPERM);
5754		}
5755	    }
5756
5757	    /*
5758	     * Now get the record part.
5759	     *
5760	     * Note that slp->ns_reclen may be 0.  Linux sometimes
5761	     * generates 0-length RPCs
5762	     */
5763	    recm = NULL;
5764	    if (slp->ns_cc == slp->ns_reclen) {
5765		recm = slp->ns_raw;
5766		slp->ns_raw = slp->ns_rawend = NULL;
5767		slp->ns_cc = slp->ns_reclen = 0;
5768	    } else if (slp->ns_cc > slp->ns_reclen) {
5769		len = 0;
5770		m = slp->ns_raw;
5771		mlen = mbuf_len(m);
5772		mdata = mbuf_data(m);
5773		om = NULL;
5774		while (len < slp->ns_reclen) {
5775			if ((len + mlen) > slp->ns_reclen) {
5776				if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
5777					slp->ns_flag &= ~SLP_GETSTREAM;
5778					return (EWOULDBLOCK);
5779				}
5780				if (om) {
5781					if (mbuf_setnext(om, m2)) {
5782						/* trouble... just drop it */
5783						printf("nfsrv_getstream: mbuf_setnext failed\n");
5784						mbuf_freem(m2);
5785						slp->ns_flag &= ~SLP_GETSTREAM;
5786						return (EWOULDBLOCK);
5787					}
5788					recm = slp->ns_raw;
5789				} else {
5790					recm = m2;
5791				}
5792				mdata += slp->ns_reclen - len;
5793				mlen -= slp->ns_reclen - len;
5794				mbuf_setdata(m, mdata, mlen);
5795				len = slp->ns_reclen;
5796			} else if ((len + mlen) == slp->ns_reclen) {
5797				om = m;
5798				len += mlen;
5799				m = mbuf_next(m);
5800				recm = slp->ns_raw;
5801				if (mbuf_setnext(om, NULL)) {
5802					printf("nfsrv_getstream: mbuf_setnext failed 2\n");
5803					slp->ns_flag &= ~SLP_GETSTREAM;
5804					return (EWOULDBLOCK);
5805				}
5806				mlen = mbuf_len(m);
5807				mdata = mbuf_data(m);
5808			} else {
5809				om = m;
5810				len += mlen;
5811				m = mbuf_next(m);
5812				mlen = mbuf_len(m);
5813				mdata = mbuf_data(m);
5814			}
5815		}
5816		slp->ns_raw = m;
5817		slp->ns_cc -= len;
5818		slp->ns_reclen = 0;
5819	    } else {
5820		slp->ns_flag &= ~SLP_GETSTREAM;
5821		return (0);
5822	    }
5823
5824	    /*
5825	     * Accumulate the fragments into a record.
5826	     */
5827	    if (slp->ns_frag == NULL) {
5828		slp->ns_frag = recm;
5829	    } else {
5830		m = slp->ns_frag;
5831		while ((m2 = mbuf_next(m)))
5832		    m = m2;
5833		if ((error = mbuf_setnext(m, recm)))
5834		    panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
5835	    }
5836	    if (slp->ns_flag & SLP_LASTFRAG) {
5837		if (slp->ns_recend)
5838		    mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
5839		else {
5840		    slp->ns_rec = slp->ns_frag;
5841		    slp->ns_flag |= SLP_DOREC;
5842		}
5843		slp->ns_recend = slp->ns_frag;
5844		slp->ns_frag = NULL;
5845	    }
5846	}
5847}
5848
5849/*
5850 * Parse an RPC header.
5851 */
5852int
5853nfsrv_dorec(
5854	struct nfsrv_sock *slp,
5855	struct nfsd *nfsd,
5856	struct nfsrv_descript **ndp)
5857{
5858	mbuf_t m;
5859	mbuf_t nam;
5860	struct nfsrv_descript *nd;
5861	int error = 0;
5862
5863	*ndp = NULL;
5864	if (!(slp->ns_flag & (SLP_VALID|SLP_DOREC)) || (slp->ns_rec == NULL))
5865		return (ENOBUFS);
5866	MALLOC_ZONE(nd, struct nfsrv_descript *,
5867			sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
5868	if (!nd)
5869		return (ENOMEM);
5870	m = slp->ns_rec;
5871	slp->ns_rec = mbuf_nextpkt(m);
5872	if (slp->ns_rec)
5873		mbuf_setnextpkt(m, NULL);
5874	else {
5875		slp->ns_flag &= ~SLP_DOREC;
5876		slp->ns_recend = NULL;
5877	}
5878	slp->ns_reccnt--;
5879	if (mbuf_type(m) == MBUF_TYPE_SONAME) {
5880		nam = m;
5881		m = mbuf_next(m);
5882		if ((error = mbuf_setnext(nam, NULL)))
5883			panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
5884	} else
5885		nam = NULL;
5886	nd->nd_nam2 = nam;
5887	nfsm_chain_dissect_init(error, &nd->nd_nmreq, m);
5888	if (!error)
5889		error = nfsrv_getreq(nd);
5890	if (error) {
5891		if (nam)
5892			mbuf_freem(nam);
5893		if (nd->nd_gss_context)
5894			nfs_gss_svc_ctx_deref(nd->nd_gss_context);
5895		FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC);
5896		return (error);
5897	}
5898	nd->nd_mrep = NULL;
5899	*ndp = nd;
5900	nfsd->nfsd_nd = nd;
5901	return (0);
5902}
5903
5904/*
5905 * Parse an RPC request
5906 * - verify it
5907 * - fill in the cred struct.
5908 */
5909int
5910nfsrv_getreq(struct nfsrv_descript *nd)
5911{
5912	struct nfsm_chain *nmreq;
5913	int len, i;
5914	u_int32_t nfsvers, auth_type;
5915	int error = 0;
5916	uid_t user_id;
5917	gid_t group_id;
5918	int ngroups;
5919	uint32_t val;
5920
5921	nd->nd_cr = NULL;
5922	nd->nd_gss_context = NULL;
5923	nd->nd_gss_seqnum = 0;
5924	nd->nd_gss_mb = NULL;
5925
5926	user_id = group_id = -2;
5927	val = auth_type = len = 0;
5928
5929	nmreq = &nd->nd_nmreq;
5930	nfsm_chain_get_32(error, nmreq, nd->nd_retxid);	// XID
5931	nfsm_chain_get_32(error, nmreq, val);		// RPC Call
5932	if (!error && (val != RPC_CALL))
5933		error = EBADRPC;
5934	nfsmout_if(error);
5935	nd->nd_repstat = 0;
5936	nfsm_chain_get_32(error, nmreq, val);	// RPC Version
5937	nfsmout_if(error);
5938	if (val != RPC_VER2) {
5939		nd->nd_repstat = ERPCMISMATCH;
5940		nd->nd_procnum = NFSPROC_NOOP;
5941		return (0);
5942	}
5943	nfsm_chain_get_32(error, nmreq, val);	// RPC Program Number
5944	nfsmout_if(error);
5945	if (val != NFS_PROG) {
5946		nd->nd_repstat = EPROGUNAVAIL;
5947		nd->nd_procnum = NFSPROC_NOOP;
5948		return (0);
5949	}
5950	nfsm_chain_get_32(error, nmreq, nfsvers);// NFS Version Number
5951	nfsmout_if(error);
5952	if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
5953		nd->nd_repstat = EPROGMISMATCH;
5954		nd->nd_procnum = NFSPROC_NOOP;
5955		return (0);
5956	}
5957	nd->nd_vers = nfsvers;
5958	nfsm_chain_get_32(error, nmreq, nd->nd_procnum);// NFS Procedure Number
5959	nfsmout_if(error);
5960	if ((nd->nd_procnum >= NFS_NPROCS) ||
5961		((nd->nd_vers == NFS_VER2) && (nd->nd_procnum > NFSV2PROC_STATFS))) {
5962		nd->nd_repstat = EPROCUNAVAIL;
5963		nd->nd_procnum = NFSPROC_NOOP;
5964		return (0);
5965	}
5966	if (nfsvers != NFS_VER3)
5967		nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
5968	nfsm_chain_get_32(error, nmreq, auth_type);	// Auth Flavor
5969	nfsm_chain_get_32(error, nmreq, len);		// Auth Length
5970	if (!error && (len < 0 || len > RPCAUTH_MAXSIZ))
5971		error = EBADRPC;
5972	nfsmout_if(error);
5973
5974	/* Handle authentication */
5975	if (auth_type == RPCAUTH_SYS) {
5976		struct posix_cred temp_pcred;
5977		if (nd->nd_procnum == NFSPROC_NULL)
5978			return (0);
5979		nd->nd_sec = RPCAUTH_SYS;
5980		nfsm_chain_adv(error, nmreq, NFSX_UNSIGNED);	// skip stamp
5981		nfsm_chain_get_32(error, nmreq, len);		// hostname length
5982		if (len < 0 || len > NFS_MAXNAMLEN)
5983			error = EBADRPC;
5984		nfsm_chain_adv(error, nmreq, nfsm_rndup(len));	// skip hostname
5985		nfsmout_if(error);
5986
5987		/* create a temporary credential using the bits from the wire */
5988		bzero(&temp_pcred, sizeof(temp_pcred));
5989		nfsm_chain_get_32(error, nmreq, user_id);
5990		nfsm_chain_get_32(error, nmreq, group_id);
5991		temp_pcred.cr_groups[0] = group_id;
5992		nfsm_chain_get_32(error, nmreq, len);		// extra GID count
5993		if ((len < 0) || (len > RPCAUTH_UNIXGIDS))
5994			error = EBADRPC;
5995		nfsmout_if(error);
5996		for (i = 1; i <= len; i++)
5997			if (i < NGROUPS)
5998				nfsm_chain_get_32(error, nmreq, temp_pcred.cr_groups[i]);
5999			else
6000				nfsm_chain_adv(error, nmreq, NFSX_UNSIGNED);
6001		nfsmout_if(error);
6002		ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
6003		if (ngroups > 1)
6004			nfsrv_group_sort(&temp_pcred.cr_groups[0], ngroups);
6005		nfsm_chain_adv(error, nmreq, NFSX_UNSIGNED);	// verifier flavor (should be AUTH_NONE)
6006		nfsm_chain_get_32(error, nmreq, len);		// verifier length
6007		if (len < 0 || len > RPCAUTH_MAXSIZ)
6008			error = EBADRPC;
6009		if (len > 0)
6010			nfsm_chain_adv(error, nmreq, nfsm_rndup(len));
6011
6012		/* request creation of a real credential */
6013		temp_pcred.cr_uid = user_id;
6014		temp_pcred.cr_ngroups = ngroups;
6015		nd->nd_cr = posix_cred_create(&temp_pcred);
6016		if (nd->nd_cr == NULL) {
6017			nd->nd_repstat = ENOMEM;
6018			nd->nd_procnum = NFSPROC_NOOP;
6019			return (0);
6020		}
6021	} else if (auth_type == RPCSEC_GSS) {
6022		error = nfs_gss_svc_cred_get(nd, nmreq);
6023		if (error) {
6024			if (error == EINVAL)
6025				goto nfsmout;	// drop the request
6026			nd->nd_repstat = error;
6027			nd->nd_procnum = NFSPROC_NOOP;
6028			return (0);
6029		}
6030	} else {
6031		if (nd->nd_procnum == NFSPROC_NULL)	// assume it's AUTH_NONE
6032			return (0);
6033		nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
6034		nd->nd_procnum = NFSPROC_NOOP;
6035		return (0);
6036	}
6037	return (0);
6038nfsmout:
6039	if (IS_VALID_CRED(nd->nd_cr))
6040		kauth_cred_unref(&nd->nd_cr);
6041	nfsm_chain_cleanup(nmreq);
6042	return (error);
6043}
6044
6045/*
6046 * Search for a sleeping nfsd and wake it up.
6047 * SIDE EFFECT: If none found, make sure the socket is queued up so that one
6048 * of the running nfsds will go look for the work in the nfsrv_sockwait list.
6049 * Note: Must be called with nfsd_mutex held.
6050 */
6051void
6052nfsrv_wakenfsd(struct nfsrv_sock *slp)
6053{
6054	struct nfsd *nd;
6055
6056	if ((slp->ns_flag & SLP_VALID) == 0)
6057		return;
6058
6059	lck_rw_lock_exclusive(&slp->ns_rwlock);
6060	/* if there's work to do on this socket, make sure it's queued up */
6061	if ((slp->ns_flag & SLP_WORKTODO) && !(slp->ns_flag & SLP_QUEUED)) {
6062		TAILQ_INSERT_TAIL(&nfsrv_sockwait, slp, ns_svcq);
6063		slp->ns_flag |= SLP_WAITQ;
6064	}
6065	lck_rw_done(&slp->ns_rwlock);
6066
6067	/* wake up a waiting nfsd, if possible */
6068	nd = TAILQ_FIRST(&nfsd_queue);
6069	if (!nd)
6070		return;
6071
6072	TAILQ_REMOVE(&nfsd_queue, nd, nfsd_queue);
6073	nd->nfsd_flag &= ~NFSD_WAITING;
6074	wakeup(nd);
6075}
6076
6077#endif /* NFSSERVER */
6078