1/*
2 * Copyright (c) 2000-2011 Apple Inc.  All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 *    must display the following acknowledgement:
46 *	This product includes software developed by the University of
47 *	California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 *    may be used to endorse or promote products derived from this software
50 *    without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 *	@(#)nfs_syscalls.c	8.5 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_syscalls.c,v 1.32 1997/11/07 08:53:25 phk Exp $
66 */
67/*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections.  This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74#include <sys/param.h>
75#include <sys/systm.h>
76#include <sys/kernel.h>
77#include <sys/file_internal.h>
78#include <sys/filedesc.h>
79#include <sys/stat.h>
80#include <sys/vnode_internal.h>
81#include <sys/mount_internal.h>
82#include <sys/proc_internal.h> /* for fdflags */
83#include <sys/kauth.h>
84#include <sys/sysctl.h>
85#include <sys/ubc.h>
86#include <sys/uio.h>
87#include <sys/malloc.h>
88#include <sys/kpi_mbuf.h>
89#include <sys/socket.h>
90#include <sys/socketvar.h>
91#include <sys/domain.h>
92#include <sys/protosw.h>
93#include <sys/fcntl.h>
94#include <sys/lockf.h>
95#include <sys/syslog.h>
96#include <sys/user.h>
97#include <sys/sysproto.h>
98#include <sys/kpi_socket.h>
99#include <sys/fsevents.h>
100#include <libkern/OSAtomic.h>
101#include <kern/thread_call.h>
102#include <kern/task.h>
103
104#include <security/audit/audit.h>
105
106#include <netinet/in.h>
107#include <netinet/tcp.h>
108#include <nfs/xdr_subs.h>
109#include <nfs/rpcv2.h>
110#include <nfs/nfsproto.h>
111#include <nfs/nfs.h>
112#include <nfs/nfsm_subs.h>
113#include <nfs/nfsrvcache.h>
114#include <nfs/nfs_gss.h>
115#include <nfs/nfsmount.h>
116#include <nfs/nfsnode.h>
117#include <nfs/nfs_lock.h>
118#if CONFIG_MACF
119#include <security/mac_framework.h>
120#endif
121
122kern_return_t	thread_terminate(thread_t); /* XXX */
123
124#if NFSSERVER
125
126extern int (*nfsrv_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
127					    struct nfsrv_sock *slp,
128					    vfs_context_t ctx,
129					    mbuf_t *mrepp);
130extern int nfsrv_wg_delay;
131extern int nfsrv_wg_delay_v3;
132
133static int nfsrv_require_resv_port = 0;
134static int nfsrv_deadsock_timer_on = 0;
135
136int	nfssvc_export(user_addr_t argp);
137int	nfssvc_nfsd(void);
138int	nfssvc_addsock(socket_t, mbuf_t);
139void	nfsrv_zapsock(struct nfsrv_sock *);
140void	nfsrv_slpderef(struct nfsrv_sock *);
141void	nfsrv_slpfree(struct nfsrv_sock *);
142
143#endif /* NFSSERVER */
144
145/*
146 * sysctl stuff
147 */
148SYSCTL_DECL(_vfs_generic);
149SYSCTL_NODE(_vfs_generic, OID_AUTO, nfs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs hinge");
150
151#if NFSCLIENT
152SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, client, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs client hinge");
153SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, initialdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_initial_delay, 0, "");
154SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nextdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_delay, 0, "");
155SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_iosize, 0, "");
156SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_cache_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_cache_timeout, 0, "");
157SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, allow_async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_allow_async, 0, "");
158SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, statfs_rate_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_statfs_rate_limit, 0, "");
159SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsiod_thread_max, 0, "");
160SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsiod_thread_count, 0, "");
161SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, lockd_mounts, CTLFLAG_RD | CTLFLAG_LOCKED, &nfs_lockd_mounts, 0, "");
162SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, max_async_writes, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_max_async_writes, 0, "");
163SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, single_des, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_single_des, 0, "");
164SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_delete, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_delete, 0, "");
165SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_dotzfs, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_dotzfs, 0, "");
166SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_for_getattr, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_for_getattr, 0, "");
167SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, idmap_ctrl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_idmap_ctrl, 0, "");
168SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, callback_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_callback_port, 0, "");
169SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, is_mobile, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_is_mobile, 0, "");
170SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, squishy_flags, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_squishy_flags, 0, "");
171SYSCTL_UINT(_vfs_generic_nfs_client, OID_AUTO, debug_ctl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_debug_ctl, 0, "");
172
173
174#endif /* NFSCLIENT */
175
176#if NFSSERVER
177SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, server, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs server hinge");
178SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay, 0, "");
179SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay_v3, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay_v3, 0, "");
180SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, require_resv_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_require_resv_port, 0, "");
181SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_async, 0, "");
182SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, export_hash_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_export_hash_size, 0, "");
183SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, reqcache_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_reqcache_size, 0, "");
184SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, request_queue_length, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_sock_max_rec_queue_length, 0, "");
185SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, user_stats, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_user_stat_enabled, 0, "");
186SYSCTL_UINT(_vfs_generic_nfs_server, OID_AUTO, gss_context_ttl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_gss_context_ttl, 0, "");
187#if CONFIG_FSE
188SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, fsevents, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_fsevents_enabled, 0, "");
189#endif
190SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsd_thread_max, 0, "");
191SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsd_thread_count, 0, "");
192#ifdef NFS_UC_Q_DEBUG
193SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, use_upcall_svc, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_use_proxy, 0, "");
194SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_limit, 0, "");
195SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_max_seen, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_max_seen, 0, "");
196SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_count, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)&nfsrv_uc_queue_count, 0, "");
197#endif
198#endif /* NFSSERVER */
199
200
201#if NFSCLIENT
202
203int
204nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval)
205{
206	struct lockd_ans la;
207	int error;
208
209	switch (uap->flag) {
210	case NFSCLNT_LOCKDANS:
211		error = copyin(uap->argp, &la, sizeof(la));
212		if (!error)
213			error = nfslockdans(p, &la);
214		break;
215	case NFSCLNT_LOCKDNOTIFY:
216		error = nfslockdnotify(p, uap->argp);
217		break;
218	default:
219		error = EINVAL;
220	}
221	return (error);
222}
223
224/*
225 * Asynchronous I/O threads for client NFS.
226 * They do read-ahead and write-behind operations on the block I/O cache.
227 *
228 * The pool of up to nfsiod_thread_max threads is launched on demand and exit
229 * when unused for a while.  There are as many nfsiod structs as there are
230 * nfsiod threads; however there's no strict tie between a thread and a struct.
231 * Each thread puts an nfsiod on the free list and sleeps on it.  When it wakes
232 * up, it removes the next struct nfsiod from the queue and services it.  Then
233 * it will put the struct at the head of free list and sleep on it.
234 * Async requests will pull the next struct nfsiod from the head of the free list,
235 * put it on the work queue, and wake whatever thread is waiting on that struct.
236 */
237
238/*
239 * nfsiod thread exit routine
240 *
241 * Must be called with nfsiod_mutex held so that the
242 * decision to terminate is atomic with the termination.
243 */
244void
245nfsiod_terminate(struct nfsiod *niod)
246{
247	nfsiod_thread_count--;
248	lck_mtx_unlock(nfsiod_mutex);
249	if (niod)
250		FREE(niod, M_TEMP);
251	else
252		printf("nfsiod: terminating without niod\n");
253	thread_terminate(current_thread());
254	/*NOTREACHED*/
255}
256
257/* nfsiod thread startup routine */
258void
259nfsiod_thread(void)
260{
261	struct nfsiod *niod;
262	int error;
263
264	MALLOC(niod, struct nfsiod *, sizeof(struct nfsiod), M_TEMP, M_WAITOK);
265	if (!niod) {
266		lck_mtx_lock(nfsiod_mutex);
267		nfsiod_thread_count--;
268		wakeup(current_thread());
269		lck_mtx_unlock(nfsiod_mutex);
270		thread_terminate(current_thread());
271		/*NOTREACHED*/
272	}
273	bzero(niod, sizeof(*niod));
274	lck_mtx_lock(nfsiod_mutex);
275	TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link);
276	wakeup(current_thread());
277	error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE*hz, nfsiod_continue);
278	/* shouldn't return... so we have an error */
279	/* remove an old nfsiod struct and terminate */
280	lck_mtx_lock(nfsiod_mutex);
281	if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist)))
282		TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
283	nfsiod_terminate(niod);
284	/*NOTREACHED*/
285}
286
287/*
288 * Start up another nfsiod thread.
289 * (unless we're already maxed out and there are nfsiods running)
290 */
291int
292nfsiod_start(void)
293{
294	thread_t thd = THREAD_NULL;
295
296	lck_mtx_lock(nfsiod_mutex);
297	if ((nfsiod_thread_count >= NFSIOD_MAX) && (nfsiod_thread_count > 0)) {
298		lck_mtx_unlock(nfsiod_mutex);
299		return (EBUSY);
300	}
301	nfsiod_thread_count++;
302	if (kernel_thread_start((thread_continue_t)nfsiod_thread, NULL, &thd) != KERN_SUCCESS) {
303		lck_mtx_unlock(nfsiod_mutex);
304		return (EBUSY);
305	}
306	/* wait for the thread to complete startup */
307	msleep(thd, nfsiod_mutex, PWAIT | PDROP, "nfsiodw", NULL);
308	thread_deallocate(thd);
309	return (0);
310}
311
312/*
313 * Continuation for Asynchronous I/O threads for NFS client.
314 *
315 * Grab an nfsiod struct to work on, do some work, then drop it
316 */
317int
318nfsiod_continue(int error)
319{
320	struct nfsiod *niod;
321	struct nfsmount *nmp;
322	struct nfsreq *req, *treq;
323	struct nfs_reqqhead iodq;
324	int morework;
325
326	lck_mtx_lock(nfsiod_mutex);
327	niod = TAILQ_FIRST(&nfsiodwork);
328	if (!niod) {
329		/* there's no work queued up */
330		/* remove an old nfsiod struct and terminate */
331		if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist)))
332			TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
333		nfsiod_terminate(niod);
334		/*NOTREACHED*/
335	}
336	TAILQ_REMOVE(&nfsiodwork, niod, niod_link);
337
338worktodo:
339	while ((nmp = niod->niod_nmp)) {
340		/*
341		 * Service this mount's async I/O queue.
342		 *
343		 * In order to ensure some level of fairness between mounts,
344		 * we grab all the work up front before processing it so any
345		 * new work that arrives will be serviced on a subsequent
346		 * iteration - and we have a chance to see if other work needs
347		 * to be done (e.g. the delayed write queue needs to be pushed
348		 * or other mounts are waiting for an nfsiod).
349		 */
350		/* grab the current contents of the queue */
351		TAILQ_INIT(&iodq);
352		TAILQ_CONCAT(&iodq, &nmp->nm_iodq, r_achain);
353		lck_mtx_unlock(nfsiod_mutex);
354
355		/* process the queue */
356		TAILQ_FOREACH_SAFE(req, &iodq, r_achain, treq) {
357			TAILQ_REMOVE(&iodq, req, r_achain);
358			req->r_achain.tqe_next = NFSREQNOLIST;
359			req->r_callback.rcb_func(req);
360		}
361
362		/* now check if there's more/other work to be done */
363		lck_mtx_lock(nfsiod_mutex);
364		morework = !TAILQ_EMPTY(&nmp->nm_iodq);
365		if (!morework || !TAILQ_EMPTY(&nfsiodmounts)) {
366			/* we're going to stop working on this mount */
367			if (morework) /* mount still needs more work so queue it up */
368				TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
369			nmp->nm_niod = NULL;
370			niod->niod_nmp = NULL;
371		}
372	}
373
374	/* loop if there's still a mount to work on */
375	if (!niod->niod_nmp && !TAILQ_EMPTY(&nfsiodmounts)) {
376		niod->niod_nmp = TAILQ_FIRST(&nfsiodmounts);
377		TAILQ_REMOVE(&nfsiodmounts, niod->niod_nmp, nm_iodlink);
378	}
379	if (niod->niod_nmp)
380		goto worktodo;
381
382	/* queue ourselves back up - if there aren't too many threads running */
383	if (nfsiod_thread_count <= NFSIOD_MAX) {
384		TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link);
385		error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE*hz, nfsiod_continue);
386		/* shouldn't return... so we have an error */
387		/* remove an old nfsiod struct and terminate */
388		lck_mtx_lock(nfsiod_mutex);
389		if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist)))
390			TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
391	}
392	nfsiod_terminate(niod);
393	/*NOTREACHED*/
394	return (0);
395}
396
397#endif /* NFSCLIENT */
398
399
400#if NFSSERVER
401
402/*
403 * NFS server system calls
404 * getfh() lives here too, but maybe should move to kern/vfs_syscalls.c
405 */
406
407/*
408 * Get file handle system call
409 */
410int
411getfh(proc_t p, struct getfh_args *uap, __unused int *retval)
412{
413	vnode_t vp;
414	struct nfs_filehandle nfh;
415	int error, fhlen, fidlen;
416	struct nameidata nd;
417	char path[MAXPATHLEN], *ptr;
418	size_t pathlen;
419	struct nfs_exportfs *nxfs;
420	struct nfs_export *nx;
421
422	/*
423	 * Must be super user
424	 */
425	error = proc_suser(p);
426	if (error)
427		return (error);
428
429	error = copyinstr(uap->fname, path, MAXPATHLEN, &pathlen);
430	if (!error)
431		error = copyin(uap->fhp, &fhlen, sizeof(fhlen));
432	if (error)
433		return (error);
434	/* limit fh size to length specified (or v3 size by default) */
435	if ((fhlen != NFSV2_MAX_FH_SIZE) && (fhlen != NFSV3_MAX_FH_SIZE))
436		fhlen = NFSV3_MAX_FH_SIZE;
437	fidlen = fhlen - sizeof(struct nfs_exphandle);
438
439	if (!nfsrv_is_initialized())
440		return (EINVAL);
441
442	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1,
443			UIO_SYSSPACE, CAST_USER_ADDR_T(path), vfs_context_current());
444	error = namei(&nd);
445	if (error)
446		return (error);
447	nameidone(&nd);
448
449	vp = nd.ni_vp;
450
451	// find exportfs that matches f_mntonname
452	lck_rw_lock_shared(&nfsrv_export_rwlock);
453	ptr = vnode_mount(vp)->mnt_vfsstat.f_mntonname;
454	LIST_FOREACH(nxfs, &nfsrv_exports, nxfs_next) {
455		if (!strncmp(nxfs->nxfs_path, ptr, MAXPATHLEN))
456			break;
457	}
458	if (!nxfs || strncmp(nxfs->nxfs_path, path, strlen(nxfs->nxfs_path))) {
459		error = EINVAL;
460		goto out;
461	}
462	// find export that best matches remainder of path
463	ptr = path + strlen(nxfs->nxfs_path);
464	while (*ptr && (*ptr == '/'))
465		ptr++;
466	LIST_FOREACH(nx, &nxfs->nxfs_exports, nx_next) {
467		int len = strlen(nx->nx_path);
468		if (len == 0)  // we've hit the export entry for the root directory
469			break;
470		if (!strncmp(nx->nx_path, ptr, len))
471			break;
472	}
473	if (!nx) {
474		error = EINVAL;
475		goto out;
476	}
477
478	bzero(&nfh, sizeof(nfh));
479	nfh.nfh_xh.nxh_version = htonl(NFS_FH_VERSION);
480	nfh.nfh_xh.nxh_fsid = htonl(nxfs->nxfs_id);
481	nfh.nfh_xh.nxh_expid = htonl(nx->nx_id);
482	nfh.nfh_xh.nxh_flags = 0;
483	nfh.nfh_xh.nxh_reserved = 0;
484	nfh.nfh_len = fidlen;
485	error = VFS_VPTOFH(vp, (int*)&nfh.nfh_len, &nfh.nfh_fid[0], NULL);
486	if (nfh.nfh_len > (uint32_t)fidlen)
487		error = EOVERFLOW;
488	nfh.nfh_xh.nxh_fidlen = nfh.nfh_len;
489	nfh.nfh_len += sizeof(nfh.nfh_xh);
490	nfh.nfh_fhp = (u_char*)&nfh.nfh_xh;
491
492out:
493	lck_rw_done(&nfsrv_export_rwlock);
494	vnode_put(vp);
495	if (error)
496		return (error);
497	error = copyout((caddr_t)&nfh, uap->fhp, sizeof(fhandle_t));
498	return (error);
499}
500
501extern const struct fileops vnops;
502
503/*
504 * syscall for the rpc.lockd to use to translate a NFS file handle into
505 * an open descriptor.
506 *
507 * warning: do not remove the suser() call or this becomes one giant
508 * security hole.
509 */
510int
511fhopen( proc_t p,
512	struct fhopen_args *uap,
513	int32_t *retval)
514{
515	vnode_t vp;
516	struct nfs_filehandle nfh;
517	struct nfs_export *nx;
518	struct nfs_export_options *nxo;
519	struct flock lf;
520	struct fileproc *fp, *nfp;
521	int fmode, error, type;
522	int indx;
523	vfs_context_t ctx = vfs_context_current();
524	kauth_action_t action;
525
526	/*
527	 * Must be super user
528	 */
529	error = suser(vfs_context_ucred(ctx), 0);
530	if (error) {
531		return (error);
532	}
533
534	if (!nfsrv_is_initialized()) {
535		return (EINVAL);
536	}
537
538	fmode = FFLAGS(uap->flags);
539	/* why not allow a non-read/write open for our lockd? */
540	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
541		return (EINVAL);
542
543	error = copyin(uap->u_fhp, &nfh.nfh_len, sizeof(nfh.nfh_len));
544	if (error)
545		return (error);
546	if ((nfh.nfh_len < (int)sizeof(struct nfs_exphandle)) ||
547	    (nfh.nfh_len > (int)NFSV3_MAX_FH_SIZE))
548		return (EINVAL);
549	error = copyin(uap->u_fhp, &nfh, sizeof(nfh.nfh_len) + nfh.nfh_len);
550	if (error)
551		return (error);
552	nfh.nfh_fhp = (u_char*)&nfh.nfh_xh;
553
554	lck_rw_lock_shared(&nfsrv_export_rwlock);
555	/* now give me my vnode, it gets returned to me with a reference */
556	error = nfsrv_fhtovp(&nfh, NULL, &vp, &nx, &nxo);
557	lck_rw_done(&nfsrv_export_rwlock);
558	if (error) {
559		if (error == NFSERR_TRYLATER)
560			error = EAGAIN; // XXX EBUSY? Or just leave as TRYLATER?
561		return (error);
562	}
563
564	/*
565	 * From now on we have to make sure not
566	 * to forget about the vnode.
567	 * Any error that causes an abort must vnode_put(vp).
568	 * Just set error = err and 'goto bad;'.
569	 */
570
571	/*
572	 * from vn_open
573	 */
574	if (vnode_vtype(vp) == VSOCK) {
575		error = EOPNOTSUPP;
576		goto bad;
577	}
578
579	/* disallow write operations on directories */
580	if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) {
581		error = EISDIR;
582		goto bad;
583	}
584
585	/* compute action to be authorized */
586	action = 0;
587	if (fmode & FREAD)
588		action |= KAUTH_VNODE_READ_DATA;
589	if (fmode & (FWRITE | O_TRUNC))
590		action |= KAUTH_VNODE_WRITE_DATA;
591	if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
592		goto bad;
593
594	if ((error = VNOP_OPEN(vp, fmode, ctx)))
595		goto bad;
596	if ((error = vnode_ref_ext(vp, fmode, 0)))
597		goto bad;
598
599	/*
600	 * end of vn_open code
601	 */
602
603	// starting here... error paths should call vn_close/vnode_put
604	if ((error = falloc(p, &nfp, &indx, ctx)) != 0) {
605		vn_close(vp, fmode & FMASK, ctx);
606		goto bad;
607	}
608	fp = nfp;
609
610	fp->f_fglob->fg_flag = fmode & FMASK;
611	fp->f_fglob->fg_ops = &vnops;
612	fp->f_fglob->fg_data = (caddr_t)vp;
613
614	// XXX do we really need to support this with fhopen()?
615	if (fmode & (O_EXLOCK | O_SHLOCK)) {
616		lf.l_whence = SEEK_SET;
617		lf.l_start = 0;
618		lf.l_len = 0;
619		if (fmode & O_EXLOCK)
620			lf.l_type = F_WRLCK;
621		else
622			lf.l_type = F_RDLCK;
623		type = F_FLOCK;
624		if ((fmode & FNONBLOCK) == 0)
625			type |= F_WAIT;
626		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
627			struct vfs_context context = *vfs_context_current();
628			/* Modify local copy (to not damage thread copy) */
629			context.vc_ucred = fp->f_fglob->fg_cred;
630
631			vn_close(vp, fp->f_fglob->fg_flag, &context);
632			fp_free(p, indx, fp);
633			return (error);
634		}
635		fp->f_fglob->fg_flag |= FHASLOCK;
636	}
637
638	vnode_put(vp);
639
640	proc_fdlock(p);
641	procfdtbl_releasefd(p, indx, NULL);
642	fp_drop(p, indx, fp, 1);
643	proc_fdunlock(p);
644
645	*retval = indx;
646	return (0);
647
648bad:
649	vnode_put(vp);
650	return (error);
651}
652
653/*
654 * NFS server pseudo system call
655 */
656int
657nfssvc(proc_t p, struct nfssvc_args *uap, __unused int *retval)
658{
659	mbuf_t nam;
660	struct user_nfsd_args user_nfsdarg;
661	socket_t so;
662	int error;
663
664	AUDIT_ARG(cmd, uap->flag);
665
666	/*
667	 * Must be super user for most operations (export ops checked later).
668	 */
669	if ((uap->flag != NFSSVC_EXPORT) && ((error = proc_suser(p))))
670		return (error);
671#if CONFIG_MACF
672	error = mac_system_check_nfsd(kauth_cred_get());
673	if (error)
674		return (error);
675#endif
676
677	/* make sure NFS server data structures have been initialized */
678	nfsrv_init();
679
680	if (uap->flag & NFSSVC_ADDSOCK) {
681		if (IS_64BIT_PROCESS(p)) {
682			error = copyin(uap->argp, (caddr_t)&user_nfsdarg, sizeof(user_nfsdarg));
683		} else {
684			struct nfsd_args    tmp_args;
685			error = copyin(uap->argp, (caddr_t)&tmp_args, sizeof(tmp_args));
686			if (error == 0) {
687				user_nfsdarg.sock = tmp_args.sock;
688				user_nfsdarg.name = CAST_USER_ADDR_T(tmp_args.name);
689				user_nfsdarg.namelen = tmp_args.namelen;
690			}
691		}
692		if (error)
693			return (error);
694		/* get the socket */
695		error = file_socket(user_nfsdarg.sock, &so);
696		if (error)
697			return (error);
698		/* Get the client address for connected sockets. */
699		if (user_nfsdarg.name == USER_ADDR_NULL || user_nfsdarg.namelen == 0) {
700			nam = NULL;
701		} else {
702			error = sockargs(&nam, user_nfsdarg.name, user_nfsdarg.namelen, MBUF_TYPE_SONAME);
703			if (error) {
704				/* drop the iocount file_socket() grabbed on the file descriptor */
705				file_drop(user_nfsdarg.sock);
706				return (error);
707			}
708		}
709		/*
710		 * nfssvc_addsock() will grab a retain count on the socket
711		 * to keep the socket from being closed when nfsd closes its
712		 * file descriptor for it.
713		 */
714		error = nfssvc_addsock(so, nam);
715		/* drop the iocount file_socket() grabbed on the file descriptor */
716		file_drop(user_nfsdarg.sock);
717	} else if (uap->flag & NFSSVC_NFSD) {
718		error = nfssvc_nfsd();
719	} else if (uap->flag & NFSSVC_EXPORT) {
720		error = nfssvc_export(uap->argp);
721	} else {
722		error = EINVAL;
723	}
724	if (error == EINTR || error == ERESTART)
725		error = 0;
726	return (error);
727}
728
729/*
730 * Adds a socket to the list for servicing by nfsds.
731 */
732int
733nfssvc_addsock(socket_t so, mbuf_t mynam)
734{
735	struct nfsrv_sock *slp;
736	int error = 0, sodomain, sotype, soprotocol, on = 1;
737	int first;
738	struct timeval timeo;
739
740	/* make sure mbuf constants are set up */
741	if (!nfs_mbuf_mhlen)
742		nfs_mbuf_init();
743
744	sock_gettype(so, &sodomain, &sotype, &soprotocol);
745
746	/* There should be only one UDP socket for each of IPv4 and IPv6 */
747	if ((sodomain == AF_INET) && (soprotocol == IPPROTO_UDP) && nfsrv_udpsock) {
748		mbuf_freem(mynam);
749		return (EEXIST);
750	}
751	if ((sodomain == AF_INET6) && (soprotocol == IPPROTO_UDP) && nfsrv_udp6sock) {
752		mbuf_freem(mynam);
753		return (EEXIST);
754	}
755
756	/* Set protocol options and reserve some space (for UDP). */
757	if (sotype == SOCK_STREAM)
758		sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
759	if ((sodomain == AF_INET) && (soprotocol == IPPROTO_TCP))
760		sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
761	if (sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */
762		int reserve = NFS_UDPSOCKBUF;
763		error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve));
764		error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve));
765		if (error) {
766			log(LOG_INFO, "nfssvc_addsock: UDP socket buffer setting error(s) %d\n", error);
767			error = 0;
768		}
769	}
770	sock_nointerrupt(so, 0);
771
772	/*
773	 * Set socket send/receive timeouts.
774	 * Receive timeout shouldn't matter, but setting the send timeout
775	 * will make sure that an unresponsive client can't hang the server.
776	 */
777	timeo.tv_usec = 0;
778	timeo.tv_sec = 1;
779	error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
780	timeo.tv_sec = 30;
781	error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
782	if (error) {
783		log(LOG_INFO, "nfssvc_addsock: socket timeout setting error(s) %d\n", error);
784		error = 0;
785	}
786
787	MALLOC(slp, struct nfsrv_sock *, sizeof(struct nfsrv_sock), M_NFSSVC, M_WAITOK);
788	if (!slp) {
789		mbuf_freem(mynam);
790		return (ENOMEM);
791	}
792	bzero((caddr_t)slp, sizeof (struct nfsrv_sock));
793	lck_rw_init(&slp->ns_rwlock, nfsrv_slp_rwlock_group, LCK_ATTR_NULL);
794	lck_mtx_init(&slp->ns_wgmutex, nfsrv_slp_mutex_group, LCK_ATTR_NULL);
795
796	lck_mtx_lock(nfsd_mutex);
797
798	if (soprotocol == IPPROTO_UDP) {
799		if (sodomain == AF_INET) {
800			/* There should be only one UDP/IPv4 socket */
801			if (nfsrv_udpsock) {
802				lck_mtx_unlock(nfsd_mutex);
803				nfsrv_slpfree(slp);
804				mbuf_freem(mynam);
805				return (EEXIST);
806			}
807			nfsrv_udpsock = slp;
808		}
809		if (sodomain == AF_INET6) {
810			/* There should be only one UDP/IPv6 socket */
811			if (nfsrv_udp6sock) {
812				lck_mtx_unlock(nfsd_mutex);
813				nfsrv_slpfree(slp);
814				mbuf_freem(mynam);
815				return (EEXIST);
816			}
817			nfsrv_udp6sock = slp;
818		}
819	}
820
821	/* add the socket to the list */
822	first = TAILQ_EMPTY(&nfsrv_socklist);
823	TAILQ_INSERT_TAIL(&nfsrv_socklist, slp, ns_chain);
824
825	sock_retain(so); /* grab a retain count on the socket */
826	slp->ns_so = so;
827	slp->ns_sotype = sotype;
828	slp->ns_nam = mynam;
829
830	/* set up the socket up-call */
831	nfsrv_uc_addsock(slp, first);
832
833	/* mark that the socket is not in the nfsrv_sockwg list */
834	slp->ns_wgq.tqe_next = SLPNOLIST;
835
836	slp->ns_flag = SLP_VALID | SLP_NEEDQ;
837
838	nfsrv_wakenfsd(slp);
839	lck_mtx_unlock(nfsd_mutex);
840
841	return (0);
842}
843
844/*
845 * nfssvc_nfsd()
846 *
847 * nfsd theory of operation:
848 *
849 * The first nfsd thread stays in user mode accepting new TCP connections
850 * which are then added via the "addsock" call.  The rest of the nfsd threads
851 * simply call into the kernel and remain there in a loop handling NFS
852 * requests until killed by a signal.
853 *
854 * There's a list of nfsd threads (nfsd_head).
855 * There's an nfsd queue that contains only those nfsds that are
856 *   waiting for work to do (nfsd_queue).
857 *
858 * There's a list of all NFS sockets (nfsrv_socklist) and two queues for
859 *   managing the work on the sockets:
860 *   nfsrv_sockwait - sockets w/new data waiting to be worked on
861 *   nfsrv_sockwork - sockets being worked on which may have more work to do
862 *   nfsrv_sockwg -- sockets which have pending write gather data
863 * When a socket receives data, if it is not currently queued, it
864 *   will be placed at the end of the "wait" queue.
865 * Whenever a socket needs servicing we make sure it is queued and
866 *   wake up a waiting nfsd (if there is one).
867 *
868 * nfsds will service at most 8 requests from the same socket before
869 *   defecting to work on another socket.
870 * nfsds will defect immediately if there are any sockets in the "wait" queue
871 * nfsds looking for a socket to work on check the "wait" queue first and
872 *   then check the "work" queue.
873 * When an nfsd starts working on a socket, it removes it from the head of
874 *   the queue it's currently on and moves it to the end of the "work" queue.
875 * When nfsds are checking the queues for work, any sockets found not to
876 *   have any work are simply dropped from the queue.
877 *
878 */
879int
880nfssvc_nfsd(void)
881{
882	mbuf_t m, mrep;
883	struct nfsrv_sock *slp;
884	struct nfsd *nfsd;
885	struct nfsrv_descript *nd = NULL;
886	int error = 0, cacherep, writes_todo;
887	int siz, procrastinate, opcnt = 0;
888	u_quad_t cur_usec;
889	struct timeval now;
890	struct vfs_context context;
891	struct timespec to;
892
893#ifndef nolint
894	cacherep = RC_DOIT;
895	writes_todo = 0;
896#endif
897
898	MALLOC(nfsd, struct nfsd *, sizeof(struct nfsd), M_NFSD, M_WAITOK);
899	if (!nfsd)
900		return (ENOMEM);
901	bzero(nfsd, sizeof(struct nfsd));
902	lck_mtx_lock(nfsd_mutex);
903	if (nfsd_thread_count++ == 0)
904		nfsrv_initcache();		/* Init the server request cache */
905
906	TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain);
907	lck_mtx_unlock(nfsd_mutex);
908
909	context.vc_thread = current_thread();
910
911	/* Set time out so that nfsd threads can wake up a see if they are still needed. */
912	to.tv_sec = 5;
913	to.tv_nsec = 0;
914
915	/*
916	 * Loop getting rpc requests until SIGKILL.
917	 */
918	for (;;) {
919		if (nfsd_thread_max <= 0) {
920			/* NFS server shutting down, get out ASAP */
921			error = EINTR;
922			slp = nfsd->nfsd_slp;
923		} else if (nfsd->nfsd_flag & NFSD_REQINPROG) {
924			/* already have some work to do */
925			error = 0;
926			slp = nfsd->nfsd_slp;
927		} else {
928			/* need to find work to do */
929			error = 0;
930			lck_mtx_lock(nfsd_mutex);
931			while (!nfsd->nfsd_slp && TAILQ_EMPTY(&nfsrv_sockwait) && TAILQ_EMPTY(&nfsrv_sockwork)) {
932				if (nfsd_thread_count > nfsd_thread_max) {
933					/*
934					 * If we have no socket and there are more
935					 * nfsd threads than configured, let's exit.
936					 */
937					error = 0;
938					goto done;
939				}
940				nfsd->nfsd_flag |= NFSD_WAITING;
941				TAILQ_INSERT_HEAD(&nfsd_queue, nfsd, nfsd_queue);
942				error = msleep(nfsd, nfsd_mutex, PSOCK | PCATCH, "nfsd", &to);
943				if (error) {
944					if (nfsd->nfsd_flag & NFSD_WAITING) {
945						TAILQ_REMOVE(&nfsd_queue, nfsd, nfsd_queue);
946						nfsd->nfsd_flag &= ~NFSD_WAITING;
947					}
948					if (error == EWOULDBLOCK)
949						continue;
950					goto done;
951				}
952			}
953			slp = nfsd->nfsd_slp;
954			if (!slp && !TAILQ_EMPTY(&nfsrv_sockwait)) {
955				/* look for a socket to work on in the wait queue */
956				while ((slp = TAILQ_FIRST(&nfsrv_sockwait))) {
957					lck_rw_lock_exclusive(&slp->ns_rwlock);
958					/* remove from the head of the queue */
959					TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
960					slp->ns_flag &= ~SLP_WAITQ;
961					if ((slp->ns_flag & SLP_VALID) && (slp->ns_flag & SLP_WORKTODO))
962						break;
963					/* nothing to do, so skip this socket */
964					lck_rw_done(&slp->ns_rwlock);
965				}
966			}
967			if (!slp && !TAILQ_EMPTY(&nfsrv_sockwork)) {
968				/* look for a socket to work on in the work queue */
969				while ((slp = TAILQ_FIRST(&nfsrv_sockwork))) {
970					lck_rw_lock_exclusive(&slp->ns_rwlock);
971					/* remove from the head of the queue */
972					TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
973					slp->ns_flag &= ~SLP_WORKQ;
974					if ((slp->ns_flag & SLP_VALID) && (slp->ns_flag & SLP_WORKTODO))
975						break;
976					/* nothing to do, so skip this socket */
977					lck_rw_done(&slp->ns_rwlock);
978				}
979			}
980			if (!nfsd->nfsd_slp && slp) {
981				/* we found a socket to work on, grab a reference */
982				slp->ns_sref++;
983				nfsd->nfsd_slp = slp;
984				opcnt = 0;
985				/* and put it at the back of the work queue */
986				TAILQ_INSERT_TAIL(&nfsrv_sockwork, slp, ns_svcq);
987				slp->ns_flag |= SLP_WORKQ;
988				lck_rw_done(&slp->ns_rwlock);
989			}
990			lck_mtx_unlock(nfsd_mutex);
991			if (!slp)
992				continue;
993			lck_rw_lock_exclusive(&slp->ns_rwlock);
994			if (slp->ns_flag & SLP_VALID) {
995				if ((slp->ns_flag & (SLP_NEEDQ|SLP_DISCONN)) == SLP_NEEDQ) {
996					slp->ns_flag &= ~SLP_NEEDQ;
997					nfsrv_rcv_locked(slp->ns_so, slp, MBUF_WAITOK);
998				}
999				if (slp->ns_flag & SLP_DISCONN)
1000					nfsrv_zapsock(slp);
1001				error = nfsrv_dorec(slp, nfsd, &nd);
1002				if (error == EINVAL) {	// RPCSEC_GSS drop
1003					if (slp->ns_sotype == SOCK_STREAM)
1004						nfsrv_zapsock(slp); // drop connection
1005				}
1006				writes_todo = 0;
1007				if (error && (slp->ns_wgtime || (slp->ns_flag & SLP_DOWRITES))) {
1008					microuptime(&now);
1009					cur_usec = (u_quad_t)now.tv_sec * 1000000 +
1010						(u_quad_t)now.tv_usec;
1011					if (slp->ns_wgtime <= cur_usec) {
1012						error = 0;
1013						cacherep = RC_DOIT;
1014						writes_todo = 1;
1015					}
1016					slp->ns_flag &= ~SLP_DOWRITES;
1017				}
1018				nfsd->nfsd_flag |= NFSD_REQINPROG;
1019			}
1020			lck_rw_done(&slp->ns_rwlock);
1021		}
1022		if (error || (slp && !(slp->ns_flag & SLP_VALID))) {
1023			if (nd) {
1024				nfsm_chain_cleanup(&nd->nd_nmreq);
1025				if (nd->nd_nam2)
1026					mbuf_freem(nd->nd_nam2);
1027				if (IS_VALID_CRED(nd->nd_cr))
1028					kauth_cred_unref(&nd->nd_cr);
1029				if (nd->nd_gss_context)
1030					nfs_gss_svc_ctx_deref(nd->nd_gss_context);
1031				FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC);
1032				nd = NULL;
1033			}
1034			nfsd->nfsd_slp = NULL;
1035			nfsd->nfsd_flag &= ~NFSD_REQINPROG;
1036			if (slp)
1037				nfsrv_slpderef(slp);
1038			if (nfsd_thread_max <= 0)
1039				break;
1040			continue;
1041		}
1042		if (nd) {
1043		    microuptime(&nd->nd_starttime);
1044		    if (nd->nd_nam2)
1045			nd->nd_nam = nd->nd_nam2;
1046		    else
1047			nd->nd_nam = slp->ns_nam;
1048
1049		    cacherep = nfsrv_getcache(nd, slp, &mrep);
1050
1051		    if (nfsrv_require_resv_port) {
1052			/* Check if source port is a reserved port */
1053			in_port_t port = 0;
1054			struct sockaddr *saddr = mbuf_data(nd->nd_nam);
1055
1056			if (saddr->sa_family == AF_INET)
1057				port = ntohs(((struct sockaddr_in*)saddr)->sin_port);
1058			else if (saddr->sa_family == AF_INET6)
1059				port = ntohs(((struct sockaddr_in6*)saddr)->sin6_port);
1060			if ((port >= IPPORT_RESERVED) && (nd->nd_procnum != NFSPROC_NULL)) {
1061			    nd->nd_procnum = NFSPROC_NOOP;
1062			    nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK);
1063			    cacherep = RC_DOIT;
1064			}
1065		    }
1066
1067		}
1068
1069		/*
1070		 * Loop to get all the write RPC replies that have been
1071		 * gathered together.
1072		 */
1073		do {
1074		    switch (cacherep) {
1075		    case RC_DOIT:
1076			if (nd && (nd->nd_vers == NFS_VER3))
1077			    procrastinate = nfsrv_wg_delay_v3;
1078			else
1079			    procrastinate = nfsrv_wg_delay;
1080			lck_rw_lock_shared(&nfsrv_export_rwlock);
1081			context.vc_ucred = NULL;
1082			if (writes_todo || ((nd->nd_procnum == NFSPROC_WRITE) && (procrastinate > 0)))
1083				error = nfsrv_writegather(&nd, slp, &context, &mrep);
1084			else
1085				error = (*(nfsrv_procs[nd->nd_procnum]))(nd, slp, &context, &mrep);
1086			lck_rw_done(&nfsrv_export_rwlock);
1087			if (mrep == NULL) {
1088				/*
1089				 * If this is a stream socket and we are not going
1090				 * to send a reply we better close the connection
1091				 * so the client doesn't hang.
1092				 */
1093				if (error && slp->ns_sotype == SOCK_STREAM) {
1094					lck_rw_lock_exclusive(&slp->ns_rwlock);
1095					nfsrv_zapsock(slp);
1096					lck_rw_done(&slp->ns_rwlock);
1097					printf("NFS server: NULL reply from proc = %d error = %d\n",
1098						nd->nd_procnum, error);
1099				}
1100				break;
1101
1102			}
1103			if (error) {
1104				OSAddAtomic64(1, &nfsstats.srv_errs);
1105				nfsrv_updatecache(nd, FALSE, mrep);
1106				if (nd->nd_nam2) {
1107					mbuf_freem(nd->nd_nam2);
1108					nd->nd_nam2 = NULL;
1109				}
1110				break;
1111			}
1112			OSAddAtomic64(1, &nfsstats.srvrpccnt[nd->nd_procnum]);
1113			nfsrv_updatecache(nd, TRUE, mrep);
1114			/* FALLTHRU */
1115
1116		    case RC_REPLY:
1117			if (nd->nd_gss_mb != NULL) {	// It's RPCSEC_GSS
1118				/*
1119				 * Need to checksum or encrypt the reply
1120				 */
1121				error = nfs_gss_svc_protect_reply(nd, mrep);
1122				if (error) {
1123				    	mbuf_freem(mrep);
1124					break;
1125				}
1126			}
1127
1128			/*
1129			 * Get the total size of the reply
1130			 */
1131			m = mrep;
1132			siz = 0;
1133			while (m) {
1134				siz += mbuf_len(m);
1135				m = mbuf_next(m);
1136			}
1137			if (siz <= 0 || siz > NFS_MAXPACKET) {
1138				printf("mbuf siz=%d\n",siz);
1139				panic("Bad nfs svc reply");
1140			}
1141			m = mrep;
1142			mbuf_pkthdr_setlen(m, siz);
1143			error = mbuf_pkthdr_setrcvif(m, NULL);
1144			if (error)
1145				panic("nfsd setrcvif failed: %d", error);
1146			/*
1147			 * For stream protocols, prepend a Sun RPC
1148			 * Record Mark.
1149			 */
1150			if (slp->ns_sotype == SOCK_STREAM) {
1151				error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1152				if (!error)
1153					*(u_int32_t*)mbuf_data(m) = htonl(0x80000000 | siz);
1154			}
1155			if (!error) {
1156				if (slp->ns_flag & SLP_VALID) {
1157				    error = nfsrv_send(slp, nd->nd_nam2, m);
1158				} else {
1159				    error = EPIPE;
1160				    mbuf_freem(m);
1161				}
1162			} else {
1163				mbuf_freem(m);
1164			}
1165			mrep = NULL;
1166			if (nd->nd_nam2) {
1167				mbuf_freem(nd->nd_nam2);
1168				nd->nd_nam2 = NULL;
1169			}
1170			if (error == EPIPE) {
1171				lck_rw_lock_exclusive(&slp->ns_rwlock);
1172				nfsrv_zapsock(slp);
1173				lck_rw_done(&slp->ns_rwlock);
1174			}
1175			if (error == EINTR || error == ERESTART) {
1176				nfsm_chain_cleanup(&nd->nd_nmreq);
1177				if (IS_VALID_CRED(nd->nd_cr))
1178					kauth_cred_unref(&nd->nd_cr);
1179				if (nd->nd_gss_context)
1180					nfs_gss_svc_ctx_deref(nd->nd_gss_context);
1181				FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC);
1182				nfsrv_slpderef(slp);
1183				lck_mtx_lock(nfsd_mutex);
1184				goto done;
1185			}
1186			break;
1187		    case RC_DROPIT:
1188			mbuf_freem(nd->nd_nam2);
1189			nd->nd_nam2 = NULL;
1190			break;
1191		    };
1192		    opcnt++;
1193		    if (nd) {
1194			nfsm_chain_cleanup(&nd->nd_nmreq);
1195			if (nd->nd_nam2)
1196				mbuf_freem(nd->nd_nam2);
1197			if (IS_VALID_CRED(nd->nd_cr))
1198				kauth_cred_unref(&nd->nd_cr);
1199			if (nd->nd_gss_context)
1200				nfs_gss_svc_ctx_deref(nd->nd_gss_context);
1201			FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC);
1202			nd = NULL;
1203		    }
1204
1205		    /*
1206		     * Check to see if there are outstanding writes that
1207		     * need to be serviced.
1208		     */
1209		    writes_todo = 0;
1210		    if (slp->ns_wgtime) {
1211			microuptime(&now);
1212			cur_usec = (u_quad_t)now.tv_sec * 1000000 +
1213				(u_quad_t)now.tv_usec;
1214			if (slp->ns_wgtime <= cur_usec) {
1215			    cacherep = RC_DOIT;
1216			    writes_todo = 1;
1217			}
1218		    }
1219		} while (writes_todo);
1220
1221		nd = NULL;
1222		if (TAILQ_EMPTY(&nfsrv_sockwait) && (opcnt < 8)) {
1223			lck_rw_lock_exclusive(&slp->ns_rwlock);
1224			error = nfsrv_dorec(slp, nfsd, &nd);
1225			if (error == EINVAL) {	// RPCSEC_GSS drop
1226				if (slp->ns_sotype == SOCK_STREAM)
1227					nfsrv_zapsock(slp); // drop connection
1228			}
1229			lck_rw_done(&slp->ns_rwlock);
1230		}
1231		if (!nd) {
1232			/* drop our reference on the socket */
1233			nfsd->nfsd_flag &= ~NFSD_REQINPROG;
1234			nfsd->nfsd_slp = NULL;
1235			nfsrv_slpderef(slp);
1236		}
1237	}
1238	lck_mtx_lock(nfsd_mutex);
1239done:
1240	TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain);
1241	FREE(nfsd, M_NFSD);
1242	if (--nfsd_thread_count == 0)
1243		nfsrv_cleanup();
1244	lck_mtx_unlock(nfsd_mutex);
1245	return (error);
1246}
1247
1248int
1249nfssvc_export(user_addr_t argp)
1250{
1251	int error = 0, is_64bit;
1252	struct user_nfs_export_args unxa;
1253	vfs_context_t ctx = vfs_context_current();
1254
1255	is_64bit = IS_64BIT_PROCESS(vfs_context_proc(ctx));
1256
1257	/* copy in pointers to path and export args */
1258	if (is_64bit) {
1259		error = copyin(argp, (caddr_t)&unxa, sizeof(unxa));
1260	} else {
1261		struct nfs_export_args tnxa;
1262		error = copyin(argp, (caddr_t)&tnxa, sizeof(tnxa));
1263		if (error == 0) {
1264			/* munge into LP64 version of nfs_export_args structure */
1265			unxa.nxa_fsid = tnxa.nxa_fsid;
1266			unxa.nxa_expid = tnxa.nxa_expid;
1267			unxa.nxa_fspath = CAST_USER_ADDR_T(tnxa.nxa_fspath);
1268			unxa.nxa_exppath = CAST_USER_ADDR_T(tnxa.nxa_exppath);
1269			unxa.nxa_flags = tnxa.nxa_flags;
1270			unxa.nxa_netcount = tnxa.nxa_netcount;
1271			unxa.nxa_nets = CAST_USER_ADDR_T(tnxa.nxa_nets);
1272		}
1273	}
1274	if (error)
1275		return (error);
1276
1277	error = nfsrv_export(&unxa, ctx);
1278
1279	return (error);
1280}
1281
1282/*
1283 * Shut down a socket associated with an nfsrv_sock structure.
1284 * Should be called with the send lock set, if required.
1285 * The trick here is to increment the sref at the start, so that the nfsds
1286 * will stop using it and clear ns_flag at the end so that it will not be
1287 * reassigned during cleanup.
1288 */
1289void
1290nfsrv_zapsock(struct nfsrv_sock *slp)
1291{
1292	socket_t so;
1293
1294	if ((slp->ns_flag & SLP_VALID) == 0)
1295		return;
1296	slp->ns_flag &= ~SLP_ALLFLAGS;
1297
1298	so = slp->ns_so;
1299	if (so == NULL)
1300		return;
1301
1302	/*
1303	 * Attempt to deter future up-calls, but leave the
1304	 * up-call info in place to avoid a race with the
1305	 * networking code.
1306	 */
1307	socket_lock(so, 1);
1308	so->so_rcv.sb_flags &= ~SB_UPCALL;
1309	socket_unlock(so, 1);
1310
1311	sock_shutdown(so, SHUT_RDWR);
1312
1313	/*
1314	 * Remove from the up-call queue
1315	 */
1316	nfsrv_uc_dequeue(slp);
1317}
1318
1319/*
1320 * cleanup and release a server socket structure.
1321 */
1322void
1323nfsrv_slpfree(struct nfsrv_sock *slp)
1324{
1325	struct nfsrv_descript *nwp, *nnwp;
1326
1327	if (slp->ns_so) {
1328		sock_release(slp->ns_so);
1329		slp->ns_so = NULL;
1330	}
1331	if (slp->ns_nam)
1332		mbuf_free(slp->ns_nam);
1333	if (slp->ns_raw)
1334		mbuf_freem(slp->ns_raw);
1335	if (slp->ns_rec)
1336		mbuf_freem(slp->ns_rec);
1337	if (slp->ns_frag)
1338		mbuf_freem(slp->ns_frag);
1339	slp->ns_nam = slp->ns_raw = slp->ns_rec = slp->ns_frag = NULL;
1340	slp->ns_reccnt = 0;
1341
1342	if (slp->ns_ua)
1343		FREE(slp->ns_ua, M_NFSSVC);
1344
1345	for (nwp = slp->ns_tq.lh_first; nwp; nwp = nnwp) {
1346		nnwp = nwp->nd_tq.le_next;
1347		LIST_REMOVE(nwp, nd_tq);
1348		nfsm_chain_cleanup(&nwp->nd_nmreq);
1349		if (nwp->nd_mrep)
1350			mbuf_freem(nwp->nd_mrep);
1351		if (nwp->nd_nam2)
1352			mbuf_freem(nwp->nd_nam2);
1353		if (IS_VALID_CRED(nwp->nd_cr))
1354			kauth_cred_unref(&nwp->nd_cr);
1355		if (nwp->nd_gss_context)
1356			nfs_gss_svc_ctx_deref(nwp->nd_gss_context);
1357		FREE_ZONE(nwp, sizeof(*nwp), M_NFSRVDESC);
1358	}
1359	LIST_INIT(&slp->ns_tq);
1360
1361	lck_rw_destroy(&slp->ns_rwlock, nfsrv_slp_rwlock_group);
1362	lck_mtx_destroy(&slp->ns_wgmutex, nfsrv_slp_mutex_group);
1363	FREE(slp, M_NFSSVC);
1364}
1365
1366/*
1367 * Derefence a server socket structure. If it has no more references and
1368 * is no longer valid, you can throw it away.
1369 */
1370void
1371nfsrv_slpderef(struct nfsrv_sock *slp)
1372{
1373	struct timeval now;
1374
1375	lck_mtx_lock(nfsd_mutex);
1376	lck_rw_lock_exclusive(&slp->ns_rwlock);
1377	slp->ns_sref--;
1378
1379	if (slp->ns_sref || (slp->ns_flag & SLP_VALID)) {
1380		if ((slp->ns_flag & SLP_QUEUED) && !(slp->ns_flag & SLP_WORKTODO)) {
1381			/* remove socket from queue since there's no work */
1382			if (slp->ns_flag & SLP_WAITQ)
1383				TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
1384			else
1385				TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
1386			slp->ns_flag &= ~SLP_QUEUED;
1387		}
1388		lck_rw_done(&slp->ns_rwlock);
1389		lck_mtx_unlock(nfsd_mutex);
1390		return;
1391	}
1392
1393	/* This socket is no longer valid, so we'll get rid of it */
1394
1395	if (slp->ns_flag & SLP_QUEUED) {
1396		if (slp->ns_flag & SLP_WAITQ)
1397			TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
1398		else
1399			TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
1400		slp->ns_flag &= ~SLP_QUEUED;
1401	}
1402
1403	/*
1404	 * Queue the socket up for deletion
1405	 * and start the timer to delete it
1406	 * after it has been in limbo for
1407	 * a while.
1408	 */
1409	microuptime(&now);
1410	slp->ns_timestamp = now.tv_sec;
1411	TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain);
1412	TAILQ_INSERT_TAIL(&nfsrv_deadsocklist, slp, ns_chain);
1413	if (!nfsrv_deadsock_timer_on) {
1414		nfsrv_deadsock_timer_on = 1;
1415		nfs_interval_timer_start(nfsrv_deadsock_timer_call,
1416				NFSRV_DEADSOCKDELAY * 1000);
1417	}
1418
1419	lck_rw_done(&slp->ns_rwlock);
1420	/* now remove from the write gather socket list */
1421	if (slp->ns_wgq.tqe_next != SLPNOLIST) {
1422		TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq);
1423		slp->ns_wgq.tqe_next = SLPNOLIST;
1424	}
1425	lck_mtx_unlock(nfsd_mutex);
1426}
1427
1428/*
1429 * Check periodically for dead sockets pending delete.
1430 * If a socket has been dead for more than NFSRV_DEADSOCKDELAY
1431 * seconds then we assume it's safe to free.
1432 */
1433void
1434nfsrv_deadsock_timer(__unused void *param0, __unused void *param1)
1435{
1436	struct nfsrv_sock *slp;
1437	struct timeval now;
1438	time_t time_to_wait;
1439
1440	microuptime(&now);
1441	lck_mtx_lock(nfsd_mutex);
1442
1443	while ((slp = TAILQ_FIRST(&nfsrv_deadsocklist))) {
1444		if ((slp->ns_timestamp + NFSRV_DEADSOCKDELAY) > now.tv_sec)
1445			break;
1446		TAILQ_REMOVE(&nfsrv_deadsocklist, slp, ns_chain);
1447		nfsrv_slpfree(slp);
1448	}
1449	if (TAILQ_EMPTY(&nfsrv_deadsocklist)) {
1450		nfsrv_deadsock_timer_on = 0;
1451		lck_mtx_unlock(nfsd_mutex);
1452		return;
1453	}
1454	time_to_wait = (slp->ns_timestamp + NFSRV_DEADSOCKDELAY) - now.tv_sec;
1455	if (time_to_wait < 1)
1456		time_to_wait = 1;
1457
1458	lck_mtx_unlock(nfsd_mutex);
1459
1460	nfs_interval_timer_start(nfsrv_deadsock_timer_call,
1461		time_to_wait * 1000);
1462}
1463
1464/*
1465 * Clean up the data structures for the server.
1466 */
1467void
1468nfsrv_cleanup(void)
1469{
1470	struct nfsrv_sock *slp, *nslp;
1471	struct timeval now;
1472#if CONFIG_FSE
1473	struct nfsrv_fmod *fp, *nfp;
1474	int i;
1475#endif
1476
1477	microuptime(&now);
1478	for (slp = TAILQ_FIRST(&nfsrv_socklist); slp != 0; slp = nslp) {
1479		nslp = TAILQ_NEXT(slp, ns_chain);
1480		if (slp->ns_flag & SLP_VALID) {
1481			lck_rw_lock_exclusive(&slp->ns_rwlock);
1482			nfsrv_zapsock(slp);
1483			lck_rw_done(&slp->ns_rwlock);
1484		}
1485		if (slp->ns_flag & SLP_QUEUED) {
1486			if (slp->ns_flag & SLP_WAITQ)
1487				TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
1488			else
1489				TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
1490			slp->ns_flag &= ~SLP_QUEUED;
1491		}
1492		if (slp->ns_wgq.tqe_next != SLPNOLIST) {
1493			TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq);
1494			slp->ns_wgq.tqe_next = SLPNOLIST;
1495		}
1496		/* queue the socket up for deletion */
1497		slp->ns_timestamp = now.tv_sec;
1498		TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain);
1499		TAILQ_INSERT_TAIL(&nfsrv_deadsocklist, slp, ns_chain);
1500		if (!nfsrv_deadsock_timer_on) {
1501			nfsrv_deadsock_timer_on = 1;
1502			nfs_interval_timer_start(nfsrv_deadsock_timer_call,
1503				NFSRV_DEADSOCKDELAY * 1000);
1504		}
1505	}
1506
1507#if CONFIG_FSE
1508	/*
1509	 * Flush pending file write fsevents
1510	 */
1511	lck_mtx_lock(nfsrv_fmod_mutex);
1512	for (i = 0; i < NFSRVFMODHASHSZ; i++) {
1513		for (fp = LIST_FIRST(&nfsrv_fmod_hashtbl[i]); fp; fp = nfp) {
1514			/*
1515			 * Fire off the content modified fsevent for each
1516			 * entry, remove it from the list, and free it.
1517			 */
1518			if (nfsrv_fsevents_enabled) {
1519				fp->fm_context.vc_thread = current_thread();
1520				add_fsevent(FSE_CONTENT_MODIFIED, &fp->fm_context,
1521						FSE_ARG_VNODE, fp->fm_vp,
1522						FSE_ARG_DONE);
1523			}
1524			vnode_put(fp->fm_vp);
1525			kauth_cred_unref(&fp->fm_context.vc_ucred);
1526			nfp = LIST_NEXT(fp, fm_link);
1527			LIST_REMOVE(fp, fm_link);
1528			FREE(fp, M_TEMP);
1529		}
1530	}
1531	nfsrv_fmod_pending = 0;
1532	lck_mtx_unlock(nfsrv_fmod_mutex);
1533#endif
1534
1535	nfsrv_uc_cleanup();     /* Stop nfs socket up-call threads */
1536
1537	nfs_gss_svc_cleanup();	/* Remove any RPCSEC_GSS contexts */
1538
1539	nfsrv_cleancache();	/* And clear out server cache */
1540
1541	nfsrv_udpsock = NULL;
1542	nfsrv_udp6sock = NULL;
1543}
1544
1545#endif /* NFS_NOSERVER */
1546