1/*
2 * Copyright (c) 2000-2011 Apple Inc.  All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 *    must display the following acknowledgement:
46 *	This product includes software developed by the University of
47 *	California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 *    may be used to endorse or promote products derived from this software
50 *    without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 *	@(#)nfs_syscalls.c	8.5 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_syscalls.c,v 1.32 1997/11/07 08:53:25 phk Exp $
66 */
67/*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections.  This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74#include <sys/param.h>
75#include <sys/systm.h>
76#include <sys/kernel.h>
77#include <sys/file_internal.h>
78#include <sys/filedesc.h>
79#include <sys/stat.h>
80#include <sys/vnode_internal.h>
81#include <sys/mount_internal.h>
82#include <sys/proc_internal.h> /* for fdflags */
83#include <sys/kauth.h>
84#include <sys/sysctl.h>
85#include <sys/ubc.h>
86#include <sys/uio.h>
87#include <sys/malloc.h>
88#include <sys/kpi_mbuf.h>
89#include <sys/socket.h>
90#include <sys/socketvar.h>
91#include <sys/domain.h>
92#include <sys/protosw.h>
93#include <sys/fcntl.h>
94#include <sys/lockf.h>
95#include <sys/syslog.h>
96#include <sys/user.h>
97#include <sys/sysproto.h>
98#include <sys/kpi_socket.h>
99#include <sys/fsevents.h>
100#include <libkern/OSAtomic.h>
101#include <kern/thread_call.h>
102#include <kern/task.h>
103
104#include <security/audit/audit.h>
105
106#include <netinet/in.h>
107#include <netinet/tcp.h>
108#include <nfs/xdr_subs.h>
109#include <nfs/rpcv2.h>
110#include <nfs/nfsproto.h>
111#include <nfs/nfs.h>
112#include <nfs/nfsm_subs.h>
113#include <nfs/nfsrvcache.h>
114#include <nfs/nfs_gss.h>
115#include <nfs/nfsmount.h>
116#include <nfs/nfsnode.h>
117#include <nfs/nfs_lock.h>
118#if CONFIG_MACF
119#include <security/mac_framework.h>
120#endif
121
122kern_return_t	thread_terminate(thread_t); /* XXX */
123
124#if NFSSERVER
125
126extern int (*nfsrv_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
127					    struct nfsrv_sock *slp,
128					    vfs_context_t ctx,
129					    mbuf_t *mrepp);
130extern int nfsrv_wg_delay;
131extern int nfsrv_wg_delay_v3;
132
133static int nfsrv_require_resv_port = 0;
134static int nfsrv_deadsock_timer_on = 0;
135
136int	nfssvc_export(user_addr_t argp);
137int	nfssvc_nfsd(void);
138int	nfssvc_addsock(socket_t, mbuf_t);
139void	nfsrv_zapsock(struct nfsrv_sock *);
140void	nfsrv_slpderef(struct nfsrv_sock *);
141void	nfsrv_slpfree(struct nfsrv_sock *);
142
143#endif /* NFSSERVER */
144
145/*
146 * sysctl stuff
147 */
148SYSCTL_DECL(_vfs_generic);
149SYSCTL_NODE(_vfs_generic, OID_AUTO, nfs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs hinge");
150
151#if NFSCLIENT
152SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, client, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs client hinge");
153SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, initialdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_initial_delay, 0, "");
154SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nextdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_delay, 0, "");
155SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_iosize, 0, "");
156SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_cache_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_cache_timeout, 0, "");
157SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, allow_async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_allow_async, 0, "");
158SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, statfs_rate_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_statfs_rate_limit, 0, "");
159SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsiod_thread_max, 0, "");
160SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsiod_thread_count, 0, "");
161SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, lockd_mounts, CTLFLAG_RD | CTLFLAG_LOCKED, &nfs_lockd_mounts, 0, "");
162SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, max_async_writes, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_max_async_writes, 0, "");
163SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, single_des, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_single_des, 0, "");
164SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_delete, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_delete, 0, "");
165SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_dotzfs, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_dotzfs, 0, "");
166SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_for_getattr, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_for_getattr, 0, "");
167SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, idmap_ctrl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_idmap_ctrl, 0, "");
168SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, callback_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_callback_port, 0, "");
169SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, is_mobile, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_is_mobile, 0, "");
170SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, squishy_flags, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_squishy_flags, 0, "");
171
172#endif /* NFSCLIENT */
173
174#if NFSSERVER
175SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, server, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs server hinge");
176SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay, 0, "");
177SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay_v3, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay_v3, 0, "");
178SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, require_resv_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_require_resv_port, 0, "");
179SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_async, 0, "");
180SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, export_hash_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_export_hash_size, 0, "");
181SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, reqcache_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_reqcache_size, 0, "");
182SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, request_queue_length, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_sock_max_rec_queue_length, 0, "");
183SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, user_stats, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_user_stat_enabled, 0, "");
184SYSCTL_UINT(_vfs_generic_nfs_server, OID_AUTO, gss_context_ttl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_gss_context_ttl, 0, "");
185#if CONFIG_FSE
186SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, fsevents, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_fsevents_enabled, 0, "");
187#endif
188SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsd_thread_max, 0, "");
189SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsd_thread_count, 0, "");
190#ifdef NFS_UC_Q_DEBUG
191SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, use_upcall_svc, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_use_proxy, 0, "");
192SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_limit, 0, "");
193SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_max_seen, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_max_seen, 0, "");
194SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_count, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)&nfsrv_uc_queue_count, 0, "");
195#endif
196#endif /* NFSSERVER */
197
198
199#if NFSCLIENT
200
201int
202nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval)
203{
204	struct lockd_ans la;
205	int error;
206
207	switch (uap->flag) {
208	case NFSCLNT_LOCKDANS:
209		error = copyin(uap->argp, &la, sizeof(la));
210		if (!error)
211			error = nfslockdans(p, &la);
212		break;
213	case NFSCLNT_LOCKDNOTIFY:
214		error = nfslockdnotify(p, uap->argp);
215		break;
216	default:
217		error = EINVAL;
218	}
219	return (error);
220}
221
222/*
223 * Asynchronous I/O threads for client NFS.
224 * They do read-ahead and write-behind operations on the block I/O cache.
225 *
226 * The pool of up to nfsiod_thread_max threads is launched on demand and exit
227 * when unused for a while.  There are as many nfsiod structs as there are
228 * nfsiod threads; however there's no strict tie between a thread and a struct.
229 * Each thread puts an nfsiod on the free list and sleeps on it.  When it wakes
230 * up, it removes the next struct nfsiod from the queue and services it.  Then
231 * it will put the struct at the head of free list and sleep on it.
232 * Async requests will pull the next struct nfsiod from the head of the free list,
233 * put it on the work queue, and wake whatever thread is waiting on that struct.
234 */
235
236/*
237 * nfsiod thread exit routine
238 *
239 * Must be called with nfsiod_mutex held so that the
240 * decision to terminate is atomic with the termination.
241 */
242void
243nfsiod_terminate(struct nfsiod *niod)
244{
245	nfsiod_thread_count--;
246	lck_mtx_unlock(nfsiod_mutex);
247	if (niod)
248		FREE(niod, M_TEMP);
249	else
250		printf("nfsiod: terminating without niod\n");
251	thread_terminate(current_thread());
252	/*NOTREACHED*/
253}
254
255/* nfsiod thread startup routine */
256void
257nfsiod_thread(void)
258{
259	struct nfsiod *niod;
260	int error;
261
262	MALLOC(niod, struct nfsiod *, sizeof(struct nfsiod), M_TEMP, M_WAITOK);
263	if (!niod) {
264		lck_mtx_lock(nfsiod_mutex);
265		nfsiod_thread_count--;
266		wakeup(current_thread());
267		lck_mtx_unlock(nfsiod_mutex);
268		thread_terminate(current_thread());
269		/*NOTREACHED*/
270	}
271	bzero(niod, sizeof(*niod));
272	lck_mtx_lock(nfsiod_mutex);
273	TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link);
274	wakeup(current_thread());
275	error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE*hz, nfsiod_continue);
276	/* shouldn't return... so we have an error */
277	/* remove an old nfsiod struct and terminate */
278	lck_mtx_lock(nfsiod_mutex);
279	if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist)))
280		TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
281	nfsiod_terminate(niod);
282	/*NOTREACHED*/
283}
284
285/*
286 * Start up another nfsiod thread.
287 * (unless we're already maxed out and there are nfsiods running)
288 */
289int
290nfsiod_start(void)
291{
292	thread_t thd = THREAD_NULL;
293
294	lck_mtx_lock(nfsiod_mutex);
295	if ((nfsiod_thread_count >= NFSIOD_MAX) && (nfsiod_thread_count > 0)) {
296		lck_mtx_unlock(nfsiod_mutex);
297		return (EBUSY);
298	}
299	nfsiod_thread_count++;
300	if (kernel_thread_start((thread_continue_t)nfsiod_thread, NULL, &thd) != KERN_SUCCESS) {
301		lck_mtx_unlock(nfsiod_mutex);
302		return (EBUSY);
303	}
304	/* wait for the thread to complete startup */
305	msleep(thd, nfsiod_mutex, PWAIT | PDROP, "nfsiodw", NULL);
306	thread_deallocate(thd);
307	return (0);
308}
309
310/*
311 * Continuation for Asynchronous I/O threads for NFS client.
312 *
313 * Grab an nfsiod struct to work on, do some work, then drop it
314 */
315int
316nfsiod_continue(int error)
317{
318	struct nfsiod *niod;
319	struct nfsmount *nmp;
320	struct nfsreq *req, *treq;
321	struct nfs_reqqhead iodq;
322	int morework;
323
324	lck_mtx_lock(nfsiod_mutex);
325	niod = TAILQ_FIRST(&nfsiodwork);
326	if (!niod) {
327		/* there's no work queued up */
328		/* remove an old nfsiod struct and terminate */
329		if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist)))
330			TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
331		nfsiod_terminate(niod);
332		/*NOTREACHED*/
333	}
334	TAILQ_REMOVE(&nfsiodwork, niod, niod_link);
335
336worktodo:
337	while ((nmp = niod->niod_nmp)) {
338		/*
339		 * Service this mount's async I/O queue.
340		 *
341		 * In order to ensure some level of fairness between mounts,
342		 * we grab all the work up front before processing it so any
343		 * new work that arrives will be serviced on a subsequent
344		 * iteration - and we have a chance to see if other work needs
345		 * to be done (e.g. the delayed write queue needs to be pushed
346		 * or other mounts are waiting for an nfsiod).
347		 */
348		/* grab the current contents of the queue */
349		TAILQ_INIT(&iodq);
350		TAILQ_CONCAT(&iodq, &nmp->nm_iodq, r_achain);
351		lck_mtx_unlock(nfsiod_mutex);
352
353		/* process the queue */
354		TAILQ_FOREACH_SAFE(req, &iodq, r_achain, treq) {
355			TAILQ_REMOVE(&iodq, req, r_achain);
356			req->r_achain.tqe_next = NFSREQNOLIST;
357			req->r_callback.rcb_func(req);
358		}
359
360		/* now check if there's more/other work to be done */
361		lck_mtx_lock(nfsiod_mutex);
362		morework = !TAILQ_EMPTY(&nmp->nm_iodq);
363		if (!morework || !TAILQ_EMPTY(&nfsiodmounts)) {
364			/* we're going to stop working on this mount */
365			if (morework) /* mount still needs more work so queue it up */
366				TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
367			nmp->nm_niod = NULL;
368			niod->niod_nmp = NULL;
369		}
370	}
371
372	/* loop if there's still a mount to work on */
373	if (!niod->niod_nmp && !TAILQ_EMPTY(&nfsiodmounts)) {
374		niod->niod_nmp = TAILQ_FIRST(&nfsiodmounts);
375		TAILQ_REMOVE(&nfsiodmounts, niod->niod_nmp, nm_iodlink);
376	}
377	if (niod->niod_nmp)
378		goto worktodo;
379
380	/* queue ourselves back up - if there aren't too many threads running */
381	if (nfsiod_thread_count <= NFSIOD_MAX) {
382		TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link);
383		error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE*hz, nfsiod_continue);
384		/* shouldn't return... so we have an error */
385		/* remove an old nfsiod struct and terminate */
386		lck_mtx_lock(nfsiod_mutex);
387		if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist)))
388			TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
389	}
390	nfsiod_terminate(niod);
391	/*NOTREACHED*/
392	return (0);
393}
394
395#endif /* NFSCLIENT */
396
397
398#if NFSSERVER
399
400/*
401 * NFS server system calls
402 * getfh() lives here too, but maybe should move to kern/vfs_syscalls.c
403 */
404
405/*
406 * Get file handle system call
407 */
408int
409getfh(proc_t p, struct getfh_args *uap, __unused int *retval)
410{
411	vnode_t vp;
412	struct nfs_filehandle nfh;
413	int error, fhlen, fidlen;
414	struct nameidata nd;
415	char path[MAXPATHLEN], *ptr;
416	size_t pathlen;
417	struct nfs_exportfs *nxfs;
418	struct nfs_export *nx;
419
420	/*
421	 * Must be super user
422	 */
423	error = proc_suser(p);
424	if (error)
425		return (error);
426
427	error = copyinstr(uap->fname, path, MAXPATHLEN, &pathlen);
428	if (!error)
429		error = copyin(uap->fhp, &fhlen, sizeof(fhlen));
430	if (error)
431		return (error);
432	/* limit fh size to length specified (or v3 size by default) */
433	if ((fhlen != NFSV2_MAX_FH_SIZE) && (fhlen != NFSV3_MAX_FH_SIZE))
434		fhlen = NFSV3_MAX_FH_SIZE;
435	fidlen = fhlen - sizeof(struct nfs_exphandle);
436
437	if (!nfsrv_is_initialized())
438		return (EINVAL);
439
440	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1,
441			UIO_SYSSPACE, CAST_USER_ADDR_T(path), vfs_context_current());
442	error = namei(&nd);
443	if (error)
444		return (error);
445	nameidone(&nd);
446
447	vp = nd.ni_vp;
448
449	// find exportfs that matches f_mntonname
450	lck_rw_lock_shared(&nfsrv_export_rwlock);
451	ptr = vnode_mount(vp)->mnt_vfsstat.f_mntonname;
452	LIST_FOREACH(nxfs, &nfsrv_exports, nxfs_next) {
453		if (!strncmp(nxfs->nxfs_path, ptr, MAXPATHLEN))
454			break;
455	}
456	if (!nxfs || strncmp(nxfs->nxfs_path, path, strlen(nxfs->nxfs_path))) {
457		error = EINVAL;
458		goto out;
459	}
460	// find export that best matches remainder of path
461	ptr = path + strlen(nxfs->nxfs_path);
462	while (*ptr && (*ptr == '/'))
463		ptr++;
464	LIST_FOREACH(nx, &nxfs->nxfs_exports, nx_next) {
465		int len = strlen(nx->nx_path);
466		if (len == 0)  // we've hit the export entry for the root directory
467			break;
468		if (!strncmp(nx->nx_path, ptr, len))
469			break;
470	}
471	if (!nx) {
472		error = EINVAL;
473		goto out;
474	}
475
476	bzero(&nfh, sizeof(nfh));
477	nfh.nfh_xh.nxh_version = htonl(NFS_FH_VERSION);
478	nfh.nfh_xh.nxh_fsid = htonl(nxfs->nxfs_id);
479	nfh.nfh_xh.nxh_expid = htonl(nx->nx_id);
480	nfh.nfh_xh.nxh_flags = 0;
481	nfh.nfh_xh.nxh_reserved = 0;
482	nfh.nfh_len = fidlen;
483	error = VFS_VPTOFH(vp, (int*)&nfh.nfh_len, &nfh.nfh_fid[0], NULL);
484	if (nfh.nfh_len > (uint32_t)fidlen)
485		error = EOVERFLOW;
486	nfh.nfh_xh.nxh_fidlen = nfh.nfh_len;
487	nfh.nfh_len += sizeof(nfh.nfh_xh);
488	nfh.nfh_fhp = (u_char*)&nfh.nfh_xh;
489
490out:
491	lck_rw_done(&nfsrv_export_rwlock);
492	vnode_put(vp);
493	if (error)
494		return (error);
495	error = copyout((caddr_t)&nfh, uap->fhp, sizeof(fhandle_t));
496	return (error);
497}
498
499extern struct fileops vnops;
500
501/*
502 * syscall for the rpc.lockd to use to translate a NFS file handle into
503 * an open descriptor.
504 *
505 * warning: do not remove the suser() call or this becomes one giant
506 * security hole.
507 */
508int
509fhopen( proc_t p,
510	struct fhopen_args *uap,
511	int32_t *retval)
512{
513	vnode_t vp;
514	struct nfs_filehandle nfh;
515	struct nfs_export *nx;
516	struct nfs_export_options *nxo;
517	struct flock lf;
518	struct fileproc *fp, *nfp;
519	int fmode, error, type;
520	int indx;
521	vfs_context_t ctx = vfs_context_current();
522	kauth_action_t action;
523
524	/*
525	 * Must be super user
526	 */
527	error = suser(vfs_context_ucred(ctx), 0);
528	if (error) {
529		return (error);
530	}
531
532	if (!nfsrv_is_initialized()) {
533		return (EINVAL);
534	}
535
536	fmode = FFLAGS(uap->flags);
537	/* why not allow a non-read/write open for our lockd? */
538	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
539		return (EINVAL);
540
541	error = copyin(uap->u_fhp, &nfh.nfh_len, sizeof(nfh.nfh_len));
542	if (error)
543		return (error);
544	if ((nfh.nfh_len < (int)sizeof(struct nfs_exphandle)) ||
545	    (nfh.nfh_len > (int)NFSV3_MAX_FH_SIZE))
546		return (EINVAL);
547	error = copyin(uap->u_fhp, &nfh, sizeof(nfh.nfh_len) + nfh.nfh_len);
548	if (error)
549		return (error);
550	nfh.nfh_fhp = (u_char*)&nfh.nfh_xh;
551
552	lck_rw_lock_shared(&nfsrv_export_rwlock);
553	/* now give me my vnode, it gets returned to me with a reference */
554	error = nfsrv_fhtovp(&nfh, NULL, &vp, &nx, &nxo);
555	lck_rw_done(&nfsrv_export_rwlock);
556	if (error) {
557		if (error == NFSERR_TRYLATER)
558			error = EAGAIN; // XXX EBUSY? Or just leave as TRYLATER?
559		return (error);
560	}
561
562	/*
563	 * From now on we have to make sure not
564	 * to forget about the vnode.
565	 * Any error that causes an abort must vnode_put(vp).
566	 * Just set error = err and 'goto bad;'.
567	 */
568
569	/*
570	 * from vn_open
571	 */
572	if (vnode_vtype(vp) == VSOCK) {
573		error = EOPNOTSUPP;
574		goto bad;
575	}
576
577	/* disallow write operations on directories */
578	if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) {
579		error = EISDIR;
580		goto bad;
581	}
582
583	/* compute action to be authorized */
584	action = 0;
585	if (fmode & FREAD)
586		action |= KAUTH_VNODE_READ_DATA;
587	if (fmode & (FWRITE | O_TRUNC))
588		action |= KAUTH_VNODE_WRITE_DATA;
589	if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
590		goto bad;
591
592	if ((error = VNOP_OPEN(vp, fmode, ctx)))
593		goto bad;
594	if ((error = vnode_ref_ext(vp, fmode, 0)))
595		goto bad;
596
597	/*
598	 * end of vn_open code
599	 */
600
601	// starting here... error paths should call vn_close/vnode_put
602	if ((error = falloc(p, &nfp, &indx, ctx)) != 0) {
603		vn_close(vp, fmode & FMASK, ctx);
604		goto bad;
605	}
606	fp = nfp;
607
608	fp->f_fglob->fg_flag = fmode & FMASK;
609	fp->f_fglob->fg_type = DTYPE_VNODE;
610	fp->f_fglob->fg_ops = &vnops;
611	fp->f_fglob->fg_data = (caddr_t)vp;
612
613	// XXX do we really need to support this with fhopen()?
614	if (fmode & (O_EXLOCK | O_SHLOCK)) {
615		lf.l_whence = SEEK_SET;
616		lf.l_start = 0;
617		lf.l_len = 0;
618		if (fmode & O_EXLOCK)
619			lf.l_type = F_WRLCK;
620		else
621			lf.l_type = F_RDLCK;
622		type = F_FLOCK;
623		if ((fmode & FNONBLOCK) == 0)
624			type |= F_WAIT;
625		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx))) {
626			struct vfs_context context = *vfs_context_current();
627			/* Modify local copy (to not damage thread copy) */
628			context.vc_ucred = fp->f_fglob->fg_cred;
629
630			vn_close(vp, fp->f_fglob->fg_flag, &context);
631			fp_free(p, indx, fp);
632			return (error);
633		}
634		fp->f_fglob->fg_flag |= FHASLOCK;
635	}
636
637	vnode_put(vp);
638
639	proc_fdlock(p);
640	procfdtbl_releasefd(p, indx, NULL);
641	fp_drop(p, indx, fp, 1);
642	proc_fdunlock(p);
643
644	*retval = indx;
645	return (0);
646
647bad:
648	vnode_put(vp);
649	return (error);
650}
651
652/*
653 * NFS server pseudo system call
654 */
655int
656nfssvc(proc_t p, struct nfssvc_args *uap, __unused int *retval)
657{
658	mbuf_t nam;
659	struct user_nfsd_args user_nfsdarg;
660	socket_t so;
661	int error;
662
663	AUDIT_ARG(cmd, uap->flag);
664
665	/*
666	 * Must be super user for most operations (export ops checked later).
667	 */
668	if ((uap->flag != NFSSVC_EXPORT) && ((error = proc_suser(p))))
669		return (error);
670#if CONFIG_MACF
671	error = mac_system_check_nfsd(kauth_cred_get());
672	if (error)
673		return (error);
674#endif
675
676	/* make sure NFS server data structures have been initialized */
677	nfsrv_init();
678
679	if (uap->flag & NFSSVC_ADDSOCK) {
680		if (IS_64BIT_PROCESS(p)) {
681			error = copyin(uap->argp, (caddr_t)&user_nfsdarg, sizeof(user_nfsdarg));
682		} else {
683			struct nfsd_args    tmp_args;
684			error = copyin(uap->argp, (caddr_t)&tmp_args, sizeof(tmp_args));
685			if (error == 0) {
686				user_nfsdarg.sock = tmp_args.sock;
687				user_nfsdarg.name = CAST_USER_ADDR_T(tmp_args.name);
688				user_nfsdarg.namelen = tmp_args.namelen;
689			}
690		}
691		if (error)
692			return (error);
693		/* get the socket */
694		error = file_socket(user_nfsdarg.sock, &so);
695		if (error)
696			return (error);
697		/* Get the client address for connected sockets. */
698		if (user_nfsdarg.name == USER_ADDR_NULL || user_nfsdarg.namelen == 0) {
699			nam = NULL;
700		} else {
701			error = sockargs(&nam, user_nfsdarg.name, user_nfsdarg.namelen, MBUF_TYPE_SONAME);
702			if (error) {
703				/* drop the iocount file_socket() grabbed on the file descriptor */
704				file_drop(user_nfsdarg.sock);
705				return (error);
706			}
707		}
708		/*
709		 * nfssvc_addsock() will grab a retain count on the socket
710		 * to keep the socket from being closed when nfsd closes its
711		 * file descriptor for it.
712		 */
713		error = nfssvc_addsock(so, nam);
714		/* drop the iocount file_socket() grabbed on the file descriptor */
715		file_drop(user_nfsdarg.sock);
716	} else if (uap->flag & NFSSVC_NFSD) {
717		error = nfssvc_nfsd();
718	} else if (uap->flag & NFSSVC_EXPORT) {
719		error = nfssvc_export(uap->argp);
720	} else {
721		error = EINVAL;
722	}
723	if (error == EINTR || error == ERESTART)
724		error = 0;
725	return (error);
726}
727
728/*
729 * Adds a socket to the list for servicing by nfsds.
730 */
731int
732nfssvc_addsock(socket_t so, mbuf_t mynam)
733{
734	struct nfsrv_sock *slp;
735	int error = 0, sodomain, sotype, soprotocol, on = 1;
736	int first;
737	struct timeval timeo;
738
739	/* make sure mbuf constants are set up */
740	if (!nfs_mbuf_mhlen)
741		nfs_mbuf_init();
742
743	sock_gettype(so, &sodomain, &sotype, &soprotocol);
744
745	/* There should be only one UDP socket for each of IPv4 and IPv6 */
746	if ((sodomain == AF_INET) && (soprotocol == IPPROTO_UDP) && nfsrv_udpsock) {
747		mbuf_freem(mynam);
748		return (EEXIST);
749	}
750	if ((sodomain == AF_INET6) && (soprotocol == IPPROTO_UDP) && nfsrv_udp6sock) {
751		mbuf_freem(mynam);
752		return (EEXIST);
753	}
754
755	/* Set protocol options and reserve some space (for UDP). */
756	if (sotype == SOCK_STREAM)
757		sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
758	if ((sodomain == AF_INET) && (soprotocol == IPPROTO_TCP))
759		sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
760	if (sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */
761		int reserve = NFS_UDPSOCKBUF;
762		error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve));
763		error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve));
764		if (error) {
765			log(LOG_INFO, "nfssvc_addsock: UDP socket buffer setting error(s) %d\n", error);
766			error = 0;
767		}
768	}
769	sock_nointerrupt(so, 0);
770
771	/*
772	 * Set socket send/receive timeouts.
773	 * Receive timeout shouldn't matter, but setting the send timeout
774	 * will make sure that an unresponsive client can't hang the server.
775	 */
776	timeo.tv_usec = 0;
777	timeo.tv_sec = 1;
778	error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
779	timeo.tv_sec = 30;
780	error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
781	if (error) {
782		log(LOG_INFO, "nfssvc_addsock: socket timeout setting error(s) %d\n", error);
783		error = 0;
784	}
785
786	MALLOC(slp, struct nfsrv_sock *, sizeof(struct nfsrv_sock), M_NFSSVC, M_WAITOK);
787	if (!slp) {
788		mbuf_freem(mynam);
789		return (ENOMEM);
790	}
791	bzero((caddr_t)slp, sizeof (struct nfsrv_sock));
792	lck_rw_init(&slp->ns_rwlock, nfsrv_slp_rwlock_group, LCK_ATTR_NULL);
793	lck_mtx_init(&slp->ns_wgmutex, nfsrv_slp_mutex_group, LCK_ATTR_NULL);
794
795	lck_mtx_lock(nfsd_mutex);
796
797	if (soprotocol == IPPROTO_UDP) {
798		if (sodomain == AF_INET) {
799			/* There should be only one UDP/IPv4 socket */
800			if (nfsrv_udpsock) {
801				lck_mtx_unlock(nfsd_mutex);
802				nfsrv_slpfree(slp);
803				mbuf_freem(mynam);
804				return (EEXIST);
805			}
806			nfsrv_udpsock = slp;
807		}
808		if (sodomain == AF_INET6) {
809			/* There should be only one UDP/IPv6 socket */
810			if (nfsrv_udp6sock) {
811				lck_mtx_unlock(nfsd_mutex);
812				nfsrv_slpfree(slp);
813				mbuf_freem(mynam);
814				return (EEXIST);
815			}
816			nfsrv_udp6sock = slp;
817		}
818	}
819
820	/* add the socket to the list */
821	first = TAILQ_EMPTY(&nfsrv_socklist);
822	TAILQ_INSERT_TAIL(&nfsrv_socklist, slp, ns_chain);
823
824	sock_retain(so); /* grab a retain count on the socket */
825	slp->ns_so = so;
826	slp->ns_sotype = sotype;
827	slp->ns_nam = mynam;
828
829	/* set up the socket up-call */
830	nfsrv_uc_addsock(slp, first);
831
832	/* mark that the socket is not in the nfsrv_sockwg list */
833	slp->ns_wgq.tqe_next = SLPNOLIST;
834
835	slp->ns_flag = SLP_VALID | SLP_NEEDQ;
836
837	nfsrv_wakenfsd(slp);
838	lck_mtx_unlock(nfsd_mutex);
839
840	return (0);
841}
842
843/*
844 * nfssvc_nfsd()
845 *
846 * nfsd theory of operation:
847 *
848 * The first nfsd thread stays in user mode accepting new TCP connections
849 * which are then added via the "addsock" call.  The rest of the nfsd threads
850 * simply call into the kernel and remain there in a loop handling NFS
851 * requests until killed by a signal.
852 *
853 * There's a list of nfsd threads (nfsd_head).
854 * There's an nfsd queue that contains only those nfsds that are
855 *   waiting for work to do (nfsd_queue).
856 *
857 * There's a list of all NFS sockets (nfsrv_socklist) and two queues for
858 *   managing the work on the sockets:
859 *   nfsrv_sockwait - sockets w/new data waiting to be worked on
860 *   nfsrv_sockwork - sockets being worked on which may have more work to do
861 *   nfsrv_sockwg -- sockets which have pending write gather data
862 * When a socket receives data, if it is not currently queued, it
863 *   will be placed at the end of the "wait" queue.
864 * Whenever a socket needs servicing we make sure it is queued and
865 *   wake up a waiting nfsd (if there is one).
866 *
867 * nfsds will service at most 8 requests from the same socket before
868 *   defecting to work on another socket.
869 * nfsds will defect immediately if there are any sockets in the "wait" queue
870 * nfsds looking for a socket to work on check the "wait" queue first and
871 *   then check the "work" queue.
872 * When an nfsd starts working on a socket, it removes it from the head of
873 *   the queue it's currently on and moves it to the end of the "work" queue.
874 * When nfsds are checking the queues for work, any sockets found not to
875 *   have any work are simply dropped from the queue.
876 *
877 */
878int
879nfssvc_nfsd(void)
880{
881	mbuf_t m, mrep;
882	struct nfsrv_sock *slp;
883	struct nfsd *nfsd;
884	struct nfsrv_descript *nd = NULL;
885	int error = 0, cacherep, writes_todo;
886	int siz, procrastinate, opcnt = 0;
887	u_quad_t cur_usec;
888	struct timeval now;
889	struct vfs_context context;
890	struct timespec to;
891
892#ifndef nolint
893	cacherep = RC_DOIT;
894	writes_todo = 0;
895#endif
896
897	MALLOC(nfsd, struct nfsd *, sizeof(struct nfsd), M_NFSD, M_WAITOK);
898	if (!nfsd)
899		return (ENOMEM);
900	bzero(nfsd, sizeof(struct nfsd));
901	lck_mtx_lock(nfsd_mutex);
902	if (nfsd_thread_count++ == 0)
903		nfsrv_initcache();		/* Init the server request cache */
904
905	TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain);
906	lck_mtx_unlock(nfsd_mutex);
907
908	context.vc_thread = current_thread();
909
910	/* Set time out so that nfsd threads can wake up a see if they are still needed. */
911	to.tv_sec = 5;
912	to.tv_nsec = 0;
913
914	/*
915	 * Loop getting rpc requests until SIGKILL.
916	 */
917	for (;;) {
918		if (nfsd_thread_max <= 0) {
919			/* NFS server shutting down, get out ASAP */
920			error = EINTR;
921			slp = nfsd->nfsd_slp;
922		} else if (nfsd->nfsd_flag & NFSD_REQINPROG) {
923			/* already have some work to do */
924			error = 0;
925			slp = nfsd->nfsd_slp;
926		} else {
927			/* need to find work to do */
928			error = 0;
929			lck_mtx_lock(nfsd_mutex);
930			while (!nfsd->nfsd_slp && TAILQ_EMPTY(&nfsrv_sockwait) && TAILQ_EMPTY(&nfsrv_sockwork)) {
931				if (nfsd_thread_count > nfsd_thread_max) {
932					/*
933					 * If we have no socket and there are more
934					 * nfsd threads than configured, let's exit.
935					 */
936					error = 0;
937					goto done;
938				}
939				nfsd->nfsd_flag |= NFSD_WAITING;
940				TAILQ_INSERT_HEAD(&nfsd_queue, nfsd, nfsd_queue);
941				error = msleep(nfsd, nfsd_mutex, PSOCK | PCATCH, "nfsd", &to);
942				if (error) {
943					if (nfsd->nfsd_flag & NFSD_WAITING) {
944						TAILQ_REMOVE(&nfsd_queue, nfsd, nfsd_queue);
945						nfsd->nfsd_flag &= ~NFSD_WAITING;
946					}
947					if (error == EWOULDBLOCK)
948						continue;
949					goto done;
950				}
951			}
952			slp = nfsd->nfsd_slp;
953			if (!slp && !TAILQ_EMPTY(&nfsrv_sockwait)) {
954				/* look for a socket to work on in the wait queue */
955				while ((slp = TAILQ_FIRST(&nfsrv_sockwait))) {
956					lck_rw_lock_exclusive(&slp->ns_rwlock);
957					/* remove from the head of the queue */
958					TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
959					slp->ns_flag &= ~SLP_WAITQ;
960					if ((slp->ns_flag & SLP_VALID) && (slp->ns_flag & SLP_WORKTODO))
961						break;
962					/* nothing to do, so skip this socket */
963					lck_rw_done(&slp->ns_rwlock);
964				}
965			}
966			if (!slp && !TAILQ_EMPTY(&nfsrv_sockwork)) {
967				/* look for a socket to work on in the work queue */
968				while ((slp = TAILQ_FIRST(&nfsrv_sockwork))) {
969					lck_rw_lock_exclusive(&slp->ns_rwlock);
970					/* remove from the head of the queue */
971					TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
972					slp->ns_flag &= ~SLP_WORKQ;
973					if ((slp->ns_flag & SLP_VALID) && (slp->ns_flag & SLP_WORKTODO))
974						break;
975					/* nothing to do, so skip this socket */
976					lck_rw_done(&slp->ns_rwlock);
977				}
978			}
979			if (!nfsd->nfsd_slp && slp) {
980				/* we found a socket to work on, grab a reference */
981				slp->ns_sref++;
982				nfsd->nfsd_slp = slp;
983				opcnt = 0;
984				/* and put it at the back of the work queue */
985				TAILQ_INSERT_TAIL(&nfsrv_sockwork, slp, ns_svcq);
986				slp->ns_flag |= SLP_WORKQ;
987				lck_rw_done(&slp->ns_rwlock);
988			}
989			lck_mtx_unlock(nfsd_mutex);
990			if (!slp)
991				continue;
992			lck_rw_lock_exclusive(&slp->ns_rwlock);
993			if (slp->ns_flag & SLP_VALID) {
994				if ((slp->ns_flag & (SLP_NEEDQ|SLP_DISCONN)) == SLP_NEEDQ) {
995					slp->ns_flag &= ~SLP_NEEDQ;
996					nfsrv_rcv_locked(slp->ns_so, slp, MBUF_WAITOK);
997				}
998				if (slp->ns_flag & SLP_DISCONN)
999					nfsrv_zapsock(slp);
1000				error = nfsrv_dorec(slp, nfsd, &nd);
1001				if (error == EINVAL) {	// RPCSEC_GSS drop
1002					if (slp->ns_sotype == SOCK_STREAM)
1003						nfsrv_zapsock(slp); // drop connection
1004				}
1005				writes_todo = 0;
1006				if (error && (slp->ns_wgtime || (slp->ns_flag & SLP_DOWRITES))) {
1007					microuptime(&now);
1008					cur_usec = (u_quad_t)now.tv_sec * 1000000 +
1009						(u_quad_t)now.tv_usec;
1010					if (slp->ns_wgtime <= cur_usec) {
1011						error = 0;
1012						cacherep = RC_DOIT;
1013						writes_todo = 1;
1014					}
1015					slp->ns_flag &= ~SLP_DOWRITES;
1016				}
1017				nfsd->nfsd_flag |= NFSD_REQINPROG;
1018			}
1019			lck_rw_done(&slp->ns_rwlock);
1020		}
1021		if (error || (slp && !(slp->ns_flag & SLP_VALID))) {
1022			if (nd) {
1023				nfsm_chain_cleanup(&nd->nd_nmreq);
1024				if (nd->nd_nam2)
1025					mbuf_freem(nd->nd_nam2);
1026				if (IS_VALID_CRED(nd->nd_cr))
1027					kauth_cred_unref(&nd->nd_cr);
1028				if (nd->nd_gss_context)
1029					nfs_gss_svc_ctx_deref(nd->nd_gss_context);
1030				FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC);
1031				nd = NULL;
1032			}
1033			nfsd->nfsd_slp = NULL;
1034			nfsd->nfsd_flag &= ~NFSD_REQINPROG;
1035			if (slp)
1036				nfsrv_slpderef(slp);
1037			if (nfsd_thread_max <= 0)
1038				break;
1039			continue;
1040		}
1041		if (nd) {
1042		    microuptime(&nd->nd_starttime);
1043		    if (nd->nd_nam2)
1044			nd->nd_nam = nd->nd_nam2;
1045		    else
1046			nd->nd_nam = slp->ns_nam;
1047
1048		    cacherep = nfsrv_getcache(nd, slp, &mrep);
1049
1050		    if (nfsrv_require_resv_port) {
1051			/* Check if source port is a reserved port */
1052			in_port_t port = 0;
1053			struct sockaddr *saddr = mbuf_data(nd->nd_nam);
1054
1055			if (saddr->sa_family == AF_INET)
1056				port = ntohs(((struct sockaddr_in*)saddr)->sin_port);
1057			else if (saddr->sa_family == AF_INET6)
1058				port = ntohs(((struct sockaddr_in6*)saddr)->sin6_port);
1059			if ((port >= IPPORT_RESERVED) && (nd->nd_procnum != NFSPROC_NULL)) {
1060			    nd->nd_procnum = NFSPROC_NOOP;
1061			    nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK);
1062			    cacherep = RC_DOIT;
1063			}
1064		    }
1065
1066		}
1067
1068		/*
1069		 * Loop to get all the write RPC replies that have been
1070		 * gathered together.
1071		 */
1072		do {
1073		    switch (cacherep) {
1074		    case RC_DOIT:
1075			if (nd && (nd->nd_vers == NFS_VER3))
1076			    procrastinate = nfsrv_wg_delay_v3;
1077			else
1078			    procrastinate = nfsrv_wg_delay;
1079			lck_rw_lock_shared(&nfsrv_export_rwlock);
1080			context.vc_ucred = NULL;
1081			if (writes_todo || ((nd->nd_procnum == NFSPROC_WRITE) && (procrastinate > 0)))
1082				error = nfsrv_writegather(&nd, slp, &context, &mrep);
1083			else
1084				error = (*(nfsrv_procs[nd->nd_procnum]))(nd, slp, &context, &mrep);
1085			lck_rw_done(&nfsrv_export_rwlock);
1086			if (mrep == NULL) {
1087				/*
1088				 * If this is a stream socket and we are not going
1089				 * to send a reply we better close the connection
1090				 * so the client doesn't hang.
1091				 */
1092				if (error && slp->ns_sotype == SOCK_STREAM) {
1093					lck_rw_lock_exclusive(&slp->ns_rwlock);
1094					nfsrv_zapsock(slp);
1095					lck_rw_done(&slp->ns_rwlock);
1096					printf("NFS server: NULL reply from proc = %d error = %d\n",
1097						nd->nd_procnum, error);
1098				}
1099				break;
1100
1101			}
1102			if (error) {
1103				OSAddAtomic64(1, &nfsstats.srv_errs);
1104				nfsrv_updatecache(nd, FALSE, mrep);
1105				if (nd->nd_nam2) {
1106					mbuf_freem(nd->nd_nam2);
1107					nd->nd_nam2 = NULL;
1108				}
1109				break;
1110			}
1111			OSAddAtomic64(1, &nfsstats.srvrpccnt[nd->nd_procnum]);
1112			nfsrv_updatecache(nd, TRUE, mrep);
1113			/* FALLTHRU */
1114
1115		    case RC_REPLY:
1116			if (nd->nd_gss_mb != NULL) {	// It's RPCSEC_GSS
1117				/*
1118				 * Need to checksum or encrypt the reply
1119				 */
1120				error = nfs_gss_svc_protect_reply(nd, mrep);
1121				if (error) {
1122				    	mbuf_freem(mrep);
1123					break;
1124				}
1125			}
1126
1127			/*
1128			 * Get the total size of the reply
1129			 */
1130			m = mrep;
1131			siz = 0;
1132			while (m) {
1133				siz += mbuf_len(m);
1134				m = mbuf_next(m);
1135			}
1136			if (siz <= 0 || siz > NFS_MAXPACKET) {
1137				printf("mbuf siz=%d\n",siz);
1138				panic("Bad nfs svc reply");
1139			}
1140			m = mrep;
1141			mbuf_pkthdr_setlen(m, siz);
1142			error = mbuf_pkthdr_setrcvif(m, NULL);
1143			if (error)
1144				panic("nfsd setrcvif failed: %d", error);
1145			/*
1146			 * For stream protocols, prepend a Sun RPC
1147			 * Record Mark.
1148			 */
1149			if (slp->ns_sotype == SOCK_STREAM) {
1150				error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1151				if (!error)
1152					*(u_int32_t*)mbuf_data(m) = htonl(0x80000000 | siz);
1153			}
1154			if (!error) {
1155				if (slp->ns_flag & SLP_VALID) {
1156				    error = nfsrv_send(slp, nd->nd_nam2, m);
1157				} else {
1158				    error = EPIPE;
1159				    mbuf_freem(m);
1160				}
1161			} else {
1162				mbuf_freem(m);
1163			}
1164			mrep = NULL;
1165			if (nd->nd_nam2) {
1166				mbuf_freem(nd->nd_nam2);
1167				nd->nd_nam2 = NULL;
1168			}
1169			if (error == EPIPE) {
1170				lck_rw_lock_exclusive(&slp->ns_rwlock);
1171				nfsrv_zapsock(slp);
1172				lck_rw_done(&slp->ns_rwlock);
1173			}
1174			if (error == EINTR || error == ERESTART) {
1175				nfsm_chain_cleanup(&nd->nd_nmreq);
1176				if (IS_VALID_CRED(nd->nd_cr))
1177					kauth_cred_unref(&nd->nd_cr);
1178				if (nd->nd_gss_context)
1179					nfs_gss_svc_ctx_deref(nd->nd_gss_context);
1180				FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC);
1181				nfsrv_slpderef(slp);
1182				lck_mtx_lock(nfsd_mutex);
1183				goto done;
1184			}
1185			break;
1186		    case RC_DROPIT:
1187			mbuf_freem(nd->nd_nam2);
1188			nd->nd_nam2 = NULL;
1189			break;
1190		    };
1191		    opcnt++;
1192		    if (nd) {
1193			nfsm_chain_cleanup(&nd->nd_nmreq);
1194			if (nd->nd_nam2)
1195				mbuf_freem(nd->nd_nam2);
1196			if (IS_VALID_CRED(nd->nd_cr))
1197				kauth_cred_unref(&nd->nd_cr);
1198			if (nd->nd_gss_context)
1199				nfs_gss_svc_ctx_deref(nd->nd_gss_context);
1200			FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC);
1201			nd = NULL;
1202		    }
1203
1204		    /*
1205		     * Check to see if there are outstanding writes that
1206		     * need to be serviced.
1207		     */
1208		    writes_todo = 0;
1209		    if (slp->ns_wgtime) {
1210			microuptime(&now);
1211			cur_usec = (u_quad_t)now.tv_sec * 1000000 +
1212				(u_quad_t)now.tv_usec;
1213			if (slp->ns_wgtime <= cur_usec) {
1214			    cacherep = RC_DOIT;
1215			    writes_todo = 1;
1216			}
1217		    }
1218		} while (writes_todo);
1219
1220		nd = NULL;
1221		if (TAILQ_EMPTY(&nfsrv_sockwait) && (opcnt < 8)) {
1222			lck_rw_lock_exclusive(&slp->ns_rwlock);
1223			error = nfsrv_dorec(slp, nfsd, &nd);
1224			if (error == EINVAL) {	// RPCSEC_GSS drop
1225				if (slp->ns_sotype == SOCK_STREAM)
1226					nfsrv_zapsock(slp); // drop connection
1227			}
1228			lck_rw_done(&slp->ns_rwlock);
1229		}
1230		if (!nd) {
1231			/* drop our reference on the socket */
1232			nfsd->nfsd_flag &= ~NFSD_REQINPROG;
1233			nfsd->nfsd_slp = NULL;
1234			nfsrv_slpderef(slp);
1235		}
1236	}
1237	lck_mtx_lock(nfsd_mutex);
1238done:
1239	TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain);
1240	FREE(nfsd, M_NFSD);
1241	if (--nfsd_thread_count == 0)
1242		nfsrv_cleanup();
1243	lck_mtx_unlock(nfsd_mutex);
1244	return (error);
1245}
1246
1247int
1248nfssvc_export(user_addr_t argp)
1249{
1250	int error = 0, is_64bit;
1251	struct user_nfs_export_args unxa;
1252	vfs_context_t ctx = vfs_context_current();
1253
1254	is_64bit = IS_64BIT_PROCESS(vfs_context_proc(ctx));
1255
1256	/* copy in pointers to path and export args */
1257	if (is_64bit) {
1258		error = copyin(argp, (caddr_t)&unxa, sizeof(unxa));
1259	} else {
1260		struct nfs_export_args tnxa;
1261		error = copyin(argp, (caddr_t)&tnxa, sizeof(tnxa));
1262		if (error == 0) {
1263			/* munge into LP64 version of nfs_export_args structure */
1264			unxa.nxa_fsid = tnxa.nxa_fsid;
1265			unxa.nxa_expid = tnxa.nxa_expid;
1266			unxa.nxa_fspath = CAST_USER_ADDR_T(tnxa.nxa_fspath);
1267			unxa.nxa_exppath = CAST_USER_ADDR_T(tnxa.nxa_exppath);
1268			unxa.nxa_flags = tnxa.nxa_flags;
1269			unxa.nxa_netcount = tnxa.nxa_netcount;
1270			unxa.nxa_nets = CAST_USER_ADDR_T(tnxa.nxa_nets);
1271		}
1272	}
1273	if (error)
1274		return (error);
1275
1276	error = nfsrv_export(&unxa, ctx);
1277
1278	return (error);
1279}
1280
1281/*
1282 * Shut down a socket associated with an nfsrv_sock structure.
1283 * Should be called with the send lock set, if required.
1284 * The trick here is to increment the sref at the start, so that the nfsds
1285 * will stop using it and clear ns_flag at the end so that it will not be
1286 * reassigned during cleanup.
1287 */
1288void
1289nfsrv_zapsock(struct nfsrv_sock *slp)
1290{
1291	socket_t so;
1292
1293	if ((slp->ns_flag & SLP_VALID) == 0)
1294		return;
1295	slp->ns_flag &= ~SLP_ALLFLAGS;
1296
1297	so = slp->ns_so;
1298	if (so == NULL)
1299		return;
1300
1301	/*
1302	 * Attempt to deter future up-calls, but leave the
1303	 * up-call info in place to avoid a race with the
1304	 * networking code.
1305	 */
1306	socket_lock(so, 1);
1307	so->so_rcv.sb_flags &= ~SB_UPCALL;
1308	socket_unlock(so, 1);
1309
1310	sock_shutdown(so, SHUT_RDWR);
1311
1312	/*
1313	 * Remove from the up-call queue
1314	 */
1315	nfsrv_uc_dequeue(slp);
1316}
1317
1318/*
1319 * cleanup and release a server socket structure.
1320 */
1321void
1322nfsrv_slpfree(struct nfsrv_sock *slp)
1323{
1324	struct nfsrv_descript *nwp, *nnwp;
1325
1326	if (slp->ns_so) {
1327		sock_release(slp->ns_so);
1328		slp->ns_so = NULL;
1329	}
1330	if (slp->ns_nam)
1331		mbuf_free(slp->ns_nam);
1332	if (slp->ns_raw)
1333		mbuf_freem(slp->ns_raw);
1334	if (slp->ns_rec)
1335		mbuf_freem(slp->ns_rec);
1336	if (slp->ns_frag)
1337		mbuf_freem(slp->ns_frag);
1338	slp->ns_nam = slp->ns_raw = slp->ns_rec = slp->ns_frag = NULL;
1339	slp->ns_reccnt = 0;
1340
1341	if (slp->ns_ua)
1342		FREE(slp->ns_ua, M_NFSSVC);
1343
1344	for (nwp = slp->ns_tq.lh_first; nwp; nwp = nnwp) {
1345		nnwp = nwp->nd_tq.le_next;
1346		LIST_REMOVE(nwp, nd_tq);
1347		nfsm_chain_cleanup(&nwp->nd_nmreq);
1348		if (nwp->nd_mrep)
1349			mbuf_freem(nwp->nd_mrep);
1350		if (nwp->nd_nam2)
1351			mbuf_freem(nwp->nd_nam2);
1352		if (IS_VALID_CRED(nwp->nd_cr))
1353			kauth_cred_unref(&nwp->nd_cr);
1354		if (nwp->nd_gss_context)
1355			nfs_gss_svc_ctx_deref(nwp->nd_gss_context);
1356		FREE_ZONE(nwp, sizeof(*nwp), M_NFSRVDESC);
1357	}
1358	LIST_INIT(&slp->ns_tq);
1359
1360	lck_rw_destroy(&slp->ns_rwlock, nfsrv_slp_rwlock_group);
1361	lck_mtx_destroy(&slp->ns_wgmutex, nfsrv_slp_mutex_group);
1362	FREE(slp, M_NFSSVC);
1363}
1364
1365/*
1366 * Derefence a server socket structure. If it has no more references and
1367 * is no longer valid, you can throw it away.
1368 */
1369void
1370nfsrv_slpderef(struct nfsrv_sock *slp)
1371{
1372	struct timeval now;
1373
1374	lck_mtx_lock(nfsd_mutex);
1375	lck_rw_lock_exclusive(&slp->ns_rwlock);
1376	slp->ns_sref--;
1377
1378	if (slp->ns_sref || (slp->ns_flag & SLP_VALID)) {
1379		if ((slp->ns_flag & SLP_QUEUED) && !(slp->ns_flag & SLP_WORKTODO)) {
1380			/* remove socket from queue since there's no work */
1381			if (slp->ns_flag & SLP_WAITQ)
1382				TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
1383			else
1384				TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
1385			slp->ns_flag &= ~SLP_QUEUED;
1386		}
1387		lck_rw_done(&slp->ns_rwlock);
1388		lck_mtx_unlock(nfsd_mutex);
1389		return;
1390	}
1391
1392	/* This socket is no longer valid, so we'll get rid of it */
1393
1394	if (slp->ns_flag & SLP_QUEUED) {
1395		if (slp->ns_flag & SLP_WAITQ)
1396			TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
1397		else
1398			TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
1399		slp->ns_flag &= ~SLP_QUEUED;
1400	}
1401
1402	/*
1403	 * Queue the socket up for deletion
1404	 * and start the timer to delete it
1405	 * after it has been in limbo for
1406	 * a while.
1407	 */
1408	microuptime(&now);
1409	slp->ns_timestamp = now.tv_sec;
1410	TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain);
1411	TAILQ_INSERT_TAIL(&nfsrv_deadsocklist, slp, ns_chain);
1412	if (!nfsrv_deadsock_timer_on) {
1413		nfsrv_deadsock_timer_on = 1;
1414		nfs_interval_timer_start(nfsrv_deadsock_timer_call,
1415				NFSRV_DEADSOCKDELAY * 1000);
1416	}
1417
1418	lck_rw_done(&slp->ns_rwlock);
1419	/* now remove from the write gather socket list */
1420	if (slp->ns_wgq.tqe_next != SLPNOLIST) {
1421		TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq);
1422		slp->ns_wgq.tqe_next = SLPNOLIST;
1423	}
1424	lck_mtx_unlock(nfsd_mutex);
1425}
1426
1427/*
1428 * Check periodically for dead sockets pending delete.
1429 * If a socket has been dead for more than NFSRV_DEADSOCKDELAY
1430 * seconds then we assume it's safe to free.
1431 */
1432void
1433nfsrv_deadsock_timer(__unused void *param0, __unused void *param1)
1434{
1435	struct nfsrv_sock *slp;
1436	struct timeval now;
1437	time_t time_to_wait;
1438
1439	microuptime(&now);
1440	lck_mtx_lock(nfsd_mutex);
1441
1442	while ((slp = TAILQ_FIRST(&nfsrv_deadsocklist))) {
1443		if ((slp->ns_timestamp + NFSRV_DEADSOCKDELAY) > now.tv_sec)
1444			break;
1445		TAILQ_REMOVE(&nfsrv_deadsocklist, slp, ns_chain);
1446		nfsrv_slpfree(slp);
1447	}
1448	if (TAILQ_EMPTY(&nfsrv_deadsocklist)) {
1449		nfsrv_deadsock_timer_on = 0;
1450		lck_mtx_unlock(nfsd_mutex);
1451		return;
1452	}
1453	time_to_wait = (slp->ns_timestamp + NFSRV_DEADSOCKDELAY) - now.tv_sec;
1454	if (time_to_wait < 1)
1455		time_to_wait = 1;
1456
1457	lck_mtx_unlock(nfsd_mutex);
1458
1459	nfs_interval_timer_start(nfsrv_deadsock_timer_call,
1460		time_to_wait * 1000);
1461}
1462
1463/*
1464 * Clean up the data structures for the server.
1465 */
1466void
1467nfsrv_cleanup(void)
1468{
1469	struct nfsrv_sock *slp, *nslp;
1470	struct timeval now;
1471#if CONFIG_FSE
1472	struct nfsrv_fmod *fp, *nfp;
1473	int i;
1474#endif
1475
1476	microuptime(&now);
1477	for (slp = TAILQ_FIRST(&nfsrv_socklist); slp != 0; slp = nslp) {
1478		nslp = TAILQ_NEXT(slp, ns_chain);
1479		if (slp->ns_flag & SLP_VALID) {
1480			lck_rw_lock_exclusive(&slp->ns_rwlock);
1481			nfsrv_zapsock(slp);
1482			lck_rw_done(&slp->ns_rwlock);
1483		}
1484		if (slp->ns_flag & SLP_QUEUED) {
1485			if (slp->ns_flag & SLP_WAITQ)
1486				TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
1487			else
1488				TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
1489			slp->ns_flag &= ~SLP_QUEUED;
1490		}
1491		if (slp->ns_wgq.tqe_next != SLPNOLIST) {
1492			TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq);
1493			slp->ns_wgq.tqe_next = SLPNOLIST;
1494		}
1495		/* queue the socket up for deletion */
1496		slp->ns_timestamp = now.tv_sec;
1497		TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain);
1498		TAILQ_INSERT_TAIL(&nfsrv_deadsocklist, slp, ns_chain);
1499		if (!nfsrv_deadsock_timer_on) {
1500			nfsrv_deadsock_timer_on = 1;
1501			nfs_interval_timer_start(nfsrv_deadsock_timer_call,
1502				NFSRV_DEADSOCKDELAY * 1000);
1503		}
1504	}
1505
1506#if CONFIG_FSE
1507	/*
1508	 * Flush pending file write fsevents
1509	 */
1510	lck_mtx_lock(nfsrv_fmod_mutex);
1511	for (i = 0; i < NFSRVFMODHASHSZ; i++) {
1512		for (fp = LIST_FIRST(&nfsrv_fmod_hashtbl[i]); fp; fp = nfp) {
1513			/*
1514			 * Fire off the content modified fsevent for each
1515			 * entry, remove it from the list, and free it.
1516			 */
1517			if (nfsrv_fsevents_enabled) {
1518				fp->fm_context.vc_thread = current_thread();
1519				add_fsevent(FSE_CONTENT_MODIFIED, &fp->fm_context,
1520						FSE_ARG_VNODE, fp->fm_vp,
1521						FSE_ARG_DONE);
1522			}
1523			vnode_put(fp->fm_vp);
1524			kauth_cred_unref(&fp->fm_context.vc_ucred);
1525			nfp = LIST_NEXT(fp, fm_link);
1526			LIST_REMOVE(fp, fm_link);
1527			FREE(fp, M_TEMP);
1528		}
1529	}
1530	nfsrv_fmod_pending = 0;
1531	lck_mtx_unlock(nfsrv_fmod_mutex);
1532#endif
1533
1534	nfsrv_uc_cleanup();     /* Stop nfs socket up-call threads */
1535
1536	nfs_gss_svc_cleanup();	/* Remove any RPCSEC_GSS contexts */
1537
1538	nfsrv_cleancache();	/* And clear out server cache */
1539
1540	nfsrv_udpsock = NULL;
1541	nfsrv_udp6sock = NULL;
1542}
1543
1544#endif /* NFS_NOSERVER */
1545