1/*
2 * Copyright (c) 2000-2014 Apple Inc.  All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 *    must display the following acknowledgement:
46 *	This product includes software developed by the University of
47 *	California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 *    may be used to endorse or promote products derived from this software
50 *    without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 *	@(#)nfs_syscalls.c	8.5 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_syscalls.c,v 1.32 1997/11/07 08:53:25 phk Exp $
66 */
67/*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections.  This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74#include <sys/param.h>
75#include <sys/systm.h>
76#include <sys/kernel.h>
77#include <sys/file_internal.h>
78#include <sys/filedesc.h>
79#include <sys/stat.h>
80#include <sys/vnode_internal.h>
81#include <sys/mount_internal.h>
82#include <sys/proc_internal.h> /* for fdflags */
83#include <sys/kauth.h>
84#include <sys/sysctl.h>
85#include <sys/ubc.h>
86#include <sys/uio.h>
87#include <sys/malloc.h>
88#include <sys/kpi_mbuf.h>
89#include <sys/socket.h>
90#include <sys/socketvar.h>
91#include <sys/domain.h>
92#include <sys/protosw.h>
93#include <sys/fcntl.h>
94#include <sys/lockf.h>
95#include <sys/syslog.h>
96#include <sys/user.h>
97#include <sys/sysproto.h>
98#include <sys/kpi_socket.h>
99#include <sys/fsevents.h>
100#include <libkern/OSAtomic.h>
101#include <kern/thread_call.h>
102#include <kern/task.h>
103
104#include <security/audit/audit.h>
105
106#include <netinet/in.h>
107#include <netinet/tcp.h>
108#include <nfs/xdr_subs.h>
109#include <nfs/rpcv2.h>
110#include <nfs/nfsproto.h>
111#include <nfs/nfs.h>
112#include <nfs/nfsm_subs.h>
113#include <nfs/nfsrvcache.h>
114#include <nfs/nfs_gss.h>
115#include <nfs/nfsmount.h>
116#include <nfs/nfsnode.h>
117#include <nfs/nfs_lock.h>
118#if CONFIG_MACF
119#include <security/mac_framework.h>
120#endif
121
122kern_return_t	thread_terminate(thread_t); /* XXX */
123
124#if NFSSERVER
125
126extern int (*nfsrv_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
127					    struct nfsrv_sock *slp,
128					    vfs_context_t ctx,
129					    mbuf_t *mrepp);
130extern int nfsrv_wg_delay;
131extern int nfsrv_wg_delay_v3;
132
133static int nfsrv_require_resv_port = 0;
134static int nfsrv_deadsock_timer_on = 0;
135
136int	nfssvc_export(user_addr_t argp);
137int	nfssvc_nfsd(void);
138int	nfssvc_addsock(socket_t, mbuf_t);
139void	nfsrv_zapsock(struct nfsrv_sock *);
140void	nfsrv_slpderef(struct nfsrv_sock *);
141void	nfsrv_slpfree(struct nfsrv_sock *);
142
143#endif /* NFSSERVER */
144
145/*
146 * sysctl stuff
147 */
148SYSCTL_DECL(_vfs_generic);
149SYSCTL_NODE(_vfs_generic, OID_AUTO, nfs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs hinge");
150
151#if NFSCLIENT
152SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, client, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs client hinge");
153SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, initialdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_initial_delay, 0, "");
154SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nextdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_delay, 0, "");
155SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_iosize, 0, "");
156SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_cache_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_cache_timeout, 0, "");
157SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, allow_async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_allow_async, 0, "");
158SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, statfs_rate_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_statfs_rate_limit, 0, "");
159SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsiod_thread_max, 0, "");
160SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsiod_thread_count, 0, "");
161SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, lockd_mounts, CTLFLAG_RD | CTLFLAG_LOCKED, &nfs_lockd_mounts, 0, "");
162SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, max_async_writes, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_max_async_writes, 0, "");
163SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, single_des, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_single_des, 0, "");
164SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_delete, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_delete, 0, "");
165SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_dotzfs, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_dotzfs, 0, "");
166SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_for_getattr, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_for_getattr, 0, "");
167SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, idmap_ctrl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_idmap_ctrl, 0, "");
168SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, callback_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_callback_port, 0, "");
169SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, is_mobile, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_is_mobile, 0, "");
170SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, squishy_flags, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_squishy_flags, 0, "");
171SYSCTL_UINT(_vfs_generic_nfs_client, OID_AUTO, debug_ctl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_debug_ctl, 0, "");
172SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, readlink_nocache, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_readlink_nocache, 0, "");
173
174#endif /* NFSCLIENT */
175
176#if NFSSERVER
177SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, server, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs server hinge");
178SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay, 0, "");
179SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay_v3, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay_v3, 0, "");
180SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, require_resv_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_require_resv_port, 0, "");
181SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_async, 0, "");
182SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, export_hash_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_export_hash_size, 0, "");
183SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, reqcache_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_reqcache_size, 0, "");
184SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, request_queue_length, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_sock_max_rec_queue_length, 0, "");
185SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, user_stats, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_user_stat_enabled, 0, "");
186SYSCTL_UINT(_vfs_generic_nfs_server, OID_AUTO, gss_context_ttl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_gss_context_ttl, 0, "");
187#if CONFIG_FSE
188SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, fsevents, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_fsevents_enabled, 0, "");
189#endif
190SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsd_thread_max, 0, "");
191SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsd_thread_count, 0, "");
192#ifdef NFS_UC_Q_DEBUG
193SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, use_upcall_svc, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_use_proxy, 0, "");
194SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_limit, 0, "");
195SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_max_seen, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_max_seen, 0, "");
196SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_count, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)&nfsrv_uc_queue_count, 0, "");
197#endif
198#endif /* NFSSERVER */
199
200
201#if NFSCLIENT
202
203static int
204mapname2id(struct nfs_testmapid *map)
205{
206	int error;
207
208	error = nfs4_id2guid(map->ntm_name, &map->ntm_guid, map->ntm_grpflag);
209	if (error)
210		return (error);
211
212	if (map->ntm_grpflag)
213		error = kauth_cred_guid2gid(&map->ntm_guid, (gid_t *)&map->ntm_id);
214	else
215		error = kauth_cred_guid2uid(&map->ntm_guid, (uid_t *)&map->ntm_id);
216
217	return (error);
218}
219
220static int
221mapid2name(struct nfs_testmapid *map)
222{
223	int error;
224	int len = sizeof(map->ntm_name);
225
226	if (map->ntm_grpflag)
227		error = kauth_cred_gid2guid((gid_t)map->ntm_id, &map->ntm_guid);
228	else
229		error = kauth_cred_uid2guid((uid_t)map->ntm_id, &map->ntm_guid);
230
231	if (error)
232		return (error);
233
234	error = nfs4_guid2id(&map->ntm_guid, map->ntm_name, &len, map->ntm_grpflag);
235
236	return (error);
237
238}
239
240
241static int
242nfsclnt_testidmap(proc_t p, user_addr_t argp)
243{
244	struct nfs_testmapid mapid;
245	int error, coerror;
246
247        /* Let root make this call. */
248	error = proc_suser(p);
249        if (error)
250                return (error);
251
252	error = copyin(argp, &mapid, sizeof(mapid));
253	if (error)
254		return (error);
255	if (mapid.ntm_name2id)
256		error = mapname2id(&mapid);
257	else
258		error = mapid2name(&mapid);
259
260	coerror = copyout(&mapid, argp, sizeof(mapid));
261
262	return (error ? error : coerror);
263}
264
265int
266nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval)
267{
268	struct lockd_ans la;
269	int error;
270
271	switch (uap->flag) {
272	case NFSCLNT_LOCKDANS:
273		error = copyin(uap->argp, &la, sizeof(la));
274		if (!error)
275			error = nfslockdans(p, &la);
276		break;
277	case NFSCLNT_LOCKDNOTIFY:
278		error = nfslockdnotify(p, uap->argp);
279		break;
280	case NFSCLNT_TESTIDMAP:
281		error = nfsclnt_testidmap(p, uap->argp);
282		break;
283	default:
284		error = EINVAL;
285	}
286	return (error);
287}
288
289
290/*
291 * Asynchronous I/O threads for client NFS.
292 * They do read-ahead and write-behind operations on the block I/O cache.
293 *
294 * The pool of up to nfsiod_thread_max threads is launched on demand and exit
295 * when unused for a while.  There are as many nfsiod structs as there are
296 * nfsiod threads; however there's no strict tie between a thread and a struct.
297 * Each thread puts an nfsiod on the free list and sleeps on it.  When it wakes
298 * up, it removes the next struct nfsiod from the queue and services it.  Then
299 * it will put the struct at the head of free list and sleep on it.
300 * Async requests will pull the next struct nfsiod from the head of the free list,
301 * put it on the work queue, and wake whatever thread is waiting on that struct.
302 */
303
304/*
305 * nfsiod thread exit routine
306 *
307 * Must be called with nfsiod_mutex held so that the
308 * decision to terminate is atomic with the termination.
309 */
310void
311nfsiod_terminate(struct nfsiod *niod)
312{
313	nfsiod_thread_count--;
314	lck_mtx_unlock(nfsiod_mutex);
315	if (niod)
316		FREE(niod, M_TEMP);
317	else
318		printf("nfsiod: terminating without niod\n");
319	thread_terminate(current_thread());
320	/*NOTREACHED*/
321}
322
323/* nfsiod thread startup routine */
324void
325nfsiod_thread(void)
326{
327	struct nfsiod *niod;
328	int error;
329
330	MALLOC(niod, struct nfsiod *, sizeof(struct nfsiod), M_TEMP, M_WAITOK);
331	if (!niod) {
332		lck_mtx_lock(nfsiod_mutex);
333		nfsiod_thread_count--;
334		wakeup(current_thread());
335		lck_mtx_unlock(nfsiod_mutex);
336		thread_terminate(current_thread());
337		/*NOTREACHED*/
338	}
339	bzero(niod, sizeof(*niod));
340	lck_mtx_lock(nfsiod_mutex);
341	TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link);
342	wakeup(current_thread());
343	error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE*hz, nfsiod_continue);
344	/* shouldn't return... so we have an error */
345	/* remove an old nfsiod struct and terminate */
346	lck_mtx_lock(nfsiod_mutex);
347	if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist)))
348		TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
349	nfsiod_terminate(niod);
350	/*NOTREACHED*/
351}
352
353/*
354 * Start up another nfsiod thread.
355 * (unless we're already maxed out and there are nfsiods running)
356 */
357int
358nfsiod_start(void)
359{
360	thread_t thd = THREAD_NULL;
361
362	lck_mtx_lock(nfsiod_mutex);
363	if ((nfsiod_thread_count >= NFSIOD_MAX) && (nfsiod_thread_count > 0)) {
364		lck_mtx_unlock(nfsiod_mutex);
365		return (EBUSY);
366	}
367	nfsiod_thread_count++;
368	if (kernel_thread_start((thread_continue_t)nfsiod_thread, NULL, &thd) != KERN_SUCCESS) {
369		lck_mtx_unlock(nfsiod_mutex);
370		return (EBUSY);
371	}
372	/* wait for the thread to complete startup */
373	msleep(thd, nfsiod_mutex, PWAIT | PDROP, "nfsiodw", NULL);
374	thread_deallocate(thd);
375	return (0);
376}
377
378/*
379 * Continuation for Asynchronous I/O threads for NFS client.
380 *
381 * Grab an nfsiod struct to work on, do some work, then drop it
382 */
383int
384nfsiod_continue(int error)
385{
386	struct nfsiod *niod;
387	struct nfsmount *nmp;
388	struct nfsreq *req, *treq;
389	struct nfs_reqqhead iodq;
390	int morework;
391
392	lck_mtx_lock(nfsiod_mutex);
393	niod = TAILQ_FIRST(&nfsiodwork);
394	if (!niod) {
395		/* there's no work queued up */
396		/* remove an old nfsiod struct and terminate */
397		if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist)))
398			TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
399		nfsiod_terminate(niod);
400		/*NOTREACHED*/
401	}
402	TAILQ_REMOVE(&nfsiodwork, niod, niod_link);
403
404worktodo:
405	while ((nmp = niod->niod_nmp)) {
406		if (nmp == NULL){
407			niod->niod_nmp = NULL;
408			break;
409		}
410
411		/*
412		 * Service this mount's async I/O queue.
413		 *
414		 * In order to ensure some level of fairness between mounts,
415		 * we grab all the work up front before processing it so any
416		 * new work that arrives will be serviced on a subsequent
417		 * iteration - and we have a chance to see if other work needs
418		 * to be done (e.g. the delayed write queue needs to be pushed
419		 * or other mounts are waiting for an nfsiod).
420		 */
421		/* grab the current contents of the queue */
422		TAILQ_INIT(&iodq);
423		TAILQ_CONCAT(&iodq, &nmp->nm_iodq, r_achain);
424		lck_mtx_unlock(nfsiod_mutex);
425
426		/* process the queue */
427		TAILQ_FOREACH_SAFE(req, &iodq, r_achain, treq) {
428			TAILQ_REMOVE(&iodq, req, r_achain);
429			lck_mtx_lock(nfsiod_mutex);
430			req->r_achain.tqe_next = NFSIODCOMPLETING;
431			lck_mtx_unlock(nfsiod_mutex);
432			req->r_callback.rcb_func(req);
433		}
434
435		/* now check if there's more/other work to be done */
436		lck_mtx_lock(nfsiod_mutex);
437		morework = !TAILQ_EMPTY(&nmp->nm_iodq);
438		if (!morework || !TAILQ_EMPTY(&nfsiodmounts)) {
439			/*
440			 * we're going to stop working on this mount but if the
441			 * mount still needs more work so queue it up
442			 */
443			if (morework && nmp->nm_iodlink.tqe_next == NFSNOLIST)
444				TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
445			nmp->nm_niod = NULL;
446			niod->niod_nmp = NULL;
447		}
448	}
449
450	/* loop if there's still a mount to work on */
451	if (!niod->niod_nmp && !TAILQ_EMPTY(&nfsiodmounts)) {
452		niod->niod_nmp = TAILQ_FIRST(&nfsiodmounts);
453		TAILQ_REMOVE(&nfsiodmounts, niod->niod_nmp, nm_iodlink);
454		niod->niod_nmp->nm_iodlink.tqe_next = NFSNOLIST;
455	}
456	if (niod->niod_nmp)
457		goto worktodo;
458
459	/* queue ourselves back up - if there aren't too many threads running */
460	if (nfsiod_thread_count <= NFSIOD_MAX) {
461		TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link);
462		error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE*hz, nfsiod_continue);
463		/* shouldn't return... so we have an error */
464		/* remove an old nfsiod struct and terminate */
465		lck_mtx_lock(nfsiod_mutex);
466		if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist)))
467			TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
468	}
469	nfsiod_terminate(niod);
470	/*NOTREACHED*/
471	return (0);
472}
473
474#endif /* NFSCLIENT */
475
476
477#if NFSSERVER
478
479/*
480 * NFS server system calls
481 * getfh() lives here too, but maybe should move to kern/vfs_syscalls.c
482 */
483
484/*
485 * Get file handle system call
486 */
487int
488getfh(proc_t p, struct getfh_args *uap, __unused int *retval)
489{
490	vnode_t vp;
491	struct nfs_filehandle nfh;
492	int error, fhlen, fidlen;
493	struct nameidata nd;
494	char path[MAXPATHLEN], *ptr;
495	size_t pathlen;
496	struct nfs_exportfs *nxfs;
497	struct nfs_export *nx;
498
499	/*
500	 * Must be super user
501	 */
502	error = proc_suser(p);
503	if (error)
504		return (error);
505
506	error = copyinstr(uap->fname, path, MAXPATHLEN, &pathlen);
507	if (!error)
508		error = copyin(uap->fhp, &fhlen, sizeof(fhlen));
509	if (error)
510		return (error);
511	/* limit fh size to length specified (or v3 size by default) */
512	if ((fhlen != NFSV2_MAX_FH_SIZE) && (fhlen != NFSV3_MAX_FH_SIZE))
513		fhlen = NFSV3_MAX_FH_SIZE;
514	fidlen = fhlen - sizeof(struct nfs_exphandle);
515
516	if (!nfsrv_is_initialized())
517		return (EINVAL);
518
519	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1,
520			UIO_SYSSPACE, CAST_USER_ADDR_T(path), vfs_context_current());
521	error = namei(&nd);
522	if (error)
523		return (error);
524	nameidone(&nd);
525
526	vp = nd.ni_vp;
527
528	// find exportfs that matches f_mntonname
529	lck_rw_lock_shared(&nfsrv_export_rwlock);
530	ptr = vnode_mount(vp)->mnt_vfsstat.f_mntonname;
531	LIST_FOREACH(nxfs, &nfsrv_exports, nxfs_next) {
532		if (!strncmp(nxfs->nxfs_path, ptr, MAXPATHLEN))
533			break;
534	}
535	if (!nxfs || strncmp(nxfs->nxfs_path, path, strlen(nxfs->nxfs_path))) {
536		error = EINVAL;
537		goto out;
538	}
539	// find export that best matches remainder of path
540	ptr = path + strlen(nxfs->nxfs_path);
541	while (*ptr && (*ptr == '/'))
542		ptr++;
543	LIST_FOREACH(nx, &nxfs->nxfs_exports, nx_next) {
544		int len = strlen(nx->nx_path);
545		if (len == 0)  // we've hit the export entry for the root directory
546			break;
547		if (!strncmp(nx->nx_path, ptr, len))
548			break;
549	}
550	if (!nx) {
551		error = EINVAL;
552		goto out;
553	}
554
555	bzero(&nfh, sizeof(nfh));
556	nfh.nfh_xh.nxh_version = htonl(NFS_FH_VERSION);
557	nfh.nfh_xh.nxh_fsid = htonl(nxfs->nxfs_id);
558	nfh.nfh_xh.nxh_expid = htonl(nx->nx_id);
559	nfh.nfh_xh.nxh_flags = 0;
560	nfh.nfh_xh.nxh_reserved = 0;
561	nfh.nfh_len = fidlen;
562	error = VFS_VPTOFH(vp, (int*)&nfh.nfh_len, &nfh.nfh_fid[0], NULL);
563	if (nfh.nfh_len > (uint32_t)fidlen)
564		error = EOVERFLOW;
565	nfh.nfh_xh.nxh_fidlen = nfh.nfh_len;
566	nfh.nfh_len += sizeof(nfh.nfh_xh);
567	nfh.nfh_fhp = (u_char*)&nfh.nfh_xh;
568
569out:
570	lck_rw_done(&nfsrv_export_rwlock);
571	vnode_put(vp);
572	if (error)
573		return (error);
574	error = copyout((caddr_t)&nfh, uap->fhp, sizeof(fhandle_t));
575	return (error);
576}
577
578extern const struct fileops vnops;
579
580/*
581 * syscall for the rpc.lockd to use to translate a NFS file handle into
582 * an open descriptor.
583 *
584 * warning: do not remove the suser() call or this becomes one giant
585 * security hole.
586 */
587int
588fhopen( proc_t p,
589	struct fhopen_args *uap,
590	int32_t *retval)
591{
592	vnode_t vp;
593	struct nfs_filehandle nfh;
594	struct nfs_export *nx;
595	struct nfs_export_options *nxo;
596	struct flock lf;
597	struct fileproc *fp, *nfp;
598	int fmode, error, type;
599	int indx;
600	vfs_context_t ctx = vfs_context_current();
601	kauth_action_t action;
602
603	/*
604	 * Must be super user
605	 */
606	error = suser(vfs_context_ucred(ctx), 0);
607	if (error) {
608		return (error);
609	}
610
611	if (!nfsrv_is_initialized()) {
612		return (EINVAL);
613	}
614
615	fmode = FFLAGS(uap->flags);
616	/* why not allow a non-read/write open for our lockd? */
617	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
618		return (EINVAL);
619
620	error = copyin(uap->u_fhp, &nfh.nfh_len, sizeof(nfh.nfh_len));
621	if (error)
622		return (error);
623	if ((nfh.nfh_len < (int)sizeof(struct nfs_exphandle)) ||
624	    (nfh.nfh_len > (int)NFSV3_MAX_FH_SIZE))
625		return (EINVAL);
626	error = copyin(uap->u_fhp, &nfh, sizeof(nfh.nfh_len) + nfh.nfh_len);
627	if (error)
628		return (error);
629	nfh.nfh_fhp = (u_char*)&nfh.nfh_xh;
630
631	lck_rw_lock_shared(&nfsrv_export_rwlock);
632	/* now give me my vnode, it gets returned to me with a reference */
633	error = nfsrv_fhtovp(&nfh, NULL, &vp, &nx, &nxo);
634	lck_rw_done(&nfsrv_export_rwlock);
635	if (error) {
636		if (error == NFSERR_TRYLATER)
637			error = EAGAIN; // XXX EBUSY? Or just leave as TRYLATER?
638		return (error);
639	}
640
641	/*
642	 * From now on we have to make sure not
643	 * to forget about the vnode.
644	 * Any error that causes an abort must vnode_put(vp).
645	 * Just set error = err and 'goto bad;'.
646	 */
647
648	/*
649	 * from vn_open
650	 */
651	if (vnode_vtype(vp) == VSOCK) {
652		error = EOPNOTSUPP;
653		goto bad;
654	}
655
656	/* disallow write operations on directories */
657	if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) {
658		error = EISDIR;
659		goto bad;
660	}
661
662	/* compute action to be authorized */
663	action = 0;
664	if (fmode & FREAD)
665		action |= KAUTH_VNODE_READ_DATA;
666	if (fmode & (FWRITE | O_TRUNC))
667		action |= KAUTH_VNODE_WRITE_DATA;
668	if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
669		goto bad;
670
671	if ((error = VNOP_OPEN(vp, fmode, ctx)))
672		goto bad;
673	if ((error = vnode_ref_ext(vp, fmode, 0)))
674		goto bad;
675
676	/*
677	 * end of vn_open code
678	 */
679
680	// starting here... error paths should call vn_close/vnode_put
681	if ((error = falloc(p, &nfp, &indx, ctx)) != 0) {
682		vn_close(vp, fmode & FMASK, ctx);
683		goto bad;
684	}
685	fp = nfp;
686
687	fp->f_fglob->fg_flag = fmode & FMASK;
688	fp->f_fglob->fg_ops = &vnops;
689	fp->f_fglob->fg_data = (caddr_t)vp;
690
691	// XXX do we really need to support this with fhopen()?
692	if (fmode & (O_EXLOCK | O_SHLOCK)) {
693		lf.l_whence = SEEK_SET;
694		lf.l_start = 0;
695		lf.l_len = 0;
696		if (fmode & O_EXLOCK)
697			lf.l_type = F_WRLCK;
698		else
699			lf.l_type = F_RDLCK;
700		type = F_FLOCK;
701		if ((fmode & FNONBLOCK) == 0)
702			type |= F_WAIT;
703		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
704			struct vfs_context context = *vfs_context_current();
705			/* Modify local copy (to not damage thread copy) */
706			context.vc_ucred = fp->f_fglob->fg_cred;
707
708			vn_close(vp, fp->f_fglob->fg_flag, &context);
709			fp_free(p, indx, fp);
710			return (error);
711		}
712		fp->f_fglob->fg_flag |= FHASLOCK;
713	}
714
715	vnode_put(vp);
716
717	proc_fdlock(p);
718	procfdtbl_releasefd(p, indx, NULL);
719	fp_drop(p, indx, fp, 1);
720	proc_fdunlock(p);
721
722	*retval = indx;
723	return (0);
724
725bad:
726	vnode_put(vp);
727	return (error);
728}
729
730/*
731 * NFS server pseudo system call
732 */
733int
734nfssvc(proc_t p, struct nfssvc_args *uap, __unused int *retval)
735{
736	mbuf_t nam;
737	struct user_nfsd_args user_nfsdarg;
738	socket_t so;
739	int error;
740
741	AUDIT_ARG(cmd, uap->flag);
742
743	/*
744	 * Must be super user for most operations (export ops checked later).
745	 */
746	if ((uap->flag != NFSSVC_EXPORT) && ((error = proc_suser(p))))
747		return (error);
748#if CONFIG_MACF
749	error = mac_system_check_nfsd(kauth_cred_get());
750	if (error)
751		return (error);
752#endif
753
754	/* make sure NFS server data structures have been initialized */
755	nfsrv_init();
756
757	if (uap->flag & NFSSVC_ADDSOCK) {
758		if (IS_64BIT_PROCESS(p)) {
759			error = copyin(uap->argp, (caddr_t)&user_nfsdarg, sizeof(user_nfsdarg));
760		} else {
761			struct nfsd_args    tmp_args;
762			error = copyin(uap->argp, (caddr_t)&tmp_args, sizeof(tmp_args));
763			if (error == 0) {
764				user_nfsdarg.sock = tmp_args.sock;
765				user_nfsdarg.name = CAST_USER_ADDR_T(tmp_args.name);
766				user_nfsdarg.namelen = tmp_args.namelen;
767			}
768		}
769		if (error)
770			return (error);
771		/* get the socket */
772		error = file_socket(user_nfsdarg.sock, &so);
773		if (error)
774			return (error);
775		/* Get the client address for connected sockets. */
776		if (user_nfsdarg.name == USER_ADDR_NULL || user_nfsdarg.namelen == 0) {
777			nam = NULL;
778		} else {
779			error = sockargs(&nam, user_nfsdarg.name, user_nfsdarg.namelen, MBUF_TYPE_SONAME);
780			if (error) {
781				/* drop the iocount file_socket() grabbed on the file descriptor */
782				file_drop(user_nfsdarg.sock);
783				return (error);
784			}
785		}
786		/*
787		 * nfssvc_addsock() will grab a retain count on the socket
788		 * to keep the socket from being closed when nfsd closes its
789		 * file descriptor for it.
790		 */
791		error = nfssvc_addsock(so, nam);
792		/* drop the iocount file_socket() grabbed on the file descriptor */
793		file_drop(user_nfsdarg.sock);
794	} else if (uap->flag & NFSSVC_NFSD) {
795		error = nfssvc_nfsd();
796	} else if (uap->flag & NFSSVC_EXPORT) {
797		error = nfssvc_export(uap->argp);
798	} else {
799		error = EINVAL;
800	}
801	if (error == EINTR || error == ERESTART)
802		error = 0;
803	return (error);
804}
805
806/*
807 * Adds a socket to the list for servicing by nfsds.
808 */
809int
810nfssvc_addsock(socket_t so, mbuf_t mynam)
811{
812	struct nfsrv_sock *slp;
813	int error = 0, sodomain, sotype, soprotocol, on = 1;
814	int first;
815	struct timeval timeo;
816
817	/* make sure mbuf constants are set up */
818	if (!nfs_mbuf_mhlen)
819		nfs_mbuf_init();
820
821	sock_gettype(so, &sodomain, &sotype, &soprotocol);
822
823	/* There should be only one UDP socket for each of IPv4 and IPv6 */
824	if ((sodomain == AF_INET) && (soprotocol == IPPROTO_UDP) && nfsrv_udpsock) {
825		mbuf_freem(mynam);
826		return (EEXIST);
827	}
828	if ((sodomain == AF_INET6) && (soprotocol == IPPROTO_UDP) && nfsrv_udp6sock) {
829		mbuf_freem(mynam);
830		return (EEXIST);
831	}
832
833	/* Set protocol options and reserve some space (for UDP). */
834	if (sotype == SOCK_STREAM)
835		sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
836	if ((sodomain == AF_INET) && (soprotocol == IPPROTO_TCP))
837		sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
838	if (sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */
839		int reserve = NFS_UDPSOCKBUF;
840		error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve));
841		error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve));
842		if (error) {
843			log(LOG_INFO, "nfssvc_addsock: UDP socket buffer setting error(s) %d\n", error);
844			error = 0;
845		}
846	}
847	sock_nointerrupt(so, 0);
848
849	/*
850	 * Set socket send/receive timeouts.
851	 * Receive timeout shouldn't matter, but setting the send timeout
852	 * will make sure that an unresponsive client can't hang the server.
853	 */
854	timeo.tv_usec = 0;
855	timeo.tv_sec = 1;
856	error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
857	timeo.tv_sec = 30;
858	error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
859	if (error) {
860		log(LOG_INFO, "nfssvc_addsock: socket timeout setting error(s) %d\n", error);
861		error = 0;
862	}
863
864	MALLOC(slp, struct nfsrv_sock *, sizeof(struct nfsrv_sock), M_NFSSVC, M_WAITOK);
865	if (!slp) {
866		mbuf_freem(mynam);
867		return (ENOMEM);
868	}
869	bzero((caddr_t)slp, sizeof (struct nfsrv_sock));
870	lck_rw_init(&slp->ns_rwlock, nfsrv_slp_rwlock_group, LCK_ATTR_NULL);
871	lck_mtx_init(&slp->ns_wgmutex, nfsrv_slp_mutex_group, LCK_ATTR_NULL);
872
873	lck_mtx_lock(nfsd_mutex);
874
875	if (soprotocol == IPPROTO_UDP) {
876		if (sodomain == AF_INET) {
877			/* There should be only one UDP/IPv4 socket */
878			if (nfsrv_udpsock) {
879				lck_mtx_unlock(nfsd_mutex);
880				nfsrv_slpfree(slp);
881				mbuf_freem(mynam);
882				return (EEXIST);
883			}
884			nfsrv_udpsock = slp;
885		}
886		if (sodomain == AF_INET6) {
887			/* There should be only one UDP/IPv6 socket */
888			if (nfsrv_udp6sock) {
889				lck_mtx_unlock(nfsd_mutex);
890				nfsrv_slpfree(slp);
891				mbuf_freem(mynam);
892				return (EEXIST);
893			}
894			nfsrv_udp6sock = slp;
895		}
896	}
897
898	/* add the socket to the list */
899	first = TAILQ_EMPTY(&nfsrv_socklist);
900	TAILQ_INSERT_TAIL(&nfsrv_socklist, slp, ns_chain);
901
902	sock_retain(so); /* grab a retain count on the socket */
903	slp->ns_so = so;
904	slp->ns_sotype = sotype;
905	slp->ns_nam = mynam;
906
907	/* set up the socket up-call */
908	nfsrv_uc_addsock(slp, first);
909
910	/* mark that the socket is not in the nfsrv_sockwg list */
911	slp->ns_wgq.tqe_next = SLPNOLIST;
912
913	slp->ns_flag = SLP_VALID | SLP_NEEDQ;
914
915	nfsrv_wakenfsd(slp);
916	lck_mtx_unlock(nfsd_mutex);
917
918	return (0);
919}
920
921/*
922 * nfssvc_nfsd()
923 *
924 * nfsd theory of operation:
925 *
926 * The first nfsd thread stays in user mode accepting new TCP connections
927 * which are then added via the "addsock" call.  The rest of the nfsd threads
928 * simply call into the kernel and remain there in a loop handling NFS
929 * requests until killed by a signal.
930 *
931 * There's a list of nfsd threads (nfsd_head).
932 * There's an nfsd queue that contains only those nfsds that are
933 *   waiting for work to do (nfsd_queue).
934 *
935 * There's a list of all NFS sockets (nfsrv_socklist) and two queues for
936 *   managing the work on the sockets:
937 *   nfsrv_sockwait - sockets w/new data waiting to be worked on
938 *   nfsrv_sockwork - sockets being worked on which may have more work to do
939 *   nfsrv_sockwg -- sockets which have pending write gather data
940 * When a socket receives data, if it is not currently queued, it
941 *   will be placed at the end of the "wait" queue.
942 * Whenever a socket needs servicing we make sure it is queued and
943 *   wake up a waiting nfsd (if there is one).
944 *
945 * nfsds will service at most 8 requests from the same socket before
946 *   defecting to work on another socket.
947 * nfsds will defect immediately if there are any sockets in the "wait" queue
948 * nfsds looking for a socket to work on check the "wait" queue first and
949 *   then check the "work" queue.
950 * When an nfsd starts working on a socket, it removes it from the head of
951 *   the queue it's currently on and moves it to the end of the "work" queue.
952 * When nfsds are checking the queues for work, any sockets found not to
953 *   have any work are simply dropped from the queue.
954 *
955 */
956int
957nfssvc_nfsd(void)
958{
959	mbuf_t m, mrep;
960	struct nfsrv_sock *slp;
961	struct nfsd *nfsd;
962	struct nfsrv_descript *nd = NULL;
963	int error = 0, cacherep, writes_todo;
964	int siz, procrastinate, opcnt = 0;
965	u_quad_t cur_usec;
966	struct timeval now;
967	struct vfs_context context;
968	struct timespec to;
969
970#ifndef nolint
971	cacherep = RC_DOIT;
972	writes_todo = 0;
973#endif
974
975	MALLOC(nfsd, struct nfsd *, sizeof(struct nfsd), M_NFSD, M_WAITOK);
976	if (!nfsd)
977		return (ENOMEM);
978	bzero(nfsd, sizeof(struct nfsd));
979	lck_mtx_lock(nfsd_mutex);
980	if (nfsd_thread_count++ == 0)
981		nfsrv_initcache();		/* Init the server request cache */
982
983	TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain);
984	lck_mtx_unlock(nfsd_mutex);
985
986	context.vc_thread = current_thread();
987
988	/* Set time out so that nfsd threads can wake up a see if they are still needed. */
989	to.tv_sec = 5;
990	to.tv_nsec = 0;
991
992	/*
993	 * Loop getting rpc requests until SIGKILL.
994	 */
995	for (;;) {
996		if (nfsd_thread_max <= 0) {
997			/* NFS server shutting down, get out ASAP */
998			error = EINTR;
999			slp = nfsd->nfsd_slp;
1000		} else if (nfsd->nfsd_flag & NFSD_REQINPROG) {
1001			/* already have some work to do */
1002			error = 0;
1003			slp = nfsd->nfsd_slp;
1004		} else {
1005			/* need to find work to do */
1006			error = 0;
1007			lck_mtx_lock(nfsd_mutex);
1008			while (!nfsd->nfsd_slp && TAILQ_EMPTY(&nfsrv_sockwait) && TAILQ_EMPTY(&nfsrv_sockwork)) {
1009				if (nfsd_thread_count > nfsd_thread_max) {
1010					/*
1011					 * If we have no socket and there are more
1012					 * nfsd threads than configured, let's exit.
1013					 */
1014					error = 0;
1015					goto done;
1016				}
1017				nfsd->nfsd_flag |= NFSD_WAITING;
1018				TAILQ_INSERT_HEAD(&nfsd_queue, nfsd, nfsd_queue);
1019				error = msleep(nfsd, nfsd_mutex, PSOCK | PCATCH, "nfsd", &to);
1020				if (error) {
1021					if (nfsd->nfsd_flag & NFSD_WAITING) {
1022						TAILQ_REMOVE(&nfsd_queue, nfsd, nfsd_queue);
1023						nfsd->nfsd_flag &= ~NFSD_WAITING;
1024					}
1025					if (error == EWOULDBLOCK)
1026						continue;
1027					goto done;
1028				}
1029			}
1030			slp = nfsd->nfsd_slp;
1031			if (!slp && !TAILQ_EMPTY(&nfsrv_sockwait)) {
1032				/* look for a socket to work on in the wait queue */
1033				while ((slp = TAILQ_FIRST(&nfsrv_sockwait))) {
1034					lck_rw_lock_exclusive(&slp->ns_rwlock);
1035					/* remove from the head of the queue */
1036					TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
1037					slp->ns_flag &= ~SLP_WAITQ;
1038					if ((slp->ns_flag & SLP_VALID) && (slp->ns_flag & SLP_WORKTODO))
1039						break;
1040					/* nothing to do, so skip this socket */
1041					lck_rw_done(&slp->ns_rwlock);
1042				}
1043			}
1044			if (!slp && !TAILQ_EMPTY(&nfsrv_sockwork)) {
1045				/* look for a socket to work on in the work queue */
1046				while ((slp = TAILQ_FIRST(&nfsrv_sockwork))) {
1047					lck_rw_lock_exclusive(&slp->ns_rwlock);
1048					/* remove from the head of the queue */
1049					TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
1050					slp->ns_flag &= ~SLP_WORKQ;
1051					if ((slp->ns_flag & SLP_VALID) && (slp->ns_flag & SLP_WORKTODO))
1052						break;
1053					/* nothing to do, so skip this socket */
1054					lck_rw_done(&slp->ns_rwlock);
1055				}
1056			}
1057			if (!nfsd->nfsd_slp && slp) {
1058				/* we found a socket to work on, grab a reference */
1059				slp->ns_sref++;
1060				nfsd->nfsd_slp = slp;
1061				opcnt = 0;
1062				/* and put it at the back of the work queue */
1063				TAILQ_INSERT_TAIL(&nfsrv_sockwork, slp, ns_svcq);
1064				slp->ns_flag |= SLP_WORKQ;
1065				lck_rw_done(&slp->ns_rwlock);
1066			}
1067			lck_mtx_unlock(nfsd_mutex);
1068			if (!slp)
1069				continue;
1070			lck_rw_lock_exclusive(&slp->ns_rwlock);
1071			if (slp->ns_flag & SLP_VALID) {
1072				if ((slp->ns_flag & (SLP_NEEDQ|SLP_DISCONN)) == SLP_NEEDQ) {
1073					slp->ns_flag &= ~SLP_NEEDQ;
1074					nfsrv_rcv_locked(slp->ns_so, slp, MBUF_WAITOK);
1075				}
1076				if (slp->ns_flag & SLP_DISCONN)
1077					nfsrv_zapsock(slp);
1078				error = nfsrv_dorec(slp, nfsd, &nd);
1079				if (error == EINVAL) {	// RPCSEC_GSS drop
1080					if (slp->ns_sotype == SOCK_STREAM)
1081						nfsrv_zapsock(slp); // drop connection
1082				}
1083				writes_todo = 0;
1084				if (error && (slp->ns_wgtime || (slp->ns_flag & SLP_DOWRITES))) {
1085					microuptime(&now);
1086					cur_usec = (u_quad_t)now.tv_sec * 1000000 +
1087						(u_quad_t)now.tv_usec;
1088					if (slp->ns_wgtime <= cur_usec) {
1089						error = 0;
1090						cacherep = RC_DOIT;
1091						writes_todo = 1;
1092					}
1093					slp->ns_flag &= ~SLP_DOWRITES;
1094				}
1095				nfsd->nfsd_flag |= NFSD_REQINPROG;
1096			}
1097			lck_rw_done(&slp->ns_rwlock);
1098		}
1099		if (error || (slp && !(slp->ns_flag & SLP_VALID))) {
1100			if (nd) {
1101				nfsm_chain_cleanup(&nd->nd_nmreq);
1102				if (nd->nd_nam2)
1103					mbuf_freem(nd->nd_nam2);
1104				if (IS_VALID_CRED(nd->nd_cr))
1105					kauth_cred_unref(&nd->nd_cr);
1106				if (nd->nd_gss_context)
1107					nfs_gss_svc_ctx_deref(nd->nd_gss_context);
1108				FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC);
1109				nd = NULL;
1110			}
1111			nfsd->nfsd_slp = NULL;
1112			nfsd->nfsd_flag &= ~NFSD_REQINPROG;
1113			if (slp)
1114				nfsrv_slpderef(slp);
1115			if (nfsd_thread_max <= 0)
1116				break;
1117			continue;
1118		}
1119		if (nd) {
1120		    microuptime(&nd->nd_starttime);
1121		    if (nd->nd_nam2)
1122			nd->nd_nam = nd->nd_nam2;
1123		    else
1124			nd->nd_nam = slp->ns_nam;
1125
1126		    cacherep = nfsrv_getcache(nd, slp, &mrep);
1127
1128		    if (nfsrv_require_resv_port) {
1129			/* Check if source port is a reserved port */
1130			in_port_t port = 0;
1131			struct sockaddr *saddr = mbuf_data(nd->nd_nam);
1132
1133			if (saddr->sa_family == AF_INET)
1134				port = ntohs(((struct sockaddr_in*)saddr)->sin_port);
1135			else if (saddr->sa_family == AF_INET6)
1136				port = ntohs(((struct sockaddr_in6*)saddr)->sin6_port);
1137			if ((port >= IPPORT_RESERVED) && (nd->nd_procnum != NFSPROC_NULL)) {
1138			    nd->nd_procnum = NFSPROC_NOOP;
1139			    nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK);
1140			    cacherep = RC_DOIT;
1141			}
1142		    }
1143
1144		}
1145
1146		/*
1147		 * Loop to get all the write RPC replies that have been
1148		 * gathered together.
1149		 */
1150		do {
1151		    switch (cacherep) {
1152		    case RC_DOIT:
1153			if (nd && (nd->nd_vers == NFS_VER3))
1154			    procrastinate = nfsrv_wg_delay_v3;
1155			else
1156			    procrastinate = nfsrv_wg_delay;
1157			lck_rw_lock_shared(&nfsrv_export_rwlock);
1158			context.vc_ucred = NULL;
1159			if (writes_todo || ((nd->nd_procnum == NFSPROC_WRITE) && (procrastinate > 0)))
1160				error = nfsrv_writegather(&nd, slp, &context, &mrep);
1161			else
1162				error = (*(nfsrv_procs[nd->nd_procnum]))(nd, slp, &context, &mrep);
1163			lck_rw_done(&nfsrv_export_rwlock);
1164			if (mrep == NULL) {
1165				/*
1166				 * If this is a stream socket and we are not going
1167				 * to send a reply we better close the connection
1168				 * so the client doesn't hang.
1169				 */
1170				if (error && slp->ns_sotype == SOCK_STREAM) {
1171					lck_rw_lock_exclusive(&slp->ns_rwlock);
1172					nfsrv_zapsock(slp);
1173					lck_rw_done(&slp->ns_rwlock);
1174					printf("NFS server: NULL reply from proc = %d error = %d\n",
1175						nd->nd_procnum, error);
1176				}
1177				break;
1178
1179			}
1180			if (error) {
1181				OSAddAtomic64(1, &nfsstats.srv_errs);
1182				nfsrv_updatecache(nd, FALSE, mrep);
1183				if (nd->nd_nam2) {
1184					mbuf_freem(nd->nd_nam2);
1185					nd->nd_nam2 = NULL;
1186				}
1187				break;
1188			}
1189			OSAddAtomic64(1, &nfsstats.srvrpccnt[nd->nd_procnum]);
1190			nfsrv_updatecache(nd, TRUE, mrep);
1191			/* FALLTHRU */
1192
1193		    case RC_REPLY:
1194			if (nd->nd_gss_mb != NULL) {	// It's RPCSEC_GSS
1195				/*
1196				 * Need to checksum or encrypt the reply
1197				 */
1198				error = nfs_gss_svc_protect_reply(nd, mrep);
1199				if (error) {
1200				    	mbuf_freem(mrep);
1201					break;
1202				}
1203			}
1204
1205			/*
1206			 * Get the total size of the reply
1207			 */
1208			m = mrep;
1209			siz = 0;
1210			while (m) {
1211				siz += mbuf_len(m);
1212				m = mbuf_next(m);
1213			}
1214			if (siz <= 0 || siz > NFS_MAXPACKET) {
1215				printf("mbuf siz=%d\n",siz);
1216				panic("Bad nfs svc reply");
1217			}
1218			m = mrep;
1219			mbuf_pkthdr_setlen(m, siz);
1220			error = mbuf_pkthdr_setrcvif(m, NULL);
1221			if (error)
1222				panic("nfsd setrcvif failed: %d", error);
1223			/*
1224			 * For stream protocols, prepend a Sun RPC
1225			 * Record Mark.
1226			 */
1227			if (slp->ns_sotype == SOCK_STREAM) {
1228				error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1229				if (!error)
1230					*(u_int32_t*)mbuf_data(m) = htonl(0x80000000 | siz);
1231			}
1232			if (!error) {
1233				if (slp->ns_flag & SLP_VALID) {
1234				    error = nfsrv_send(slp, nd->nd_nam2, m);
1235				} else {
1236				    error = EPIPE;
1237				    mbuf_freem(m);
1238				}
1239			} else {
1240				mbuf_freem(m);
1241			}
1242			mrep = NULL;
1243			if (nd->nd_nam2) {
1244				mbuf_freem(nd->nd_nam2);
1245				nd->nd_nam2 = NULL;
1246			}
1247			if (error == EPIPE) {
1248				lck_rw_lock_exclusive(&slp->ns_rwlock);
1249				nfsrv_zapsock(slp);
1250				lck_rw_done(&slp->ns_rwlock);
1251			}
1252			if (error == EINTR || error == ERESTART) {
1253				nfsm_chain_cleanup(&nd->nd_nmreq);
1254				if (IS_VALID_CRED(nd->nd_cr))
1255					kauth_cred_unref(&nd->nd_cr);
1256				if (nd->nd_gss_context)
1257					nfs_gss_svc_ctx_deref(nd->nd_gss_context);
1258				FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC);
1259				nfsrv_slpderef(slp);
1260				lck_mtx_lock(nfsd_mutex);
1261				goto done;
1262			}
1263			break;
1264		    case RC_DROPIT:
1265			mbuf_freem(nd->nd_nam2);
1266			nd->nd_nam2 = NULL;
1267			break;
1268		    };
1269		    opcnt++;
1270		    if (nd) {
1271			nfsm_chain_cleanup(&nd->nd_nmreq);
1272			if (nd->nd_nam2)
1273				mbuf_freem(nd->nd_nam2);
1274			if (IS_VALID_CRED(nd->nd_cr))
1275				kauth_cred_unref(&nd->nd_cr);
1276			if (nd->nd_gss_context)
1277				nfs_gss_svc_ctx_deref(nd->nd_gss_context);
1278			FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC);
1279			nd = NULL;
1280		    }
1281
1282		    /*
1283		     * Check to see if there are outstanding writes that
1284		     * need to be serviced.
1285		     */
1286		    writes_todo = 0;
1287		    if (slp->ns_wgtime) {
1288			microuptime(&now);
1289			cur_usec = (u_quad_t)now.tv_sec * 1000000 +
1290				(u_quad_t)now.tv_usec;
1291			if (slp->ns_wgtime <= cur_usec) {
1292			    cacherep = RC_DOIT;
1293			    writes_todo = 1;
1294			}
1295		    }
1296		} while (writes_todo);
1297
1298		nd = NULL;
1299		if (TAILQ_EMPTY(&nfsrv_sockwait) && (opcnt < 8)) {
1300			lck_rw_lock_exclusive(&slp->ns_rwlock);
1301			error = nfsrv_dorec(slp, nfsd, &nd);
1302			if (error == EINVAL) {	// RPCSEC_GSS drop
1303				if (slp->ns_sotype == SOCK_STREAM)
1304					nfsrv_zapsock(slp); // drop connection
1305			}
1306			lck_rw_done(&slp->ns_rwlock);
1307		}
1308		if (!nd) {
1309			/* drop our reference on the socket */
1310			nfsd->nfsd_flag &= ~NFSD_REQINPROG;
1311			nfsd->nfsd_slp = NULL;
1312			nfsrv_slpderef(slp);
1313		}
1314	}
1315	lck_mtx_lock(nfsd_mutex);
1316done:
1317	TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain);
1318	FREE(nfsd, M_NFSD);
1319	if (--nfsd_thread_count == 0)
1320		nfsrv_cleanup();
1321	lck_mtx_unlock(nfsd_mutex);
1322	return (error);
1323}
1324
1325int
1326nfssvc_export(user_addr_t argp)
1327{
1328	int error = 0, is_64bit;
1329	struct user_nfs_export_args unxa;
1330	vfs_context_t ctx = vfs_context_current();
1331
1332	is_64bit = IS_64BIT_PROCESS(vfs_context_proc(ctx));
1333
1334	/* copy in pointers to path and export args */
1335	if (is_64bit) {
1336		error = copyin(argp, (caddr_t)&unxa, sizeof(unxa));
1337	} else {
1338		struct nfs_export_args tnxa;
1339		error = copyin(argp, (caddr_t)&tnxa, sizeof(tnxa));
1340		if (error == 0) {
1341			/* munge into LP64 version of nfs_export_args structure */
1342			unxa.nxa_fsid = tnxa.nxa_fsid;
1343			unxa.nxa_expid = tnxa.nxa_expid;
1344			unxa.nxa_fspath = CAST_USER_ADDR_T(tnxa.nxa_fspath);
1345			unxa.nxa_exppath = CAST_USER_ADDR_T(tnxa.nxa_exppath);
1346			unxa.nxa_flags = tnxa.nxa_flags;
1347			unxa.nxa_netcount = tnxa.nxa_netcount;
1348			unxa.nxa_nets = CAST_USER_ADDR_T(tnxa.nxa_nets);
1349		}
1350	}
1351	if (error)
1352		return (error);
1353
1354	error = nfsrv_export(&unxa, ctx);
1355
1356	return (error);
1357}
1358
1359/*
1360 * Shut down a socket associated with an nfsrv_sock structure.
1361 * Should be called with the send lock set, if required.
1362 * The trick here is to increment the sref at the start, so that the nfsds
1363 * will stop using it and clear ns_flag at the end so that it will not be
1364 * reassigned during cleanup.
1365 */
1366void
1367nfsrv_zapsock(struct nfsrv_sock *slp)
1368{
1369	socket_t so;
1370
1371	if ((slp->ns_flag & SLP_VALID) == 0)
1372		return;
1373	slp->ns_flag &= ~SLP_ALLFLAGS;
1374
1375	so = slp->ns_so;
1376	if (so == NULL)
1377		return;
1378
1379	/*
1380	 * Attempt to deter future up-calls, but leave the
1381	 * up-call info in place to avoid a race with the
1382	 * networking code.
1383	 */
1384	socket_lock(so, 1);
1385	so->so_rcv.sb_flags &= ~SB_UPCALL;
1386	socket_unlock(so, 1);
1387
1388	sock_shutdown(so, SHUT_RDWR);
1389
1390	/*
1391	 * Remove from the up-call queue
1392	 */
1393	nfsrv_uc_dequeue(slp);
1394}
1395
1396/*
1397 * cleanup and release a server socket structure.
1398 */
1399void
1400nfsrv_slpfree(struct nfsrv_sock *slp)
1401{
1402	struct nfsrv_descript *nwp, *nnwp;
1403
1404	if (slp->ns_so) {
1405		sock_release(slp->ns_so);
1406		slp->ns_so = NULL;
1407	}
1408	if (slp->ns_nam)
1409		mbuf_free(slp->ns_nam);
1410	if (slp->ns_raw)
1411		mbuf_freem(slp->ns_raw);
1412	if (slp->ns_rec)
1413		mbuf_freem(slp->ns_rec);
1414	if (slp->ns_frag)
1415		mbuf_freem(slp->ns_frag);
1416	slp->ns_nam = slp->ns_raw = slp->ns_rec = slp->ns_frag = NULL;
1417	slp->ns_reccnt = 0;
1418
1419	if (slp->ns_ua)
1420		FREE(slp->ns_ua, M_NFSSVC);
1421
1422	for (nwp = slp->ns_tq.lh_first; nwp; nwp = nnwp) {
1423		nnwp = nwp->nd_tq.le_next;
1424		LIST_REMOVE(nwp, nd_tq);
1425		nfsm_chain_cleanup(&nwp->nd_nmreq);
1426		if (nwp->nd_mrep)
1427			mbuf_freem(nwp->nd_mrep);
1428		if (nwp->nd_nam2)
1429			mbuf_freem(nwp->nd_nam2);
1430		if (IS_VALID_CRED(nwp->nd_cr))
1431			kauth_cred_unref(&nwp->nd_cr);
1432		if (nwp->nd_gss_context)
1433			nfs_gss_svc_ctx_deref(nwp->nd_gss_context);
1434		FREE_ZONE(nwp, sizeof(*nwp), M_NFSRVDESC);
1435	}
1436	LIST_INIT(&slp->ns_tq);
1437
1438	lck_rw_destroy(&slp->ns_rwlock, nfsrv_slp_rwlock_group);
1439	lck_mtx_destroy(&slp->ns_wgmutex, nfsrv_slp_mutex_group);
1440	FREE(slp, M_NFSSVC);
1441}
1442
1443/*
1444 * Derefence a server socket structure. If it has no more references and
1445 * is no longer valid, you can throw it away.
1446 */
1447void
1448nfsrv_slpderef(struct nfsrv_sock *slp)
1449{
1450	struct timeval now;
1451
1452	lck_mtx_lock(nfsd_mutex);
1453	lck_rw_lock_exclusive(&slp->ns_rwlock);
1454	slp->ns_sref--;
1455
1456	if (slp->ns_sref || (slp->ns_flag & SLP_VALID)) {
1457		if ((slp->ns_flag & SLP_QUEUED) && !(slp->ns_flag & SLP_WORKTODO)) {
1458			/* remove socket from queue since there's no work */
1459			if (slp->ns_flag & SLP_WAITQ)
1460				TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
1461			else
1462				TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
1463			slp->ns_flag &= ~SLP_QUEUED;
1464		}
1465		lck_rw_done(&slp->ns_rwlock);
1466		lck_mtx_unlock(nfsd_mutex);
1467		return;
1468	}
1469
1470	/* This socket is no longer valid, so we'll get rid of it */
1471
1472	if (slp->ns_flag & SLP_QUEUED) {
1473		if (slp->ns_flag & SLP_WAITQ)
1474			TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
1475		else
1476			TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
1477		slp->ns_flag &= ~SLP_QUEUED;
1478	}
1479
1480	/*
1481	 * Queue the socket up for deletion
1482	 * and start the timer to delete it
1483	 * after it has been in limbo for
1484	 * a while.
1485	 */
1486	microuptime(&now);
1487	slp->ns_timestamp = now.tv_sec;
1488	TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain);
1489	TAILQ_INSERT_TAIL(&nfsrv_deadsocklist, slp, ns_chain);
1490	if (!nfsrv_deadsock_timer_on) {
1491		nfsrv_deadsock_timer_on = 1;
1492		nfs_interval_timer_start(nfsrv_deadsock_timer_call,
1493				NFSRV_DEADSOCKDELAY * 1000);
1494	}
1495
1496	lck_rw_done(&slp->ns_rwlock);
1497	/* now remove from the write gather socket list */
1498	if (slp->ns_wgq.tqe_next != SLPNOLIST) {
1499		TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq);
1500		slp->ns_wgq.tqe_next = SLPNOLIST;
1501	}
1502	lck_mtx_unlock(nfsd_mutex);
1503}
1504
1505/*
1506 * Check periodically for dead sockets pending delete.
1507 * If a socket has been dead for more than NFSRV_DEADSOCKDELAY
1508 * seconds then we assume it's safe to free.
1509 */
1510void
1511nfsrv_deadsock_timer(__unused void *param0, __unused void *param1)
1512{
1513	struct nfsrv_sock *slp;
1514	struct timeval now;
1515	time_t time_to_wait;
1516
1517	microuptime(&now);
1518	lck_mtx_lock(nfsd_mutex);
1519
1520	while ((slp = TAILQ_FIRST(&nfsrv_deadsocklist))) {
1521		if ((slp->ns_timestamp + NFSRV_DEADSOCKDELAY) > now.tv_sec)
1522			break;
1523		TAILQ_REMOVE(&nfsrv_deadsocklist, slp, ns_chain);
1524		nfsrv_slpfree(slp);
1525	}
1526	if (TAILQ_EMPTY(&nfsrv_deadsocklist)) {
1527		nfsrv_deadsock_timer_on = 0;
1528		lck_mtx_unlock(nfsd_mutex);
1529		return;
1530	}
1531	time_to_wait = (slp->ns_timestamp + NFSRV_DEADSOCKDELAY) - now.tv_sec;
1532	if (time_to_wait < 1)
1533		time_to_wait = 1;
1534
1535	lck_mtx_unlock(nfsd_mutex);
1536
1537	nfs_interval_timer_start(nfsrv_deadsock_timer_call,
1538		time_to_wait * 1000);
1539}
1540
1541/*
1542 * Clean up the data structures for the server.
1543 */
1544void
1545nfsrv_cleanup(void)
1546{
1547	struct nfsrv_sock *slp, *nslp;
1548	struct timeval now;
1549#if CONFIG_FSE
1550	struct nfsrv_fmod *fp, *nfp;
1551	int i;
1552#endif
1553
1554	microuptime(&now);
1555	for (slp = TAILQ_FIRST(&nfsrv_socklist); slp != 0; slp = nslp) {
1556		nslp = TAILQ_NEXT(slp, ns_chain);
1557		if (slp->ns_flag & SLP_VALID) {
1558			lck_rw_lock_exclusive(&slp->ns_rwlock);
1559			nfsrv_zapsock(slp);
1560			lck_rw_done(&slp->ns_rwlock);
1561		}
1562		if (slp->ns_flag & SLP_QUEUED) {
1563			if (slp->ns_flag & SLP_WAITQ)
1564				TAILQ_REMOVE(&nfsrv_sockwait, slp, ns_svcq);
1565			else
1566				TAILQ_REMOVE(&nfsrv_sockwork, slp, ns_svcq);
1567			slp->ns_flag &= ~SLP_QUEUED;
1568		}
1569		if (slp->ns_wgq.tqe_next != SLPNOLIST) {
1570			TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq);
1571			slp->ns_wgq.tqe_next = SLPNOLIST;
1572		}
1573		/* queue the socket up for deletion */
1574		slp->ns_timestamp = now.tv_sec;
1575		TAILQ_REMOVE(&nfsrv_socklist, slp, ns_chain);
1576		TAILQ_INSERT_TAIL(&nfsrv_deadsocklist, slp, ns_chain);
1577		if (!nfsrv_deadsock_timer_on) {
1578			nfsrv_deadsock_timer_on = 1;
1579			nfs_interval_timer_start(nfsrv_deadsock_timer_call,
1580				NFSRV_DEADSOCKDELAY * 1000);
1581		}
1582	}
1583
1584#if CONFIG_FSE
1585	/*
1586	 * Flush pending file write fsevents
1587	 */
1588	lck_mtx_lock(nfsrv_fmod_mutex);
1589	for (i = 0; i < NFSRVFMODHASHSZ; i++) {
1590		for (fp = LIST_FIRST(&nfsrv_fmod_hashtbl[i]); fp; fp = nfp) {
1591			/*
1592			 * Fire off the content modified fsevent for each
1593			 * entry, remove it from the list, and free it.
1594			 */
1595			if (nfsrv_fsevents_enabled) {
1596				fp->fm_context.vc_thread = current_thread();
1597				add_fsevent(FSE_CONTENT_MODIFIED, &fp->fm_context,
1598						FSE_ARG_VNODE, fp->fm_vp,
1599						FSE_ARG_DONE);
1600			}
1601			vnode_put(fp->fm_vp);
1602			kauth_cred_unref(&fp->fm_context.vc_ucred);
1603			nfp = LIST_NEXT(fp, fm_link);
1604			LIST_REMOVE(fp, fm_link);
1605			FREE(fp, M_TEMP);
1606		}
1607	}
1608	nfsrv_fmod_pending = 0;
1609	lck_mtx_unlock(nfsrv_fmod_mutex);
1610#endif
1611
1612	nfsrv_uc_cleanup();     /* Stop nfs socket up-call threads */
1613
1614	nfs_gss_svc_cleanup();	/* Remove any RPCSEC_GSS contexts */
1615
1616	nfsrv_cleancache();	/* And clear out server cache */
1617
1618	nfsrv_udpsock = NULL;
1619	nfsrv_udp6sock = NULL;
1620}
1621
1622#endif /* NFS_NOSERVER */
1623