1/*
2 * Copyright (c) 2002-2007 Apple Inc.  All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*-
29 * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. Berkeley Software Design Inc's name may not be used to endorse or
40 *    promote products derived from this software without specific prior
41 *    written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 *      from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp
56 */
57
58#include <sys/cdefs.h>
59#include <sys/param.h>
60#include <sys/systm.h>
61#include <sys/fcntl.h>
62#include <sys/kernel.h>		/* for hz */
63#include <sys/file_internal.h>
64#include <sys/malloc.h>
65#include <sys/lockf.h>		/* for hz */ /* Must come after sys/malloc.h */
66#include <sys/kpi_mbuf.h>
67#include <sys/mount_internal.h>
68#include <sys/proc_internal.h>	/* for p_start */
69#include <sys/kauth.h>
70#include <sys/resourcevar.h>
71#include <sys/socket.h>
72#include <sys/unistd.h>
73#include <sys/user.h>
74#include <sys/vnode_internal.h>
75
76#include <kern/thread.h>
77#include <kern/host.h>
78
79#include <machine/limits.h>
80
81#include <net/if.h>
82
83#include <nfs/rpcv2.h>
84#include <nfs/nfsproto.h>
85#include <nfs/nfs.h>
86#include <nfs/nfs_gss.h>
87#include <nfs/nfsmount.h>
88#include <nfs/nfsnode.h>
89#include <nfs/nfs_lock.h>
90
91#include <mach/host_priv.h>
92#include <mach/mig_errors.h>
93#include <mach/host_special_ports.h>
94#include <lockd/lockd_mach.h>
95
96extern void ipc_port_release_send(ipc_port_t);
97
98#define OFF_MAX QUAD_MAX
99
100/*
101 * pending lock request messages are kept in this queue which is
102 * kept sorted by transaction ID (xid).
103 */
104static uint64_t nfs_lockxid = 0;
105static LOCKD_MSG_QUEUE nfs_pendlockq;
106
107/*
108 * This structure is used to identify processes which have acquired NFS locks.
109 * Knowing which processes have ever acquired locks allows us to short-circuit
110 * unlock requests for processes that have never had an NFS file lock.  Thus
111 * avoiding a costly and unnecessary lockd request.
112 */
113struct nfs_lock_pid {
114	TAILQ_ENTRY(nfs_lock_pid)	lp_lru;		/* LRU list */
115	LIST_ENTRY(nfs_lock_pid)	lp_hash;	/* hash chain */
116	int				lp_valid;	/* valid entry? */
117	int				lp_time;	/* last time seen valid */
118	pid_t				lp_pid;		/* The process ID. */
119	struct timeval			lp_pid_start;	/* Start time of process id */
120};
121
122#define NFS_LOCK_PID_HASH_SIZE		64	// XXX tune me
123#define	NFS_LOCK_PID_HASH(pid)	\
124	(&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash])
125static LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl;
126static TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru;
127static u_long nfs_lock_pid_hash, nfs_lock_pid_hash_trusted;
128
129static lck_grp_t *nfs_lock_lck_grp;
130static lck_mtx_t *nfs_lock_mutex;
131
132
133/*
134 * initialize global nfs lock state
135 */
136void
137nfs_lockinit(void)
138{
139	TAILQ_INIT(&nfs_pendlockq);
140	nfs_lock_pid_hash_trusted = 1;
141	nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE,
142					 M_TEMP, &nfs_lock_pid_hash);
143	TAILQ_INIT(&nfs_lock_pid_lru);
144
145	nfs_lock_lck_grp = lck_grp_alloc_init("nfs_lock", LCK_GRP_ATTR_NULL);
146	nfs_lock_mutex = lck_mtx_alloc_init(nfs_lock_lck_grp, LCK_ATTR_NULL);
147}
148
149/*
150 * change the count of NFS mounts that may need to make lockd requests
151 *
152 * If the mount count drops to zero, then send a shutdown request to
153 * lockd if we've sent any requests to it.
154 */
155void
156nfs_lockd_mount_change(int i)
157{
158	mach_port_t lockd_port = IPC_PORT_NULL;
159	kern_return_t kr;
160	int send_shutdown;
161
162	lck_mtx_lock(nfs_lock_mutex);
163
164	nfs_lockd_mounts += i;
165
166	/* send a shutdown request if there are no more lockd mounts */
167	send_shutdown = ((nfs_lockd_mounts == 0) && nfs_lockd_request_sent);
168	if (send_shutdown)
169		nfs_lockd_request_sent = 0;
170
171	lck_mtx_unlock(nfs_lock_mutex);
172
173	if (!send_shutdown)
174		return;
175
176	/*
177	 * Let lockd know that it is no longer need for any NFS mounts
178	 */
179	kr = host_get_lockd_port(host_priv_self(), &lockd_port);
180	if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(lockd_port)) {
181		printf("nfs_lockd_mount_change: shutdown couldn't get port, kr %d, port %s\n",
182			kr, (lockd_port == IPC_PORT_NULL) ? "NULL" :
183			(lockd_port == IPC_PORT_DEAD) ? "DEAD" : "VALID");
184		return;
185	}
186
187	kr = lockd_shutdown(lockd_port);
188	if (kr != KERN_SUCCESS)
189		printf("nfs_lockd_mount_change: shutdown %d\n", kr);
190
191	ipc_port_release_send(lockd_port);
192}
193
194/*
195 * insert a lock request message into the pending queue
196 * (nfs_lock_mutex must be held)
197 */
198static inline void
199nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq)
200{
201	LOCKD_MSG_REQUEST *mr;
202
203	mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue);
204	if (!mr || (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
205		/* fast path: empty queue or new largest xid */
206		TAILQ_INSERT_TAIL(&nfs_pendlockq, msgreq, lmr_next);
207		return;
208	}
209	/* slow path: need to walk list to find insertion point */
210	while (mr && (msgreq->lmr_msg.lm_xid > mr->lmr_msg.lm_xid)) {
211		mr = TAILQ_PREV(mr, nfs_lock_msg_queue, lmr_next);
212	}
213	if (mr) {
214		TAILQ_INSERT_AFTER(&nfs_pendlockq, mr, msgreq, lmr_next);
215	} else {
216		TAILQ_INSERT_HEAD(&nfs_pendlockq, msgreq, lmr_next);
217	}
218}
219
220/*
221 * remove a lock request message from the pending queue
222 * (nfs_lock_mutex must be held)
223 */
224static inline void
225nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq)
226{
227	TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next);
228}
229
230/*
231 * find a pending lock request message by xid
232 *
233 * We search from the head of the list assuming that the message we're
234 * looking for is for an older request (because we have an answer to it).
235 * This assumes that lock request will be answered primarily in FIFO order.
236 * However, this may not be the case if there are blocked requests.  We may
237 * want to move blocked requests to a separate queue (but that'll complicate
238 * duplicate xid checking).
239 *
240 * (nfs_lock_mutex must be held)
241 */
242static inline LOCKD_MSG_REQUEST *
243nfs_lockdmsg_find_by_xid(uint64_t lockxid)
244{
245	LOCKD_MSG_REQUEST *mr;
246
247	TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
248		if (mr->lmr_msg.lm_xid == lockxid)
249			return mr;
250		if (mr->lmr_msg.lm_xid > lockxid)
251			return NULL;
252	}
253	return mr;
254}
255
256/*
257 * Because we can't depend on nlm_granted messages containing the same
258 * cookie we sent with the original lock request, we need code test if
259 * an nlm_granted answer matches the lock request.  We also need code
260 * that can find a lockd message based solely on the nlm_granted answer.
261 */
262
263/*
264 * compare lockd message to answer
265 *
266 * returns 0 on equality and 1 if different
267 */
268static inline int
269nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp)
270{
271	if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
272		return 1;
273	if (msgreq->lmr_msg.lm_fl.l_pid != ansp->la_pid)
274		return 1;
275	if (msgreq->lmr_msg.lm_fl.l_start != ansp->la_start)
276		return 1;
277	if (msgreq->lmr_msg.lm_fl.l_len != ansp->la_len)
278		return 1;
279	if (msgreq->lmr_msg.lm_fh_len != ansp->la_fh_len)
280		return 1;
281	if (bcmp(msgreq->lmr_msg.lm_fh, ansp->la_fh, ansp->la_fh_len))
282		return 1;
283	return 0;
284}
285
286/*
287 * find a pending lock request message based on the lock info provided
288 * in the lockd_ans/nlm_granted data.  We need this because we can't
289 * depend on nlm_granted messages containing the same cookie we sent
290 * with the original lock request.
291 *
292 * We search from the head of the list assuming that the message we're
293 * looking for is for an older request (because we have an answer to it).
294 * This assumes that lock request will be answered primarily in FIFO order.
295 * However, this may not be the case if there are blocked requests.  We may
296 * want to move blocked requests to a separate queue (but that'll complicate
297 * duplicate xid checking).
298 *
299 * (nfs_lock_mutex must be held)
300 */
301static inline LOCKD_MSG_REQUEST *
302nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp)
303{
304	LOCKD_MSG_REQUEST *mr;
305
306	if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO))
307		return NULL;
308	TAILQ_FOREACH(mr, &nfs_pendlockq, lmr_next) {
309		if (!nfs_lockdmsg_compare_to_answer(mr, ansp))
310			break;
311	}
312	return mr;
313}
314
315/*
316 * return the next unique lock request transaction ID
317 * (nfs_lock_mutex must be held)
318 */
319static inline uint64_t
320nfs_lockxid_get(void)
321{
322	LOCKD_MSG_REQUEST *mr;
323
324	/* derive initial lock xid from system time */
325	if (!nfs_lockxid) {
326		/*
327		 * Note: it's OK if this code inits nfs_lockxid to 0 (for example,
328		 * due to a broken clock) because we immediately increment it
329		 * and we guarantee to never use xid 0.  So, nfs_lockxid should only
330		 * ever be 0 the first time this function is called.
331		 */
332		struct timeval tv;
333		microtime(&tv);
334		nfs_lockxid = (uint64_t)tv.tv_sec << 12;
335	}
336
337	/* make sure we get a unique xid */
338	do {
339		/* Skip zero xid if it should ever happen.  */
340		if (++nfs_lockxid == 0)
341			nfs_lockxid++;
342		if (!(mr = TAILQ_LAST(&nfs_pendlockq, nfs_lock_msg_queue)) ||
343		     (mr->lmr_msg.lm_xid < nfs_lockxid)) {
344			/* fast path: empty queue or new largest xid */
345			break;
346		}
347		/* check if xid is already in use */
348	} while (nfs_lockdmsg_find_by_xid(nfs_lockxid));
349
350	return nfs_lockxid;
351}
352
353
354/*
355 * Check the nfs_lock_pid hash table for an entry and, if requested,
356 * add the entry if it is not found.
357 *
358 * (Also, if adding, try to clean up some stale entries.)
359 * (nfs_lock_mutex must be held)
360 */
361static int
362nfs_lock_pid_check(proc_t p, int addflag)
363{
364	struct nfs_lock_pid *lp, *lplru, *lplru_next, *mlp;
365	TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_free;
366	proc_t plru = PROC_NULL;
367	pid_t pid;
368	int error = 0;
369	struct timeval now;
370
371	TAILQ_INIT(&nfs_lock_pid_free);
372	mlp = NULL;
373
374loop:
375	/* Search hash chain */
376	pid = proc_pid(p);
377	error = ENOENT;
378	lp = NFS_LOCK_PID_HASH(pid)->lh_first;
379	for (; lp != NULL; lp = lp->lp_hash.le_next)
380		if (lp->lp_pid == pid) {
381			/* found pid... */
382			if (timevalcmp(&lp->lp_pid_start, &p->p_start, ==)) {
383				/* ...and it's valid */
384				/* move to tail of LRU */
385				TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
386				microuptime(&now);
387				lp->lp_time = now.tv_sec;
388				TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
389				error = 0;
390				break;
391			}
392			/* ...but it's no longer valid */
393			/* remove from hash, invalidate, and move to lru head */
394			LIST_REMOVE(lp, lp_hash);
395			lp->lp_valid = 0;
396			TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru);
397			TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru);
398			lp = NULL;
399			break;
400		}
401
402	/* if we didn't find it (valid), use any newly allocated one */
403	if (!lp)
404		lp = mlp;
405
406	/* if we don't have an lp and we've been asked to add it */
407	if ((error == ENOENT) && addflag && !lp) {
408		/* scan lru list for invalid, stale entries to reuse/free */
409		int lrucnt = 0;
410		microuptime(&now);
411		for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) {
412			lplru_next = TAILQ_NEXT(lplru, lp_lru);
413			if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) {
414				/*
415				 * If the oldest LRU entry is relatively new, then don't
416				 * bother scanning any further.
417				 */
418				break;
419			}
420			/* remove entry from LRU, and check if it's still in use */
421			TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru);
422			if (!lplru->lp_valid || !(plru = proc_find(lplru->lp_pid)) ||
423			    timevalcmp(&lplru->lp_pid_start, &plru->p_start, !=)) {
424				if (plru != PROC_NULL) {
425					proc_rele(plru);
426					plru = PROC_NULL;
427				}
428				/* no longer in use */
429				LIST_REMOVE(lplru, lp_hash);
430				if (!lp) {
431					/* we'll reuse this one */
432					lp = lplru;
433				} else {
434					/* queue it up for freeing */
435					TAILQ_INSERT_HEAD(&nfs_lock_pid_free, lplru, lp_lru);
436				}
437			} else {
438				/* still in use */
439				if (plru != PROC_NULL) {
440					proc_rele(plru);
441					plru = PROC_NULL;
442				}
443				lplru->lp_time = now.tv_sec;
444				TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru);
445			}
446			/* don't check too many entries at once */
447			if (++lrucnt > 8)
448				break;
449		}
450		if (!lp) {
451			/* we need to allocate a new one */
452			lck_mtx_unlock(nfs_lock_mutex);
453			MALLOC(mlp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid),
454				M_TEMP, M_WAITOK | M_ZERO);
455			lck_mtx_lock(nfs_lock_mutex);
456			if (mlp) /* make sure somebody hasn't already added this guy */
457				goto loop;
458			error = ENOMEM;
459		}
460	}
461	if ((error == ENOENT) && addflag && lp) {
462		/* (re)initialize nfs_lock_pid info */
463		lp->lp_pid = pid;
464		lp->lp_pid_start = p->p_start;
465		/* insert pid in hash */
466		LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash);
467		lp->lp_valid = 1;
468		lp->lp_time = now.tv_sec;
469		TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru);
470		error = 0;
471	}
472
473	if ((mlp && (lp != mlp)) || TAILQ_FIRST(&nfs_lock_pid_free)) {
474		lck_mtx_unlock(nfs_lock_mutex);
475		if (mlp && (lp != mlp)) {
476			/* we didn't need this one, so we can free it */
477			FREE(mlp, M_TEMP);
478		}
479		/* free up any stale entries */
480		while ((lp = TAILQ_FIRST(&nfs_lock_pid_free))) {
481			TAILQ_REMOVE(&nfs_lock_pid_free, lp, lp_lru);
482			FREE(lp, M_TEMP);
483		}
484		lck_mtx_lock(nfs_lock_mutex);
485	}
486
487	return (error);
488}
489
490#define MACH_MAX_TRIES 3
491
492static int
493send_request(LOCKD_MSG *msg, int interruptable)
494{
495	kern_return_t kr;
496	int retries = 0;
497	mach_port_t lockd_port = IPC_PORT_NULL;
498
499	kr = host_get_lockd_port(host_priv_self(), &lockd_port);
500	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(lockd_port))
501		return (ENOTSUP);
502
503	do {
504		/* In the kernel all mach messaging is interruptable */
505		do {
506			kr = lockd_request(
507				lockd_port,
508				msg->lm_version,
509				msg->lm_flags,
510				msg->lm_xid,
511				msg->lm_fl.l_start,
512				msg->lm_fl.l_len,
513				msg->lm_fl.l_pid,
514				msg->lm_fl.l_type,
515				msg->lm_fl.l_whence,
516				(uint32_t *)&msg->lm_addr,
517				(uint32_t *)&msg->lm_cred,
518				msg->lm_fh_len,
519				msg->lm_fh);
520			if (kr != KERN_SUCCESS)
521				printf("lockd_request received %d!\n", kr);
522		} while (!interruptable && kr == MACH_SEND_INTERRUPTED);
523	} while (kr == MIG_SERVER_DIED && retries++ < MACH_MAX_TRIES);
524
525	ipc_port_release_send(lockd_port);
526	switch (kr) {
527	case MACH_SEND_INTERRUPTED:
528		return (EINTR);
529	default:
530		/*
531		 * Other MACH or MIG errors we will retry. Eventually
532		 * we will call nfs_down and allow the user to disable
533		 * locking.
534		 */
535		return (EAGAIN);
536	}
537	return (kr);
538}
539
540
541/*
542 * NFS advisory byte-level locks (client)
543 */
544int
545nfs3_vnop_advlock(
546	struct vnop_advlock_args /* {
547		struct vnodeop_desc *a_desc;
548		vnode_t a_vp;
549		caddr_t a_id;
550		int a_op;
551		struct flock *a_fl;
552		int a_flags;
553		vfs_context_t a_context;
554	} */ *ap)
555{
556	vfs_context_t ctx;
557	proc_t p;
558	LOCKD_MSG_REQUEST msgreq;
559	LOCKD_MSG *msg;
560	vnode_t vp;
561	nfsnode_t np;
562	int error, error2;
563	int interruptable;
564	struct flock *fl;
565	struct nfsmount *nmp;
566	struct nfs_vattr nvattr;
567	off_t start, end;
568	struct timeval now;
569	int timeo, endtime, lastmsg, wentdown = 0;
570	int lockpidcheck, nfsvers;
571	struct sockaddr *saddr;
572	struct timespec ts;
573
574	ctx = ap->a_context;
575	p = vfs_context_proc(ctx);
576	vp = ap->a_vp;
577	fl = ap->a_fl;
578	np = VTONFS(vp);
579
580	nmp = VTONMP(vp);
581	if (!nmp)
582		return (ENXIO);
583	lck_mtx_lock(&nmp->nm_lock);
584	if (nmp->nm_flag & NFSMNT_NOLOCKS) {
585		lck_mtx_unlock(&nmp->nm_lock);
586		return (ENOTSUP);
587	}
588	nfsvers = nmp->nm_vers;
589	lck_mtx_unlock(&nmp->nm_lock);
590
591	/*
592	 * The NLM protocol doesn't allow the server to return an error
593	 * on ranges, so we do it.  Pre LFS (Large File Summit)
594	 * standards required EINVAL for the range errors.  More recent
595	 * standards use EOVERFLOW, but their EINVAL wording still
596	 * encompasses these errors.
597	 * Any code sensitive to this is either:
598	 *  1) written pre-LFS and so can handle only EINVAL, or
599	 *  2) written post-LFS and thus ought to be tolerant of pre-LFS
600	 *     implementations.
601	 * Since returning EOVERFLOW certainly breaks 1), we return EINVAL.
602	 */
603	if (fl->l_whence != SEEK_END) {
604		if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) ||
605		    fl->l_start < 0 ||
606		    (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) ||
607		    (fl->l_len < 0 && fl->l_start + fl->l_len < 0))
608			return (EINVAL);
609	}
610
611	lck_mtx_lock(nfs_lock_mutex);
612
613	/*
614	 * Need to check if this process has successfully acquired an NFS lock before.
615	 * If not, and this is an unlock request we can simply return success here.
616	 */
617	lockpidcheck = nfs_lock_pid_check(p, 0);
618	lck_mtx_unlock(nfs_lock_mutex);
619	if (lockpidcheck) {
620		if (lockpidcheck != ENOENT)
621			return (lockpidcheck);
622		if ((ap->a_op == F_UNLCK) && nfs_lock_pid_hash_trusted)
623			return (0);
624	}
625
626	/*
627	 * The NFS Lock Manager protocol doesn't directly handle
628	 * negative lengths or SEEK_END, so we need to normalize
629	 * things here where we have all the info.
630	 * (Note: SEEK_CUR is already adjusted for at this point)
631	 */
632	/* Convert the flock structure into a start and end. */
633	switch (fl->l_whence) {
634	case SEEK_SET:
635	case SEEK_CUR:
636		/*
637		 * Caller is responsible for adding any necessary offset
638		 * to fl->l_start when SEEK_CUR is used.
639		 */
640		start = fl->l_start;
641		break;
642	case SEEK_END:
643		/* need to flush, and refetch attributes to make */
644		/* sure we have the correct end of file offset   */
645		error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
646		if (error)
647			return (error);
648		NATTRINVALIDATE(np);
649		if (np->n_flag & NMODIFIED) {
650			nfs_unlock(np);
651			error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
652			if (error)
653				return (error);
654		} else
655			nfs_unlock(np);
656
657		error = nfs_getattr(np, &nvattr, ctx, 0);
658		nfs_data_lock(np, NFS_NODE_LOCK_SHARED);
659		if (!error)
660			error = nfs_lock(np, NFS_NODE_LOCK_SHARED);
661		if (error) {
662			nfs_data_unlock(np);
663			return (error);
664		}
665		start = np->n_size + fl->l_start;
666		nfs_unlock(np);
667		nfs_data_unlock(np);
668		break;
669	default:
670		return (EINVAL);
671	}
672	if (fl->l_len == 0)
673		end = -1;
674	else if (fl->l_len > 0)
675		end = start + fl->l_len - 1;
676	else { /* l_len is negative */
677		end = start - 1;
678		start += fl->l_len;
679	}
680	if (start < 0)
681		return (EINVAL);
682
683	if ((nfsvers == NFS_VER2) &&
684	    ((start >= 0x80000000) || (end >= 0x80000000)))
685		return (EINVAL);
686
687	/*
688	 * Fill in the information structure.
689	 * We set all values to zero with bzero to clear
690	 * out any information in the sockaddr_storage
691	 * and nfs_filehandle contained in msgreq so that
692	 * we will not leak extraneous information out of
693	 * the kernel when calling up to lockd via our mig
694	 * generated routine.
695	 */
696	bzero(&msgreq, sizeof(msgreq));
697	msg = &msgreq.lmr_msg;
698	msg->lm_version = LOCKD_MSG_VERSION;
699	msg->lm_flags = 0;
700
701	msg->lm_fl = *fl;
702	msg->lm_fl.l_start = start;
703	if (end != -1)
704		msg->lm_fl.l_len = end - start + 1;
705	msg->lm_fl.l_pid = vfs_context_pid(ctx);
706
707	if (ap->a_flags & F_WAIT)
708		msg->lm_flags |= LOCKD_MSG_BLOCK;
709	if (ap->a_op == F_GETLK)
710		msg->lm_flags |= LOCKD_MSG_TEST;
711
712	nmp = VTONMP(vp);
713	if (!nmp)
714		return (ENXIO);
715
716	lck_mtx_lock(&nmp->nm_lock);
717	saddr = mbuf_data(nmp->nm_nam);
718	bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len));
719	msg->lm_fh_len = (nfsvers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize;
720	bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len);
721	if (nfsvers == NFS_VER3)
722		msg->lm_flags |= LOCKD_MSG_NFSV3;
723	cru2x(vfs_context_ucred(ctx), &msg->lm_cred);
724
725	microuptime(&now);
726	lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
727	interruptable = nmp->nm_flag & NFSMNT_INT;
728	lck_mtx_unlock(&nmp->nm_lock);
729
730	lck_mtx_lock(nfs_lock_mutex);
731
732	/* allocate unique xid */
733	msg->lm_xid = nfs_lockxid_get();
734	nfs_lockdmsg_enqueue(&msgreq);
735
736	timeo = 2;
737
738	for (;;) {
739		nfs_lockd_request_sent = 1;
740
741		/* need to drop nfs_lock_mutex while calling send_request() */
742		lck_mtx_unlock(nfs_lock_mutex);
743		error = send_request(msg, interruptable);
744		lck_mtx_lock(nfs_lock_mutex);
745		if (error && error != EAGAIN)
746			break;
747
748		/*
749		 * Always wait for an answer.  Not waiting for unlocks could
750		 * cause a lock to be left if the unlock request gets dropped.
751		 */
752
753		/*
754		 * Retry if it takes too long to get a response.
755		 *
756		 * The timeout numbers were picked out of thin air... they start
757		 * at 2 and double each timeout with a max of 60 seconds.
758		 *
759		 * In order to maintain responsiveness, we pass a small timeout
760		 * to msleep and calculate the timeouts ourselves.  This allows
761		 * us to pick up on mount changes quicker.
762		 */
763wait_for_granted:
764		error = EWOULDBLOCK;
765		ts.tv_sec = 2;
766		ts.tv_nsec = 0;
767		microuptime(&now);
768		endtime = now.tv_sec + timeo;
769		while (now.tv_sec < endtime) {
770			error = error2 = 0;
771			if (!msgreq.lmr_answered)
772				error = msleep(&msgreq, nfs_lock_mutex, PCATCH | PUSER, "lockd", &ts);
773			if (msgreq.lmr_answered) {
774				/*
775				 * Note: it's possible to have a lock granted at
776				 * essentially the same time that we get interrupted.
777				 * Since the lock may be granted, we can't return an
778				 * error from this request or we might not unlock the
779				 * lock that's been granted.
780				 */
781				nmp = VTONMP(vp);
782				if ((msgreq.lmr_errno == ENOTSUP) && nmp &&
783				    (nmp->nm_state & NFSSTA_LOCKSWORK)) {
784					/*
785					 * We have evidence that locks work, yet lockd
786					 * returned ENOTSUP.  This is probably because
787					 * it was unable to contact the server's lockd
788					 * to send it the request.
789					 *
790					 * Because we know locks work, we'll consider
791					 * this failure to be a timeout.
792					 */
793					error = EWOULDBLOCK;
794				} else {
795					error = 0;
796				}
797				break;
798			}
799			if (error != EWOULDBLOCK)
800				break;
801			/* check that we still have our mount... */
802			/* ...and that we still support locks */
803			nmp = VTONMP(vp);
804			if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) {
805				error = error2;
806				if (fl->l_type == F_UNLCK)
807					printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error);
808				break;
809			}
810			lck_mtx_lock(&nmp->nm_lock);
811			if (nmp->nm_flag & NFSMNT_NOLOCKS) {
812				lck_mtx_unlock(&nmp->nm_lock);
813				break;
814			}
815			interruptable = nmp->nm_flag & NFSMNT_INT;
816			lck_mtx_unlock(&nmp->nm_lock);
817			microuptime(&now);
818		}
819		if (error) {
820			/* check that we still have our mount... */
821			nmp = VTONMP(vp);
822			if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) {
823				error = error2;
824				if (error2 != EINTR) {
825					if (fl->l_type == F_UNLCK)
826						printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error);
827					break;
828				}
829			}
830			/* ...and that we still support locks */
831			lck_mtx_lock(&nmp->nm_lock);
832			if (nmp->nm_flag & NFSMNT_NOLOCKS) {
833				if (error == EWOULDBLOCK)
834					error = ENOTSUP;
835				lck_mtx_unlock(&nmp->nm_lock);
836				break;
837			}
838			interruptable = nmp->nm_flag & NFSMNT_INT;
839			if (error != EWOULDBLOCK) {
840				lck_mtx_unlock(&nmp->nm_lock);
841				/*
842				 * We're going to bail on this request.
843				 * If we were a blocked lock request, send a cancel.
844				 */
845				if ((msgreq.lmr_errno == EINPROGRESS) &&
846				    !(msg->lm_flags & LOCKD_MSG_CANCEL)) {
847					/* set this request up as a cancel */
848					msg->lm_flags |= LOCKD_MSG_CANCEL;
849					nfs_lockdmsg_dequeue(&msgreq);
850					msg->lm_xid = nfs_lockxid_get();
851					nfs_lockdmsg_enqueue(&msgreq);
852					msgreq.lmr_saved_errno = error;
853					msgreq.lmr_errno = 0;
854					msgreq.lmr_answered = 0;
855					/* reset timeout */
856					timeo = 2;
857					/* send cancel request */
858					continue;
859				}
860				break;
861			}
862
863			/* warn if we're not getting any response */
864			microuptime(&now);
865			if ((msgreq.lmr_errno != EINPROGRESS) &&
866			    (nmp->nm_tprintf_initial_delay != 0) &&
867			    ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) {
868				lck_mtx_unlock(&nmp->nm_lock);
869				lastmsg = now.tv_sec;
870				nfs_down(nmp, vfs_context_thread(ctx), 0, NFSSTA_LOCKTIMEO, "lockd not responding");
871				wentdown = 1;
872			} else
873				lck_mtx_unlock(&nmp->nm_lock);
874
875			if (msgreq.lmr_errno == EINPROGRESS) {
876				/*
877				 * We've got a blocked lock request that we are
878				 * going to retry.  First, we'll want to try to
879				 * send a cancel for the previous request.
880				 *
881				 * Clear errno so if we don't get a response
882				 * to the resend we'll call nfs_down().
883				 * Also reset timeout because we'll expect a
884				 * quick response to the cancel/resend (even if
885				 * it is NLM_BLOCKED).
886				 */
887				msg->lm_flags |= LOCKD_MSG_CANCEL;
888				nfs_lockdmsg_dequeue(&msgreq);
889				msg->lm_xid = nfs_lockxid_get();
890				nfs_lockdmsg_enqueue(&msgreq);
891				msgreq.lmr_saved_errno = msgreq.lmr_errno;
892				msgreq.lmr_errno = 0;
893				msgreq.lmr_answered = 0;
894				timeo = 2;
895				/* send cancel then resend request */
896				continue;
897			}
898			/*
899			 * We timed out, so we will resend the request.
900			 */
901			timeo *= 2;
902			if (timeo > 60)
903				timeo = 60;
904			/* resend request */
905			continue;
906		}
907
908		/* we got a reponse, so the server's lockd is OK */
909		nfs_up(VTONMP(vp), vfs_context_thread(ctx), NFSSTA_LOCKTIMEO,
910			wentdown ? "lockd alive again" : NULL);
911		wentdown = 0;
912
913		if (msgreq.lmr_errno == EINPROGRESS) {
914			/* got NLM_BLOCKED response */
915			/* need to wait for NLM_GRANTED */
916			timeo = 60;
917			msgreq.lmr_answered = 0;
918			goto wait_for_granted;
919		}
920
921		if ((msg->lm_flags & LOCKD_MSG_CANCEL) &&
922		    (msgreq.lmr_saved_errno == EINPROGRESS)) {
923			/*
924			 * We just got a successful reply to the
925			 * cancel of the previous blocked lock request.
926			 * Now, go ahead and resend the request.
927			 */
928			msg->lm_flags &= ~LOCKD_MSG_CANCEL;
929			nfs_lockdmsg_dequeue(&msgreq);
930			msg->lm_xid = nfs_lockxid_get();
931			nfs_lockdmsg_enqueue(&msgreq);
932			msgreq.lmr_saved_errno = 0;
933			msgreq.lmr_errno = 0;
934			msgreq.lmr_answered = 0;
935			timeo = 2;
936			/* resend request */
937			continue;
938		}
939
940		if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) {
941			if (msg->lm_fl.l_type != F_UNLCK) {
942				fl->l_type = msg->lm_fl.l_type;
943				fl->l_pid = msg->lm_fl.l_pid;
944				fl->l_start = msg->lm_fl.l_start;
945				fl->l_len = msg->lm_fl.l_len;
946				fl->l_whence = SEEK_SET;
947			} else
948				fl->l_type = F_UNLCK;
949		}
950
951		/*
952		 * If the blocked lock request was cancelled.
953		 * Restore the error condition from when we
954		 * originally bailed on the request.
955		 */
956		if (msg->lm_flags & LOCKD_MSG_CANCEL) {
957			msg->lm_flags &= ~LOCKD_MSG_CANCEL;
958			error = msgreq.lmr_saved_errno;
959		} else
960			error = msgreq.lmr_errno;
961
962		nmp = VTONMP(vp);
963		if ((error == ENOTSUP) && nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK)) {
964			/*
965			 * We have NO evidence that locks work and lockd
966			 * returned ENOTSUP.  Let's take this as a hint
967			 * that locks aren't supported and disable them
968			 * for this mount.
969			 */
970			lck_mtx_lock(&nmp->nm_lock);
971			nmp->nm_flag |= NFSMNT_NOLOCKS;
972			nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
973			lck_mtx_unlock(&nmp->nm_lock);
974			printf("lockd returned ENOTSUP, disabling locks for nfs server: %s\n",
975				vfs_statfs(nmp->nm_mountp)->f_mntfromname);
976		}
977		if (!error) {
978			/* record that NFS file locking has worked on this mount */
979			if (nmp) {
980				lck_mtx_lock(&nmp->nm_lock);
981				if (!(nmp->nm_state & NFSSTA_LOCKSWORK))
982					nmp->nm_state |= NFSSTA_LOCKSWORK;
983				lck_mtx_unlock(&nmp->nm_lock);
984			}
985			/*
986			 * If we successfully acquired a lock, make sure this pid
987			 * is in the nfs_lock_pid hash table so we know we can't
988			 * short-circuit unlock requests.
989			 */
990			if ((lockpidcheck == ENOENT) &&
991			    ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW))) {
992				error = nfs_lock_pid_check(p, 1);
993				if (error) {
994					/*
995					 * We couldn't add the pid to the table,
996					 * so we can no longer trust that a pid
997					 * not in the table has no locks.
998					 */
999					nfs_lock_pid_hash_trusted = 0;
1000					printf("nfs_vnop_advlock: pid add failed - no longer trusted\n");
1001				}
1002			}
1003		}
1004		break;
1005	}
1006
1007	nfs_lockdmsg_dequeue(&msgreq);
1008
1009	lck_mtx_unlock(nfs_lock_mutex);
1010
1011	return (error);
1012}
1013
1014/*
1015 * nfslockdans --
1016 *      NFS advisory byte-level locks answer from the lock daemon.
1017 */
1018int
1019nfslockdans(proc_t p, struct lockd_ans *ansp)
1020{
1021	LOCKD_MSG_REQUEST *msgreq;
1022	int error;
1023
1024	/* Let root make this call. */
1025	error = proc_suser(p);
1026	if (error)
1027		return (error);
1028
1029	/* the version should match, or we're out of sync */
1030	if (ansp->la_version != LOCKD_ANS_VERSION)
1031		return (EINVAL);
1032
1033	lck_mtx_lock(nfs_lock_mutex);
1034
1035	/* try to find the lockd message by transaction id (cookie) */
1036	msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
1037	if (ansp->la_flags & LOCKD_ANS_GRANTED) {
1038		/*
1039		 * We can't depend on the granted message having our cookie,
1040		 * so we check the answer against the lockd message found.
1041		 * If no message was found or it doesn't match the answer,
1042		 * we look for the lockd message by the answer's lock info.
1043		 */
1044		if (!msgreq || nfs_lockdmsg_compare_to_answer(msgreq, ansp))
1045			msgreq = nfs_lockdmsg_find_by_answer(ansp);
1046		/*
1047		 * We need to make sure this request isn't being cancelled
1048		 * If it is, we don't want to accept the granted message.
1049		 */
1050		if (msgreq && (msgreq->lmr_msg.lm_flags & LOCKD_MSG_CANCEL))
1051			msgreq = NULL;
1052	}
1053	if (!msgreq) {
1054		lck_mtx_unlock(nfs_lock_mutex);
1055		return (EPIPE);
1056	}
1057
1058	msgreq->lmr_errno = ansp->la_errno;
1059	if ((msgreq->lmr_msg.lm_flags & LOCKD_MSG_TEST) && msgreq->lmr_errno == 0) {
1060		if (ansp->la_flags & LOCKD_ANS_LOCK_INFO) {
1061			if (ansp->la_flags & LOCKD_ANS_LOCK_EXCL)
1062				msgreq->lmr_msg.lm_fl.l_type = F_WRLCK;
1063			else
1064				msgreq->lmr_msg.lm_fl.l_type = F_RDLCK;
1065			msgreq->lmr_msg.lm_fl.l_pid = ansp->la_pid;
1066			msgreq->lmr_msg.lm_fl.l_start = ansp->la_start;
1067			msgreq->lmr_msg.lm_fl.l_len = ansp->la_len;
1068		} else {
1069			msgreq->lmr_msg.lm_fl.l_type = F_UNLCK;
1070		}
1071	}
1072
1073	msgreq->lmr_answered = 1;
1074	lck_mtx_unlock(nfs_lock_mutex);
1075	wakeup(msgreq);
1076
1077	return (0);
1078}
1079
1080