vfs_lockf.c revision 1.72
1/*	$NetBSD: vfs_lockf.c,v 1.72 2009/08/05 19:39:50 dsl Exp $	*/
2
3/*
4 * Copyright (c) 1982, 1986, 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Scooter Morris at Genentech Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)ufs_lockf.c	8.4 (Berkeley) 10/26/94
35 */
36
37#include <sys/cdefs.h>
38__KERNEL_RCSID(0, "$NetBSD: vfs_lockf.c,v 1.72 2009/08/05 19:39:50 dsl Exp $");
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/kernel.h>
43#include <sys/file.h>
44#include <sys/proc.h>
45#include <sys/vnode.h>
46#include <sys/pool.h>
47#include <sys/fcntl.h>
48#include <sys/lockf.h>
49#include <sys/atomic.h>
50#include <sys/kauth.h>
51#include <sys/uidinfo.h>
52
53/*
54 * The lockf structure is a kernel structure which contains the information
55 * associated with a byte range lock.  The lockf structures are linked into
56 * the vnode structure.  Locks are sorted by the starting byte of the lock for
57 * efficiency.
58 *
59 * lf_next is used for two purposes, depending on whether the lock is
60 * being held, or is in conflict with an existing lock.  If this lock
61 * is held, it indicates the next lock on the same vnode.
62 * For pending locks, if lock->lf_next is non-NULL, then lock->lf_block
63 * must be queued on the lf_blkhd TAILQ of lock->lf_next.
64 */
65
66TAILQ_HEAD(locklist, lockf);
67
68struct lockf {
69	kcondvar_t lf_cv;	 /* Signalling */
70	short	lf_flags;	 /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */
71	short	lf_type;	 /* Lock type: F_RDLCK, F_WRLCK */
72	off_t	lf_start;	 /* The byte # of the start of the lock */
73	off_t	lf_end;		 /* The byte # of the end of the lock (-1=EOF)*/
74	void	*lf_id;		 /* process or file description holding lock */
75	struct	lockf **lf_head; /* Back pointer to the head of lockf list */
76	struct	lockf *lf_next;	 /* Next lock on this vnode, or blocking lock */
77	struct  locklist lf_blkhd; /* List of requests blocked on this lock */
78	TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */
79	uid_t	lf_uid;		 /* User ID responsible */
80};
81
82/* Maximum length of sleep chains to traverse to try and detect deadlock. */
83#define MAXDEPTH 50
84
85static pool_cache_t lockf_cache;
86static kmutex_t *lockf_lock;
87static char lockstr[] = "lockf";
88
89/*
90 * This variable controls the maximum number of processes that will
91 * be checked in doing deadlock detection.
92 */
93int maxlockdepth = MAXDEPTH;
94
95#ifdef LOCKF_DEBUG
96int	lockf_debug = 0;
97#endif
98
99#define SELF	0x1
100#define OTHERS	0x2
101
102/*
103 * XXX TODO
104 * Misc cleanups: "void *id" should be visible in the API as a
105 * "struct proc *".
106 * (This requires rototilling all VFS's which support advisory locking).
107 */
108
109/*
110 * If there's a lot of lock contention on a single vnode, locking
111 * schemes which allow for more paralleism would be needed.  Given how
112 * infrequently byte-range locks are actually used in typical BSD
113 * code, a more complex approach probably isn't worth it.
114 */
115
116/*
117 * We enforce a limit on locks by uid, so that a single user cannot
118 * run the kernel out of memory.  For now, the limit is pretty coarse.
119 * There is no limit on root.
120 *
121 * Splitting a lock will always succeed, regardless of current allocations.
122 * If you're slightly above the limit, we still have to permit an allocation
123 * so that the unlock can succeed.  If the unlocking causes too many splits,
124 * however, you're totally cutoff.
125 */
126int maxlocksperuid = 1024;
127
128#ifdef LOCKF_DEBUG
129/*
130 * Print out a lock.
131 */
132static void
133lf_print(const char *tag, struct lockf *lock)
134{
135
136	printf("%s: lock %p for ", tag, lock);
137	if (lock->lf_flags & F_POSIX)
138		printf("proc %d", ((struct proc *)lock->lf_id)->p_pid);
139	else
140		printf("file %p", (struct file *)lock->lf_id);
141	printf(" %s, start %qx, end %qx",
142		lock->lf_type == F_RDLCK ? "shared" :
143		lock->lf_type == F_WRLCK ? "exclusive" :
144		lock->lf_type == F_UNLCK ? "unlock" :
145		"unknown", lock->lf_start, lock->lf_end);
146	if (TAILQ_FIRST(&lock->lf_blkhd))
147		printf(" block %p\n", TAILQ_FIRST(&lock->lf_blkhd));
148	else
149		printf("\n");
150}
151
152static void
153lf_printlist(const char *tag, struct lockf *lock)
154{
155	struct lockf *lf, *blk;
156
157	printf("%s: Lock list:\n", tag);
158	for (lf = *lock->lf_head; lf; lf = lf->lf_next) {
159		printf("\tlock %p for ", lf);
160		if (lf->lf_flags & F_POSIX)
161			printf("proc %d", ((struct proc *)lf->lf_id)->p_pid);
162		else
163			printf("file %p", (struct file *)lf->lf_id);
164		printf(", %s, start %qx, end %qx",
165			lf->lf_type == F_RDLCK ? "shared" :
166			lf->lf_type == F_WRLCK ? "exclusive" :
167			lf->lf_type == F_UNLCK ? "unlock" :
168			"unknown", lf->lf_start, lf->lf_end);
169		TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
170			if (blk->lf_flags & F_POSIX)
171				printf("; proc %d",
172				    ((struct proc *)blk->lf_id)->p_pid);
173			else
174				printf("; file %p", (struct file *)blk->lf_id);
175			printf(", %s, start %qx, end %qx",
176				blk->lf_type == F_RDLCK ? "shared" :
177				blk->lf_type == F_WRLCK ? "exclusive" :
178				blk->lf_type == F_UNLCK ? "unlock" :
179				"unknown", blk->lf_start, blk->lf_end);
180			if (TAILQ_FIRST(&blk->lf_blkhd))
181				 panic("lf_printlist: bad list");
182		}
183		printf("\n");
184	}
185}
186#endif /* LOCKF_DEBUG */
187
188/*
189 * 3 options for allowfail.
190 * 0 - always allocate.  1 - cutoff at limit.  2 - cutoff at double limit.
191 */
192static struct lockf *
193lf_alloc(int allowfail)
194{
195	struct uidinfo *uip;
196	struct lockf *lock;
197	u_long lcnt;
198	const uid_t uid = kauth_cred_geteuid(kauth_cred_get());
199
200	uip = uid_find(uid);
201	lcnt = atomic_inc_ulong_nv(&uip->ui_lockcnt);
202	if (uid && allowfail && lcnt >
203	    (allowfail == 1 ? maxlocksperuid : (maxlocksperuid * 2))) {
204		atomic_dec_ulong(&uip->ui_lockcnt);
205		return NULL;
206	}
207
208	lock = pool_cache_get(lockf_cache, PR_WAITOK);
209	lock->lf_uid = uid;
210	return lock;
211}
212
213static void
214lf_free(struct lockf *lock)
215{
216	struct uidinfo *uip;
217
218	uip = uid_find(lock->lf_uid);
219	atomic_dec_ulong(&uip->ui_lockcnt);
220	pool_cache_put(lockf_cache, lock);
221}
222
223static int
224lf_ctor(void *arg, void *obj, int flag)
225{
226	struct lockf *lock;
227
228	lock = obj;
229	cv_init(&lock->lf_cv, lockstr);
230
231	return 0;
232}
233
234static void
235lf_dtor(void *arg, void *obj)
236{
237	struct lockf *lock;
238
239	lock = obj;
240	cv_destroy(&lock->lf_cv);
241}
242
243/*
244 * Walk the list of locks for an inode to
245 * find an overlapping lock (if any).
246 *
247 * NOTE: this returns only the FIRST overlapping lock.  There
248 *	 may be more than one.
249 */
250static int
251lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
252    struct lockf ***prev, struct lockf **overlap)
253{
254	off_t start, end;
255
256	*overlap = lf;
257	if (lf == NULL)
258		return 0;
259#ifdef LOCKF_DEBUG
260	if (lockf_debug & 2)
261		lf_print("lf_findoverlap: looking for overlap in", lock);
262#endif /* LOCKF_DEBUG */
263	start = lock->lf_start;
264	end = lock->lf_end;
265	while (lf != NULL) {
266		if (((type == SELF) && lf->lf_id != lock->lf_id) ||
267		    ((type == OTHERS) && lf->lf_id == lock->lf_id)) {
268			*prev = &lf->lf_next;
269			*overlap = lf = lf->lf_next;
270			continue;
271		}
272#ifdef LOCKF_DEBUG
273		if (lockf_debug & 2)
274			lf_print("\tchecking", lf);
275#endif /* LOCKF_DEBUG */
276		/*
277		 * OK, check for overlap
278		 *
279		 * Six cases:
280		 *	0) no overlap
281		 *	1) overlap == lock
282		 *	2) overlap contains lock
283		 *	3) lock contains overlap
284		 *	4) overlap starts before lock
285		 *	5) overlap ends after lock
286		 */
287		if ((lf->lf_end != -1 && start > lf->lf_end) ||
288		    (end != -1 && lf->lf_start > end)) {
289			/* Case 0 */
290#ifdef LOCKF_DEBUG
291			if (lockf_debug & 2)
292				printf("no overlap\n");
293#endif /* LOCKF_DEBUG */
294			if ((type & SELF) && end != -1 && lf->lf_start > end)
295				return 0;
296			*prev = &lf->lf_next;
297			*overlap = lf = lf->lf_next;
298			continue;
299		}
300		if ((lf->lf_start == start) && (lf->lf_end == end)) {
301			/* Case 1 */
302#ifdef LOCKF_DEBUG
303			if (lockf_debug & 2)
304				printf("overlap == lock\n");
305#endif /* LOCKF_DEBUG */
306			return 1;
307		}
308		if ((lf->lf_start <= start) &&
309		    (end != -1) &&
310		    ((lf->lf_end >= end) || (lf->lf_end == -1))) {
311			/* Case 2 */
312#ifdef LOCKF_DEBUG
313			if (lockf_debug & 2)
314				printf("overlap contains lock\n");
315#endif /* LOCKF_DEBUG */
316			return 2;
317		}
318		if (start <= lf->lf_start &&
319		           (end == -1 ||
320			   (lf->lf_end != -1 && end >= lf->lf_end))) {
321			/* Case 3 */
322#ifdef LOCKF_DEBUG
323			if (lockf_debug & 2)
324				printf("lock contains overlap\n");
325#endif /* LOCKF_DEBUG */
326			return 3;
327		}
328		if ((lf->lf_start < start) &&
329			((lf->lf_end >= start) || (lf->lf_end == -1))) {
330			/* Case 4 */
331#ifdef LOCKF_DEBUG
332			if (lockf_debug & 2)
333				printf("overlap starts before lock\n");
334#endif /* LOCKF_DEBUG */
335			return 4;
336		}
337		if ((lf->lf_start > start) &&
338			(end != -1) &&
339			((lf->lf_end > end) || (lf->lf_end == -1))) {
340			/* Case 5 */
341#ifdef LOCKF_DEBUG
342			if (lockf_debug & 2)
343				printf("overlap ends after lock\n");
344#endif /* LOCKF_DEBUG */
345			return 5;
346		}
347		panic("lf_findoverlap: default");
348	}
349	return 0;
350}
351
352/*
353 * Split a lock and a contained region into
354 * two or three locks as necessary.
355 */
356static void
357lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **sparelock)
358{
359	struct lockf *splitlock;
360
361#ifdef LOCKF_DEBUG
362	if (lockf_debug & 2) {
363		lf_print("lf_split", lock1);
364		lf_print("splitting from", lock2);
365	}
366#endif /* LOCKF_DEBUG */
367	/*
368	 * Check to see if spliting into only two pieces.
369	 */
370	if (lock1->lf_start == lock2->lf_start) {
371		lock1->lf_start = lock2->lf_end + 1;
372		lock2->lf_next = lock1;
373		return;
374	}
375	if (lock1->lf_end == lock2->lf_end) {
376		lock1->lf_end = lock2->lf_start - 1;
377		lock2->lf_next = lock1->lf_next;
378		lock1->lf_next = lock2;
379		return;
380	}
381	/*
382	 * Make a new lock consisting of the last part of
383	 * the encompassing lock
384	 */
385	splitlock = *sparelock;
386	*sparelock = NULL;
387	cv_destroy(&splitlock->lf_cv);
388	memcpy(splitlock, lock1, sizeof(*splitlock));
389	cv_init(&splitlock->lf_cv, lockstr);
390
391	splitlock->lf_start = lock2->lf_end + 1;
392	TAILQ_INIT(&splitlock->lf_blkhd);
393	lock1->lf_end = lock2->lf_start - 1;
394	/*
395	 * OK, now link it in
396	 */
397	splitlock->lf_next = lock1->lf_next;
398	lock2->lf_next = splitlock;
399	lock1->lf_next = lock2;
400}
401
402/*
403 * Wakeup a blocklist
404 */
405static void
406lf_wakelock(struct lockf *listhead)
407{
408	struct lockf *wakelock;
409
410	while ((wakelock = TAILQ_FIRST(&listhead->lf_blkhd))) {
411		KASSERT(wakelock->lf_next == listhead);
412		TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
413		wakelock->lf_next = NULL;
414#ifdef LOCKF_DEBUG
415		if (lockf_debug & 2)
416			lf_print("lf_wakelock: awakening", wakelock);
417#endif
418		cv_broadcast(&wakelock->lf_cv);
419	}
420}
421
422/*
423 * Remove a byte-range lock on an inode.
424 *
425 * Generally, find the lock (or an overlap to that lock)
426 * and remove it (or shrink it), then wakeup anyone we can.
427 */
428static int
429lf_clearlock(struct lockf *unlock, struct lockf **sparelock)
430{
431	struct lockf **head = unlock->lf_head;
432	struct lockf *lf = *head;
433	struct lockf *overlap, **prev;
434	int ovcase;
435
436	if (lf == NULL)
437		return 0;
438#ifdef LOCKF_DEBUG
439	if (unlock->lf_type != F_UNLCK)
440		panic("lf_clearlock: bad type");
441	if (lockf_debug & 1)
442		lf_print("lf_clearlock", unlock);
443#endif /* LOCKF_DEBUG */
444	prev = head;
445	while ((ovcase = lf_findoverlap(lf, unlock, SELF,
446	    &prev, &overlap)) != 0) {
447		/*
448		 * Wakeup the list of locks to be retried.
449		 */
450		lf_wakelock(overlap);
451
452		switch (ovcase) {
453
454		case 1: /* overlap == lock */
455			*prev = overlap->lf_next;
456			lf_free(overlap);
457			break;
458
459		case 2: /* overlap contains lock: split it */
460			if (overlap->lf_start == unlock->lf_start) {
461				overlap->lf_start = unlock->lf_end + 1;
462				break;
463			}
464			lf_split(overlap, unlock, sparelock);
465			overlap->lf_next = unlock->lf_next;
466			break;
467
468		case 3: /* lock contains overlap */
469			*prev = overlap->lf_next;
470			lf = overlap->lf_next;
471			lf_free(overlap);
472			continue;
473
474		case 4: /* overlap starts before lock */
475			overlap->lf_end = unlock->lf_start - 1;
476			prev = &overlap->lf_next;
477			lf = overlap->lf_next;
478			continue;
479
480		case 5: /* overlap ends after lock */
481			overlap->lf_start = unlock->lf_end + 1;
482			break;
483		}
484		break;
485	}
486#ifdef LOCKF_DEBUG
487	if (lockf_debug & 1)
488		lf_printlist("lf_clearlock", unlock);
489#endif /* LOCKF_DEBUG */
490	return 0;
491}
492
493/*
494 * Walk the list of locks for an inode and
495 * return the first blocking lock.
496 */
497static struct lockf *
498lf_getblock(struct lockf *lock)
499{
500	struct lockf **prev, *overlap, *lf = *(lock->lf_head);
501
502	prev = lock->lf_head;
503	while (lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != 0) {
504		/*
505		 * We've found an overlap, see if it blocks us
506		 */
507		if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
508			return overlap;
509		/*
510		 * Nope, point to the next one on the list and
511		 * see if it blocks us
512		 */
513		lf = overlap->lf_next;
514	}
515	return NULL;
516}
517
518/*
519 * Set a byte-range lock.
520 */
521static int
522lf_setlock(struct lockf *lock, struct lockf **sparelock,
523    kmutex_t *interlock)
524{
525	struct lockf *block;
526	struct lockf **head = lock->lf_head;
527	struct lockf **prev, *overlap, *ltmp;
528	int ovcase, needtolink, error;
529
530#ifdef LOCKF_DEBUG
531	if (lockf_debug & 1)
532		lf_print("lf_setlock", lock);
533#endif /* LOCKF_DEBUG */
534
535	/*
536	 * Scan lock list for this file looking for locks that would block us.
537	 */
538	while ((block = lf_getblock(lock)) != NULL) {
539		/*
540		 * Free the structure and return if nonblocking.
541		 */
542		if ((lock->lf_flags & F_WAIT) == 0) {
543			lf_free(lock);
544			return EAGAIN;
545		}
546		/*
547		 * We are blocked. Since flock style locks cover
548		 * the whole file, there is no chance for deadlock.
549		 * For byte-range locks we must check for deadlock.
550		 *
551		 * Deadlock detection is done by looking through the
552		 * wait channels to see if there are any cycles that
553		 * involve us. MAXDEPTH is set just to make sure we
554		 * do not go off into neverneverland.
555		 */
556		if ((lock->lf_flags & F_POSIX) &&
557		    (block->lf_flags & F_POSIX)) {
558			struct lwp *wlwp;
559			volatile const struct lockf *waitblock;
560			int i = 0;
561			struct proc *p;
562
563			p = (struct proc *)block->lf_id;
564			KASSERT(p != NULL);
565			while (i++ < maxlockdepth) {
566				mutex_enter(p->p_lock);
567				if (p->p_nlwps > 1) {
568					mutex_exit(p->p_lock);
569					break;
570				}
571				wlwp = LIST_FIRST(&p->p_lwps);
572				lwp_lock(wlwp);
573				if (wlwp->l_wchan == NULL ||
574				    wlwp->l_wmesg != lockstr) {
575					lwp_unlock(wlwp);
576					mutex_exit(p->p_lock);
577					break;
578				}
579				waitblock = wlwp->l_wchan;
580				lwp_unlock(wlwp);
581				mutex_exit(p->p_lock);
582				/* Get the owner of the blocking lock */
583				waitblock = waitblock->lf_next;
584				if ((waitblock->lf_flags & F_POSIX) == 0)
585					break;
586				p = (struct proc *)waitblock->lf_id;
587				if (p == curproc) {
588					lf_free(lock);
589					return EDEADLK;
590				}
591			}
592			/*
593			 * If we're still following a dependency chain
594			 * after maxlockdepth iterations, assume we're in
595			 * a cycle to be safe.
596			 */
597			if (i >= maxlockdepth) {
598				lf_free(lock);
599				return EDEADLK;
600			}
601		}
602		/*
603		 * For flock type locks, we must first remove
604		 * any shared locks that we hold before we sleep
605		 * waiting for an exclusive lock.
606		 */
607		if ((lock->lf_flags & F_FLOCK) &&
608		    lock->lf_type == F_WRLCK) {
609			lock->lf_type = F_UNLCK;
610			(void) lf_clearlock(lock, NULL);
611			lock->lf_type = F_WRLCK;
612		}
613		/*
614		 * Add our lock to the blocked list and sleep until we're free.
615		 * Remember who blocked us (for deadlock detection).
616		 */
617		lock->lf_next = block;
618		TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
619#ifdef LOCKF_DEBUG
620		if (lockf_debug & 1) {
621			lf_print("lf_setlock: blocking on", block);
622			lf_printlist("lf_setlock", block);
623		}
624#endif /* LOCKF_DEBUG */
625		error = cv_wait_sig(&lock->lf_cv, interlock);
626
627		/*
628		 * We may have been awoken by a signal (in
629		 * which case we must remove ourselves from the
630		 * blocked list) and/or by another process
631		 * releasing a lock (in which case we have already
632		 * been removed from the blocked list and our
633		 * lf_next field set to NULL).
634		 */
635		if (lock->lf_next != NULL) {
636			TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
637			lock->lf_next = NULL;
638		}
639		if (error) {
640			lf_free(lock);
641			return error;
642		}
643	}
644	/*
645	 * No blocks!!  Add the lock.  Note that we will
646	 * downgrade or upgrade any overlapping locks this
647	 * process already owns.
648	 *
649	 * Skip over locks owned by other processes.
650	 * Handle any locks that overlap and are owned by ourselves.
651	 */
652	prev = head;
653	block = *head;
654	needtolink = 1;
655	for (;;) {
656		ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
657		if (ovcase)
658			block = overlap->lf_next;
659		/*
660		 * Six cases:
661		 *	0) no overlap
662		 *	1) overlap == lock
663		 *	2) overlap contains lock
664		 *	3) lock contains overlap
665		 *	4) overlap starts before lock
666		 *	5) overlap ends after lock
667		 */
668		switch (ovcase) {
669		case 0: /* no overlap */
670			if (needtolink) {
671				*prev = lock;
672				lock->lf_next = overlap;
673			}
674			break;
675
676		case 1: /* overlap == lock */
677			/*
678			 * If downgrading lock, others may be
679			 * able to acquire it.
680			 */
681			if (lock->lf_type == F_RDLCK &&
682			    overlap->lf_type == F_WRLCK)
683				lf_wakelock(overlap);
684			overlap->lf_type = lock->lf_type;
685			lf_free(lock);
686			lock = overlap; /* for debug output below */
687			break;
688
689		case 2: /* overlap contains lock */
690			/*
691			 * Check for common starting point and different types.
692			 */
693			if (overlap->lf_type == lock->lf_type) {
694				lf_free(lock);
695				lock = overlap; /* for debug output below */
696				break;
697			}
698			if (overlap->lf_start == lock->lf_start) {
699				*prev = lock;
700				lock->lf_next = overlap;
701				overlap->lf_start = lock->lf_end + 1;
702			} else
703				lf_split(overlap, lock, sparelock);
704			lf_wakelock(overlap);
705			break;
706
707		case 3: /* lock contains overlap */
708			/*
709			 * If downgrading lock, others may be able to
710			 * acquire it, otherwise take the list.
711			 */
712			if (lock->lf_type == F_RDLCK &&
713			    overlap->lf_type == F_WRLCK) {
714				lf_wakelock(overlap);
715			} else {
716				while ((ltmp = TAILQ_FIRST(&overlap->lf_blkhd))) {
717					KASSERT(ltmp->lf_next == overlap);
718					TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
719					    lf_block);
720					ltmp->lf_next = lock;
721					TAILQ_INSERT_TAIL(&lock->lf_blkhd,
722					    ltmp, lf_block);
723				}
724			}
725			/*
726			 * Add the new lock if necessary and delete the overlap.
727			 */
728			if (needtolink) {
729				*prev = lock;
730				lock->lf_next = overlap->lf_next;
731				prev = &lock->lf_next;
732				needtolink = 0;
733			} else
734				*prev = overlap->lf_next;
735			lf_free(overlap);
736			continue;
737
738		case 4: /* overlap starts before lock */
739			/*
740			 * Add lock after overlap on the list.
741			 */
742			lock->lf_next = overlap->lf_next;
743			overlap->lf_next = lock;
744			overlap->lf_end = lock->lf_start - 1;
745			prev = &lock->lf_next;
746			lf_wakelock(overlap);
747			needtolink = 0;
748			continue;
749
750		case 5: /* overlap ends after lock */
751			/*
752			 * Add the new lock before overlap.
753			 */
754			if (needtolink) {
755				*prev = lock;
756				lock->lf_next = overlap;
757			}
758			overlap->lf_start = lock->lf_end + 1;
759			lf_wakelock(overlap);
760			break;
761		}
762		break;
763	}
764#ifdef LOCKF_DEBUG
765	if (lockf_debug & 1) {
766		lf_print("lf_setlock: got the lock", lock);
767		lf_printlist("lf_setlock", lock);
768	}
769#endif /* LOCKF_DEBUG */
770	return 0;
771}
772
773/*
774 * Check whether there is a blocking lock,
775 * and if so return its process identifier.
776 */
777static int
778lf_getlock(struct lockf *lock, struct flock *fl)
779{
780	struct lockf *block;
781
782#ifdef LOCKF_DEBUG
783	if (lockf_debug & 1)
784		lf_print("lf_getlock", lock);
785#endif /* LOCKF_DEBUG */
786
787	if ((block = lf_getblock(lock)) != NULL) {
788		fl->l_type = block->lf_type;
789		fl->l_whence = SEEK_SET;
790		fl->l_start = block->lf_start;
791		if (block->lf_end == -1)
792			fl->l_len = 0;
793		else
794			fl->l_len = block->lf_end - block->lf_start + 1;
795		if (block->lf_flags & F_POSIX)
796			fl->l_pid = ((struct proc *)block->lf_id)->p_pid;
797		else
798			fl->l_pid = -1;
799	} else {
800		fl->l_type = F_UNLCK;
801	}
802	return 0;
803}
804
805/*
806 * Do an advisory lock operation.
807 */
808int
809lf_advlock(struct vop_advlock_args *ap, struct lockf **head, off_t size)
810{
811	struct flock *fl = ap->a_fl;
812	struct lockf *lock = NULL;
813	struct lockf *sparelock;
814	kmutex_t *interlock = lockf_lock;
815	off_t start, end;
816	int error = 0;
817
818	/*
819	 * Convert the flock structure into a start and end.
820	 */
821	switch (fl->l_whence) {
822	case SEEK_SET:
823	case SEEK_CUR:
824		/*
825		 * Caller is responsible for adding any necessary offset
826		 * when SEEK_CUR is used.
827		 */
828		start = fl->l_start;
829		break;
830
831	case SEEK_END:
832		start = size + fl->l_start;
833		break;
834
835	default:
836		return EINVAL;
837	}
838
839	if (fl->l_len == 0)
840		end = -1;
841	else {
842		if (fl->l_len > 0)
843			end = start + fl->l_len - 1;
844		else {
845			/* lockf() allows -ve lengths */
846			end = start - 1;
847			start += fl->l_len;
848		}
849	}
850	if (start < 0)
851		return EINVAL;
852
853	/*
854	 * Allocate locks before acquiring the interlock.  We need two
855	 * locks in the worst case.
856	 */
857	switch (ap->a_op) {
858	case F_SETLK:
859	case F_UNLCK:
860		/*
861		 * XXX For F_UNLCK case, we can re-use the lock.
862		 */
863		if ((ap->a_flags & F_FLOCK) == 0) {
864			/*
865			 * Byte-range lock might need one more lock.
866			 */
867			sparelock = lf_alloc(0);
868			if (sparelock == NULL) {
869				error = ENOMEM;
870				goto quit;
871			}
872			break;
873		}
874		/* FALLTHROUGH */
875
876	case F_GETLK:
877		sparelock = NULL;
878		break;
879
880	default:
881		return EINVAL;
882	}
883
884	switch (ap->a_op) {
885	case F_SETLK:
886		lock = lf_alloc(1);
887		break;
888	case F_UNLCK:
889		if (start == 0 || end == -1) {
890			/* never split */
891			lock = lf_alloc(0);
892		} else {
893			/* might split */
894			lock = lf_alloc(2);
895		}
896		break;
897	case F_GETLK:
898		lock = lf_alloc(0);
899		break;
900	}
901	if (lock == NULL) {
902		error = ENOMEM;
903		goto quit;
904	}
905
906	mutex_enter(interlock);
907
908	/*
909	 * Avoid the common case of unlocking when inode has no locks.
910	 */
911	if (*head == (struct lockf *)0) {
912		if (ap->a_op != F_SETLK) {
913			fl->l_type = F_UNLCK;
914			error = 0;
915			goto quit_unlock;
916		}
917	}
918
919	/*
920	 * Create the lockf structure.
921	 */
922	lock->lf_start = start;
923	lock->lf_end = end;
924	lock->lf_head = head;
925	lock->lf_type = fl->l_type;
926	lock->lf_next = (struct lockf *)0;
927	TAILQ_INIT(&lock->lf_blkhd);
928	lock->lf_flags = ap->a_flags;
929	if (lock->lf_flags & F_POSIX) {
930		KASSERT(curproc == (struct proc *)ap->a_id);
931	}
932	lock->lf_id = ap->a_id;
933
934	/*
935	 * Do the requested operation.
936	 */
937	switch (ap->a_op) {
938
939	case F_SETLK:
940		error = lf_setlock(lock, &sparelock, interlock);
941		lock = NULL; /* lf_setlock freed it */
942		break;
943
944	case F_UNLCK:
945		error = lf_clearlock(lock, &sparelock);
946		break;
947
948	case F_GETLK:
949		error = lf_getlock(lock, fl);
950		break;
951
952	default:
953		break;
954		/* NOTREACHED */
955	}
956
957quit_unlock:
958	mutex_exit(interlock);
959quit:
960	if (lock)
961		lf_free(lock);
962	if (sparelock)
963		lf_free(sparelock);
964
965	return error;
966}
967
968/*
969 * Initialize subsystem.   XXX We use a global lock.  This could be the
970 * vnode interlock, but the deadlock detection code may need to inspect
971 * locks belonging to other files.
972 */
973void
974lf_init(void)
975{
976
977	lockf_cache = pool_cache_init(sizeof(struct lockf), 0, 0, 0, "lockf",
978 	    NULL, IPL_NONE, lf_ctor, lf_dtor, NULL);
979        lockf_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
980}
981