1/*-
2 * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
3 * Authors: Doug Rabson <dfr@rabson.org>
4 * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27/*-
28 * Copyright (c) 1982, 1986, 1989, 1993
29 *	The Regents of the University of California.  All rights reserved.
30 *
31 * This code is derived from software contributed to Berkeley by
32 * Scooter Morris at Genentech Inc.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 *    notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 *    notice, this list of conditions and the following disclaimer in the
41 *    documentation and/or other materials provided with the distribution.
42 * 4. Neither the name of the University nor the names of its contributors
43 *    may be used to endorse or promote products derived from this software
44 *    without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 *	@(#)ufs_lockf.c	8.3 (Berkeley) 1/6/94
59 */
60
61#include <sys/cdefs.h>
62__FBSDID("$FreeBSD: stable/11/sys/kern/kern_lockf.c 313728 2017-02-14 13:45:20Z avg $");
63
64#include "opt_debug_lockf.h"
65
66#include <sys/param.h>
67#include <sys/systm.h>
68#include <sys/hash.h>
69#include <sys/kernel.h>
70#include <sys/limits.h>
71#include <sys/lock.h>
72#include <sys/mount.h>
73#include <sys/mutex.h>
74#include <sys/proc.h>
75#include <sys/sx.h>
76#include <sys/unistd.h>
77#include <sys/vnode.h>
78#include <sys/malloc.h>
79#include <sys/fcntl.h>
80#include <sys/lockf.h>
81#include <sys/taskqueue.h>
82
83#ifdef LOCKF_DEBUG
84#include <sys/sysctl.h>
85
86#include <ufs/ufs/extattr.h>
87#include <ufs/ufs/quota.h>
88#include <ufs/ufs/ufsmount.h>
89#include <ufs/ufs/inode.h>
90
91static int	lockf_debug = 0; /* control debug output */
92SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
93#endif
94
95static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
96
97struct owner_edge;
98struct owner_vertex;
99struct owner_vertex_list;
100struct owner_graph;
101
102#define NOLOCKF (struct lockf_entry *)0
103#define SELF	0x1
104#define OTHERS	0x2
105static void	 lf_init(void *);
106static int	 lf_hash_owner(caddr_t, struct flock *, int);
107static int	 lf_owner_matches(struct lock_owner *, caddr_t, struct flock *,
108    int);
109static struct lockf_entry *
110		 lf_alloc_lock(struct lock_owner *);
111static int	 lf_free_lock(struct lockf_entry *);
112static int	 lf_clearlock(struct lockf *, struct lockf_entry *);
113static int	 lf_overlaps(struct lockf_entry *, struct lockf_entry *);
114static int	 lf_blocks(struct lockf_entry *, struct lockf_entry *);
115static void	 lf_free_edge(struct lockf_edge *);
116static struct lockf_edge *
117		 lf_alloc_edge(void);
118static void	 lf_alloc_vertex(struct lockf_entry *);
119static int	 lf_add_edge(struct lockf_entry *, struct lockf_entry *);
120static void	 lf_remove_edge(struct lockf_edge *);
121static void	 lf_remove_outgoing(struct lockf_entry *);
122static void	 lf_remove_incoming(struct lockf_entry *);
123static int	 lf_add_outgoing(struct lockf *, struct lockf_entry *);
124static int	 lf_add_incoming(struct lockf *, struct lockf_entry *);
125static int	 lf_findoverlap(struct lockf_entry **, struct lockf_entry *,
126    int);
127static struct lockf_entry *
128		 lf_getblock(struct lockf *, struct lockf_entry *);
129static int	 lf_getlock(struct lockf *, struct lockf_entry *, struct flock *);
130static void	 lf_insert_lock(struct lockf *, struct lockf_entry *);
131static void	 lf_wakeup_lock(struct lockf *, struct lockf_entry *);
132static void	 lf_update_dependancies(struct lockf *, struct lockf_entry *,
133    int all, struct lockf_entry_list *);
134static void	 lf_set_start(struct lockf *, struct lockf_entry *, off_t,
135	struct lockf_entry_list*);
136static void	 lf_set_end(struct lockf *, struct lockf_entry *, off_t,
137	struct lockf_entry_list*);
138static int	 lf_setlock(struct lockf *, struct lockf_entry *,
139    struct vnode *, void **cookiep);
140static int	 lf_cancel(struct lockf *, struct lockf_entry *, void *);
141static void	 lf_split(struct lockf *, struct lockf_entry *,
142    struct lockf_entry *, struct lockf_entry_list *);
143#ifdef LOCKF_DEBUG
144static int	 graph_reaches(struct owner_vertex *x, struct owner_vertex *y,
145    struct owner_vertex_list *path);
146static void	 graph_check(struct owner_graph *g, int checkorder);
147static void	 graph_print_vertices(struct owner_vertex_list *set);
148#endif
149static int	 graph_delta_forward(struct owner_graph *g,
150    struct owner_vertex *x, struct owner_vertex *y,
151    struct owner_vertex_list *delta);
152static int	 graph_delta_backward(struct owner_graph *g,
153    struct owner_vertex *x, struct owner_vertex *y,
154    struct owner_vertex_list *delta);
155static int	 graph_add_indices(int *indices, int n,
156    struct owner_vertex_list *set);
157static int	 graph_assign_indices(struct owner_graph *g, int *indices,
158    int nextunused, struct owner_vertex_list *set);
159static int	 graph_add_edge(struct owner_graph *g,
160    struct owner_vertex *x, struct owner_vertex *y);
161static void	 graph_remove_edge(struct owner_graph *g,
162    struct owner_vertex *x, struct owner_vertex *y);
163static struct owner_vertex *graph_alloc_vertex(struct owner_graph *g,
164    struct lock_owner *lo);
165static void	 graph_free_vertex(struct owner_graph *g,
166    struct owner_vertex *v);
167static struct owner_graph * graph_init(struct owner_graph *g);
168#ifdef LOCKF_DEBUG
169static void	 lf_print(char *, struct lockf_entry *);
170static void	 lf_printlist(char *, struct lockf_entry *);
171static void	 lf_print_owner(struct lock_owner *);
172#endif
173
174/*
175 * This structure is used to keep track of both local and remote lock
176 * owners. The lf_owner field of the struct lockf_entry points back at
177 * the lock owner structure. Each possible lock owner (local proc for
178 * POSIX fcntl locks, local file for BSD flock locks or <pid,sysid>
179 * pair for remote locks) is represented by a unique instance of
180 * struct lock_owner.
181 *
182 * If a lock owner has a lock that blocks some other lock or a lock
183 * that is waiting for some other lock, it also has a vertex in the
184 * owner_graph below.
185 *
186 * Locks:
187 * (s)		locked by state->ls_lock
188 * (S)		locked by lf_lock_states_lock
189 * (l)		locked by lf_lock_owners_lock
190 * (g)		locked by lf_owner_graph_lock
191 * (c)		const until freeing
192 */
193#define	LOCK_OWNER_HASH_SIZE	256
194
195struct lock_owner {
196	LIST_ENTRY(lock_owner) lo_link; /* (l) hash chain */
197	int	lo_refs;	    /* (l) Number of locks referring to this */
198	int	lo_flags;	    /* (c) Flags passwd to lf_advlock */
199	caddr_t	lo_id;		    /* (c) Id value passed to lf_advlock */
200	pid_t	lo_pid;		    /* (c) Process Id of the lock owner */
201	int	lo_sysid;	    /* (c) System Id of the lock owner */
202	struct owner_vertex *lo_vertex; /* (g) entry in deadlock graph */
203};
204
205LIST_HEAD(lock_owner_list, lock_owner);
206
207static struct sx		lf_lock_states_lock;
208static struct lockf_list	lf_lock_states; /* (S) */
209static struct sx		lf_lock_owners_lock;
210static struct lock_owner_list	lf_lock_owners[LOCK_OWNER_HASH_SIZE]; /* (l) */
211
212/*
213 * Structures for deadlock detection.
214 *
215 * We have two types of directed graph, the first is the set of locks,
216 * both active and pending on a vnode. Within this graph, active locks
217 * are terminal nodes in the graph (i.e. have no out-going
218 * edges). Pending locks have out-going edges to each blocking active
219 * lock that prevents the lock from being granted and also to each
220 * older pending lock that would block them if it was active. The
221 * graph for each vnode is naturally acyclic; new edges are only ever
222 * added to or from new nodes (either new pending locks which only add
223 * out-going edges or new active locks which only add in-coming edges)
224 * therefore they cannot create loops in the lock graph.
225 *
226 * The second graph is a global graph of lock owners. Each lock owner
227 * is a vertex in that graph and an edge is added to the graph
228 * whenever an edge is added to a vnode graph, with end points
229 * corresponding to owner of the new pending lock and the owner of the
230 * lock upon which it waits. In order to prevent deadlock, we only add
231 * an edge to this graph if the new edge would not create a cycle.
232 *
233 * The lock owner graph is topologically sorted, i.e. if a node has
234 * any outgoing edges, then it has an order strictly less than any
235 * node to which it has an outgoing edge. We preserve this ordering
236 * (and detect cycles) on edge insertion using Algorithm PK from the
237 * paper "A Dynamic Topological Sort Algorithm for Directed Acyclic
238 * Graphs" (ACM Journal of Experimental Algorithms, Vol 11, Article
239 * No. 1.7)
240 */
241struct owner_vertex;
242
243struct owner_edge {
244	LIST_ENTRY(owner_edge) e_outlink; /* (g) link from's out-edge list */
245	LIST_ENTRY(owner_edge) e_inlink;  /* (g) link to's in-edge list */
246	int		e_refs;		  /* (g) number of times added */
247	struct owner_vertex *e_from;	  /* (c) out-going from here */
248	struct owner_vertex *e_to;	  /* (c) in-coming to here */
249};
250LIST_HEAD(owner_edge_list, owner_edge);
251
252struct owner_vertex {
253	TAILQ_ENTRY(owner_vertex) v_link; /* (g) workspace for edge insertion */
254	uint32_t	v_gen;		  /* (g) workspace for edge insertion */
255	int		v_order;	  /* (g) order of vertex in graph */
256	struct owner_edge_list v_outedges;/* (g) list of out-edges */
257	struct owner_edge_list v_inedges; /* (g) list of in-edges */
258	struct lock_owner *v_owner;	  /* (c) corresponding lock owner */
259};
260TAILQ_HEAD(owner_vertex_list, owner_vertex);
261
262struct owner_graph {
263	struct owner_vertex** g_vertices; /* (g) pointers to vertices */
264	int		g_size;		  /* (g) number of vertices */
265	int		g_space;	  /* (g) space allocated for vertices */
266	int		*g_indexbuf;	  /* (g) workspace for loop detection */
267	uint32_t	g_gen;		  /* (g) increment when re-ordering */
268};
269
270static struct sx		lf_owner_graph_lock;
271static struct owner_graph	lf_owner_graph;
272
273/*
274 * Initialise various structures and locks.
275 */
276static void
277lf_init(void *dummy)
278{
279	int i;
280
281	sx_init(&lf_lock_states_lock, "lock states lock");
282	LIST_INIT(&lf_lock_states);
283
284	sx_init(&lf_lock_owners_lock, "lock owners lock");
285	for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++)
286		LIST_INIT(&lf_lock_owners[i]);
287
288	sx_init(&lf_owner_graph_lock, "owner graph lock");
289	graph_init(&lf_owner_graph);
290}
291SYSINIT(lf_init, SI_SUB_LOCK, SI_ORDER_FIRST, lf_init, NULL);
292
293/*
294 * Generate a hash value for a lock owner.
295 */
296static int
297lf_hash_owner(caddr_t id, struct flock *fl, int flags)
298{
299	uint32_t h;
300
301	if (flags & F_REMOTE) {
302		h = HASHSTEP(0, fl->l_pid);
303		h = HASHSTEP(h, fl->l_sysid);
304	} else if (flags & F_FLOCK) {
305		h = ((uintptr_t) id) >> 7;
306	} else {
307		struct proc *p = (struct proc *) id;
308		h = HASHSTEP(0, p->p_pid);
309		h = HASHSTEP(h, 0);
310	}
311
312	return (h % LOCK_OWNER_HASH_SIZE);
313}
314
315/*
316 * Return true if a lock owner matches the details passed to
317 * lf_advlock.
318 */
319static int
320lf_owner_matches(struct lock_owner *lo, caddr_t id, struct flock *fl,
321    int flags)
322{
323	if (flags & F_REMOTE) {
324		return lo->lo_pid == fl->l_pid
325			&& lo->lo_sysid == fl->l_sysid;
326	} else {
327		return lo->lo_id == id;
328	}
329}
330
331static struct lockf_entry *
332lf_alloc_lock(struct lock_owner *lo)
333{
334	struct lockf_entry *lf;
335
336	lf = malloc(sizeof(struct lockf_entry), M_LOCKF, M_WAITOK|M_ZERO);
337
338#ifdef LOCKF_DEBUG
339	if (lockf_debug & 4)
340		printf("Allocated lock %p\n", lf);
341#endif
342	if (lo) {
343		sx_xlock(&lf_lock_owners_lock);
344		lo->lo_refs++;
345		sx_xunlock(&lf_lock_owners_lock);
346		lf->lf_owner = lo;
347	}
348
349	return (lf);
350}
351
352static int
353lf_free_lock(struct lockf_entry *lock)
354{
355
356	KASSERT(lock->lf_refs > 0, ("lockf_entry negative ref count %p", lock));
357	if (--lock->lf_refs > 0)
358		return (0);
359	/*
360	 * Adjust the lock_owner reference count and
361	 * reclaim the entry if this is the last lock
362	 * for that owner.
363	 */
364	struct lock_owner *lo = lock->lf_owner;
365	if (lo) {
366		KASSERT(LIST_EMPTY(&lock->lf_outedges),
367		    ("freeing lock with dependencies"));
368		KASSERT(LIST_EMPTY(&lock->lf_inedges),
369		    ("freeing lock with dependants"));
370		sx_xlock(&lf_lock_owners_lock);
371		KASSERT(lo->lo_refs > 0, ("lock owner refcount"));
372		lo->lo_refs--;
373		if (lo->lo_refs == 0) {
374#ifdef LOCKF_DEBUG
375			if (lockf_debug & 1)
376				printf("lf_free_lock: freeing lock owner %p\n",
377				    lo);
378#endif
379			if (lo->lo_vertex) {
380				sx_xlock(&lf_owner_graph_lock);
381				graph_free_vertex(&lf_owner_graph,
382				    lo->lo_vertex);
383				sx_xunlock(&lf_owner_graph_lock);
384			}
385			LIST_REMOVE(lo, lo_link);
386			free(lo, M_LOCKF);
387#ifdef LOCKF_DEBUG
388			if (lockf_debug & 4)
389				printf("Freed lock owner %p\n", lo);
390#endif
391		}
392		sx_unlock(&lf_lock_owners_lock);
393	}
394	if ((lock->lf_flags & F_REMOTE) && lock->lf_vnode) {
395		vrele(lock->lf_vnode);
396		lock->lf_vnode = NULL;
397	}
398#ifdef LOCKF_DEBUG
399	if (lockf_debug & 4)
400		printf("Freed lock %p\n", lock);
401#endif
402	free(lock, M_LOCKF);
403	return (1);
404}
405
406/*
407 * Advisory record locking support
408 */
409int
410lf_advlockasync(struct vop_advlockasync_args *ap, struct lockf **statep,
411    u_quad_t size)
412{
413	struct lockf *state, *freestate = NULL;
414	struct flock *fl = ap->a_fl;
415	struct lockf_entry *lock;
416	struct vnode *vp = ap->a_vp;
417	caddr_t id = ap->a_id;
418	int flags = ap->a_flags;
419	int hash;
420	struct lock_owner *lo;
421	off_t start, end, oadd;
422	int error;
423
424	/*
425	 * Handle the F_UNLKSYS case first - no need to mess about
426	 * creating a lock owner for this one.
427	 */
428	if (ap->a_op == F_UNLCKSYS) {
429		lf_clearremotesys(fl->l_sysid);
430		return (0);
431	}
432
433	/*
434	 * Convert the flock structure into a start and end.
435	 */
436	switch (fl->l_whence) {
437
438	case SEEK_SET:
439	case SEEK_CUR:
440		/*
441		 * Caller is responsible for adding any necessary offset
442		 * when SEEK_CUR is used.
443		 */
444		start = fl->l_start;
445		break;
446
447	case SEEK_END:
448		if (size > OFF_MAX ||
449		    (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
450			return (EOVERFLOW);
451		start = size + fl->l_start;
452		break;
453
454	default:
455		return (EINVAL);
456	}
457	if (start < 0)
458		return (EINVAL);
459	if (fl->l_len < 0) {
460		if (start == 0)
461			return (EINVAL);
462		end = start - 1;
463		start += fl->l_len;
464		if (start < 0)
465			return (EINVAL);
466	} else if (fl->l_len == 0) {
467		end = OFF_MAX;
468	} else {
469		oadd = fl->l_len - 1;
470		if (oadd > OFF_MAX - start)
471			return (EOVERFLOW);
472		end = start + oadd;
473	}
474
475retry_setlock:
476
477	/*
478	 * Avoid the common case of unlocking when inode has no locks.
479	 */
480	VI_LOCK(vp);
481	if ((*statep) == NULL) {
482		if (ap->a_op != F_SETLK) {
483			fl->l_type = F_UNLCK;
484			VI_UNLOCK(vp);
485			return (0);
486		}
487	}
488	VI_UNLOCK(vp);
489
490	/*
491	 * Map our arguments to an existing lock owner or create one
492	 * if this is the first time we have seen this owner.
493	 */
494	hash = lf_hash_owner(id, fl, flags);
495	sx_xlock(&lf_lock_owners_lock);
496	LIST_FOREACH(lo, &lf_lock_owners[hash], lo_link)
497		if (lf_owner_matches(lo, id, fl, flags))
498			break;
499	if (!lo) {
500		/*
501		 * We initialise the lock with a reference
502		 * count which matches the new lockf_entry
503		 * structure created below.
504		 */
505		lo = malloc(sizeof(struct lock_owner), M_LOCKF,
506		    M_WAITOK|M_ZERO);
507#ifdef LOCKF_DEBUG
508		if (lockf_debug & 4)
509			printf("Allocated lock owner %p\n", lo);
510#endif
511
512		lo->lo_refs = 1;
513		lo->lo_flags = flags;
514		lo->lo_id = id;
515		if (flags & F_REMOTE) {
516			lo->lo_pid = fl->l_pid;
517			lo->lo_sysid = fl->l_sysid;
518		} else if (flags & F_FLOCK) {
519			lo->lo_pid = -1;
520			lo->lo_sysid = 0;
521		} else {
522			struct proc *p = (struct proc *) id;
523			lo->lo_pid = p->p_pid;
524			lo->lo_sysid = 0;
525		}
526		lo->lo_vertex = NULL;
527
528#ifdef LOCKF_DEBUG
529		if (lockf_debug & 1) {
530			printf("lf_advlockasync: new lock owner %p ", lo);
531			lf_print_owner(lo);
532			printf("\n");
533		}
534#endif
535
536		LIST_INSERT_HEAD(&lf_lock_owners[hash], lo, lo_link);
537	} else {
538		/*
539		 * We have seen this lock owner before, increase its
540		 * reference count to account for the new lockf_entry
541		 * structure we create below.
542		 */
543		lo->lo_refs++;
544	}
545	sx_xunlock(&lf_lock_owners_lock);
546
547	/*
548	 * Create the lockf structure. We initialise the lf_owner
549	 * field here instead of in lf_alloc_lock() to avoid paying
550	 * the lf_lock_owners_lock tax twice.
551	 */
552	lock = lf_alloc_lock(NULL);
553	lock->lf_refs = 1;
554	lock->lf_start = start;
555	lock->lf_end = end;
556	lock->lf_owner = lo;
557	lock->lf_vnode = vp;
558	if (flags & F_REMOTE) {
559		/*
560		 * For remote locks, the caller may release its ref to
561		 * the vnode at any time - we have to ref it here to
562		 * prevent it from being recycled unexpectedly.
563		 */
564		vref(vp);
565	}
566
567	/*
568	 * XXX The problem is that VTOI is ufs specific, so it will
569	 * break LOCKF_DEBUG for all other FS's other than UFS because
570	 * it casts the vnode->data ptr to struct inode *.
571	 */
572/*	lock->lf_inode = VTOI(ap->a_vp); */
573	lock->lf_inode = (struct inode *)0;
574	lock->lf_type = fl->l_type;
575	LIST_INIT(&lock->lf_outedges);
576	LIST_INIT(&lock->lf_inedges);
577	lock->lf_async_task = ap->a_task;
578	lock->lf_flags = ap->a_flags;
579
580	/*
581	 * Do the requested operation. First find our state structure
582	 * and create a new one if necessary - the caller's *statep
583	 * variable and the state's ls_threads count is protected by
584	 * the vnode interlock.
585	 */
586	VI_LOCK(vp);
587	if (vp->v_iflag & VI_DOOMED) {
588		VI_UNLOCK(vp);
589		lf_free_lock(lock);
590		return (ENOENT);
591	}
592
593	/*
594	 * Allocate a state structure if necessary.
595	 */
596	state = *statep;
597	if (state == NULL) {
598		struct lockf *ls;
599
600		VI_UNLOCK(vp);
601
602		ls = malloc(sizeof(struct lockf), M_LOCKF, M_WAITOK|M_ZERO);
603		sx_init(&ls->ls_lock, "ls_lock");
604		LIST_INIT(&ls->ls_active);
605		LIST_INIT(&ls->ls_pending);
606		ls->ls_threads = 1;
607
608		sx_xlock(&lf_lock_states_lock);
609		LIST_INSERT_HEAD(&lf_lock_states, ls, ls_link);
610		sx_xunlock(&lf_lock_states_lock);
611
612		/*
613		 * Cope if we lost a race with some other thread while
614		 * trying to allocate memory.
615		 */
616		VI_LOCK(vp);
617		if (vp->v_iflag & VI_DOOMED) {
618			VI_UNLOCK(vp);
619			sx_xlock(&lf_lock_states_lock);
620			LIST_REMOVE(ls, ls_link);
621			sx_xunlock(&lf_lock_states_lock);
622			sx_destroy(&ls->ls_lock);
623			free(ls, M_LOCKF);
624			lf_free_lock(lock);
625			return (ENOENT);
626		}
627		if ((*statep) == NULL) {
628			state = *statep = ls;
629			VI_UNLOCK(vp);
630		} else {
631			state = *statep;
632			state->ls_threads++;
633			VI_UNLOCK(vp);
634
635			sx_xlock(&lf_lock_states_lock);
636			LIST_REMOVE(ls, ls_link);
637			sx_xunlock(&lf_lock_states_lock);
638			sx_destroy(&ls->ls_lock);
639			free(ls, M_LOCKF);
640		}
641	} else {
642		state->ls_threads++;
643		VI_UNLOCK(vp);
644	}
645
646	sx_xlock(&state->ls_lock);
647	/*
648	 * Recheck the doomed vnode after state->ls_lock is
649	 * locked. lf_purgelocks() requires that no new threads add
650	 * pending locks when vnode is marked by VI_DOOMED flag.
651	 */
652	VI_LOCK(vp);
653	if (vp->v_iflag & VI_DOOMED) {
654		state->ls_threads--;
655		wakeup(state);
656		VI_UNLOCK(vp);
657		sx_xunlock(&state->ls_lock);
658		lf_free_lock(lock);
659		return (ENOENT);
660	}
661	VI_UNLOCK(vp);
662
663	switch (ap->a_op) {
664	case F_SETLK:
665		error = lf_setlock(state, lock, vp, ap->a_cookiep);
666		break;
667
668	case F_UNLCK:
669		error = lf_clearlock(state, lock);
670		lf_free_lock(lock);
671		break;
672
673	case F_GETLK:
674		error = lf_getlock(state, lock, fl);
675		lf_free_lock(lock);
676		break;
677
678	case F_CANCEL:
679		if (ap->a_cookiep)
680			error = lf_cancel(state, lock, *ap->a_cookiep);
681		else
682			error = EINVAL;
683		lf_free_lock(lock);
684		break;
685
686	default:
687		lf_free_lock(lock);
688		error = EINVAL;
689		break;
690	}
691
692#ifdef DIAGNOSTIC
693	/*
694	 * Check for some can't happen stuff. In this case, the active
695	 * lock list becoming disordered or containing mutually
696	 * blocking locks. We also check the pending list for locks
697	 * which should be active (i.e. have no out-going edges).
698	 */
699	LIST_FOREACH(lock, &state->ls_active, lf_link) {
700		struct lockf_entry *lf;
701		if (LIST_NEXT(lock, lf_link))
702			KASSERT((lock->lf_start
703				<= LIST_NEXT(lock, lf_link)->lf_start),
704			    ("locks disordered"));
705		LIST_FOREACH(lf, &state->ls_active, lf_link) {
706			if (lock == lf)
707				break;
708			KASSERT(!lf_blocks(lock, lf),
709			    ("two conflicting active locks"));
710			if (lock->lf_owner == lf->lf_owner)
711				KASSERT(!lf_overlaps(lock, lf),
712				    ("two overlapping locks from same owner"));
713		}
714	}
715	LIST_FOREACH(lock, &state->ls_pending, lf_link) {
716		KASSERT(!LIST_EMPTY(&lock->lf_outedges),
717		    ("pending lock which should be active"));
718	}
719#endif
720	sx_xunlock(&state->ls_lock);
721
722	/*
723	 * If we have removed the last active lock on the vnode and
724	 * this is the last thread that was in-progress, we can free
725	 * the state structure. We update the caller's pointer inside
726	 * the vnode interlock but call free outside.
727	 *
728	 * XXX alternatively, keep the state structure around until
729	 * the filesystem recycles - requires a callback from the
730	 * filesystem.
731	 */
732	VI_LOCK(vp);
733
734	state->ls_threads--;
735	wakeup(state);
736	if (LIST_EMPTY(&state->ls_active) && state->ls_threads == 0) {
737		KASSERT(LIST_EMPTY(&state->ls_pending),
738		    ("freeing state with pending locks"));
739		freestate = state;
740		*statep = NULL;
741	}
742
743	VI_UNLOCK(vp);
744
745	if (freestate != NULL) {
746		sx_xlock(&lf_lock_states_lock);
747		LIST_REMOVE(freestate, ls_link);
748		sx_xunlock(&lf_lock_states_lock);
749		sx_destroy(&freestate->ls_lock);
750		free(freestate, M_LOCKF);
751		freestate = NULL;
752	}
753
754	if (error == EDOOFUS) {
755		KASSERT(ap->a_op == F_SETLK, ("EDOOFUS"));
756		goto retry_setlock;
757	}
758	return (error);
759}
760
761int
762lf_advlock(struct vop_advlock_args *ap, struct lockf **statep, u_quad_t size)
763{
764	struct vop_advlockasync_args a;
765
766	a.a_vp = ap->a_vp;
767	a.a_id = ap->a_id;
768	a.a_op = ap->a_op;
769	a.a_fl = ap->a_fl;
770	a.a_flags = ap->a_flags;
771	a.a_task = NULL;
772	a.a_cookiep = NULL;
773
774	return (lf_advlockasync(&a, statep, size));
775}
776
777void
778lf_purgelocks(struct vnode *vp, struct lockf **statep)
779{
780	struct lockf *state;
781	struct lockf_entry *lock, *nlock;
782
783	/*
784	 * For this to work correctly, the caller must ensure that no
785	 * other threads enter the locking system for this vnode,
786	 * e.g. by checking VI_DOOMED. We wake up any threads that are
787	 * sleeping waiting for locks on this vnode and then free all
788	 * the remaining locks.
789	 */
790	VI_LOCK(vp);
791	KASSERT(vp->v_iflag & VI_DOOMED,
792	    ("lf_purgelocks: vp %p has not vgone yet", vp));
793	state = *statep;
794	if (state) {
795		*statep = NULL;
796		state->ls_threads++;
797		VI_UNLOCK(vp);
798
799		sx_xlock(&state->ls_lock);
800		sx_xlock(&lf_owner_graph_lock);
801		LIST_FOREACH_SAFE(lock, &state->ls_pending, lf_link, nlock) {
802			LIST_REMOVE(lock, lf_link);
803			lf_remove_outgoing(lock);
804			lf_remove_incoming(lock);
805
806			/*
807			 * If its an async lock, we can just free it
808			 * here, otherwise we let the sleeping thread
809			 * free it.
810			 */
811			if (lock->lf_async_task) {
812				lf_free_lock(lock);
813			} else {
814				lock->lf_flags |= F_INTR;
815				wakeup(lock);
816			}
817		}
818		sx_xunlock(&lf_owner_graph_lock);
819		sx_xunlock(&state->ls_lock);
820
821		/*
822		 * Wait for all other threads, sleeping and otherwise
823		 * to leave.
824		 */
825		VI_LOCK(vp);
826		while (state->ls_threads > 1)
827			msleep(state, VI_MTX(vp), 0, "purgelocks", 0);
828		VI_UNLOCK(vp);
829
830		/*
831		 * We can just free all the active locks since they
832		 * will have no dependencies (we removed them all
833		 * above). We don't need to bother locking since we
834		 * are the last thread using this state structure.
835		 */
836		KASSERT(LIST_EMPTY(&state->ls_pending),
837		    ("lock pending for %p", state));
838		LIST_FOREACH_SAFE(lock, &state->ls_active, lf_link, nlock) {
839			LIST_REMOVE(lock, lf_link);
840			lf_free_lock(lock);
841		}
842		sx_xlock(&lf_lock_states_lock);
843		LIST_REMOVE(state, ls_link);
844		sx_xunlock(&lf_lock_states_lock);
845		sx_destroy(&state->ls_lock);
846		free(state, M_LOCKF);
847	} else {
848		VI_UNLOCK(vp);
849	}
850}
851
852/*
853 * Return non-zero if locks 'x' and 'y' overlap.
854 */
855static int
856lf_overlaps(struct lockf_entry *x, struct lockf_entry *y)
857{
858
859	return (x->lf_start <= y->lf_end && x->lf_end >= y->lf_start);
860}
861
862/*
863 * Return non-zero if lock 'x' is blocked by lock 'y' (or vice versa).
864 */
865static int
866lf_blocks(struct lockf_entry *x, struct lockf_entry *y)
867{
868
869	return x->lf_owner != y->lf_owner
870		&& (x->lf_type == F_WRLCK || y->lf_type == F_WRLCK)
871		&& lf_overlaps(x, y);
872}
873
874/*
875 * Allocate a lock edge from the free list
876 */
877static struct lockf_edge *
878lf_alloc_edge(void)
879{
880
881	return (malloc(sizeof(struct lockf_edge), M_LOCKF, M_WAITOK|M_ZERO));
882}
883
884/*
885 * Free a lock edge.
886 */
887static void
888lf_free_edge(struct lockf_edge *e)
889{
890
891	free(e, M_LOCKF);
892}
893
894
895/*
896 * Ensure that the lock's owner has a corresponding vertex in the
897 * owner graph.
898 */
899static void
900lf_alloc_vertex(struct lockf_entry *lock)
901{
902	struct owner_graph *g = &lf_owner_graph;
903
904	if (!lock->lf_owner->lo_vertex)
905		lock->lf_owner->lo_vertex =
906			graph_alloc_vertex(g, lock->lf_owner);
907}
908
909/*
910 * Attempt to record an edge from lock x to lock y. Return EDEADLK if
911 * the new edge would cause a cycle in the owner graph.
912 */
913static int
914lf_add_edge(struct lockf_entry *x, struct lockf_entry *y)
915{
916	struct owner_graph *g = &lf_owner_graph;
917	struct lockf_edge *e;
918	int error;
919
920#ifdef DIAGNOSTIC
921	LIST_FOREACH(e, &x->lf_outedges, le_outlink)
922		KASSERT(e->le_to != y, ("adding lock edge twice"));
923#endif
924
925	/*
926	 * Make sure the two owners have entries in the owner graph.
927	 */
928	lf_alloc_vertex(x);
929	lf_alloc_vertex(y);
930
931	error = graph_add_edge(g, x->lf_owner->lo_vertex,
932	    y->lf_owner->lo_vertex);
933	if (error)
934		return (error);
935
936	e = lf_alloc_edge();
937	LIST_INSERT_HEAD(&x->lf_outedges, e, le_outlink);
938	LIST_INSERT_HEAD(&y->lf_inedges, e, le_inlink);
939	e->le_from = x;
940	e->le_to = y;
941
942	return (0);
943}
944
945/*
946 * Remove an edge from the lock graph.
947 */
948static void
949lf_remove_edge(struct lockf_edge *e)
950{
951	struct owner_graph *g = &lf_owner_graph;
952	struct lockf_entry *x = e->le_from;
953	struct lockf_entry *y = e->le_to;
954
955	graph_remove_edge(g, x->lf_owner->lo_vertex, y->lf_owner->lo_vertex);
956	LIST_REMOVE(e, le_outlink);
957	LIST_REMOVE(e, le_inlink);
958	e->le_from = NULL;
959	e->le_to = NULL;
960	lf_free_edge(e);
961}
962
963/*
964 * Remove all out-going edges from lock x.
965 */
966static void
967lf_remove_outgoing(struct lockf_entry *x)
968{
969	struct lockf_edge *e;
970
971	while ((e = LIST_FIRST(&x->lf_outedges)) != NULL) {
972		lf_remove_edge(e);
973	}
974}
975
976/*
977 * Remove all in-coming edges from lock x.
978 */
979static void
980lf_remove_incoming(struct lockf_entry *x)
981{
982	struct lockf_edge *e;
983
984	while ((e = LIST_FIRST(&x->lf_inedges)) != NULL) {
985		lf_remove_edge(e);
986	}
987}
988
989/*
990 * Walk the list of locks for the file and create an out-going edge
991 * from lock to each blocking lock.
992 */
993static int
994lf_add_outgoing(struct lockf *state, struct lockf_entry *lock)
995{
996	struct lockf_entry *overlap;
997	int error;
998
999	LIST_FOREACH(overlap, &state->ls_active, lf_link) {
1000		/*
1001		 * We may assume that the active list is sorted by
1002		 * lf_start.
1003		 */
1004		if (overlap->lf_start > lock->lf_end)
1005			break;
1006		if (!lf_blocks(lock, overlap))
1007			continue;
1008
1009		/*
1010		 * We've found a blocking lock. Add the corresponding
1011		 * edge to the graphs and see if it would cause a
1012		 * deadlock.
1013		 */
1014		error = lf_add_edge(lock, overlap);
1015
1016		/*
1017		 * The only error that lf_add_edge returns is EDEADLK.
1018		 * Remove any edges we added and return the error.
1019		 */
1020		if (error) {
1021			lf_remove_outgoing(lock);
1022			return (error);
1023		}
1024	}
1025
1026	/*
1027	 * We also need to add edges to sleeping locks that block
1028	 * us. This ensures that lf_wakeup_lock cannot grant two
1029	 * mutually blocking locks simultaneously and also enforces a
1030	 * 'first come, first served' fairness model. Note that this
1031	 * only happens if we are blocked by at least one active lock
1032	 * due to the call to lf_getblock in lf_setlock below.
1033	 */
1034	LIST_FOREACH(overlap, &state->ls_pending, lf_link) {
1035		if (!lf_blocks(lock, overlap))
1036			continue;
1037		/*
1038		 * We've found a blocking lock. Add the corresponding
1039		 * edge to the graphs and see if it would cause a
1040		 * deadlock.
1041		 */
1042		error = lf_add_edge(lock, overlap);
1043
1044		/*
1045		 * The only error that lf_add_edge returns is EDEADLK.
1046		 * Remove any edges we added and return the error.
1047		 */
1048		if (error) {
1049			lf_remove_outgoing(lock);
1050			return (error);
1051		}
1052	}
1053
1054	return (0);
1055}
1056
1057/*
1058 * Walk the list of pending locks for the file and create an in-coming
1059 * edge from lock to each blocking lock.
1060 */
1061static int
1062lf_add_incoming(struct lockf *state, struct lockf_entry *lock)
1063{
1064	struct lockf_entry *overlap;
1065	int error;
1066
1067	LIST_FOREACH(overlap, &state->ls_pending, lf_link) {
1068		if (!lf_blocks(lock, overlap))
1069			continue;
1070
1071		/*
1072		 * We've found a blocking lock. Add the corresponding
1073		 * edge to the graphs and see if it would cause a
1074		 * deadlock.
1075		 */
1076		error = lf_add_edge(overlap, lock);
1077
1078		/*
1079		 * The only error that lf_add_edge returns is EDEADLK.
1080		 * Remove any edges we added and return the error.
1081		 */
1082		if (error) {
1083			lf_remove_incoming(lock);
1084			return (error);
1085		}
1086	}
1087	return (0);
1088}
1089
1090/*
1091 * Insert lock into the active list, keeping list entries ordered by
1092 * increasing values of lf_start.
1093 */
1094static void
1095lf_insert_lock(struct lockf *state, struct lockf_entry *lock)
1096{
1097	struct lockf_entry *lf, *lfprev;
1098
1099	if (LIST_EMPTY(&state->ls_active)) {
1100		LIST_INSERT_HEAD(&state->ls_active, lock, lf_link);
1101		return;
1102	}
1103
1104	lfprev = NULL;
1105	LIST_FOREACH(lf, &state->ls_active, lf_link) {
1106		if (lf->lf_start > lock->lf_start) {
1107			LIST_INSERT_BEFORE(lf, lock, lf_link);
1108			return;
1109		}
1110		lfprev = lf;
1111	}
1112	LIST_INSERT_AFTER(lfprev, lock, lf_link);
1113}
1114
1115/*
1116 * Wake up a sleeping lock and remove it from the pending list now
1117 * that all its dependencies have been resolved. The caller should
1118 * arrange for the lock to be added to the active list, adjusting any
1119 * existing locks for the same owner as needed.
1120 */
1121static void
1122lf_wakeup_lock(struct lockf *state, struct lockf_entry *wakelock)
1123{
1124
1125	/*
1126	 * Remove from ls_pending list and wake up the caller
1127	 * or start the async notification, as appropriate.
1128	 */
1129	LIST_REMOVE(wakelock, lf_link);
1130#ifdef LOCKF_DEBUG
1131	if (lockf_debug & 1)
1132		lf_print("lf_wakeup_lock: awakening", wakelock);
1133#endif /* LOCKF_DEBUG */
1134	if (wakelock->lf_async_task) {
1135		taskqueue_enqueue(taskqueue_thread, wakelock->lf_async_task);
1136	} else {
1137		wakeup(wakelock);
1138	}
1139}
1140
1141/*
1142 * Re-check all dependent locks and remove edges to locks that we no
1143 * longer block. If 'all' is non-zero, the lock has been removed and
1144 * we must remove all the dependencies, otherwise it has simply been
1145 * reduced but remains active. Any pending locks which have been been
1146 * unblocked are added to 'granted'
1147 */
1148static void
1149lf_update_dependancies(struct lockf *state, struct lockf_entry *lock, int all,
1150	struct lockf_entry_list *granted)
1151{
1152	struct lockf_edge *e, *ne;
1153	struct lockf_entry *deplock;
1154
1155	LIST_FOREACH_SAFE(e, &lock->lf_inedges, le_inlink, ne) {
1156		deplock = e->le_from;
1157		if (all || !lf_blocks(lock, deplock)) {
1158			sx_xlock(&lf_owner_graph_lock);
1159			lf_remove_edge(e);
1160			sx_xunlock(&lf_owner_graph_lock);
1161			if (LIST_EMPTY(&deplock->lf_outedges)) {
1162				lf_wakeup_lock(state, deplock);
1163				LIST_INSERT_HEAD(granted, deplock, lf_link);
1164			}
1165		}
1166	}
1167}
1168
1169/*
1170 * Set the start of an existing active lock, updating dependencies and
1171 * adding any newly woken locks to 'granted'.
1172 */
1173static void
1174lf_set_start(struct lockf *state, struct lockf_entry *lock, off_t new_start,
1175	struct lockf_entry_list *granted)
1176{
1177
1178	KASSERT(new_start >= lock->lf_start, ("can't increase lock"));
1179	lock->lf_start = new_start;
1180	LIST_REMOVE(lock, lf_link);
1181	lf_insert_lock(state, lock);
1182	lf_update_dependancies(state, lock, FALSE, granted);
1183}
1184
1185/*
1186 * Set the end of an existing active lock, updating dependencies and
1187 * adding any newly woken locks to 'granted'.
1188 */
1189static void
1190lf_set_end(struct lockf *state, struct lockf_entry *lock, off_t new_end,
1191	struct lockf_entry_list *granted)
1192{
1193
1194	KASSERT(new_end <= lock->lf_end, ("can't increase lock"));
1195	lock->lf_end = new_end;
1196	lf_update_dependancies(state, lock, FALSE, granted);
1197}
1198
1199/*
1200 * Add a lock to the active list, updating or removing any current
1201 * locks owned by the same owner and processing any pending locks that
1202 * become unblocked as a result. This code is also used for unlock
1203 * since the logic for updating existing locks is identical.
1204 *
1205 * As a result of processing the new lock, we may unblock existing
1206 * pending locks as a result of downgrading/unlocking. We simply
1207 * activate the newly granted locks by looping.
1208 *
1209 * Since the new lock already has its dependencies set up, we always
1210 * add it to the list (unless its an unlock request). This may
1211 * fragment the lock list in some pathological cases but its probably
1212 * not a real problem.
1213 */
1214static void
1215lf_activate_lock(struct lockf *state, struct lockf_entry *lock)
1216{
1217	struct lockf_entry *overlap, *lf;
1218	struct lockf_entry_list granted;
1219	int ovcase;
1220
1221	LIST_INIT(&granted);
1222	LIST_INSERT_HEAD(&granted, lock, lf_link);
1223
1224	while (!LIST_EMPTY(&granted)) {
1225		lock = LIST_FIRST(&granted);
1226		LIST_REMOVE(lock, lf_link);
1227
1228		/*
1229		 * Skip over locks owned by other processes.  Handle
1230		 * any locks that overlap and are owned by ourselves.
1231		 */
1232		overlap = LIST_FIRST(&state->ls_active);
1233		for (;;) {
1234			ovcase = lf_findoverlap(&overlap, lock, SELF);
1235
1236#ifdef LOCKF_DEBUG
1237			if (ovcase && (lockf_debug & 2)) {
1238				printf("lf_setlock: overlap %d", ovcase);
1239				lf_print("", overlap);
1240			}
1241#endif
1242			/*
1243			 * Six cases:
1244			 *	0) no overlap
1245			 *	1) overlap == lock
1246			 *	2) overlap contains lock
1247			 *	3) lock contains overlap
1248			 *	4) overlap starts before lock
1249			 *	5) overlap ends after lock
1250			 */
1251			switch (ovcase) {
1252			case 0: /* no overlap */
1253				break;
1254
1255			case 1: /* overlap == lock */
1256				/*
1257				 * We have already setup the
1258				 * dependants for the new lock, taking
1259				 * into account a possible downgrade
1260				 * or unlock. Remove the old lock.
1261				 */
1262				LIST_REMOVE(overlap, lf_link);
1263				lf_update_dependancies(state, overlap, TRUE,
1264					&granted);
1265				lf_free_lock(overlap);
1266				break;
1267
1268			case 2: /* overlap contains lock */
1269				/*
1270				 * Just split the existing lock.
1271				 */
1272				lf_split(state, overlap, lock, &granted);
1273				break;
1274
1275			case 3: /* lock contains overlap */
1276				/*
1277				 * Delete the overlap and advance to
1278				 * the next entry in the list.
1279				 */
1280				lf = LIST_NEXT(overlap, lf_link);
1281				LIST_REMOVE(overlap, lf_link);
1282				lf_update_dependancies(state, overlap, TRUE,
1283					&granted);
1284				lf_free_lock(overlap);
1285				overlap = lf;
1286				continue;
1287
1288			case 4: /* overlap starts before lock */
1289				/*
1290				 * Just update the overlap end and
1291				 * move on.
1292				 */
1293				lf_set_end(state, overlap, lock->lf_start - 1,
1294				    &granted);
1295				overlap = LIST_NEXT(overlap, lf_link);
1296				continue;
1297
1298			case 5: /* overlap ends after lock */
1299				/*
1300				 * Change the start of overlap and
1301				 * re-insert.
1302				 */
1303				lf_set_start(state, overlap, lock->lf_end + 1,
1304				    &granted);
1305				break;
1306			}
1307			break;
1308		}
1309#ifdef LOCKF_DEBUG
1310		if (lockf_debug & 1) {
1311			if (lock->lf_type != F_UNLCK)
1312				lf_print("lf_activate_lock: activated", lock);
1313			else
1314				lf_print("lf_activate_lock: unlocked", lock);
1315			lf_printlist("lf_activate_lock", lock);
1316		}
1317#endif /* LOCKF_DEBUG */
1318		if (lock->lf_type != F_UNLCK)
1319			lf_insert_lock(state, lock);
1320	}
1321}
1322
1323/*
1324 * Cancel a pending lock request, either as a result of a signal or a
1325 * cancel request for an async lock.
1326 */
1327static void
1328lf_cancel_lock(struct lockf *state, struct lockf_entry *lock)
1329{
1330	struct lockf_entry_list granted;
1331
1332	/*
1333	 * Note it is theoretically possible that cancelling this lock
1334	 * may allow some other pending lock to become
1335	 * active. Consider this case:
1336	 *
1337	 * Owner	Action		Result		Dependencies
1338	 *
1339	 * A:		lock [0..0]	succeeds
1340	 * B:		lock [2..2]	succeeds
1341	 * C:		lock [1..2]	blocked		C->B
1342	 * D:		lock [0..1]	blocked		C->B,D->A,D->C
1343	 * A:		unlock [0..0]			C->B,D->C
1344	 * C:		cancel [1..2]
1345	 */
1346
1347	LIST_REMOVE(lock, lf_link);
1348
1349	/*
1350	 * Removing out-going edges is simple.
1351	 */
1352	sx_xlock(&lf_owner_graph_lock);
1353	lf_remove_outgoing(lock);
1354	sx_xunlock(&lf_owner_graph_lock);
1355
1356	/*
1357	 * Removing in-coming edges may allow some other lock to
1358	 * become active - we use lf_update_dependancies to figure
1359	 * this out.
1360	 */
1361	LIST_INIT(&granted);
1362	lf_update_dependancies(state, lock, TRUE, &granted);
1363	lf_free_lock(lock);
1364
1365	/*
1366	 * Feed any newly active locks to lf_activate_lock.
1367	 */
1368	while (!LIST_EMPTY(&granted)) {
1369		lock = LIST_FIRST(&granted);
1370		LIST_REMOVE(lock, lf_link);
1371		lf_activate_lock(state, lock);
1372	}
1373}
1374
1375/*
1376 * Set a byte-range lock.
1377 */
1378static int
1379lf_setlock(struct lockf *state, struct lockf_entry *lock, struct vnode *vp,
1380    void **cookiep)
1381{
1382	static char lockstr[] = "lockf";
1383	int error, priority, stops_deferred;
1384
1385#ifdef LOCKF_DEBUG
1386	if (lockf_debug & 1)
1387		lf_print("lf_setlock", lock);
1388#endif /* LOCKF_DEBUG */
1389
1390	/*
1391	 * Set the priority
1392	 */
1393	priority = PLOCK;
1394	if (lock->lf_type == F_WRLCK)
1395		priority += 4;
1396	if (!(lock->lf_flags & F_NOINTR))
1397		priority |= PCATCH;
1398	/*
1399	 * Scan lock list for this file looking for locks that would block us.
1400	 */
1401	if (lf_getblock(state, lock)) {
1402		/*
1403		 * Free the structure and return if nonblocking.
1404		 */
1405		if ((lock->lf_flags & F_WAIT) == 0
1406		    && lock->lf_async_task == NULL) {
1407			lf_free_lock(lock);
1408			error = EAGAIN;
1409			goto out;
1410		}
1411
1412		/*
1413		 * For flock type locks, we must first remove
1414		 * any shared locks that we hold before we sleep
1415		 * waiting for an exclusive lock.
1416		 */
1417		if ((lock->lf_flags & F_FLOCK) &&
1418		    lock->lf_type == F_WRLCK) {
1419			lock->lf_type = F_UNLCK;
1420			lf_activate_lock(state, lock);
1421			lock->lf_type = F_WRLCK;
1422		}
1423
1424		/*
1425		 * We are blocked. Create edges to each blocking lock,
1426		 * checking for deadlock using the owner graph. For
1427		 * simplicity, we run deadlock detection for all
1428		 * locks, posix and otherwise.
1429		 */
1430		sx_xlock(&lf_owner_graph_lock);
1431		error = lf_add_outgoing(state, lock);
1432		sx_xunlock(&lf_owner_graph_lock);
1433
1434		if (error) {
1435#ifdef LOCKF_DEBUG
1436			if (lockf_debug & 1)
1437				lf_print("lf_setlock: deadlock", lock);
1438#endif
1439			lf_free_lock(lock);
1440			goto out;
1441		}
1442
1443		/*
1444		 * We have added edges to everything that blocks
1445		 * us. Sleep until they all go away.
1446		 */
1447		LIST_INSERT_HEAD(&state->ls_pending, lock, lf_link);
1448#ifdef LOCKF_DEBUG
1449		if (lockf_debug & 1) {
1450			struct lockf_edge *e;
1451			LIST_FOREACH(e, &lock->lf_outedges, le_outlink) {
1452				lf_print("lf_setlock: blocking on", e->le_to);
1453				lf_printlist("lf_setlock", e->le_to);
1454			}
1455		}
1456#endif /* LOCKF_DEBUG */
1457
1458		if ((lock->lf_flags & F_WAIT) == 0) {
1459			/*
1460			 * The caller requested async notification -
1461			 * this callback happens when the blocking
1462			 * lock is released, allowing the caller to
1463			 * make another attempt to take the lock.
1464			 */
1465			*cookiep = (void *) lock;
1466			error = EINPROGRESS;
1467			goto out;
1468		}
1469
1470		lock->lf_refs++;
1471		stops_deferred = sigdeferstop(SIGDEFERSTOP_ERESTART);
1472		error = sx_sleep(lock, &state->ls_lock, priority, lockstr, 0);
1473		sigallowstop(stops_deferred);
1474		if (lf_free_lock(lock)) {
1475			error = EDOOFUS;
1476			goto out;
1477		}
1478
1479		/*
1480		 * We may have been awakened by a signal and/or by a
1481		 * debugger continuing us (in which cases we must
1482		 * remove our lock graph edges) and/or by another
1483		 * process releasing a lock (in which case our edges
1484		 * have already been removed and we have been moved to
1485		 * the active list). We may also have been woken by
1486		 * lf_purgelocks which we report to the caller as
1487		 * EINTR. In that case, lf_purgelocks will have
1488		 * removed our lock graph edges.
1489		 *
1490		 * Note that it is possible to receive a signal after
1491		 * we were successfully woken (and moved to the active
1492		 * list) but before we resumed execution. In this
1493		 * case, our lf_outedges list will be clear. We
1494		 * pretend there was no error.
1495		 *
1496		 * Note also, if we have been sleeping long enough, we
1497		 * may now have incoming edges from some newer lock
1498		 * which is waiting behind us in the queue.
1499		 */
1500		if (lock->lf_flags & F_INTR) {
1501			error = EINTR;
1502			lf_free_lock(lock);
1503			goto out;
1504		}
1505		if (LIST_EMPTY(&lock->lf_outedges)) {
1506			error = 0;
1507		} else {
1508			lf_cancel_lock(state, lock);
1509			goto out;
1510		}
1511#ifdef LOCKF_DEBUG
1512		if (lockf_debug & 1) {
1513			lf_print("lf_setlock: granted", lock);
1514		}
1515#endif
1516		goto out;
1517	}
1518	/*
1519	 * It looks like we are going to grant the lock. First add
1520	 * edges from any currently pending lock that the new lock
1521	 * would block.
1522	 */
1523	sx_xlock(&lf_owner_graph_lock);
1524	error = lf_add_incoming(state, lock);
1525	sx_xunlock(&lf_owner_graph_lock);
1526	if (error) {
1527#ifdef LOCKF_DEBUG
1528		if (lockf_debug & 1)
1529			lf_print("lf_setlock: deadlock", lock);
1530#endif
1531		lf_free_lock(lock);
1532		goto out;
1533	}
1534
1535	/*
1536	 * No blocks!!  Add the lock.  Note that we will
1537	 * downgrade or upgrade any overlapping locks this
1538	 * process already owns.
1539	 */
1540	lf_activate_lock(state, lock);
1541	error = 0;
1542out:
1543	return (error);
1544}
1545
1546/*
1547 * Remove a byte-range lock on an inode.
1548 *
1549 * Generally, find the lock (or an overlap to that lock)
1550 * and remove it (or shrink it), then wakeup anyone we can.
1551 */
1552static int
1553lf_clearlock(struct lockf *state, struct lockf_entry *unlock)
1554{
1555	struct lockf_entry *overlap;
1556
1557	overlap = LIST_FIRST(&state->ls_active);
1558
1559	if (overlap == NOLOCKF)
1560		return (0);
1561#ifdef LOCKF_DEBUG
1562	if (unlock->lf_type != F_UNLCK)
1563		panic("lf_clearlock: bad type");
1564	if (lockf_debug & 1)
1565		lf_print("lf_clearlock", unlock);
1566#endif /* LOCKF_DEBUG */
1567
1568	lf_activate_lock(state, unlock);
1569
1570	return (0);
1571}
1572
1573/*
1574 * Check whether there is a blocking lock, and if so return its
1575 * details in '*fl'.
1576 */
1577static int
1578lf_getlock(struct lockf *state, struct lockf_entry *lock, struct flock *fl)
1579{
1580	struct lockf_entry *block;
1581
1582#ifdef LOCKF_DEBUG
1583	if (lockf_debug & 1)
1584		lf_print("lf_getlock", lock);
1585#endif /* LOCKF_DEBUG */
1586
1587	if ((block = lf_getblock(state, lock))) {
1588		fl->l_type = block->lf_type;
1589		fl->l_whence = SEEK_SET;
1590		fl->l_start = block->lf_start;
1591		if (block->lf_end == OFF_MAX)
1592			fl->l_len = 0;
1593		else
1594			fl->l_len = block->lf_end - block->lf_start + 1;
1595		fl->l_pid = block->lf_owner->lo_pid;
1596		fl->l_sysid = block->lf_owner->lo_sysid;
1597	} else {
1598		fl->l_type = F_UNLCK;
1599	}
1600	return (0);
1601}
1602
1603/*
1604 * Cancel an async lock request.
1605 */
1606static int
1607lf_cancel(struct lockf *state, struct lockf_entry *lock, void *cookie)
1608{
1609	struct lockf_entry *reallock;
1610
1611	/*
1612	 * We need to match this request with an existing lock
1613	 * request.
1614	 */
1615	LIST_FOREACH(reallock, &state->ls_pending, lf_link) {
1616		if ((void *) reallock == cookie) {
1617			/*
1618			 * Double-check that this lock looks right
1619			 * (maybe use a rolling ID for the cancel
1620			 * cookie instead?)
1621			 */
1622			if (!(reallock->lf_vnode == lock->lf_vnode
1623				&& reallock->lf_start == lock->lf_start
1624				&& reallock->lf_end == lock->lf_end)) {
1625				return (ENOENT);
1626			}
1627
1628			/*
1629			 * Make sure this lock was async and then just
1630			 * remove it from its wait lists.
1631			 */
1632			if (!reallock->lf_async_task) {
1633				return (ENOENT);
1634			}
1635
1636			/*
1637			 * Note that since any other thread must take
1638			 * state->ls_lock before it can possibly
1639			 * trigger the async callback, we are safe
1640			 * from a race with lf_wakeup_lock, i.e. we
1641			 * can free the lock (actually our caller does
1642			 * this).
1643			 */
1644			lf_cancel_lock(state, reallock);
1645			return (0);
1646		}
1647	}
1648
1649	/*
1650	 * We didn't find a matching lock - not much we can do here.
1651	 */
1652	return (ENOENT);
1653}
1654
1655/*
1656 * Walk the list of locks for an inode and
1657 * return the first blocking lock.
1658 */
1659static struct lockf_entry *
1660lf_getblock(struct lockf *state, struct lockf_entry *lock)
1661{
1662	struct lockf_entry *overlap;
1663
1664	LIST_FOREACH(overlap, &state->ls_active, lf_link) {
1665		/*
1666		 * We may assume that the active list is sorted by
1667		 * lf_start.
1668		 */
1669		if (overlap->lf_start > lock->lf_end)
1670			break;
1671		if (!lf_blocks(lock, overlap))
1672			continue;
1673		return (overlap);
1674	}
1675	return (NOLOCKF);
1676}
1677
1678/*
1679 * Walk the list of locks for an inode to find an overlapping lock (if
1680 * any) and return a classification of that overlap.
1681 *
1682 * Arguments:
1683 *	*overlap	The place in the lock list to start looking
1684 *	lock		The lock which is being tested
1685 *	type		Pass 'SELF' to test only locks with the same
1686 *			owner as lock, or 'OTHER' to test only locks
1687 *			with a different owner
1688 *
1689 * Returns one of six values:
1690 *	0) no overlap
1691 *	1) overlap == lock
1692 *	2) overlap contains lock
1693 *	3) lock contains overlap
1694 *	4) overlap starts before lock
1695 *	5) overlap ends after lock
1696 *
1697 * If there is an overlapping lock, '*overlap' is set to point at the
1698 * overlapping lock.
1699 *
1700 * NOTE: this returns only the FIRST overlapping lock.  There
1701 *	 may be more than one.
1702 */
1703static int
1704lf_findoverlap(struct lockf_entry **overlap, struct lockf_entry *lock, int type)
1705{
1706	struct lockf_entry *lf;
1707	off_t start, end;
1708	int res;
1709
1710	if ((*overlap) == NOLOCKF) {
1711		return (0);
1712	}
1713#ifdef LOCKF_DEBUG
1714	if (lockf_debug & 2)
1715		lf_print("lf_findoverlap: looking for overlap in", lock);
1716#endif /* LOCKF_DEBUG */
1717	start = lock->lf_start;
1718	end = lock->lf_end;
1719	res = 0;
1720	while (*overlap) {
1721		lf = *overlap;
1722		if (lf->lf_start > end)
1723			break;
1724		if (((type & SELF) && lf->lf_owner != lock->lf_owner) ||
1725		    ((type & OTHERS) && lf->lf_owner == lock->lf_owner)) {
1726			*overlap = LIST_NEXT(lf, lf_link);
1727			continue;
1728		}
1729#ifdef LOCKF_DEBUG
1730		if (lockf_debug & 2)
1731			lf_print("\tchecking", lf);
1732#endif /* LOCKF_DEBUG */
1733		/*
1734		 * OK, check for overlap
1735		 *
1736		 * Six cases:
1737		 *	0) no overlap
1738		 *	1) overlap == lock
1739		 *	2) overlap contains lock
1740		 *	3) lock contains overlap
1741		 *	4) overlap starts before lock
1742		 *	5) overlap ends after lock
1743		 */
1744		if (start > lf->lf_end) {
1745			/* Case 0 */
1746#ifdef LOCKF_DEBUG
1747			if (lockf_debug & 2)
1748				printf("no overlap\n");
1749#endif /* LOCKF_DEBUG */
1750			*overlap = LIST_NEXT(lf, lf_link);
1751			continue;
1752		}
1753		if (lf->lf_start == start && lf->lf_end == end) {
1754			/* Case 1 */
1755#ifdef LOCKF_DEBUG
1756			if (lockf_debug & 2)
1757				printf("overlap == lock\n");
1758#endif /* LOCKF_DEBUG */
1759			res = 1;
1760			break;
1761		}
1762		if (lf->lf_start <= start && lf->lf_end >= end) {
1763			/* Case 2 */
1764#ifdef LOCKF_DEBUG
1765			if (lockf_debug & 2)
1766				printf("overlap contains lock\n");
1767#endif /* LOCKF_DEBUG */
1768			res = 2;
1769			break;
1770		}
1771		if (start <= lf->lf_start && end >= lf->lf_end) {
1772			/* Case 3 */
1773#ifdef LOCKF_DEBUG
1774			if (lockf_debug & 2)
1775				printf("lock contains overlap\n");
1776#endif /* LOCKF_DEBUG */
1777			res = 3;
1778			break;
1779		}
1780		if (lf->lf_start < start && lf->lf_end >= start) {
1781			/* Case 4 */
1782#ifdef LOCKF_DEBUG
1783			if (lockf_debug & 2)
1784				printf("overlap starts before lock\n");
1785#endif /* LOCKF_DEBUG */
1786			res = 4;
1787			break;
1788		}
1789		if (lf->lf_start > start && lf->lf_end > end) {
1790			/* Case 5 */
1791#ifdef LOCKF_DEBUG
1792			if (lockf_debug & 2)
1793				printf("overlap ends after lock\n");
1794#endif /* LOCKF_DEBUG */
1795			res = 5;
1796			break;
1797		}
1798		panic("lf_findoverlap: default");
1799	}
1800	return (res);
1801}
1802
1803/*
1804 * Split an the existing 'lock1', based on the extent of the lock
1805 * described by 'lock2'. The existing lock should cover 'lock2'
1806 * entirely.
1807 *
1808 * Any pending locks which have been been unblocked are added to
1809 * 'granted'
1810 */
1811static void
1812lf_split(struct lockf *state, struct lockf_entry *lock1,
1813    struct lockf_entry *lock2, struct lockf_entry_list *granted)
1814{
1815	struct lockf_entry *splitlock;
1816
1817#ifdef LOCKF_DEBUG
1818	if (lockf_debug & 2) {
1819		lf_print("lf_split", lock1);
1820		lf_print("splitting from", lock2);
1821	}
1822#endif /* LOCKF_DEBUG */
1823	/*
1824	 * Check to see if we don't need to split at all.
1825	 */
1826	if (lock1->lf_start == lock2->lf_start) {
1827		lf_set_start(state, lock1, lock2->lf_end + 1, granted);
1828		return;
1829	}
1830	if (lock1->lf_end == lock2->lf_end) {
1831		lf_set_end(state, lock1, lock2->lf_start - 1, granted);
1832		return;
1833	}
1834	/*
1835	 * Make a new lock consisting of the last part of
1836	 * the encompassing lock.
1837	 */
1838	splitlock = lf_alloc_lock(lock1->lf_owner);
1839	memcpy(splitlock, lock1, sizeof *splitlock);
1840	splitlock->lf_refs = 1;
1841	if (splitlock->lf_flags & F_REMOTE)
1842		vref(splitlock->lf_vnode);
1843
1844	/*
1845	 * This cannot cause a deadlock since any edges we would add
1846	 * to splitlock already exist in lock1. We must be sure to add
1847	 * necessary dependencies to splitlock before we reduce lock1
1848	 * otherwise we may accidentally grant a pending lock that
1849	 * was blocked by the tail end of lock1.
1850	 */
1851	splitlock->lf_start = lock2->lf_end + 1;
1852	LIST_INIT(&splitlock->lf_outedges);
1853	LIST_INIT(&splitlock->lf_inedges);
1854	sx_xlock(&lf_owner_graph_lock);
1855	lf_add_incoming(state, splitlock);
1856	sx_xunlock(&lf_owner_graph_lock);
1857
1858	lf_set_end(state, lock1, lock2->lf_start - 1, granted);
1859
1860	/*
1861	 * OK, now link it in
1862	 */
1863	lf_insert_lock(state, splitlock);
1864}
1865
1866struct lockdesc {
1867	STAILQ_ENTRY(lockdesc) link;
1868	struct vnode *vp;
1869	struct flock fl;
1870};
1871STAILQ_HEAD(lockdesclist, lockdesc);
1872
1873int
1874lf_iteratelocks_sysid(int sysid, lf_iterator *fn, void *arg)
1875{
1876	struct lockf *ls;
1877	struct lockf_entry *lf;
1878	struct lockdesc *ldesc;
1879	struct lockdesclist locks;
1880	int error;
1881
1882	/*
1883	 * In order to keep the locking simple, we iterate over the
1884	 * active lock lists to build a list of locks that need
1885	 * releasing. We then call the iterator for each one in turn.
1886	 *
1887	 * We take an extra reference to the vnode for the duration to
1888	 * make sure it doesn't go away before we are finished.
1889	 */
1890	STAILQ_INIT(&locks);
1891	sx_xlock(&lf_lock_states_lock);
1892	LIST_FOREACH(ls, &lf_lock_states, ls_link) {
1893		sx_xlock(&ls->ls_lock);
1894		LIST_FOREACH(lf, &ls->ls_active, lf_link) {
1895			if (lf->lf_owner->lo_sysid != sysid)
1896				continue;
1897
1898			ldesc = malloc(sizeof(struct lockdesc), M_LOCKF,
1899			    M_WAITOK);
1900			ldesc->vp = lf->lf_vnode;
1901			vref(ldesc->vp);
1902			ldesc->fl.l_start = lf->lf_start;
1903			if (lf->lf_end == OFF_MAX)
1904				ldesc->fl.l_len = 0;
1905			else
1906				ldesc->fl.l_len =
1907					lf->lf_end - lf->lf_start + 1;
1908			ldesc->fl.l_whence = SEEK_SET;
1909			ldesc->fl.l_type = F_UNLCK;
1910			ldesc->fl.l_pid = lf->lf_owner->lo_pid;
1911			ldesc->fl.l_sysid = sysid;
1912			STAILQ_INSERT_TAIL(&locks, ldesc, link);
1913		}
1914		sx_xunlock(&ls->ls_lock);
1915	}
1916	sx_xunlock(&lf_lock_states_lock);
1917
1918	/*
1919	 * Call the iterator function for each lock in turn. If the
1920	 * iterator returns an error code, just free the rest of the
1921	 * lockdesc structures.
1922	 */
1923	error = 0;
1924	while ((ldesc = STAILQ_FIRST(&locks)) != NULL) {
1925		STAILQ_REMOVE_HEAD(&locks, link);
1926		if (!error)
1927			error = fn(ldesc->vp, &ldesc->fl, arg);
1928		vrele(ldesc->vp);
1929		free(ldesc, M_LOCKF);
1930	}
1931
1932	return (error);
1933}
1934
1935int
1936lf_iteratelocks_vnode(struct vnode *vp, lf_iterator *fn, void *arg)
1937{
1938	struct lockf *ls;
1939	struct lockf_entry *lf;
1940	struct lockdesc *ldesc;
1941	struct lockdesclist locks;
1942	int error;
1943
1944	/*
1945	 * In order to keep the locking simple, we iterate over the
1946	 * active lock lists to build a list of locks that need
1947	 * releasing. We then call the iterator for each one in turn.
1948	 *
1949	 * We take an extra reference to the vnode for the duration to
1950	 * make sure it doesn't go away before we are finished.
1951	 */
1952	STAILQ_INIT(&locks);
1953	VI_LOCK(vp);
1954	ls = vp->v_lockf;
1955	if (!ls) {
1956		VI_UNLOCK(vp);
1957		return (0);
1958	}
1959	ls->ls_threads++;
1960	VI_UNLOCK(vp);
1961
1962	sx_xlock(&ls->ls_lock);
1963	LIST_FOREACH(lf, &ls->ls_active, lf_link) {
1964		ldesc = malloc(sizeof(struct lockdesc), M_LOCKF,
1965		    M_WAITOK);
1966		ldesc->vp = lf->lf_vnode;
1967		vref(ldesc->vp);
1968		ldesc->fl.l_start = lf->lf_start;
1969		if (lf->lf_end == OFF_MAX)
1970			ldesc->fl.l_len = 0;
1971		else
1972			ldesc->fl.l_len =
1973				lf->lf_end - lf->lf_start + 1;
1974		ldesc->fl.l_whence = SEEK_SET;
1975		ldesc->fl.l_type = F_UNLCK;
1976		ldesc->fl.l_pid = lf->lf_owner->lo_pid;
1977		ldesc->fl.l_sysid = lf->lf_owner->lo_sysid;
1978		STAILQ_INSERT_TAIL(&locks, ldesc, link);
1979	}
1980	sx_xunlock(&ls->ls_lock);
1981	VI_LOCK(vp);
1982	ls->ls_threads--;
1983	wakeup(ls);
1984	VI_UNLOCK(vp);
1985
1986	/*
1987	 * Call the iterator function for each lock in turn. If the
1988	 * iterator returns an error code, just free the rest of the
1989	 * lockdesc structures.
1990	 */
1991	error = 0;
1992	while ((ldesc = STAILQ_FIRST(&locks)) != NULL) {
1993		STAILQ_REMOVE_HEAD(&locks, link);
1994		if (!error)
1995			error = fn(ldesc->vp, &ldesc->fl, arg);
1996		vrele(ldesc->vp);
1997		free(ldesc, M_LOCKF);
1998	}
1999
2000	return (error);
2001}
2002
2003static int
2004lf_clearremotesys_iterator(struct vnode *vp, struct flock *fl, void *arg)
2005{
2006
2007	VOP_ADVLOCK(vp, 0, F_UNLCK, fl, F_REMOTE);
2008	return (0);
2009}
2010
2011void
2012lf_clearremotesys(int sysid)
2013{
2014
2015	KASSERT(sysid != 0, ("Can't clear local locks with F_UNLCKSYS"));
2016	lf_iteratelocks_sysid(sysid, lf_clearremotesys_iterator, NULL);
2017}
2018
2019int
2020lf_countlocks(int sysid)
2021{
2022	int i;
2023	struct lock_owner *lo;
2024	int count;
2025
2026	count = 0;
2027	sx_xlock(&lf_lock_owners_lock);
2028	for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++)
2029		LIST_FOREACH(lo, &lf_lock_owners[i], lo_link)
2030			if (lo->lo_sysid == sysid)
2031				count += lo->lo_refs;
2032	sx_xunlock(&lf_lock_owners_lock);
2033
2034	return (count);
2035}
2036
2037#ifdef LOCKF_DEBUG
2038
2039/*
2040 * Return non-zero if y is reachable from x using a brute force
2041 * search. If reachable and path is non-null, return the route taken
2042 * in path.
2043 */
2044static int
2045graph_reaches(struct owner_vertex *x, struct owner_vertex *y,
2046    struct owner_vertex_list *path)
2047{
2048	struct owner_edge *e;
2049
2050	if (x == y) {
2051		if (path)
2052			TAILQ_INSERT_HEAD(path, x, v_link);
2053		return 1;
2054	}
2055
2056	LIST_FOREACH(e, &x->v_outedges, e_outlink) {
2057		if (graph_reaches(e->e_to, y, path)) {
2058			if (path)
2059				TAILQ_INSERT_HEAD(path, x, v_link);
2060			return 1;
2061		}
2062	}
2063	return 0;
2064}
2065
2066/*
2067 * Perform consistency checks on the graph. Make sure the values of
2068 * v_order are correct. If checkorder is non-zero, check no vertex can
2069 * reach any other vertex with a smaller order.
2070 */
2071static void
2072graph_check(struct owner_graph *g, int checkorder)
2073{
2074	int i, j;
2075
2076	for (i = 0; i < g->g_size; i++) {
2077		if (!g->g_vertices[i]->v_owner)
2078			continue;
2079		KASSERT(g->g_vertices[i]->v_order == i,
2080		    ("lock graph vertices disordered"));
2081		if (checkorder) {
2082			for (j = 0; j < i; j++) {
2083				if (!g->g_vertices[j]->v_owner)
2084					continue;
2085				KASSERT(!graph_reaches(g->g_vertices[i],
2086					g->g_vertices[j], NULL),
2087				    ("lock graph vertices disordered"));
2088			}
2089		}
2090	}
2091}
2092
2093static void
2094graph_print_vertices(struct owner_vertex_list *set)
2095{
2096	struct owner_vertex *v;
2097
2098	printf("{ ");
2099	TAILQ_FOREACH(v, set, v_link) {
2100		printf("%d:", v->v_order);
2101		lf_print_owner(v->v_owner);
2102		if (TAILQ_NEXT(v, v_link))
2103			printf(", ");
2104	}
2105	printf(" }\n");
2106}
2107
2108#endif
2109
2110/*
2111 * Calculate the sub-set of vertices v from the affected region [y..x]
2112 * where v is reachable from y. Return -1 if a loop was detected
2113 * (i.e. x is reachable from y, otherwise the number of vertices in
2114 * this subset.
2115 */
2116static int
2117graph_delta_forward(struct owner_graph *g, struct owner_vertex *x,
2118    struct owner_vertex *y, struct owner_vertex_list *delta)
2119{
2120	uint32_t gen;
2121	struct owner_vertex *v;
2122	struct owner_edge *e;
2123	int n;
2124
2125	/*
2126	 * We start with a set containing just y. Then for each vertex
2127	 * v in the set so far unprocessed, we add each vertex that v
2128	 * has an out-edge to and that is within the affected region
2129	 * [y..x]. If we see the vertex x on our travels, stop
2130	 * immediately.
2131	 */
2132	TAILQ_INIT(delta);
2133	TAILQ_INSERT_TAIL(delta, y, v_link);
2134	v = y;
2135	n = 1;
2136	gen = g->g_gen;
2137	while (v) {
2138		LIST_FOREACH(e, &v->v_outedges, e_outlink) {
2139			if (e->e_to == x)
2140				return -1;
2141			if (e->e_to->v_order < x->v_order
2142			    && e->e_to->v_gen != gen) {
2143				e->e_to->v_gen = gen;
2144				TAILQ_INSERT_TAIL(delta, e->e_to, v_link);
2145				n++;
2146			}
2147		}
2148		v = TAILQ_NEXT(v, v_link);
2149	}
2150
2151	return (n);
2152}
2153
2154/*
2155 * Calculate the sub-set of vertices v from the affected region [y..x]
2156 * where v reaches x. Return the number of vertices in this subset.
2157 */
2158static int
2159graph_delta_backward(struct owner_graph *g, struct owner_vertex *x,
2160    struct owner_vertex *y, struct owner_vertex_list *delta)
2161{
2162	uint32_t gen;
2163	struct owner_vertex *v;
2164	struct owner_edge *e;
2165	int n;
2166
2167	/*
2168	 * We start with a set containing just x. Then for each vertex
2169	 * v in the set so far unprocessed, we add each vertex that v
2170	 * has an in-edge from and that is within the affected region
2171	 * [y..x].
2172	 */
2173	TAILQ_INIT(delta);
2174	TAILQ_INSERT_TAIL(delta, x, v_link);
2175	v = x;
2176	n = 1;
2177	gen = g->g_gen;
2178	while (v) {
2179		LIST_FOREACH(e, &v->v_inedges, e_inlink) {
2180			if (e->e_from->v_order > y->v_order
2181			    && e->e_from->v_gen != gen) {
2182				e->e_from->v_gen = gen;
2183				TAILQ_INSERT_HEAD(delta, e->e_from, v_link);
2184				n++;
2185			}
2186		}
2187		v = TAILQ_PREV(v, owner_vertex_list, v_link);
2188	}
2189
2190	return (n);
2191}
2192
2193static int
2194graph_add_indices(int *indices, int n, struct owner_vertex_list *set)
2195{
2196	struct owner_vertex *v;
2197	int i, j;
2198
2199	TAILQ_FOREACH(v, set, v_link) {
2200		for (i = n;
2201		     i > 0 && indices[i - 1] > v->v_order; i--)
2202			;
2203		for (j = n - 1; j >= i; j--)
2204			indices[j + 1] = indices[j];
2205		indices[i] = v->v_order;
2206		n++;
2207	}
2208
2209	return (n);
2210}
2211
2212static int
2213graph_assign_indices(struct owner_graph *g, int *indices, int nextunused,
2214    struct owner_vertex_list *set)
2215{
2216	struct owner_vertex *v, *vlowest;
2217
2218	while (!TAILQ_EMPTY(set)) {
2219		vlowest = NULL;
2220		TAILQ_FOREACH(v, set, v_link) {
2221			if (!vlowest || v->v_order < vlowest->v_order)
2222				vlowest = v;
2223		}
2224		TAILQ_REMOVE(set, vlowest, v_link);
2225		vlowest->v_order = indices[nextunused];
2226		g->g_vertices[vlowest->v_order] = vlowest;
2227		nextunused++;
2228	}
2229
2230	return (nextunused);
2231}
2232
2233static int
2234graph_add_edge(struct owner_graph *g, struct owner_vertex *x,
2235    struct owner_vertex *y)
2236{
2237	struct owner_edge *e;
2238	struct owner_vertex_list deltaF, deltaB;
2239	int nF, nB, n, vi, i;
2240	int *indices;
2241
2242	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
2243
2244	LIST_FOREACH(e, &x->v_outedges, e_outlink) {
2245		if (e->e_to == y) {
2246			e->e_refs++;
2247			return (0);
2248		}
2249	}
2250
2251#ifdef LOCKF_DEBUG
2252	if (lockf_debug & 8) {
2253		printf("adding edge %d:", x->v_order);
2254		lf_print_owner(x->v_owner);
2255		printf(" -> %d:", y->v_order);
2256		lf_print_owner(y->v_owner);
2257		printf("\n");
2258	}
2259#endif
2260	if (y->v_order < x->v_order) {
2261		/*
2262		 * The new edge violates the order. First find the set
2263		 * of affected vertices reachable from y (deltaF) and
2264		 * the set of affect vertices affected that reach x
2265		 * (deltaB), using the graph generation number to
2266		 * detect whether we have visited a given vertex
2267		 * already. We re-order the graph so that each vertex
2268		 * in deltaB appears before each vertex in deltaF.
2269		 *
2270		 * If x is a member of deltaF, then the new edge would
2271		 * create a cycle. Otherwise, we may assume that
2272		 * deltaF and deltaB are disjoint.
2273		 */
2274		g->g_gen++;
2275		if (g->g_gen == 0) {
2276			/*
2277			 * Generation wrap.
2278			 */
2279			for (vi = 0; vi < g->g_size; vi++) {
2280				g->g_vertices[vi]->v_gen = 0;
2281			}
2282			g->g_gen++;
2283		}
2284		nF = graph_delta_forward(g, x, y, &deltaF);
2285		if (nF < 0) {
2286#ifdef LOCKF_DEBUG
2287			if (lockf_debug & 8) {
2288				struct owner_vertex_list path;
2289				printf("deadlock: ");
2290				TAILQ_INIT(&path);
2291				graph_reaches(y, x, &path);
2292				graph_print_vertices(&path);
2293			}
2294#endif
2295			return (EDEADLK);
2296		}
2297
2298#ifdef LOCKF_DEBUG
2299		if (lockf_debug & 8) {
2300			printf("re-ordering graph vertices\n");
2301			printf("deltaF = ");
2302			graph_print_vertices(&deltaF);
2303		}
2304#endif
2305
2306		nB = graph_delta_backward(g, x, y, &deltaB);
2307
2308#ifdef LOCKF_DEBUG
2309		if (lockf_debug & 8) {
2310			printf("deltaB = ");
2311			graph_print_vertices(&deltaB);
2312		}
2313#endif
2314
2315		/*
2316		 * We first build a set of vertex indices (vertex
2317		 * order values) that we may use, then we re-assign
2318		 * orders first to those vertices in deltaB, then to
2319		 * deltaF. Note that the contents of deltaF and deltaB
2320		 * may be partially disordered - we perform an
2321		 * insertion sort while building our index set.
2322		 */
2323		indices = g->g_indexbuf;
2324		n = graph_add_indices(indices, 0, &deltaF);
2325		graph_add_indices(indices, n, &deltaB);
2326
2327		/*
2328		 * We must also be sure to maintain the relative
2329		 * ordering of deltaF and deltaB when re-assigning
2330		 * vertices. We do this by iteratively removing the
2331		 * lowest ordered element from the set and assigning
2332		 * it the next value from our new ordering.
2333		 */
2334		i = graph_assign_indices(g, indices, 0, &deltaB);
2335		graph_assign_indices(g, indices, i, &deltaF);
2336
2337#ifdef LOCKF_DEBUG
2338		if (lockf_debug & 8) {
2339			struct owner_vertex_list set;
2340			TAILQ_INIT(&set);
2341			for (i = 0; i < nB + nF; i++)
2342				TAILQ_INSERT_TAIL(&set,
2343				    g->g_vertices[indices[i]], v_link);
2344			printf("new ordering = ");
2345			graph_print_vertices(&set);
2346		}
2347#endif
2348	}
2349
2350	KASSERT(x->v_order < y->v_order, ("Failed to re-order graph"));
2351
2352#ifdef LOCKF_DEBUG
2353	if (lockf_debug & 8) {
2354		graph_check(g, TRUE);
2355	}
2356#endif
2357
2358	e = malloc(sizeof(struct owner_edge), M_LOCKF, M_WAITOK);
2359
2360	LIST_INSERT_HEAD(&x->v_outedges, e, e_outlink);
2361	LIST_INSERT_HEAD(&y->v_inedges, e, e_inlink);
2362	e->e_refs = 1;
2363	e->e_from = x;
2364	e->e_to = y;
2365
2366	return (0);
2367}
2368
2369/*
2370 * Remove an edge x->y from the graph.
2371 */
2372static void
2373graph_remove_edge(struct owner_graph *g, struct owner_vertex *x,
2374    struct owner_vertex *y)
2375{
2376	struct owner_edge *e;
2377
2378	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
2379
2380	LIST_FOREACH(e, &x->v_outedges, e_outlink) {
2381		if (e->e_to == y)
2382			break;
2383	}
2384	KASSERT(e, ("Removing non-existent edge from deadlock graph"));
2385
2386	e->e_refs--;
2387	if (e->e_refs == 0) {
2388#ifdef LOCKF_DEBUG
2389		if (lockf_debug & 8) {
2390			printf("removing edge %d:", x->v_order);
2391			lf_print_owner(x->v_owner);
2392			printf(" -> %d:", y->v_order);
2393			lf_print_owner(y->v_owner);
2394			printf("\n");
2395		}
2396#endif
2397		LIST_REMOVE(e, e_outlink);
2398		LIST_REMOVE(e, e_inlink);
2399		free(e, M_LOCKF);
2400	}
2401}
2402
2403/*
2404 * Allocate a vertex from the free list. Return ENOMEM if there are
2405 * none.
2406 */
2407static struct owner_vertex *
2408graph_alloc_vertex(struct owner_graph *g, struct lock_owner *lo)
2409{
2410	struct owner_vertex *v;
2411
2412	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
2413
2414	v = malloc(sizeof(struct owner_vertex), M_LOCKF, M_WAITOK);
2415	if (g->g_size == g->g_space) {
2416		g->g_vertices = realloc(g->g_vertices,
2417		    2 * g->g_space * sizeof(struct owner_vertex *),
2418		    M_LOCKF, M_WAITOK);
2419		free(g->g_indexbuf, M_LOCKF);
2420		g->g_indexbuf = malloc(2 * g->g_space * sizeof(int),
2421		    M_LOCKF, M_WAITOK);
2422		g->g_space = 2 * g->g_space;
2423	}
2424	v->v_order = g->g_size;
2425	v->v_gen = g->g_gen;
2426	g->g_vertices[g->g_size] = v;
2427	g->g_size++;
2428
2429	LIST_INIT(&v->v_outedges);
2430	LIST_INIT(&v->v_inedges);
2431	v->v_owner = lo;
2432
2433	return (v);
2434}
2435
2436static void
2437graph_free_vertex(struct owner_graph *g, struct owner_vertex *v)
2438{
2439	struct owner_vertex *w;
2440	int i;
2441
2442	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
2443
2444	KASSERT(LIST_EMPTY(&v->v_outedges), ("Freeing vertex with edges"));
2445	KASSERT(LIST_EMPTY(&v->v_inedges), ("Freeing vertex with edges"));
2446
2447	/*
2448	 * Remove from the graph's array and close up the gap,
2449	 * renumbering the other vertices.
2450	 */
2451	for (i = v->v_order + 1; i < g->g_size; i++) {
2452		w = g->g_vertices[i];
2453		w->v_order--;
2454		g->g_vertices[i - 1] = w;
2455	}
2456	g->g_size--;
2457
2458	free(v, M_LOCKF);
2459}
2460
2461static struct owner_graph *
2462graph_init(struct owner_graph *g)
2463{
2464
2465	g->g_vertices = malloc(10 * sizeof(struct owner_vertex *),
2466	    M_LOCKF, M_WAITOK);
2467	g->g_size = 0;
2468	g->g_space = 10;
2469	g->g_indexbuf = malloc(g->g_space * sizeof(int), M_LOCKF, M_WAITOK);
2470	g->g_gen = 0;
2471
2472	return (g);
2473}
2474
2475#ifdef LOCKF_DEBUG
2476/*
2477 * Print description of a lock owner
2478 */
2479static void
2480lf_print_owner(struct lock_owner *lo)
2481{
2482
2483	if (lo->lo_flags & F_REMOTE) {
2484		printf("remote pid %d, system %d",
2485		    lo->lo_pid, lo->lo_sysid);
2486	} else if (lo->lo_flags & F_FLOCK) {
2487		printf("file %p", lo->lo_id);
2488	} else {
2489		printf("local pid %d", lo->lo_pid);
2490	}
2491}
2492
2493/*
2494 * Print out a lock.
2495 */
2496static void
2497lf_print(char *tag, struct lockf_entry *lock)
2498{
2499
2500	printf("%s: lock %p for ", tag, (void *)lock);
2501	lf_print_owner(lock->lf_owner);
2502	if (lock->lf_inode != (struct inode *)0)
2503		printf(" in ino %ju on dev <%s>,",
2504		    (uintmax_t)lock->lf_inode->i_number,
2505		    devtoname(ITODEV(lock->lf_inode)));
2506	printf(" %s, start %jd, end ",
2507	    lock->lf_type == F_RDLCK ? "shared" :
2508	    lock->lf_type == F_WRLCK ? "exclusive" :
2509	    lock->lf_type == F_UNLCK ? "unlock" : "unknown",
2510	    (intmax_t)lock->lf_start);
2511	if (lock->lf_end == OFF_MAX)
2512		printf("EOF");
2513	else
2514		printf("%jd", (intmax_t)lock->lf_end);
2515	if (!LIST_EMPTY(&lock->lf_outedges))
2516		printf(" block %p\n",
2517		    (void *)LIST_FIRST(&lock->lf_outedges)->le_to);
2518	else
2519		printf("\n");
2520}
2521
2522static void
2523lf_printlist(char *tag, struct lockf_entry *lock)
2524{
2525	struct lockf_entry *lf, *blk;
2526	struct lockf_edge *e;
2527
2528	if (lock->lf_inode == (struct inode *)0)
2529		return;
2530
2531	printf("%s: Lock list for ino %ju on dev <%s>:\n",
2532	    tag, (uintmax_t)lock->lf_inode->i_number,
2533	    devtoname(ITODEV(lock->lf_inode)));
2534	LIST_FOREACH(lf, &lock->lf_vnode->v_lockf->ls_active, lf_link) {
2535		printf("\tlock %p for ",(void *)lf);
2536		lf_print_owner(lock->lf_owner);
2537		printf(", %s, start %jd, end %jd",
2538		    lf->lf_type == F_RDLCK ? "shared" :
2539		    lf->lf_type == F_WRLCK ? "exclusive" :
2540		    lf->lf_type == F_UNLCK ? "unlock" :
2541		    "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
2542		LIST_FOREACH(e, &lf->lf_outedges, le_outlink) {
2543			blk = e->le_to;
2544			printf("\n\t\tlock request %p for ", (void *)blk);
2545			lf_print_owner(blk->lf_owner);
2546			printf(", %s, start %jd, end %jd",
2547			    blk->lf_type == F_RDLCK ? "shared" :
2548			    blk->lf_type == F_WRLCK ? "exclusive" :
2549			    blk->lf_type == F_UNLCK ? "unlock" :
2550			    "unknown", (intmax_t)blk->lf_start,
2551			    (intmax_t)blk->lf_end);
2552			if (!LIST_EMPTY(&blk->lf_inedges))
2553				panic("lf_printlist: bad list");
2554		}
2555		printf("\n");
2556	}
2557}
2558#endif /* LOCKF_DEBUG */
2559