subr_witness.c revision 75464
1/*-
2 * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 * 3. Berkeley Software Design Inc's name may not be used to endorse or
13 *    promote products derived from this software without specific prior
14 *    written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
29 *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
30 * $FreeBSD: head/sys/kern/subr_witness.c 75464 2001-04-13 08:31:38Z jhb $
31 */
32
33/*
34 * Implementation of the `witness' lock verifier.  Originally implemented for
35 * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
36 * classes in FreeBSD.
37 */
38
39/*
40 *	Main Entry: witness
41 *	Pronunciation: 'wit-n&s
42 *	Function: noun
43 *	Etymology: Middle English witnesse, from Old English witnes knowledge,
44 *	    testimony, witness, from 2wit
45 *	Date: before 12th century
46 *	1 : attestation of a fact or event : TESTIMONY
47 *	2 : one that gives evidence; specifically : one who testifies in
48 *	    a cause or before a judicial tribunal
49 *	3 : one asked to be present at a transaction so as to be able to
50 *	    testify to its having taken place
51 *	4 : one who has personal knowledge of something
52 *	5 a : something serving as evidence or proof : SIGN
53 *	  b : public affirmation by word or example of usually
54 *	      religious faith or conviction <the heroic witness to divine
55 *	      life -- Pilot>
56 *	6 capitalized : a member of the Jehovah's Witnesses
57 */
58
59#include "opt_ddb.h"
60#include "opt_witness.h"
61
62#include <sys/param.h>
63#include <sys/bus.h>
64#include <sys/kernel.h>
65#include <sys/ktr.h>
66#include <sys/lock.h>
67#include <sys/malloc.h>
68#include <sys/mutex.h>
69#include <sys/proc.h>
70#include <sys/sysctl.h>
71#include <sys/systm.h>
72
73#include <ddb/ddb.h>
74
75#define WITNESS_COUNT 200
76#define WITNESS_CHILDCOUNT (WITNESS_COUNT * 4)
77/*
78 * XXX: This is somewhat bogus, as we assume here that at most 1024 processes
79 * will hold LOCK_NCHILDREN * 2 locks.  We handle failure ok, and we should
80 * probably be safe for the most part, but it's still a SWAG.
81 */
82#define LOCK_CHILDCOUNT (MAXCPU + 1024) * 2
83
84#define	WITNESS_NCHILDREN 6
85
86struct witness_child_list_entry;
87
88struct witness {
89	const	char *w_name;
90	struct	lock_class *w_class;
91	STAILQ_ENTRY(witness) w_list;		/* List of all witnesses. */
92	STAILQ_ENTRY(witness) w_typelist;	/* Witnesses of a type. */
93	struct	witness_child_list_entry *w_children;	/* Great evilness... */
94	const	char *w_file;
95	int	w_line;
96	u_int	w_level;
97	u_int	w_refcount;
98	u_char	w_Giant_squawked:1;
99	u_char	w_other_squawked:1;
100	u_char	w_same_squawked:1;
101};
102
103struct witness_child_list_entry {
104	struct	witness_child_list_entry *wcl_next;
105	struct	witness *wcl_children[WITNESS_NCHILDREN];
106	u_int	wcl_count;
107};
108
109STAILQ_HEAD(witness_list, witness);
110
111struct witness_blessed {
112	const	char *b_lock1;
113	const	char *b_lock2;
114};
115
116struct witness_order_list_entry {
117	const	char *w_name;
118	struct	lock_class *w_class;
119};
120
121static struct	witness *enroll(const char *description,
122				struct lock_class *lock_class);
123static int	itismychild(struct witness *parent, struct witness *child);
124static void	removechild(struct witness *parent, struct witness *child);
125static int	isitmychild(struct witness *parent, struct witness *child);
126static int	isitmydescendant(struct witness *parent, struct witness *child);
127static int	dup_ok(struct witness *);
128static int	blessed(struct witness *, struct witness *);
129static void	witness_display_list(void(*prnt)(const char *fmt, ...),
130				     struct witness_list *list);
131static void	witness_displaydescendants(void(*)(const char *fmt, ...),
132					   struct witness *);
133static void	witness_leveldescendents(struct witness *parent, int level);
134static void	witness_levelall(void);
135static struct	witness *witness_get(void);
136static void	witness_free(struct witness *m);
137static struct	witness_child_list_entry *witness_child_get(void);
138static void	witness_child_free(struct witness_child_list_entry *wcl);
139static struct	lock_list_entry *witness_lock_list_get(void);
140static void	witness_lock_list_free(struct lock_list_entry *lle);
141static void	witness_display(void(*)(const char *fmt, ...));
142
143MALLOC_DEFINE(M_WITNESS, "witness", "witness structure");
144
145static int witness_watch;
146TUNABLE_INT_DECL("debug.witness_watch", 1, witness_watch);
147SYSCTL_INT(_debug, OID_AUTO, witness_watch, CTLFLAG_RD, &witness_watch, 0, "");
148
149#ifdef DDB
150/*
151 * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
152 * drop into kdebug() when:
153 *	- a lock heirarchy violation occurs
154 *	- locks are held when going to sleep.
155 */
156int	witness_ddb;
157#ifdef WITNESS_DDB
158TUNABLE_INT_DECL("debug.witness_ddb", 1, witness_ddb);
159#else
160TUNABLE_INT_DECL("debug.witness_ddb", 0, witness_ddb);
161#endif
162SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
163#endif /* DDB */
164
165int	witness_skipspin;
166#ifdef WITNESS_SKIPSPIN
167TUNABLE_INT_DECL("debug.witness_skipspin", 1, witness_skipspin);
168#else
169TUNABLE_INT_DECL("debug.witness_skipspin", 0, witness_skipspin);
170#endif
171SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
172    "");
173
174static struct mtx w_mtx;
175static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
176static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
177static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
178static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
179static struct witness_child_list_entry *w_child_free = NULL;
180static struct lock_list_entry *w_lock_list_free = NULL;
181static int witness_dead;	/* fatal error, probably no memory */
182
183static struct witness w_data[WITNESS_COUNT];
184static struct witness_child_list_entry w_childdata[WITNESS_CHILDCOUNT];
185static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
186
187static struct witness_order_list_entry order_lists[] = {
188	{ "Giant", &lock_class_mtx_sleep },
189	{ "proctree", &lock_class_sx },
190	{ "allproc", &lock_class_sx },
191	{ "process lock", &lock_class_mtx_sleep },
192	{ "uidinfo hash", &lock_class_mtx_sleep },
193	{ "uidinfo struct", &lock_class_mtx_sleep },
194	{ NULL, NULL },
195	/*
196	 * spin locks
197	 */
198#if defined(__i386__) && defined (SMP)
199	{ "com", &lock_class_mtx_spin },
200#endif
201	{ "sio", &lock_class_mtx_spin },
202#ifdef __i386__
203	{ "cy", &lock_class_mtx_spin },
204#endif
205	{ "ng_node", &lock_class_mtx_spin },
206	{ "ng_worklist", &lock_class_mtx_spin },
207	{ "ithread table lock", &lock_class_mtx_spin },
208	{ "ithread list lock", &lock_class_mtx_spin },
209	{ "sched lock", &lock_class_mtx_spin },
210#ifdef __i386__
211	{ "clk", &lock_class_mtx_spin },
212#endif
213	{ "callout", &lock_class_mtx_spin },
214	/*
215	 * leaf locks
216	 */
217#ifdef SMP
218	{ "ap boot", &lock_class_mtx_spin },
219#ifdef __i386__
220	{ "imen", &lock_class_mtx_spin },
221#endif
222	{ "smp rendezvous", &lock_class_mtx_spin },
223#endif
224	{ NULL, NULL },
225	{ NULL, NULL }
226};
227
228static const char *dup_list[] = {
229	"process lock",
230	NULL
231};
232
233/*
234 * Pairs of locks which have been blessed
235 * Don't complain about order problems with blessed locks
236 */
237static struct witness_blessed blessed_list[] = {
238};
239static int blessed_count =
240	sizeof(blessed_list) / sizeof(struct witness_blessed);
241
242/*
243 * List of all locks in the system.
244 */
245STAILQ_HEAD(, lock_object) all_locks = STAILQ_HEAD_INITIALIZER(all_locks);
246
247static struct mtx all_mtx = {
248	{ &lock_class_mtx_sleep,	/* mtx_object.lo_class */
249	  "All locks list",		/* mtx_object.lo_name */
250	  NULL,				/* mtx_object.lo_file */
251	  0,				/* mtx_object.lo_line */
252	  LO_INITIALIZED,		/* mtx_object.lo_flags */
253	  { NULL },			/* mtx_object.lo_list */
254	  NULL },			/* mtx_object.lo_witness */
255	MTX_UNOWNED, 0,			/* mtx_lock, mtx_recurse */
256	0,				/* mtx_savecrit */
257	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
258	{ NULL, NULL }			/* mtx_contested */
259};
260
261/*
262 * This global is set to 0 once it becomes safe to use the witness code.
263 */
264static int witness_cold = 1;
265
266/*
267 * Global variables for book keeping.
268 */
269static int lock_cur_cnt;
270static int lock_max_cnt;
271
272/*
273 * The WITNESS-enabled diagnostic code.
274 */
275static void
276witness_initialize(void *dummy __unused)
277{
278	struct lock_object *lock;
279	struct witness_order_list_entry *order;
280	struct witness *w, *w1;
281	int i;
282
283	/*
284	 * We have to release Giant before initializing its witness
285	 * structure so that WITNESS doesn't get confused.
286	 */
287	mtx_unlock(&Giant);
288	mtx_assert(&Giant, MA_NOTOWNED);
289
290	STAILQ_INSERT_HEAD(&all_locks, &all_mtx.mtx_object, lo_list);
291	mtx_init(&w_mtx, "witness lock", MTX_SPIN | MTX_QUIET | MTX_NOWITNESS);
292	for (i = 0; i < WITNESS_COUNT; i++)
293		witness_free(&w_data[i]);
294	for (i = 0; i < WITNESS_CHILDCOUNT; i++)
295		witness_child_free(&w_childdata[i]);
296	for (i = 0; i < LOCK_CHILDCOUNT; i++)
297		witness_lock_list_free(&w_locklistdata[i]);
298
299	/* First add in all the specified order lists. */
300	for (order = order_lists; order->w_name != NULL; order++) {
301		w = enroll(order->w_name, order->w_class);
302		w->w_file = "order list";
303		for (order++; order->w_name != NULL; order++) {
304			w1 = enroll(order->w_name, order->w_class);
305			w1->w_file = "order list";
306			itismychild(w, w1);
307			w = w1;
308		}
309	}
310
311	/* Iterate through all locks and add them to witness. */
312	mtx_lock(&all_mtx);
313	STAILQ_FOREACH(lock, &all_locks, lo_list) {
314		if (lock->lo_flags & LO_WITNESS)
315			lock->lo_witness = enroll(lock->lo_name,
316			    lock->lo_class);
317		else
318			lock->lo_witness = NULL;
319	}
320	mtx_unlock(&all_mtx);
321
322	/* Mark the witness code as being ready for use. */
323	atomic_store_rel_int(&witness_cold, 0);
324
325	mtx_lock(&Giant);
326}
327SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize, NULL)
328
329void
330witness_init(struct lock_object *lock)
331{
332	struct lock_class *class;
333
334	class = lock->lo_class;
335	if (lock->lo_flags & LO_INITIALIZED)
336		panic("%s: lock (%s) %s is already initialized!\n", __func__,
337		    class->lc_name, lock->lo_name);
338
339	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
340	    (class->lc_flags & LC_RECURSABLE) == 0)
341		panic("%s: lock (%s) %s can not be recursable!\n", __func__,
342		    class->lc_name, lock->lo_name);
343
344	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
345	    (class->lc_flags & LC_SLEEPABLE) == 0)
346		panic("%s: lock (%s) %s can not be sleepable!\n", __func__,
347		    class->lc_name, lock->lo_name);
348
349	mtx_lock(&all_mtx);
350	STAILQ_INSERT_TAIL(&all_locks, lock, lo_list);
351	lock->lo_flags |= LO_INITIALIZED;
352	lock_cur_cnt++;
353	if (lock_cur_cnt > lock_max_cnt)
354		lock_max_cnt = lock_cur_cnt;
355	mtx_unlock(&all_mtx);
356	if (!witness_cold && !witness_dead &&
357	    (lock->lo_flags & LO_WITNESS) != 0)
358		lock->lo_witness = enroll(lock->lo_name, class);
359	else
360		lock->lo_witness = NULL;
361}
362
363void
364witness_destroy(struct lock_object *lock)
365{
366	struct witness *w;
367
368	if (witness_cold)
369		panic("lock (%s) %s destroyed while witness_cold",
370		    lock->lo_class->lc_name, lock->lo_name);
371
372	if ((lock->lo_flags & LO_INITIALIZED) == 0)
373		panic("%s: lock (%s) %s is not initialized!\n", __func__,
374		    lock->lo_class->lc_name, lock->lo_name);
375
376	if (lock->lo_flags & LO_LOCKED)
377		panic("lock (%s) %s destroyed while held",
378		    lock->lo_class->lc_name, lock->lo_name);
379
380	w = lock->lo_witness;
381	if (w != NULL) {
382		mtx_lock_spin(&w_mtx);
383		w->w_refcount--;
384		if (w->w_refcount == 0) {
385			w->w_name = "(dead)";
386			w->w_file = "(dead)";
387			w->w_line = 0;
388		}
389		mtx_unlock_spin(&w_mtx);
390	}
391
392	mtx_lock(&all_mtx);
393	lock_cur_cnt--;
394	STAILQ_REMOVE(&all_locks, lock, lock_object, lo_list);
395	lock->lo_flags &= LO_INITIALIZED;
396	mtx_unlock(&all_mtx);
397}
398
399static void
400witness_display_list(void(*prnt)(const char *fmt, ...),
401		     struct witness_list *list)
402{
403	struct witness *w, *w1;
404	int found;
405
406	STAILQ_FOREACH(w, list, w_typelist) {
407		if (w->w_file == NULL)
408			continue;
409		found = 0;
410		STAILQ_FOREACH(w1, list, w_typelist) {
411			if (isitmychild(w1, w)) {
412				found++;
413				break;
414			}
415		}
416		if (found)
417			continue;
418		/*
419		 * This lock has no anscestors, display its descendants.
420		 */
421		witness_displaydescendants(prnt, w);
422	}
423}
424
425static void
426witness_display(void(*prnt)(const char *fmt, ...))
427{
428	struct witness *w;
429
430	KASSERT(!witness_cold, ("%s: witness_cold\n", __func__));
431	witness_levelall();
432
433	/*
434	 * First, handle sleep locks which have been acquired at least
435	 * once.
436	 */
437	prnt("Sleep locks:\n");
438	witness_display_list(prnt, &w_sleep);
439
440	/*
441	 * Now do spin locks which have been acquired at least once.
442	 */
443	prnt("\nSpin locks:\n");
444	witness_display_list(prnt, &w_spin);
445
446	/*
447	 * Finally, any locks which have not been acquired yet.
448	 */
449	prnt("\nLocks which were never acquired:\n");
450	STAILQ_FOREACH(w, &w_all, w_list) {
451		if (w->w_file != NULL)
452			continue;
453		prnt("%s\n", w->w_name);
454	}
455}
456
457void
458witness_lock(struct lock_object *lock, int flags, const char *file, int line)
459{
460	struct lock_list_entry **lock_list, *lle;
461	struct lock_object *lock1, *lock2;
462	struct lock_class *class;
463	struct witness *w, *w1;
464	struct proc *p;
465	int i, j;
466#ifdef DDB
467	int go_into_ddb = 0;
468#endif /* DDB */
469
470	if (witness_cold || witness_dead || lock->lo_witness == NULL ||
471	    panicstr)
472		return;
473	w = lock->lo_witness;
474	class = lock->lo_class;
475	p = curproc;
476
477	if ((lock->lo_flags & LO_LOCKED) == 0)
478		panic("%s: lock (%s) %s is not locked @ %s:%d", __func__,
479		    class->lc_name, lock->lo_name, file, line);
480
481	if ((lock->lo_flags & LO_RECURSED) != 0) {
482		if ((lock->lo_flags & LO_RECURSABLE) == 0)
483			panic(
484			"%s: recursed on non-recursive lock (%s) %s @ %s:%d",
485			    __func__, class->lc_name, lock->lo_name, file,
486			    line);
487		return;
488	}
489
490	/*
491	 * We have to hold a spinlock to keep lock_list valid across the check
492	 * in the LC_SLEEPLOCK case.  In the LC_SPINLOCK case, it is already
493	 * protected by the spinlock we are currently performing the witness
494	 * checks on, so it is ok to release the lock after performing this
495	 * check.  All we have to protect is the LC_SLEEPLOCK case when no
496	 * spinlocks are held as we may get preempted during this check and
497	 * lock_list could end up pointing to some other CPU's spinlock list.
498	 */
499	mtx_lock_spin(&w_mtx);
500	lock_list = PCPU_PTR(spinlocks);
501	if (class->lc_flags & LC_SLEEPLOCK) {
502		if (*lock_list != NULL) {
503			mtx_unlock_spin(&w_mtx);
504			panic("blockable sleep lock (%s) %s @ %s:%d",
505			    class->lc_name, lock->lo_name, file, line);
506		}
507		lock_list = &p->p_sleeplocks;
508	}
509	mtx_unlock_spin(&w_mtx);
510
511	if (flags & LOP_TRYLOCK)
512		goto out;
513
514	/*
515	 * Is this the first lock acquired?  If so, then no order checking
516	 * is needed.
517	 */
518	if (*lock_list == NULL)
519		goto out;
520
521	/*
522	 * Check for duplicate locks of the same type.  Note that we only
523	 * have to check for this on the last lock we just acquired.  Any
524	 * other cases will be caught as lock order violations.
525	 */
526	lock1 = (*lock_list)->ll_children[(*lock_list)->ll_count - 1];
527	w1 = lock1->lo_witness;
528	if (w1 == w) {
529		if (w->w_same_squawked || dup_ok(w))
530			goto out;
531		w->w_same_squawked = 1;
532		printf("acquring duplicate lock of same type: \"%s\"\n",
533			lock->lo_name);
534		printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
535		printf(" 2nd @ %s:%d\n", file, line);
536#ifdef DDB
537		go_into_ddb = 1;
538#endif /* DDB */
539		goto out;
540	}
541	MPASS(!mtx_owned(&w_mtx));
542	mtx_lock_spin(&w_mtx);
543	/*
544	 * If we have a known higher number just say ok
545	 */
546	if (witness_watch > 1 && w->w_level > w1->w_level) {
547		mtx_unlock_spin(&w_mtx);
548		goto out;
549	}
550	if (isitmydescendant(w1, w)) {
551		mtx_unlock_spin(&w_mtx);
552		goto out;
553	}
554	for (j = 0, lle = *lock_list; lle != NULL; lle = lle->ll_next) {
555		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
556
557			MPASS(j < WITNESS_COUNT);
558			lock1 = lle->ll_children[i];
559			w1 = lock1->lo_witness;
560
561			/*
562			 * If this lock doesn't undergo witness checking,
563			 * then skip it.
564			 */
565			if (w1 == NULL) {
566				KASSERT((lock1->lo_flags & LO_WITNESS) == 0,
567				    ("lock missing witness structure"));
568				continue;
569			}
570			if (!isitmydescendant(w, w1))
571				continue;
572			/*
573			 * We have a lock order violation, check to see if it
574			 * is allowed or has already been yelled about.
575			 */
576			mtx_unlock_spin(&w_mtx);
577			if (blessed(w, w1))
578				goto out;
579			if (lock1 == &Giant.mtx_object) {
580				if (w1->w_Giant_squawked)
581					goto out;
582				else
583					w1->w_Giant_squawked = 1;
584			} else {
585				if (w1->w_other_squawked)
586					goto out;
587				else
588					w1->w_other_squawked = 1;
589			}
590			/*
591			 * Ok, yell about it.
592			 */
593			printf("lock order reversal\n");
594			/*
595			 * Try to locate an earlier lock with
596			 * witness w in our list.
597			 */
598			do {
599				lock2 = lle->ll_children[i];
600				MPASS(lock2 != NULL);
601				if (lock2->lo_witness == w)
602					break;
603				i--;
604				if (i == 0 && lle->ll_next != NULL) {
605					lle = lle->ll_next;
606					i = lle->ll_count - 1;
607					MPASS(i != 0);
608				}
609			} while (i >= 0);
610			if (i < 0)
611				/*
612				 * We are very likely bogus in this case.
613				 */
614				printf(" 1st %s last acquired @ %s:%d\n",
615				    w->w_name, w->w_file, w->w_line);
616			else
617				printf(" 1st %p %s @ %s:%d\n", lock2,
618				    lock2->lo_name, lock2->lo_file,
619				    lock2->lo_line);
620			printf(" 2nd %p %s @ %s:%d\n",
621			    lock1, lock1->lo_name, lock1->lo_file,
622			    lock1->lo_line);
623			printf(" 3rd %p %s @ %s:%d\n",
624			    lock, lock->lo_name, file, line);
625#ifdef DDB
626			go_into_ddb = 1;
627#endif /* DDB */
628			goto out;
629		}
630	}
631	lock1 = (*lock_list)->ll_children[(*lock_list)->ll_count - 1];
632	if (!itismychild(lock1->lo_witness, w))
633		mtx_unlock_spin(&w_mtx);
634
635out:
636#ifdef DDB
637	if (witness_ddb && go_into_ddb)
638		Debugger("witness_enter");
639#endif /* DDB */
640	w->w_file = file;
641	w->w_line = line;
642	lock->lo_line = line;
643	lock->lo_file = file;
644
645	lle = *lock_list;
646	if (lle == NULL || lle->ll_count == LOCK_CHILDCOUNT) {
647		*lock_list = witness_lock_list_get();
648		if (*lock_list == NULL)
649			return;
650		(*lock_list)->ll_next = lle;
651		lle = *lock_list;
652	}
653	lle->ll_children[lle->ll_count++] = lock;
654}
655
656void
657witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
658{
659	struct lock_list_entry **lock_list, *lle;
660	struct lock_class *class;
661	struct proc *p;
662	int i, j;
663
664	if (witness_cold || witness_dead || lock->lo_witness == NULL ||
665	    panicstr)
666		return;
667	p = curproc;
668	class = lock->lo_class;
669
670	if (lock->lo_flags & LO_RECURSED) {
671		if ((lock->lo_flags & LO_LOCKED) == 0)
672			panic("%s: recursed lock (%s) %s is not locked @ %s:%d",
673			    __func__, class->lc_name, lock->lo_name, file,
674			    line);
675		return;
676	}
677
678	/*
679	 * We don't need to protect this PCPU_GET() here against preemption
680	 * because if we hold any spinlocks then we are already protected,
681	 * and if we don't we will get NULL if we hold no spinlocks even if
682	 * we switch CPU's while reading it.
683	 */
684	if (class->lc_flags & LC_SLEEPLOCK) {
685		if ((flags & LOP_NOSWITCH) == 0 && PCPU_GET(spinlocks) != NULL)
686			panic("switchable sleep unlock (%s) %s @ %s:%d",
687			    class->lc_name, lock->lo_name, file, line);
688		lock_list = &p->p_sleeplocks;
689	} else
690		lock_list = PCPU_PTR(spinlocks);
691
692	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
693		for (i = 0; i < (*lock_list)->ll_count; i++)
694			if ((*lock_list)->ll_children[i] == lock) {
695				(*lock_list)->ll_count--;
696				for (j = i; j < (*lock_list)->ll_count; j++)
697					(*lock_list)->ll_children[j] =
698					    (*lock_list)->ll_children[j + 1];
699				if ((*lock_list)->ll_count == 0) {
700					lle = *lock_list;
701					*lock_list = lle->ll_next;
702					witness_lock_list_free(lle);
703				}
704				return;
705			}
706}
707
708/*
709 * Warn if any held locks are not sleepable.  Note that Giant and the lock
710 * passed in are both special cases since they are both released during the
711 * sleep process and aren't actually held while the process is asleep.
712 */
713int
714witness_sleep(int check_only, struct lock_object *lock, const char *file,
715	      int line)
716{
717	struct lock_list_entry **lock_list, *lle;
718	struct lock_object *lock1;
719	struct proc *p;
720	critical_t savecrit;
721	int i, n;
722
723	if (witness_dead || panicstr)
724		return (0);
725	KASSERT(!witness_cold, ("%s: witness_cold\n", __func__));
726	n = 0;
727	/*
728	 * Preemption bad because we need PCPU_PTR(spinlocks) to not change.
729	 */
730	savecrit = critical_enter();
731	p = curproc;
732	lock_list = &p->p_sleeplocks;
733again:
734	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
735		for (i = lle->ll_count - 1; i >= 0; i--) {
736			lock1 = lle->ll_children[i];
737			if (lock1 == lock || lock1 == &Giant.mtx_object ||
738			    (lock1->lo_flags & LO_SLEEPABLE))
739				continue;
740			n++;
741			printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
742			    file, line, check_only ? "could sleep" : "sleeping",
743			    lock1->lo_name, lock1->lo_file, lock1->lo_line);
744		}
745	if (lock_list == &p->p_sleeplocks) {
746		lock_list = PCPU_PTR(spinlocks);
747		goto again;
748	}
749#ifdef DDB
750	if (witness_ddb && n)
751		Debugger("witness_sleep");
752#endif /* DDB */
753	critical_exit(savecrit);
754	return (n);
755}
756
757static struct witness *
758enroll(const char *description, struct lock_class *lock_class)
759{
760	struct witness *w;
761
762	if (!witness_watch)
763		return (NULL);
764
765	if ((lock_class->lc_flags & LC_SPINLOCK) && witness_skipspin)
766		return (NULL);
767	mtx_lock_spin(&w_mtx);
768	STAILQ_FOREACH(w, &w_all, w_list) {
769		if (strcmp(description, w->w_name) == 0) {
770			w->w_refcount++;
771			mtx_unlock_spin(&w_mtx);
772			if (lock_class != w->w_class)
773				panic(
774				"lock (%s) %s does not match earlier (%s) lock",
775				    description, lock_class->lc_name,
776				    w->w_class->lc_name);
777			return (w);
778		}
779	}
780	/*
781	 * This isn't quite right, as witness_cold is still 0 while we
782	 * enroll all the locks initialized before witness_initialize().
783	 */
784	if ((lock_class->lc_flags & LC_SPINLOCK) && !witness_cold) {
785		mtx_unlock_spin(&w_mtx);
786		panic("spin lock %s not in order list", description);
787	}
788	if ((w = witness_get()) == NULL)
789		return (NULL);
790	w->w_name = description;
791	w->w_class = lock_class;
792	w->w_refcount = 1;
793	STAILQ_INSERT_HEAD(&w_all, w, w_list);
794	if (lock_class->lc_flags & LC_SPINLOCK)
795		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
796	else if (lock_class->lc_flags & LC_SLEEPLOCK)
797		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
798	else {
799		mtx_unlock_spin(&w_mtx);
800		panic("lock class %s is not sleep or spin",
801		    lock_class->lc_name);
802	}
803	mtx_unlock_spin(&w_mtx);
804
805	return (w);
806}
807
808static int
809itismychild(struct witness *parent, struct witness *child)
810{
811	static int recursed;
812	struct witness_child_list_entry **wcl;
813	struct witness_list *list;
814
815	MPASS(child != NULL && parent != NULL);
816	if ((parent->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) !=
817	    (child->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)))
818		panic(
819		"%s: parent (%s) and child (%s) are not the same lock type",
820		    __func__, parent->w_class->lc_name,
821		    child->w_class->lc_name);
822
823	/*
824	 * Insert "child" after "parent"
825	 */
826	wcl = &parent->w_children;
827	while (*wcl != NULL && (*wcl)->wcl_count == WITNESS_NCHILDREN)
828		wcl = &(*wcl)->wcl_next;
829
830	if (*wcl == NULL) {
831		*wcl = witness_child_get();
832		if (*wcl == NULL)
833			return (1);
834	}
835
836	(*wcl)->wcl_children[(*wcl)->wcl_count++] = child;
837
838	/*
839	 * Now prune whole tree.  We look for cases where a lock is now
840	 * both a descendant and a direct child of a given lock.  In that
841	 * case, we want to remove the direct child link from the tree.
842	 */
843	if (recursed)
844		return (0);
845	recursed = 1;
846	if (parent->w_class->lc_flags & LC_SLEEPLOCK)
847		list = &w_sleep;
848	else
849		list = &w_spin;
850	STAILQ_FOREACH(child, list, w_typelist) {
851		STAILQ_FOREACH(parent, list, w_typelist) {
852			if (!isitmychild(parent, child))
853				continue;
854			removechild(parent, child);
855			if (isitmydescendant(parent, child))
856				continue;
857			itismychild(parent, child);
858		}
859	}
860	recursed = 0;
861	witness_levelall();
862	return (0);
863}
864
865static void
866removechild(struct witness *parent, struct witness *child)
867{
868	struct witness_child_list_entry **wcl, *wcl1;
869	int i;
870
871	for (wcl = &parent->w_children; *wcl != NULL; wcl = &(*wcl)->wcl_next)
872		for (i = 0; i < (*wcl)->wcl_count; i++)
873			if ((*wcl)->wcl_children[i] == child)
874				goto found;
875	return;
876found:
877	(*wcl)->wcl_count--;
878	if ((*wcl)->wcl_count > i)
879		(*wcl)->wcl_children[i] =
880		    (*wcl)->wcl_children[(*wcl)->wcl_count];
881	MPASS((*wcl)->wcl_children[i] != NULL);
882
883	if ((*wcl)->wcl_count != 0)
884		return;
885
886	wcl1 = *wcl;
887	*wcl = wcl1->wcl_next;
888	witness_child_free(wcl1);
889}
890
891static int
892isitmychild(struct witness *parent, struct witness *child)
893{
894	struct witness_child_list_entry *wcl;
895	int i;
896
897	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) {
898		for (i = 0; i < wcl->wcl_count; i++) {
899			if (wcl->wcl_children[i] == child)
900				return (1);
901		}
902	}
903	return (0);
904}
905
906static int
907isitmydescendant(struct witness *parent, struct witness *child)
908{
909	struct witness_child_list_entry *wcl;
910	int i, j;
911
912	if (isitmychild(parent, child))
913		return (1);
914	j = 0;
915	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next) {
916		MPASS(j < 1000);
917		for (i = 0; i < wcl->wcl_count; i++) {
918			if (isitmydescendant(wcl->wcl_children[i], child))
919				return (1);
920		}
921		j++;
922	}
923	return (0);
924}
925
926void
927witness_levelall (void)
928{
929	struct witness_list *list;
930	struct witness *w, *w1;
931
932	/*
933	 * First clear all levels.
934	 */
935	STAILQ_FOREACH(w, &w_all, w_list) {
936		w->w_level = 0;
937	}
938
939	/*
940	 * Look for locks with no parent and level all their descendants.
941	 */
942	STAILQ_FOREACH(w, &w_all, w_list) {
943		/*
944		 * This is just an optimization, technically we could get
945		 * away just walking the all list each time.
946		 */
947		if (w->w_class->lc_flags & LC_SLEEPLOCK)
948			list = &w_sleep;
949		else
950			list = &w_spin;
951		STAILQ_FOREACH(w1, list, w_typelist) {
952			if (isitmychild(w1, w))
953				goto skip;
954		}
955		witness_leveldescendents(w, 0);
956	skip:
957	}
958}
959
960static void
961witness_leveldescendents(struct witness *parent, int level)
962{
963	struct witness_child_list_entry *wcl;
964	int i;
965
966	if (parent->w_level < level)
967		parent->w_level = level;
968	level++;
969	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
970		for (i = 0; i < wcl->wcl_count; i++)
971			witness_leveldescendents(wcl->wcl_children[i], level);
972}
973
974static void
975witness_displaydescendants(void(*prnt)(const char *fmt, ...),
976			   struct witness *parent)
977{
978	struct witness_child_list_entry *wcl;
979	int i, level;
980
981	level =  parent->w_level;
982
983	prnt("%-2d", level);
984	for (i = 0; i < level; i++)
985		prnt(" ");
986	prnt("%s", parent->w_name);
987	if (parent->w_file != NULL)
988		prnt(" -- last acquired @ %s:%d\n", parent->w_file,
989		    parent->w_line);
990
991	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
992		for (i = 0; i < wcl->wcl_count; i++)
993			    witness_displaydescendants(prnt,
994				wcl->wcl_children[i]);
995}
996
997static int
998dup_ok(struct witness *w)
999{
1000	const char **dup;
1001
1002	for (dup = dup_list; *dup != NULL; dup++)
1003		if (strcmp(w->w_name, *dup) == 0)
1004			return (1);
1005	return (0);
1006}
1007
1008static int
1009blessed(struct witness *w1, struct witness *w2)
1010{
1011	int i;
1012	struct witness_blessed *b;
1013
1014	for (i = 0; i < blessed_count; i++) {
1015		b = &blessed_list[i];
1016		if (strcmp(w1->w_name, b->b_lock1) == 0) {
1017			if (strcmp(w2->w_name, b->b_lock2) == 0)
1018				return (1);
1019			continue;
1020		}
1021		if (strcmp(w1->w_name, b->b_lock2) == 0)
1022			if (strcmp(w2->w_name, b->b_lock1) == 0)
1023				return (1);
1024	}
1025	return (0);
1026}
1027
1028static struct witness *
1029witness_get(void)
1030{
1031	struct witness *w;
1032
1033	if (STAILQ_EMPTY(&w_free)) {
1034		witness_dead = 1;
1035		mtx_unlock_spin(&w_mtx);
1036		printf("%s: witness exhausted\n", __func__);
1037		return (NULL);
1038	}
1039	w = STAILQ_FIRST(&w_free);
1040	STAILQ_REMOVE_HEAD(&w_free, w_list);
1041	bzero(w, sizeof(*w));
1042	return (w);
1043}
1044
1045static void
1046witness_free(struct witness *w)
1047{
1048
1049	STAILQ_INSERT_HEAD(&w_free, w, w_list);
1050}
1051
1052static struct witness_child_list_entry *
1053witness_child_get(void)
1054{
1055	struct witness_child_list_entry *wcl;
1056
1057	wcl = w_child_free;
1058	if (wcl == NULL) {
1059		witness_dead = 1;
1060		mtx_unlock_spin(&w_mtx);
1061		printf("%s: witness exhausted\n", __func__);
1062		return (NULL);
1063	}
1064	w_child_free = wcl->wcl_next;
1065	bzero(wcl, sizeof(*wcl));
1066	return (wcl);
1067}
1068
1069static void
1070witness_child_free(struct witness_child_list_entry *wcl)
1071{
1072
1073	wcl->wcl_next = w_child_free;
1074	w_child_free = wcl;
1075}
1076
1077static struct lock_list_entry *
1078witness_lock_list_get(void)
1079{
1080	struct lock_list_entry *lle;
1081
1082	mtx_lock_spin(&w_mtx);
1083	lle = w_lock_list_free;
1084	if (lle == NULL) {
1085		witness_dead = 1;
1086		mtx_unlock_spin(&w_mtx);
1087		printf("%s: witness exhausted\n", __func__);
1088		return (NULL);
1089	}
1090	w_lock_list_free = lle->ll_next;
1091	mtx_unlock_spin(&w_mtx);
1092	bzero(lle, sizeof(*lle));
1093	return (lle);
1094}
1095
1096static void
1097witness_lock_list_free(struct lock_list_entry *lle)
1098{
1099
1100	mtx_lock_spin(&w_mtx);
1101	lle->ll_next = w_lock_list_free;
1102	w_lock_list_free = lle;
1103	mtx_unlock_spin(&w_mtx);
1104}
1105
1106int
1107witness_list_locks(struct lock_list_entry **lock_list)
1108{
1109	struct lock_list_entry *lle;
1110	struct lock_object *lock;
1111	int i, nheld;
1112
1113	nheld = 0;
1114	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
1115		for (i = lle->ll_count - 1; i >= 0; i--) {
1116			lock = lle->ll_children[i];
1117			printf("\t(%s) %s (%p) locked at %s:%d\n",
1118			    lock->lo_class->lc_name, lock->lo_name, lock,
1119			    lock->lo_file, lock->lo_line);
1120			nheld++;
1121		}
1122	return (nheld);
1123}
1124
1125/*
1126 * Calling this on p != curproc is bad unless we are in ddb.
1127 */
1128int
1129witness_list(struct proc *p)
1130{
1131	critical_t savecrit;
1132	int nheld;
1133
1134	KASSERT(p == curproc || db_active,
1135	    ("%s: p != curproc and we aren't in the debugger", __func__));
1136	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
1137
1138	nheld = witness_list_locks(&p->p_sleeplocks);
1139
1140	/*
1141	 * We only handle spinlocks if p == curproc.  This is somewhat broken
1142	 * if p is currently executing on some other CPU and holds spin locks
1143	 * as we won't display those locks.  If we had a MI way of getting
1144	 * the per-cpu data for a given cpu then we could use p->p_oncpu to
1145	 * get the list of spinlocks for this process and "fix" this.
1146	 */
1147	if (p == curproc) {
1148		/*
1149		 * Preemption bad because we need PCPU_PTR(spinlocks) to not
1150		 * change.
1151		 */
1152		savecrit = critical_enter();
1153		nheld += witness_list_locks(PCPU_PTR(spinlocks));
1154		critical_exit(savecrit);
1155	}
1156
1157	return (nheld);
1158}
1159
1160void
1161witness_save(struct lock_object *lock, const char **filep, int *linep)
1162{
1163
1164	KASSERT(!witness_cold, ("%s: witness_cold\n", __func__));
1165	if (lock->lo_witness == NULL)
1166		return;
1167
1168	*filep = lock->lo_file;
1169	*linep = lock->lo_line;
1170}
1171
1172void
1173witness_restore(struct lock_object *lock, const char *file, int line)
1174{
1175
1176	KASSERT(!witness_cold, ("%s: witness_cold\n", __func__));
1177	if (lock->lo_witness == NULL)
1178		return;
1179
1180	lock->lo_witness->w_file = file;
1181	lock->lo_witness->w_line = line;
1182	lock->lo_file = file;
1183	lock->lo_line = line;
1184}
1185
1186#ifdef DDB
1187
1188DB_SHOW_COMMAND(locks, db_witness_list)
1189{
1190	struct proc *p;
1191	pid_t pid;
1192
1193	if (have_addr) {
1194		pid = (addr % 16) + ((addr >> 4) % 16) * 10 +
1195		    ((addr >> 8) % 16) * 100 + ((addr >> 12) % 16) * 1000 +
1196		    ((addr >> 16) % 16) * 10000;
1197
1198		/* sx_slock(&allproc_lock); */
1199		LIST_FOREACH(p, &allproc, p_list) {
1200			if (p->p_pid == pid)
1201				break;
1202		}
1203		/* sx_sunlock(&allproc_lock); */
1204		if (p == NULL) {
1205			db_printf("pid %d not found\n", pid);
1206			return;
1207		}
1208	} else
1209		p = curproc;
1210
1211	witness_list(p);
1212}
1213
1214DB_SHOW_COMMAND(witness, db_witness_display)
1215{
1216
1217	witness_display(db_printf);
1218}
1219#endif
1220