subr_witness.c revision 70861
1/*-
2 * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 * 3. Berkeley Software Design Inc's name may not be used to endorse or
13 *    promote products derived from this software without specific prior
14 *    written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
29 *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
30 * $FreeBSD: head/sys/kern/subr_witness.c 70861 2001-01-10 04:43:51Z jake $
31 */
32
33/*
34 *	Main Entry: witness
35 *	Pronunciation: 'wit-n&s
36 *	Function: noun
37 *	Etymology: Middle English witnesse, from Old English witnes knowledge,
38 *	    testimony, witness, from 2wit
39 *	Date: before 12th century
40 *	1 : attestation of a fact or event : TESTIMONY
41 *	2 : one that gives evidence; specifically : one who testifies in
42 *	    a cause or before a judicial tribunal
43 *	3 : one asked to be present at a transaction so as to be able to
44 *	    testify to its having taken place
45 *	4 : one who has personal knowledge of something
46 *	5 a : something serving as evidence or proof : SIGN
47 *	  b : public affirmation by word or example of usually
48 *	      religious faith or conviction <the heroic witness to divine
49 *	      life -- Pilot>
50 *	6 capitalized : a member of the Jehovah's Witnesses
51 */
52
53#include "opt_ddb.h"
54#include "opt_witness.h"
55
56/*
57 * Cause non-inlined mtx_*() to be compiled.
58 * Must be defined early because other system headers may include mutex.h.
59 */
60#define _KERN_MUTEX_C_
61
62#include <sys/param.h>
63#include <sys/bus.h>
64#include <sys/kernel.h>
65#include <sys/malloc.h>
66#include <sys/proc.h>
67#include <sys/sysctl.h>
68#include <sys/systm.h>
69#include <sys/vmmeter.h>
70#include <sys/ktr.h>
71
72#include <machine/atomic.h>
73#include <machine/bus.h>
74#include <machine/clock.h>
75#include <machine/cpu.h>
76
77#include <ddb/ddb.h>
78
79#include <vm/vm.h>
80#include <vm/vm_extern.h>
81
82#include <sys/mutex.h>
83
84/*
85 * Machine independent bits of the mutex implementation
86 */
87/* All mutexes in system (used for debug/panic) */
88#ifdef WITNESS
89static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0,
90	"All mutexes queue head" };
91static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, &all_mtx_debug,
92	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
93	{ NULL, NULL }, &all_mtx, &all_mtx };
94#else	/* WITNESS */
95static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, "All mutexes queue head",
96	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
97	{ NULL, NULL }, &all_mtx, &all_mtx };
98#endif	/* WITNESS */
99
100static int	mtx_cur_cnt;
101static int	mtx_max_cnt;
102
103void	_mtx_enter_giant_def(void);
104void	_mtx_exit_giant_def(void);
105static void propagate_priority(struct proc *);
106
107#define	mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
108#define	mtx_owner(m)	(mtx_unowned(m) ? NULL \
109			    : (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))
110
111#define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
112#define	SET_PRIO(p, pri)	(p)->p_priority = (pri)
113
114/*
115 * XXX Temporary, for use from assembly language
116 */
117
118void
119_mtx_enter_giant_def(void)
120{
121
122	mtx_enter(&Giant, MTX_DEF);
123}
124
125void
126_mtx_exit_giant_def(void)
127{
128
129	mtx_exit(&Giant, MTX_DEF);
130}
131
132static void
133propagate_priority(struct proc *p)
134{
135	int pri = p->p_priority;
136	struct mtx *m = p->p_blocked;
137
138	mtx_assert(&sched_lock, MA_OWNED);
139	for (;;) {
140		struct proc *p1;
141
142		p = mtx_owner(m);
143
144		if (p == NULL) {
145			/*
146			 * This really isn't quite right. Really
147			 * ought to bump priority of process that
148			 * next acquires the mutex.
149			 */
150			MPASS(m->mtx_lock == MTX_CONTESTED);
151			return;
152		}
153		MPASS(p->p_magic == P_MAGIC);
154		KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex"));
155		if (p->p_priority <= pri)
156			return;
157
158		/*
159		 * Bump this process' priority.
160		 */
161		SET_PRIO(p, pri);
162
163		/*
164		 * If lock holder is actually running, just bump priority.
165		 */
166#ifdef SMP
167		/*
168		 * For SMP, we can check the p_oncpu field to see if we are
169		 * running.
170		 */
171		if (p->p_oncpu != 0xff) {
172			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
173			return;
174		}
175#else
176		/*
177		 * For UP, we check to see if p is curproc (this shouldn't
178		 * ever happen however as it would mean we are in a deadlock.)
179		 */
180		if (p == curproc) {
181			panic("Deadlock detected");
182			return;
183		}
184#endif
185		/*
186		 * If on run queue move to new run queue, and
187		 * quit.
188		 */
189		if (p->p_stat == SRUN) {
190			printf("XXX: moving process %d(%s) to a new run queue\n",
191			       p->p_pid, p->p_comm);
192			MPASS(p->p_blocked == NULL);
193			remrunqueue(p);
194			setrunqueue(p);
195			return;
196		}
197
198		/*
199		 * If we aren't blocked on a mutex, we should be.
200		 */
201		KASSERT(p->p_stat == SMTX, (
202		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
203		    p->p_pid, p->p_comm, p->p_stat,
204		    m->mtx_description));
205
206		/*
207		 * Pick up the mutex that p is blocked on.
208		 */
209		m = p->p_blocked;
210		MPASS(m != NULL);
211
212		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
213		    p->p_comm, m->mtx_description);
214		/*
215		 * Check if the proc needs to be moved up on
216		 * the blocked chain
217		 */
218		if (p == TAILQ_FIRST(&m->mtx_blocked)) {
219			printf("XXX: process at head of run queue\n");
220			continue;
221		}
222		p1 = TAILQ_PREV(p, rq, p_procq);
223		if (p1->p_priority <= pri) {
224			printf(
225	"XXX: previous process %d(%s) has higher priority\n",
226	                    p->p_pid, p->p_comm);
227			continue;
228		}
229
230		/*
231		 * Remove proc from blocked chain and determine where
232		 * it should be moved up to.  Since we know that p1 has
233		 * a lower priority than p, we know that at least one
234		 * process in the chain has a lower priority and that
235		 * p1 will thus not be NULL after the loop.
236		 */
237		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
238		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
239			MPASS(p1->p_magic == P_MAGIC);
240			if (p1->p_priority > pri)
241				break;
242		}
243		MPASS(p1 != NULL);
244		TAILQ_INSERT_BEFORE(p1, p, p_procq);
245		CTR4(KTR_LOCK,
246		    "propagate_priority: p 0x%p moved before 0x%p on [0x%p] %s",
247		    p, p1, m, m->mtx_description);
248	}
249}
250
251void
252mtx_enter_hard(struct mtx *m, int type, int saveintr)
253{
254	struct proc *p = CURPROC;
255
256	KASSERT(p != NULL, ("curproc is NULL in mutex"));
257
258	switch (type) {
259	case MTX_DEF:
260		if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) {
261			m->mtx_recurse++;
262			atomic_set_ptr(&m->mtx_lock, MTX_RECURSE);
263			if ((type & MTX_QUIET) == 0)
264				CTR1(KTR_LOCK, "mtx_enter: 0x%p recurse", m);
265			return;
266		}
267		if ((type & MTX_QUIET) == 0)
268			CTR3(KTR_LOCK,
269			    "mtx_enter: 0x%p contested (lock=%p) [0x%p]",
270			    m, (void *)m->mtx_lock, (void *)RETIP(m));
271
272		/*
273		 * Save our priority.  Even though p_nativepri is protected
274		 * by sched_lock, we don't obtain it here as it can be
275		 * expensive.  Since this is the only place p_nativepri is
276		 * set, and since two CPUs will not be executing the same
277		 * process concurrently, we know that no other CPU is going
278		 * to be messing with this.  Also, p_nativepri is only read
279		 * when we are blocked on a mutex, so that can't be happening
280		 * right now either.
281		 */
282		p->p_nativepri = p->p_priority;
283		while (!_obtain_lock(m, p)) {
284			uintptr_t v;
285			struct proc *p1;
286
287			mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY);
288			/*
289			 * check if the lock has been released while
290			 * waiting for the schedlock.
291			 */
292			if ((v = m->mtx_lock) == MTX_UNOWNED) {
293				mtx_exit(&sched_lock, MTX_SPIN);
294				continue;
295			}
296			/*
297			 * The mutex was marked contested on release. This
298			 * means that there are processes blocked on it.
299			 */
300			if (v == MTX_CONTESTED) {
301				p1 = TAILQ_FIRST(&m->mtx_blocked);
302				KASSERT(p1 != NULL, ("contested mutex has no contesters"));
303				KASSERT(p != NULL, ("curproc is NULL for contested mutex"));
304				m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;
305				if (p1->p_priority < p->p_priority) {
306					SET_PRIO(p, p1->p_priority);
307				}
308				mtx_exit(&sched_lock, MTX_SPIN);
309				return;
310			}
311			/*
312			 * If the mutex isn't already contested and
313			 * a failure occurs setting the contested bit the
314			 * mutex was either release or the
315			 * state of the RECURSION bit changed.
316			 */
317			if ((v & MTX_CONTESTED) == 0 &&
318			    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
319				               (void *)(v | MTX_CONTESTED))) {
320				mtx_exit(&sched_lock, MTX_SPIN);
321				continue;
322			}
323
324			/* We definitely have to sleep for this lock */
325			mtx_assert(m, MA_NOTOWNED);
326
327#ifdef notyet
328			/*
329			 * If we're borrowing an interrupted thread's VM
330			 * context must clean up before going to sleep.
331			 */
332			if (p->p_flag & (P_ITHD | P_SITHD)) {
333				ithd_t *it = (ithd_t *)p;
334
335				if (it->it_interrupted) {
336					if ((type & MTX_QUIET) == 0)
337						CTR2(KTR_LOCK,
338					    "mtx_enter: 0x%x interrupted 0x%x",
339						    it, it->it_interrupted);
340					intr_thd_fixup(it);
341				}
342			}
343#endif
344
345			/* Put us on the list of procs blocked on this mutex */
346			if (TAILQ_EMPTY(&m->mtx_blocked)) {
347				p1 = (struct proc *)(m->mtx_lock &
348						     MTX_FLAGMASK);
349				LIST_INSERT_HEAD(&p1->p_contested, m,
350						 mtx_contested);
351				TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
352			} else {
353				TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
354					if (p1->p_priority > p->p_priority)
355						break;
356				if (p1)
357					TAILQ_INSERT_BEFORE(p1, p, p_procq);
358				else
359					TAILQ_INSERT_TAIL(&m->mtx_blocked, p,
360							  p_procq);
361			}
362
363			p->p_blocked = m;	/* Who we're blocked on */
364			p->p_mtxname = m->mtx_description;
365			p->p_stat = SMTX;
366#if 0
367			propagate_priority(p);
368#endif
369			if ((type & MTX_QUIET) == 0)
370				CTR3(KTR_LOCK,
371				    "mtx_enter: p 0x%p blocked on [0x%p] %s",
372				    p, m, m->mtx_description);
373			mi_switch();
374			if ((type & MTX_QUIET) == 0)
375				CTR3(KTR_LOCK,
376			    "mtx_enter: p 0x%p free from blocked on [0x%p] %s",
377				    p, m, m->mtx_description);
378			mtx_exit(&sched_lock, MTX_SPIN);
379		}
380		return;
381	case MTX_SPIN:
382	case MTX_SPIN | MTX_FIRST:
383	case MTX_SPIN | MTX_TOPHALF:
384	    {
385		int i = 0;
386
387		if (m->mtx_lock == (uintptr_t)p) {
388			m->mtx_recurse++;
389			return;
390		}
391		if ((type & MTX_QUIET) == 0)
392			CTR1(KTR_LOCK, "mtx_enter: %p spinning", m);
393		for (;;) {
394			if (_obtain_lock(m, p))
395				break;
396			while (m->mtx_lock != MTX_UNOWNED) {
397				if (i++ < 1000000)
398					continue;
399				if (i++ < 6000000)
400					DELAY (1);
401#ifdef DDB
402				else if (!db_active)
403#else
404				else
405#endif
406					panic(
407				"spin lock %s held by 0x%p for > 5 seconds",
408					    m->mtx_description,
409					    (void *)m->mtx_lock);
410			}
411		}
412
413#ifdef MUTEX_DEBUG
414		if (type != MTX_SPIN)
415			m->mtx_saveintr = 0xbeefface;
416		else
417#endif
418			m->mtx_saveintr = saveintr;
419		if ((type & MTX_QUIET) == 0)
420			CTR1(KTR_LOCK, "mtx_enter: 0x%p spin done", m);
421		return;
422	    }
423	}
424}
425
426void
427mtx_exit_hard(struct mtx *m, int type)
428{
429	struct proc *p, *p1;
430	struct mtx *m1;
431	int pri;
432
433	p = CURPROC;
434	switch (type) {
435	case MTX_DEF:
436	case MTX_DEF | MTX_NOSWITCH:
437		if (m->mtx_recurse != 0) {
438			if (--(m->mtx_recurse) == 0)
439				atomic_clear_ptr(&m->mtx_lock, MTX_RECURSE);
440			if ((type & MTX_QUIET) == 0)
441				CTR1(KTR_LOCK, "mtx_exit: 0x%p unrecurse", m);
442			return;
443		}
444		mtx_enter(&sched_lock, MTX_SPIN);
445		if ((type & MTX_QUIET) == 0)
446			CTR1(KTR_LOCK, "mtx_exit: 0x%p contested", m);
447		p1 = TAILQ_FIRST(&m->mtx_blocked);
448		MPASS(p->p_magic == P_MAGIC);
449		MPASS(p1->p_magic == P_MAGIC);
450		TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq);
451		if (TAILQ_EMPTY(&m->mtx_blocked)) {
452			LIST_REMOVE(m, mtx_contested);
453			_release_lock_quick(m);
454			if ((type & MTX_QUIET) == 0)
455				CTR1(KTR_LOCK, "mtx_exit: 0x%p not held", m);
456		} else
457			atomic_store_rel_ptr(&m->mtx_lock,
458			    (void *)MTX_CONTESTED);
459		pri = MAXPRI;
460		LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
461			int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
462			if (cp < pri)
463				pri = cp;
464		}
465		if (pri > p->p_nativepri)
466			pri = p->p_nativepri;
467		SET_PRIO(p, pri);
468		if ((type & MTX_QUIET) == 0)
469			CTR2(KTR_LOCK,
470			    "mtx_exit: 0x%p contested setrunqueue 0x%p", m, p1);
471		p1->p_blocked = NULL;
472		p1->p_mtxname = NULL;
473		p1->p_stat = SRUN;
474		setrunqueue(p1);
475		if ((type & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
476#ifdef notyet
477			if (p->p_flag & (P_ITHD | P_SITHD)) {
478				ithd_t *it = (ithd_t *)p;
479
480				if (it->it_interrupted) {
481					if ((type & MTX_QUIET) == 0)
482						CTR2(KTR_LOCK,
483					    "mtx_exit: 0x%x interruped 0x%x",
484						    it, it->it_interrupted);
485					intr_thd_fixup(it);
486				}
487			}
488#endif
489			setrunqueue(p);
490			if ((type & MTX_QUIET) == 0)
491				CTR2(KTR_LOCK,
492				    "mtx_exit: 0x%p switching out lock=0x%p",
493				    m, (void *)m->mtx_lock);
494			mi_switch();
495			if ((type & MTX_QUIET) == 0)
496				CTR2(KTR_LOCK,
497				    "mtx_exit: 0x%p resuming lock=0x%p",
498				    m, (void *)m->mtx_lock);
499		}
500		mtx_exit(&sched_lock, MTX_SPIN);
501		break;
502	case MTX_SPIN:
503	case MTX_SPIN | MTX_FIRST:
504		if (m->mtx_recurse != 0) {
505			m->mtx_recurse--;
506			return;
507		}
508		MPASS(mtx_owned(m));
509		_release_lock_quick(m);
510		if (type & MTX_FIRST)
511			enable_intr();	/* XXX is this kosher? */
512		else {
513			MPASS(m->mtx_saveintr != 0xbeefface);
514			restore_intr(m->mtx_saveintr);
515		}
516		break;
517	case MTX_SPIN | MTX_TOPHALF:
518		if (m->mtx_recurse != 0) {
519			m->mtx_recurse--;
520			return;
521		}
522		MPASS(mtx_owned(m));
523		_release_lock_quick(m);
524		break;
525	default:
526		panic("mtx_exit_hard: unsupported type 0x%x\n", type);
527	}
528}
529
530#define MV_DESTROY	0	/* validate before destory */
531#define MV_INIT		1	/* validate before init */
532
533#ifdef MUTEX_DEBUG
534
535int mtx_validate __P((struct mtx *, int));
536
537int
538mtx_validate(struct mtx *m, int when)
539{
540	struct mtx *mp;
541	int i;
542	int retval = 0;
543
544	if (m == &all_mtx || cold)
545		return 0;
546
547	mtx_enter(&all_mtx, MTX_DEF);
548/*
549 * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
550 * we can re-enable the kernacc() checks.
551 */
552#ifndef __alpha__
553	MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t),
554	    VM_PROT_READ) == 1);
555#endif
556	MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx);
557	for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
558#ifndef __alpha__
559		if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t),
560		    VM_PROT_READ) != 1) {
561			panic("mtx_validate: mp=%p mp->mtx_next=%p",
562			    mp, mp->mtx_next);
563		}
564#endif
565		i++;
566		if (i > mtx_cur_cnt) {
567			panic("mtx_validate: too many in chain, known=%d\n",
568			    mtx_cur_cnt);
569		}
570	}
571	MPASS(i == mtx_cur_cnt);
572	switch (when) {
573	case MV_DESTROY:
574		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
575			if (mp == m)
576				break;
577		MPASS(mp == m);
578		break;
579	case MV_INIT:
580		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
581		if (mp == m) {
582			/*
583			 * Not good. This mutex already exists.
584			 */
585			printf("re-initing existing mutex %s\n",
586			    m->mtx_description);
587			MPASS(m->mtx_lock == MTX_UNOWNED);
588			retval = 1;
589		}
590	}
591	mtx_exit(&all_mtx, MTX_DEF);
592	return (retval);
593}
594#endif
595
596void
597mtx_init(struct mtx *m, const char *t, int flag)
598{
599#ifdef WITNESS
600	struct mtx_debug *debug;
601#endif
602
603	if ((flag & MTX_QUIET) == 0)
604		CTR2(KTR_LOCK, "mtx_init 0x%p (%s)", m, t);
605#ifdef MUTEX_DEBUG
606	if (mtx_validate(m, MV_INIT))	/* diagnostic and error correction */
607		return;
608#endif
609#ifdef WITNESS
610	if (flag & MTX_COLD)
611		debug = m->mtx_debug;
612	else
613		debug = NULL;
614	if (debug == NULL) {
615#ifdef DIAGNOSTIC
616		if(cold && bootverbose)
617			printf("malloc'ing mtx_debug while cold for %s\n", t);
618#endif
619
620		/* XXX - should not use DEVBUF */
621		debug = malloc(sizeof(struct mtx_debug), M_DEVBUF,
622		    M_NOWAIT | M_ZERO);
623		MPASS(debug != NULL);
624	}
625#endif
626	bzero((void *)m, sizeof *m);
627	TAILQ_INIT(&m->mtx_blocked);
628#ifdef WITNESS
629	m->mtx_debug = debug;
630#endif
631	m->mtx_description = t;
632	m->mtx_lock = MTX_UNOWNED;
633	/* Put on all mutex queue */
634	mtx_enter(&all_mtx, MTX_DEF);
635	m->mtx_next = &all_mtx;
636	m->mtx_prev = all_mtx.mtx_prev;
637	m->mtx_prev->mtx_next = m;
638	all_mtx.mtx_prev = m;
639	if (++mtx_cur_cnt > mtx_max_cnt)
640		mtx_max_cnt = mtx_cur_cnt;
641	mtx_exit(&all_mtx, MTX_DEF);
642	witness_init(m, flag);
643}
644
645void
646mtx_destroy(struct mtx *m)
647{
648
649	CTR2(KTR_LOCK, "mtx_destroy 0x%p (%s)", m, m->mtx_description);
650#ifdef MUTEX_DEBUG
651	if (m->mtx_next == NULL)
652		panic("mtx_destroy: %p (%s) already destroyed",
653		    m, m->mtx_description);
654
655	if (!mtx_owned(m)) {
656		MPASS(m->mtx_lock == MTX_UNOWNED);
657	} else {
658		MPASS((m->mtx_lock & (MTX_RECURSE|MTX_CONTESTED)) == 0);
659	}
660	mtx_validate(m, MV_DESTROY);		/* diagnostic */
661#endif
662
663#ifdef WITNESS
664	if (m->mtx_witness)
665		witness_destroy(m);
666#endif /* WITNESS */
667
668	/* Remove from the all mutex queue */
669	mtx_enter(&all_mtx, MTX_DEF);
670	m->mtx_next->mtx_prev = m->mtx_prev;
671	m->mtx_prev->mtx_next = m->mtx_next;
672#ifdef MUTEX_DEBUG
673	m->mtx_next = m->mtx_prev = NULL;
674#endif
675#ifdef WITNESS
676	free(m->mtx_debug, M_DEVBUF);
677	m->mtx_debug = NULL;
678#endif
679	mtx_cur_cnt--;
680	mtx_exit(&all_mtx, MTX_DEF);
681}
682
683/*
684 * The non-inlined versions of the mtx_*() functions are always built (above),
685 * but the witness code depends on the WITNESS kernel option being specified.
686 */
687#ifdef WITNESS
688
689#define WITNESS_COUNT 200
690#define	WITNESS_NCHILDREN 2
691
692int witness_watch = 1;
693
694struct witness {
695	struct witness	*w_next;
696	const char	*w_description;
697	const char	*w_file;
698	int		 w_line;
699	struct witness	*w_morechildren;
700	u_char		 w_childcnt;
701	u_char		 w_Giant_squawked:1;
702	u_char		 w_other_squawked:1;
703	u_char		 w_same_squawked:1;
704	u_char		 w_sleep:1;
705	u_char		 w_spin:1;	/* this is a spin mutex */
706	u_int		 w_level;
707	struct witness	*w_children[WITNESS_NCHILDREN];
708};
709
710struct witness_blessed {
711	char 	*b_lock1;
712	char	*b_lock2;
713};
714
715#ifdef DDB
716/*
717 * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
718 * drop into kdebug() when:
719 *	- a lock heirarchy violation occurs
720 *	- locks are held when going to sleep.
721 */
722#ifdef WITNESS_DDB
723int	witness_ddb = 1;
724#else
725int	witness_ddb = 0;
726#endif
727SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
728#endif /* DDB */
729
730#ifdef WITNESS_SKIPSPIN
731int	witness_skipspin = 1;
732#else
733int	witness_skipspin = 0;
734#endif
735SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
736    "");
737
738MUTEX_DECLARE(static,w_mtx);
739static struct witness	*w_free;
740static struct witness	*w_all;
741static int		 w_inited;
742static int		 witness_dead;	/* fatal error, probably no memory */
743
744static struct witness	 w_data[WITNESS_COUNT];
745
746static struct witness	 *enroll __P((const char *description, int flag));
747static int itismychild __P((struct witness *parent, struct witness *child));
748static void removechild __P((struct witness *parent, struct witness *child));
749static int isitmychild __P((struct witness *parent, struct witness *child));
750static int isitmydescendant __P((struct witness *parent, struct witness *child));
751static int dup_ok __P((struct witness *));
752static int blessed __P((struct witness *, struct witness *));
753static void witness_displaydescendants
754    __P((void(*)(const char *fmt, ...), struct witness *));
755static void witness_leveldescendents __P((struct witness *parent, int level));
756static void witness_levelall __P((void));
757static struct witness * witness_get __P((void));
758static void witness_free __P((struct witness *m));
759
760
761static char *ignore_list[] = {
762	"witness lock",
763	NULL
764};
765
766static char *spin_order_list[] = {
767	"sio",
768	"sched lock",
769#ifdef __i386__
770	"clk",
771#endif
772	"callout",
773	/*
774	 * leaf locks
775	 */
776	NULL
777};
778
779static char *order_list[] = {
780	"uidinfo hash", "uidinfo struct", NULL,
781	NULL
782};
783
784static char *dup_list[] = {
785	NULL
786};
787
788static char *sleep_list[] = {
789	"Giant",
790	NULL
791};
792
793/*
794 * Pairs of locks which have been blessed
795 * Don't complain about order problems with blessed locks
796 */
797static struct witness_blessed blessed_list[] = {
798};
799static int blessed_count = sizeof(blessed_list) / sizeof(struct witness_blessed);
800
801void
802witness_init(struct mtx *m, int flag)
803{
804	m->mtx_witness = enroll(m->mtx_description, flag);
805}
806
807void
808witness_destroy(struct mtx *m)
809{
810	struct mtx *m1;
811	struct proc *p;
812	p = CURPROC;
813	for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL;
814		m1 = LIST_NEXT(m1, mtx_held)) {
815		if (m1 == m) {
816			LIST_REMOVE(m, mtx_held);
817			break;
818		}
819	}
820	return;
821
822}
823
824void
825witness_enter(struct mtx *m, int flags, const char *file, int line)
826{
827	struct witness *w, *w1;
828	struct mtx *m1;
829	struct proc *p;
830	int i;
831#ifdef DDB
832	int go_into_ddb = 0;
833#endif /* DDB */
834
835	if (panicstr)
836		return;
837	w = m->mtx_witness;
838	p = CURPROC;
839
840	if (flags & MTX_SPIN) {
841		if (!w->w_spin)
842			panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @"
843			    " %s:%d", m->mtx_description, file, line);
844		if (m->mtx_recurse != 0)
845			return;
846		mtx_enter(&w_mtx, MTX_SPIN | MTX_QUIET);
847		i = PCPU_GET(witness_spin_check);
848		if (i != 0 && w->w_level < i) {
849			mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
850			panic("mutex_enter(%s:%x, MTX_SPIN) out of order @"
851			    " %s:%d already holding %s:%x",
852			    m->mtx_description, w->w_level, file, line,
853			    spin_order_list[ffs(i)-1], i);
854		}
855		PCPU_SET(witness_spin_check, i | w->w_level);
856		mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
857		w->w_file = file;
858		w->w_line = line;
859		m->mtx_line = line;
860		m->mtx_file = file;
861		return;
862	}
863	if (w->w_spin)
864		panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
865		    m->mtx_description, file, line);
866
867	if (m->mtx_recurse != 0)
868		return;
869	if (witness_dead)
870		goto out;
871	if (cold)
872		goto out;
873
874	if (!mtx_legal2block())
875		panic("blockable mtx_enter() of %s when not legal @ %s:%d",
876			    m->mtx_description, file, line);
877	/*
878	 * Is this the first mutex acquired
879	 */
880	if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
881		goto out;
882
883	if ((w1 = m1->mtx_witness) == w) {
884		if (w->w_same_squawked || dup_ok(w))
885			goto out;
886		w->w_same_squawked = 1;
887		printf("acquring duplicate lock of same type: \"%s\"\n",
888			m->mtx_description);
889		printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
890		printf(" 2nd @ %s:%d\n", file, line);
891#ifdef DDB
892		go_into_ddb = 1;
893#endif /* DDB */
894		goto out;
895	}
896	MPASS(!mtx_owned(&w_mtx));
897	mtx_enter(&w_mtx, MTX_SPIN | MTX_QUIET);
898	/*
899	 * If we have a known higher number just say ok
900	 */
901	if (witness_watch > 1 && w->w_level > w1->w_level) {
902		mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
903		goto out;
904	}
905	if (isitmydescendant(m1->mtx_witness, w)) {
906		mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
907		goto out;
908	}
909	for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
910
911		MPASS(i < 200);
912		w1 = m1->mtx_witness;
913		if (isitmydescendant(w, w1)) {
914			mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
915			if (blessed(w, w1))
916				goto out;
917			if (m1 == &Giant) {
918				if (w1->w_Giant_squawked)
919					goto out;
920				else
921					w1->w_Giant_squawked = 1;
922			} else {
923				if (w1->w_other_squawked)
924					goto out;
925				else
926					w1->w_other_squawked = 1;
927			}
928			printf("lock order reversal\n");
929			printf(" 1st %s last acquired @ %s:%d\n",
930			    w->w_description, w->w_file, w->w_line);
931			printf(" 2nd %p %s @ %s:%d\n",
932			    m1, w1->w_description, w1->w_file, w1->w_line);
933			printf(" 3rd %p %s @ %s:%d\n",
934			    m, w->w_description, file, line);
935#ifdef DDB
936			go_into_ddb = 1;
937#endif /* DDB */
938			goto out;
939		}
940	}
941	m1 = LIST_FIRST(&p->p_heldmtx);
942	if (!itismychild(m1->mtx_witness, w))
943		mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
944
945out:
946#ifdef DDB
947	if (witness_ddb && go_into_ddb)
948		Debugger("witness_enter");
949#endif /* DDB */
950	w->w_file = file;
951	w->w_line = line;
952	m->mtx_line = line;
953	m->mtx_file = file;
954
955	/*
956	 * If this pays off it likely means that a mutex being witnessed
957	 * is acquired in hardclock. Put it in the ignore list. It is
958	 * likely not the mutex this assert fails on.
959	 */
960	MPASS(m->mtx_held.le_prev == NULL);
961	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
962}
963
964void
965witness_exit(struct mtx *m, int flags, const char *file, int line)
966{
967	struct witness *w;
968
969	if (panicstr)
970		return;
971	w = m->mtx_witness;
972
973	if (flags & MTX_SPIN) {
974		if (!w->w_spin)
975			panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @"
976			    " %s:%d", m->mtx_description, file, line);
977		if (m->mtx_recurse != 0)
978			return;
979		mtx_enter(&w_mtx, MTX_SPIN | MTX_QUIET);
980		PCPU_SET(witness_spin_check,
981		    PCPU_GET(witness_spin_check) & ~w->w_level);
982		mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
983		return;
984	}
985	if (w->w_spin)
986		panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
987		    m->mtx_description, file, line);
988
989	if (m->mtx_recurse != 0)
990		return;
991
992	if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
993		panic("switchable mtx_exit() of %s when not legal @ %s:%d",
994			    m->mtx_description, file, line);
995	LIST_REMOVE(m, mtx_held);
996	m->mtx_held.le_prev = NULL;
997}
998
999void
1000witness_try_enter(struct mtx *m, int flags, const char *file, int line)
1001{
1002	struct proc *p;
1003	struct witness *w = m->mtx_witness;
1004
1005	if (panicstr)
1006		return;
1007	if (flags & MTX_SPIN) {
1008		if (!w->w_spin)
1009			panic("mutex_try_enter: "
1010			    "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
1011			    m->mtx_description, file, line);
1012		if (m->mtx_recurse != 0)
1013			return;
1014		mtx_enter(&w_mtx, MTX_SPIN | MTX_QUIET);
1015		PCPU_SET(witness_spin_check,
1016		    PCPU_GET(witness_spin_check) | w->w_level);
1017		mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
1018		w->w_file = file;
1019		w->w_line = line;
1020		m->mtx_line = line;
1021		m->mtx_file = file;
1022		return;
1023	}
1024
1025	if (w->w_spin)
1026		panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
1027		    m->mtx_description, file, line);
1028
1029	if (m->mtx_recurse != 0)
1030		return;
1031
1032	w->w_file = file;
1033	w->w_line = line;
1034	m->mtx_line = line;
1035	m->mtx_file = file;
1036	p = CURPROC;
1037	MPASS(m->mtx_held.le_prev == NULL);
1038	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
1039}
1040
1041void
1042witness_display(void(*prnt)(const char *fmt, ...))
1043{
1044	struct witness *w, *w1;
1045
1046	witness_levelall();
1047
1048	for (w = w_all; w; w = w->w_next) {
1049		if (w->w_file == NULL)
1050			continue;
1051		for (w1 = w_all; w1; w1 = w1->w_next) {
1052			if (isitmychild(w1, w))
1053				break;
1054		}
1055		if (w1 != NULL)
1056			continue;
1057		/*
1058		 * This lock has no anscestors, display its descendants.
1059		 */
1060		witness_displaydescendants(prnt, w);
1061	}
1062	prnt("\nMutex which were never acquired\n");
1063	for (w = w_all; w; w = w->w_next) {
1064		if (w->w_file != NULL)
1065			continue;
1066		prnt("%s\n", w->w_description);
1067	}
1068}
1069
1070int
1071witness_sleep(int check_only, struct mtx *mtx, const char *file, int line)
1072{
1073	struct mtx *m;
1074	struct proc *p;
1075	char **sleep;
1076	int n = 0;
1077
1078	p = CURPROC;
1079	for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
1080	    m = LIST_NEXT(m, mtx_held)) {
1081		if (m == mtx)
1082			continue;
1083		for (sleep = sleep_list; *sleep!= NULL; sleep++)
1084			if (strcmp(m->mtx_description, *sleep) == 0)
1085				goto next;
1086		printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
1087			file, line, check_only ? "could sleep" : "sleeping",
1088			m->mtx_description,
1089			m->mtx_witness->w_file, m->mtx_witness->w_line);
1090		n++;
1091	next:
1092	}
1093#ifdef DDB
1094	if (witness_ddb && n)
1095		Debugger("witness_sleep");
1096#endif /* DDB */
1097	return (n);
1098}
1099
1100static struct witness *
1101enroll(const char *description, int flag)
1102{
1103	int i;
1104	struct witness *w, *w1;
1105	char **ignore;
1106	char **order;
1107
1108	if (!witness_watch)
1109		return (NULL);
1110	for (ignore = ignore_list; *ignore != NULL; ignore++)
1111		if (strcmp(description, *ignore) == 0)
1112			return (NULL);
1113
1114	if (w_inited == 0) {
1115		mtx_init(&w_mtx, "witness lock", MTX_COLD | MTX_SPIN);
1116		for (i = 0; i < WITNESS_COUNT; i++) {
1117			w = &w_data[i];
1118			witness_free(w);
1119		}
1120		w_inited = 1;
1121		for (order = order_list; *order != NULL; order++) {
1122			w = enroll(*order, MTX_DEF);
1123			w->w_file = "order list";
1124			for (order++; *order != NULL; order++) {
1125				w1 = enroll(*order, MTX_DEF);
1126				w1->w_file = "order list";
1127				itismychild(w, w1);
1128				w = w1;
1129    	    	    	}
1130		}
1131	}
1132	if ((flag & MTX_SPIN) && witness_skipspin)
1133		return (NULL);
1134	mtx_enter(&w_mtx, MTX_SPIN | MTX_QUIET);
1135	for (w = w_all; w; w = w->w_next) {
1136		if (strcmp(description, w->w_description) == 0) {
1137			mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
1138			return (w);
1139		}
1140	}
1141	if ((w = witness_get()) == NULL)
1142		return (NULL);
1143	w->w_next = w_all;
1144	w_all = w;
1145	w->w_description = description;
1146	mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
1147	if (flag & MTX_SPIN) {
1148		w->w_spin = 1;
1149
1150		i = 1;
1151		for (order = spin_order_list; *order != NULL; order++) {
1152			if (strcmp(description, *order) == 0)
1153				break;
1154			i <<= 1;
1155		}
1156		if (*order == NULL)
1157			panic("spin lock %s not in order list", description);
1158		w->w_level = i;
1159	}
1160	return (w);
1161}
1162
1163static int
1164itismychild(struct witness *parent, struct witness *child)
1165{
1166	static int recursed;
1167
1168	/*
1169	 * Insert "child" after "parent"
1170	 */
1171	while (parent->w_morechildren)
1172		parent = parent->w_morechildren;
1173
1174	if (parent->w_childcnt == WITNESS_NCHILDREN) {
1175		if ((parent->w_morechildren = witness_get()) == NULL)
1176			return (1);
1177		parent = parent->w_morechildren;
1178	}
1179	MPASS(child != NULL);
1180	parent->w_children[parent->w_childcnt++] = child;
1181	/*
1182	 * now prune whole tree
1183	 */
1184	if (recursed)
1185		return (0);
1186	recursed = 1;
1187	for (child = w_all; child != NULL; child = child->w_next) {
1188		for (parent = w_all; parent != NULL;
1189		    parent = parent->w_next) {
1190			if (!isitmychild(parent, child))
1191				continue;
1192			removechild(parent, child);
1193			if (isitmydescendant(parent, child))
1194				continue;
1195			itismychild(parent, child);
1196		}
1197	}
1198	recursed = 0;
1199	witness_levelall();
1200	return (0);
1201}
1202
1203static void
1204removechild(struct witness *parent, struct witness *child)
1205{
1206	struct witness *w, *w1;
1207	int i;
1208
1209	for (w = parent; w != NULL; w = w->w_morechildren)
1210		for (i = 0; i < w->w_childcnt; i++)
1211			if (w->w_children[i] == child)
1212				goto found;
1213	return;
1214found:
1215	for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
1216		continue;
1217	w->w_children[i] = w1->w_children[--w1->w_childcnt];
1218	MPASS(w->w_children[i] != NULL);
1219
1220	if (w1->w_childcnt != 0)
1221		return;
1222
1223	if (w1 == parent)
1224		return;
1225	for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
1226		continue;
1227	w->w_morechildren = 0;
1228	witness_free(w1);
1229}
1230
1231static int
1232isitmychild(struct witness *parent, struct witness *child)
1233{
1234	struct witness *w;
1235	int i;
1236
1237	for (w = parent; w != NULL; w = w->w_morechildren) {
1238		for (i = 0; i < w->w_childcnt; i++) {
1239			if (w->w_children[i] == child)
1240				return (1);
1241		}
1242	}
1243	return (0);
1244}
1245
1246static int
1247isitmydescendant(struct witness *parent, struct witness *child)
1248{
1249	struct witness *w;
1250	int i;
1251	int j;
1252
1253	for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
1254		MPASS(j < 1000);
1255		for (i = 0; i < w->w_childcnt; i++) {
1256			if (w->w_children[i] == child)
1257				return (1);
1258		}
1259		for (i = 0; i < w->w_childcnt; i++) {
1260			if (isitmydescendant(w->w_children[i], child))
1261				return (1);
1262		}
1263	}
1264	return (0);
1265}
1266
1267void
1268witness_levelall (void)
1269{
1270	struct witness *w, *w1;
1271
1272	for (w = w_all; w; w = w->w_next)
1273		if (!w->w_spin)
1274			w->w_level = 0;
1275	for (w = w_all; w; w = w->w_next) {
1276		if (w->w_spin)
1277			continue;
1278		for (w1 = w_all; w1; w1 = w1->w_next) {
1279			if (isitmychild(w1, w))
1280				break;
1281		}
1282		if (w1 != NULL)
1283			continue;
1284		witness_leveldescendents(w, 0);
1285	}
1286}
1287
1288static void
1289witness_leveldescendents(struct witness *parent, int level)
1290{
1291	int i;
1292	struct witness *w;
1293
1294	if (parent->w_level < level)
1295		parent->w_level = level;
1296	level++;
1297	for (w = parent; w != NULL; w = w->w_morechildren)
1298		for (i = 0; i < w->w_childcnt; i++)
1299			witness_leveldescendents(w->w_children[i], level);
1300}
1301
1302static void
1303witness_displaydescendants(void(*prnt)(const char *fmt, ...),
1304			   struct witness *parent)
1305{
1306	struct witness *w;
1307	int i;
1308	int level = parent->w_level;
1309
1310	prnt("%d", level);
1311	if (level < 10)
1312		prnt(" ");
1313	for (i = 0; i < level; i++)
1314		prnt(" ");
1315	prnt("%s", parent->w_description);
1316	if (parent->w_file != NULL) {
1317		prnt(" -- last acquired @ %s", parent->w_file);
1318#ifndef W_USE_WHERE
1319		prnt(":%d", parent->w_line);
1320#endif
1321		prnt("\n");
1322	}
1323
1324	for (w = parent; w != NULL; w = w->w_morechildren)
1325		for (i = 0; i < w->w_childcnt; i++)
1326			    witness_displaydescendants(prnt, w->w_children[i]);
1327    }
1328
1329static int
1330dup_ok(struct witness *w)
1331{
1332	char **dup;
1333
1334	for (dup = dup_list; *dup!= NULL; dup++)
1335		if (strcmp(w->w_description, *dup) == 0)
1336			return (1);
1337	return (0);
1338}
1339
1340static int
1341blessed(struct witness *w1, struct witness *w2)
1342{
1343	int i;
1344	struct witness_blessed *b;
1345
1346	for (i = 0; i < blessed_count; i++) {
1347		b = &blessed_list[i];
1348		if (strcmp(w1->w_description, b->b_lock1) == 0) {
1349			if (strcmp(w2->w_description, b->b_lock2) == 0)
1350				return (1);
1351			continue;
1352		}
1353		if (strcmp(w1->w_description, b->b_lock2) == 0)
1354			if (strcmp(w2->w_description, b->b_lock1) == 0)
1355				return (1);
1356	}
1357	return (0);
1358}
1359
1360static struct witness *
1361witness_get()
1362{
1363	struct witness *w;
1364
1365	if ((w = w_free) == NULL) {
1366		witness_dead = 1;
1367		mtx_exit(&w_mtx, MTX_SPIN | MTX_QUIET);
1368		printf("witness exhausted\n");
1369		return (NULL);
1370	}
1371	w_free = w->w_next;
1372	bzero(w, sizeof(*w));
1373	return (w);
1374}
1375
1376static void
1377witness_free(struct witness *w)
1378{
1379	w->w_next = w_free;
1380	w_free = w;
1381}
1382
1383int
1384witness_list(struct proc *p)
1385{
1386	struct mtx *m;
1387	int nheld;
1388
1389	nheld = 0;
1390	for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
1391	    m = LIST_NEXT(m, mtx_held)) {
1392		printf("\t\"%s\" (%p) locked at %s:%d\n",
1393		    m->mtx_description, m,
1394		    m->mtx_witness->w_file, m->mtx_witness->w_line);
1395		nheld++;
1396	}
1397
1398	return (nheld);
1399}
1400
1401void
1402witness_save(struct mtx *m, const char **filep, int *linep)
1403{
1404	*filep = m->mtx_witness->w_file;
1405	*linep = m->mtx_witness->w_line;
1406}
1407
1408void
1409witness_restore(struct mtx *m, const char *file, int line)
1410{
1411	m->mtx_witness->w_file = file;
1412	m->mtx_witness->w_line = line;
1413}
1414
1415#endif	/* WITNESS */
1416