subr_turnstile.c revision 68889
1/*-
2 * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 * 3. Berkeley Software Design Inc's name may not be used to endorse or
13 *    promote products derived from this software without specific prior
14 *    written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
29 *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
30 * $FreeBSD: head/sys/kern/subr_turnstile.c 68889 2000-11-19 06:02:32Z jake $
31 */
32
33/*
34 *	Main Entry: witness
35 *	Pronunciation: 'wit-n&s
36 *	Function: noun
37 *	Etymology: Middle English witnesse, from Old English witnes knowledge,
38 *	    testimony, witness, from 2wit
39 *	Date: before 12th century
40 *	1 : attestation of a fact or event : TESTIMONY
41 *	2 : one that gives evidence; specifically : one who testifies in
42 *	    a cause or before a judicial tribunal
43 *	3 : one asked to be present at a transaction so as to be able to
44 *	    testify to its having taken place
45 *	4 : one who has personal knowledge of something
46 *	5 a : something serving as evidence or proof : SIGN
47 *	  b : public affirmation by word or example of usually
48 *	      religious faith or conviction <the heroic witness to divine
49 *	      life -- Pilot>
50 *	6 capitalized : a member of the Jehovah's Witnesses
51 */
52
53#include "opt_ddb.h"
54#include "opt_witness.h"
55
56#include <sys/param.h>
57#include <sys/bus.h>
58#include <sys/kernel.h>
59#include <sys/malloc.h>
60#include <sys/proc.h>
61#include <sys/sysctl.h>
62#include <sys/systm.h>
63#include <sys/vmmeter.h>
64#include <sys/ktr.h>
65
66#include <machine/atomic.h>
67#include <machine/bus.h>
68#include <machine/clock.h>
69#include <machine/cpu.h>
70
71#include <ddb/ddb.h>
72
73#include <vm/vm.h>
74#include <vm/vm_extern.h>
75
76#define _KERN_MUTEX_C_		/* Cause non-inlined mtx_*() to be compiled. */
77#include <sys/mutex.h>
78
79/*
80 * Machine independent bits of the mutex implementation
81 */
82/* All mutexes in system (used for debug/panic) */
83#ifdef MUTEX_DEBUG
84static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0,
85	"All mutexes queue head" };
86static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, &all_mtx_debug,
87	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
88	{ NULL, NULL }, &all_mtx, &all_mtx };
89#else	/* MUTEX_DEBUG */
90static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, "All mutexes queue head",
91	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
92	{ NULL, NULL }, &all_mtx, &all_mtx };
93#endif	/* MUTEX_DEBUG */
94
95static int	mtx_cur_cnt;
96static int	mtx_max_cnt;
97
98void	_mtx_enter_giant_def(void);
99void	_mtx_exit_giant_def(void);
100static void propagate_priority(struct proc *) __unused;
101
102#define	mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
103#define	mtx_owner(m)	(mtx_unowned(m) ? NULL \
104			    : (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))
105
106#define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
107#define	SET_PRIO(p, pri)	(p)->p_priority = (pri)
108
109/*
110 * XXX Temporary, for use from assembly language
111 */
112
113void
114_mtx_enter_giant_def(void)
115{
116
117	mtx_enter(&Giant, MTX_DEF);
118}
119
120void
121_mtx_exit_giant_def(void)
122{
123
124	mtx_exit(&Giant, MTX_DEF);
125}
126
127static void
128propagate_priority(struct proc *p)
129{
130	int pri = p->p_priority;
131	struct mtx *m = p->p_blocked;
132
133	for (;;) {
134		struct proc *p1;
135
136		p = mtx_owner(m);
137
138		if (p == NULL) {
139			/*
140			 * This really isn't quite right. Really
141			 * ought to bump priority of process that
142			 * next acquires the mutex.
143			 */
144			MPASS(m->mtx_lock == MTX_CONTESTED);
145			return;
146		}
147		MPASS(p->p_magic == P_MAGIC);
148		if (p->p_priority <= pri)
149			return;
150		/*
151		 * If lock holder is actually running, just bump priority.
152		 */
153		if (TAILQ_NEXT(p, p_procq) == NULL) {
154			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
155			SET_PRIO(p, pri);
156			return;
157		}
158		/*
159		 * If on run queue move to new run queue, and
160		 * quit.
161		 */
162		if (p->p_stat == SRUN) {
163			MPASS(p->p_blocked == NULL);
164			remrunqueue(p);
165			SET_PRIO(p, pri);
166			setrunqueue(p);
167			return;
168		}
169
170		/*
171		 * If we aren't blocked on a mutex, give up and quit.
172		 */
173		if (p->p_stat != SMTX) {
174			printf(
175	"XXX: process %d(%s):%d holds %s but isn't blocked on a mutex\n",
176			    p->p_pid, p->p_comm, p->p_stat, m->mtx_description);
177			return;
178		}
179
180		/*
181		 * Pick up the mutex that p is blocked on.
182		 */
183		m = p->p_blocked;
184		MPASS(m != NULL);
185
186		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
187		    p->p_comm, m->mtx_description);
188		/*
189		 * Check if the proc needs to be moved up on
190		 * the blocked chain
191		 */
192		if ((p1 = TAILQ_PREV(p, rq, p_procq)) == NULL ||
193		    p1->p_priority <= pri) {
194			if (p1)
195				printf(
196	"XXX: previous process %d(%s) has higher priority\n",
197				    p->p_pid, p->p_comm);
198			else
199				printf("XXX: process at head of run queue\n");
200			continue;
201		}
202
203		/*
204		 * Remove proc from blocked chain
205		 */
206		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
207		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
208			MPASS(p1->p_magic == P_MAGIC);
209			if (p1->p_priority > pri)
210				break;
211		}
212		if (p1)
213			TAILQ_INSERT_BEFORE(p1, p, p_procq);
214		else
215			TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
216		CTR4(KTR_LOCK,
217		    "propagate priority: p 0x%p moved before 0x%p on [0x%p] %s",
218		    p, p1, m, m->mtx_description);
219	}
220}
221
222void
223mtx_enter_hard(struct mtx *m, int type, int saveintr)
224{
225	struct proc *p = CURPROC;
226
227	KASSERT(p != NULL, ("curproc is NULL in mutex"));
228
229	switch (type) {
230	case MTX_DEF:
231		if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) {
232			m->mtx_recurse++;
233			atomic_set_ptr(&m->mtx_lock, MTX_RECURSE);
234			CTR1(KTR_LOCK, "mtx_enter: 0x%p recurse", m);
235			return;
236		}
237		CTR3(KTR_LOCK, "mtx_enter: 0x%p contested (lock=%p) [0x%p]",
238		    m, (void *)m->mtx_lock, (void *)RETIP(m));
239		while (!_obtain_lock(m, p)) {
240			uintptr_t v;
241			struct proc *p1;
242
243			mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY);
244			/*
245			 * check if the lock has been released while
246			 * waiting for the schedlock.
247			 */
248			if ((v = m->mtx_lock) == MTX_UNOWNED) {
249				mtx_exit(&sched_lock, MTX_SPIN);
250				continue;
251			}
252			/*
253			 * The mutex was marked contested on release. This
254			 * means that there are processes blocked on it.
255			 */
256			if (v == MTX_CONTESTED) {
257				p1 = TAILQ_FIRST(&m->mtx_blocked);
258				KASSERT(p1 != NULL, ("contested mutex has no contesters"));
259				KASSERT(p != NULL, ("curproc is NULL for contested mutex"));
260				m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;
261				if (p1->p_priority < p->p_priority) {
262					SET_PRIO(p, p1->p_priority);
263				}
264				mtx_exit(&sched_lock, MTX_SPIN);
265				return;
266			}
267			/*
268			 * If the mutex isn't already contested and
269			 * a failure occurs setting the contested bit the
270			 * mutex was either release or the
271			 * state of the RECURSION bit changed.
272			 */
273			if ((v & MTX_CONTESTED) == 0 &&
274			    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
275				               (void *)(v | MTX_CONTESTED))) {
276				mtx_exit(&sched_lock, MTX_SPIN);
277				continue;
278			}
279
280			/* We definitely have to sleep for this lock */
281			mtx_assert(m, MA_NOTOWNED);
282
283#ifdef notyet
284			/*
285			 * If we're borrowing an interrupted thread's VM
286			 * context must clean up before going to sleep.
287			 */
288			if (p->p_flag & (P_ITHD | P_SITHD)) {
289				ithd_t *it = (ithd_t *)p;
290
291				if (it->it_interrupted) {
292					CTR2(KTR_LOCK,
293					    "mtx_enter: 0x%x interrupted 0x%x",
294					    it, it->it_interrupted);
295					intr_thd_fixup(it);
296				}
297			}
298#endif
299
300			/* Put us on the list of procs blocked on this mutex */
301			if (TAILQ_EMPTY(&m->mtx_blocked)) {
302				p1 = (struct proc *)(m->mtx_lock &
303						     MTX_FLAGMASK);
304				LIST_INSERT_HEAD(&p1->p_contested, m,
305						 mtx_contested);
306				TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
307			} else {
308				TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
309					if (p1->p_priority > p->p_priority)
310						break;
311				if (p1)
312					TAILQ_INSERT_BEFORE(p1, p, p_procq);
313				else
314					TAILQ_INSERT_TAIL(&m->mtx_blocked, p,
315							  p_procq);
316			}
317
318			p->p_blocked = m;	/* Who we're blocked on */
319			p->p_stat = SMTX;
320#if 0
321			propagate_priority(p);
322#endif
323			CTR3(KTR_LOCK, "mtx_enter: p 0x%p blocked on [0x%p] %s",
324			    p, m, m->mtx_description);
325			mi_switch();
326			CTR3(KTR_LOCK,
327			    "mtx_enter: p 0x%p free from blocked on [0x%p] %s",
328			    p, m, m->mtx_description);
329			mtx_exit(&sched_lock, MTX_SPIN);
330		}
331		return;
332	case MTX_SPIN:
333	case MTX_SPIN | MTX_FIRST:
334	case MTX_SPIN | MTX_TOPHALF:
335	    {
336		int i = 0;
337
338		if (m->mtx_lock == (uintptr_t)p) {
339			m->mtx_recurse++;
340			return;
341		}
342		CTR1(KTR_LOCK, "mtx_enter: %p spinning", m);
343		for (;;) {
344			if (_obtain_lock(m, p))
345				break;
346			while (m->mtx_lock != MTX_UNOWNED) {
347				if (i++ < 1000000)
348					continue;
349				if (i++ < 6000000)
350					DELAY (1);
351#ifdef DDB
352				else if (!db_active)
353#else
354				else
355#endif
356					panic(
357				"spin lock %s held by 0x%p for > 5 seconds",
358					    m->mtx_description,
359					    (void *)m->mtx_lock);
360			}
361		}
362
363#ifdef MUTEX_DEBUG
364		if (type != MTX_SPIN)
365			m->mtx_saveintr = 0xbeefface;
366		else
367#endif
368			m->mtx_saveintr = saveintr;
369		CTR1(KTR_LOCK, "mtx_enter: 0x%p spin done", m);
370		return;
371	    }
372	}
373}
374
375void
376mtx_exit_hard(struct mtx *m, int type)
377{
378	struct proc *p, *p1;
379	struct mtx *m1;
380	int pri;
381
382	p = CURPROC;
383	switch (type) {
384	case MTX_DEF:
385	case MTX_DEF | MTX_NOSWITCH:
386		if (m->mtx_recurse != 0) {
387			if (--(m->mtx_recurse) == 0)
388				atomic_clear_ptr(&m->mtx_lock, MTX_RECURSE);
389			CTR1(KTR_LOCK, "mtx_exit: 0x%p unrecurse", m);
390			return;
391		}
392		mtx_enter(&sched_lock, MTX_SPIN);
393		CTR1(KTR_LOCK, "mtx_exit: 0x%p contested", m);
394		p1 = TAILQ_FIRST(&m->mtx_blocked);
395		MPASS(p->p_magic == P_MAGIC);
396		MPASS(p1->p_magic == P_MAGIC);
397		TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq);
398		if (TAILQ_EMPTY(&m->mtx_blocked)) {
399			LIST_REMOVE(m, mtx_contested);
400			_release_lock_quick(m);
401			CTR1(KTR_LOCK, "mtx_exit: 0x%p not held", m);
402		} else
403			m->mtx_lock = MTX_CONTESTED;
404		pri = MAXPRI;
405		LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
406			int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
407			if (cp < pri)
408				pri = cp;
409		}
410		if (pri > p->p_nativepri)
411			pri = p->p_nativepri;
412		SET_PRIO(p, pri);
413		CTR2(KTR_LOCK, "mtx_exit: 0x%p contested setrunqueue 0x%p",
414		    m, p1);
415		p1->p_blocked = NULL;
416		p1->p_stat = SRUN;
417		setrunqueue(p1);
418		if ((type & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
419#ifdef notyet
420			if (p->p_flag & (P_ITHD | P_SITHD)) {
421				ithd_t *it = (ithd_t *)p;
422
423				if (it->it_interrupted) {
424					CTR2(KTR_LOCK,
425					    "mtx_exit: 0x%x interruped 0x%x",
426					    it, it->it_interrupted);
427					intr_thd_fixup(it);
428				}
429			}
430#endif
431			setrunqueue(p);
432			CTR2(KTR_LOCK, "mtx_exit: 0x%p switching out lock=0x%p",
433			    m, (void *)m->mtx_lock);
434			mi_switch();
435			CTR2(KTR_LOCK, "mtx_exit: 0x%p resuming lock=0x%p",
436			    m, (void *)m->mtx_lock);
437		}
438		mtx_exit(&sched_lock, MTX_SPIN);
439		break;
440	case MTX_SPIN:
441	case MTX_SPIN | MTX_FIRST:
442		if (m->mtx_recurse != 0) {
443			m->mtx_recurse--;
444			return;
445		}
446		MPASS(mtx_owned(m));
447		_release_lock_quick(m);
448		if (type & MTX_FIRST)
449			enable_intr();	/* XXX is this kosher? */
450		else {
451			MPASS(m->mtx_saveintr != 0xbeefface);
452			restore_intr(m->mtx_saveintr);
453		}
454		break;
455	case MTX_SPIN | MTX_TOPHALF:
456		if (m->mtx_recurse != 0) {
457			m->mtx_recurse--;
458			return;
459		}
460		MPASS(mtx_owned(m));
461		_release_lock_quick(m);
462		break;
463	default:
464		panic("mtx_exit_hard: unsupported type 0x%x\n", type);
465	}
466}
467
468#define MV_DESTROY	0	/* validate before destory */
469#define MV_INIT		1	/* validate before init */
470
471#ifdef MUTEX_DEBUG
472
473int mtx_validate __P((struct mtx *, int));
474
475int
476mtx_validate(struct mtx *m, int when)
477{
478	struct mtx *mp;
479	int i;
480	int retval = 0;
481
482	if (m == &all_mtx || cold)
483		return 0;
484
485	mtx_enter(&all_mtx, MTX_DEF);
486/*
487 * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
488 * we can re-enable the kernacc() checks.
489 */
490#ifndef __alpha__
491	MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t),
492	    VM_PROT_READ) == 1);
493#endif
494	MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx);
495	for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
496#ifndef __alpha__
497		if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t),
498		    VM_PROT_READ) != 1) {
499			panic("mtx_validate: mp=%p mp->mtx_next=%p",
500			    mp, mp->mtx_next);
501		}
502#endif
503		i++;
504		if (i > mtx_cur_cnt) {
505			panic("mtx_validate: too many in chain, known=%d\n",
506			    mtx_cur_cnt);
507		}
508	}
509	MPASS(i == mtx_cur_cnt);
510	switch (when) {
511	case MV_DESTROY:
512		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
513			if (mp == m)
514				break;
515		MPASS(mp == m);
516		break;
517	case MV_INIT:
518		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
519		if (mp == m) {
520			/*
521			 * Not good. This mutex already exists.
522			 */
523			printf("re-initing existing mutex %s\n",
524			    m->mtx_description);
525			MPASS(m->mtx_lock == MTX_UNOWNED);
526			retval = 1;
527		}
528	}
529	mtx_exit(&all_mtx, MTX_DEF);
530	return (retval);
531}
532#endif
533
534void
535mtx_init(struct mtx *m, const char *t, int flag)
536{
537#ifdef MUTEX_DEBUG
538	struct mtx_debug *debug;
539#endif
540
541	CTR2(KTR_LOCK, "mtx_init 0x%p (%s)", m, t);
542#ifdef MUTEX_DEBUG
543	if (mtx_validate(m, MV_INIT))	/* diagnostic and error correction */
544		return;
545	if (flag & MTX_COLD)
546		debug = m->mtx_debug;
547	else
548		debug = NULL;
549	if (debug == NULL) {
550#ifdef DIAGNOSTIC
551		if(cold && bootverbose)
552			printf("malloc'ing mtx_debug while cold for %s\n", t);
553#endif
554
555		/* XXX - should not use DEVBUF */
556		debug = malloc(sizeof(struct mtx_debug), M_DEVBUF, M_NOWAIT);
557		MPASS(debug != NULL);
558		bzero(debug, sizeof(struct mtx_debug));
559	}
560#endif
561	bzero((void *)m, sizeof *m);
562	TAILQ_INIT(&m->mtx_blocked);
563#ifdef MUTEX_DEBUG
564	m->mtx_debug = debug;
565#endif
566	m->mtx_description = t;
567	m->mtx_lock = MTX_UNOWNED;
568	/* Put on all mutex queue */
569	mtx_enter(&all_mtx, MTX_DEF);
570	m->mtx_next = &all_mtx;
571	m->mtx_prev = all_mtx.mtx_prev;
572	m->mtx_prev->mtx_next = m;
573	all_mtx.mtx_prev = m;
574	if (++mtx_cur_cnt > mtx_max_cnt)
575		mtx_max_cnt = mtx_cur_cnt;
576	mtx_exit(&all_mtx, MTX_DEF);
577	witness_init(m, flag);
578}
579
580void
581mtx_destroy(struct mtx *m)
582{
583
584	CTR2(KTR_LOCK, "mtx_destroy 0x%p (%s)", m, m->mtx_description);
585#ifdef MUTEX_DEBUG
586	if (m->mtx_next == NULL)
587		panic("mtx_destroy: %p (%s) already destroyed",
588		    m, m->mtx_description);
589
590	if (!mtx_owned(m)) {
591		MPASS(m->mtx_lock == MTX_UNOWNED);
592	} else {
593		MPASS((m->mtx_lock & (MTX_RECURSE|MTX_CONTESTED)) == 0);
594	}
595	mtx_validate(m, MV_DESTROY);		/* diagnostic */
596#endif
597
598#ifdef WITNESS
599	if (m->mtx_witness)
600		witness_destroy(m);
601#endif /* WITNESS */
602
603	/* Remove from the all mutex queue */
604	mtx_enter(&all_mtx, MTX_DEF);
605	m->mtx_next->mtx_prev = m->mtx_prev;
606	m->mtx_prev->mtx_next = m->mtx_next;
607#ifdef MUTEX_DEBUG
608	m->mtx_next = m->mtx_prev = NULL;
609	free(m->mtx_debug, M_DEVBUF);
610	m->mtx_debug = NULL;
611#endif
612	mtx_cur_cnt--;
613	mtx_exit(&all_mtx, MTX_DEF);
614}
615
616/*
617 * The non-inlined versions of the mtx_*() functions are always built (above),
618 * but the witness code depends on the MUTEX_DEBUG and WITNESS kernel options
619 * being specified.
620 */
621#if (defined(MUTEX_DEBUG) && defined(WITNESS))
622
623#define WITNESS_COUNT 200
624#define	WITNESS_NCHILDREN 2
625
626int witness_watch = 1;
627
628struct witness {
629	struct witness	*w_next;
630	const char	*w_description;
631	const char	*w_file;
632	int		 w_line;
633	struct witness	*w_morechildren;
634	u_char		 w_childcnt;
635	u_char		 w_Giant_squawked:1;
636	u_char		 w_other_squawked:1;
637	u_char		 w_same_squawked:1;
638	u_char		 w_sleep:1;
639	u_char		 w_spin:1;	/* this is a spin mutex */
640	u_int		 w_level;
641	struct witness	*w_children[WITNESS_NCHILDREN];
642};
643
644struct witness_blessed {
645	char 	*b_lock1;
646	char	*b_lock2;
647};
648
649#ifdef DDB
650/*
651 * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
652 * drop into kdebug() when:
653 *	- a lock heirarchy violation occurs
654 *	- locks are held when going to sleep.
655 */
656#ifdef WITNESS_DDB
657int	witness_ddb = 1;
658#else
659int	witness_ddb = 0;
660#endif
661SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
662#endif /* DDB */
663
664#ifdef WITNESS_SKIPSPIN
665int	witness_skipspin = 1;
666#else
667int	witness_skipspin = 0;
668#endif
669SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
670    "");
671
672MUTEX_DECLARE(static,w_mtx);
673static struct witness	*w_free;
674static struct witness	*w_all;
675static int		 w_inited;
676static int		 witness_dead;	/* fatal error, probably no memory */
677
678static struct witness	 w_data[WITNESS_COUNT];
679
680static struct witness	 *enroll __P((const char *description, int flag));
681static int itismychild __P((struct witness *parent, struct witness *child));
682static void removechild __P((struct witness *parent, struct witness *child));
683static int isitmychild __P((struct witness *parent, struct witness *child));
684static int isitmydescendant __P((struct witness *parent, struct witness *child));
685static int dup_ok __P((struct witness *));
686static int blessed __P((struct witness *, struct witness *));
687static void witness_displaydescendants
688    __P((void(*)(const char *fmt, ...), struct witness *));
689static void witness_leveldescendents __P((struct witness *parent, int level));
690static void witness_levelall __P((void));
691static struct witness * witness_get __P((void));
692static void witness_free __P((struct witness *m));
693
694
695static char *ignore_list[] = {
696	"witness lock",
697	NULL
698};
699
700static char *spin_order_list[] = {
701	"sched lock",
702	"sio",
703#ifdef __i386__
704	"clk",
705#endif
706	"callout",
707	/*
708	 * leaf locks
709	 */
710	NULL
711};
712
713static char *order_list[] = {
714	NULL
715};
716
717static char *dup_list[] = {
718	NULL
719};
720
721static char *sleep_list[] = {
722	"Giant",
723	NULL
724};
725
726/*
727 * Pairs of locks which have been blessed
728 * Don't complain about order problems with blessed locks
729 */
730static struct witness_blessed blessed_list[] = {
731};
732static int blessed_count = sizeof(blessed_list) / sizeof(struct witness_blessed);
733
734void
735witness_init(struct mtx *m, int flag)
736{
737	m->mtx_witness = enroll(m->mtx_description, flag);
738}
739
740void
741witness_destroy(struct mtx *m)
742{
743	struct mtx *m1;
744	struct proc *p;
745	p = CURPROC;
746	for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL;
747		m1 = LIST_NEXT(m1, mtx_held)) {
748		if (m1 == m) {
749			LIST_REMOVE(m, mtx_held);
750			break;
751		}
752	}
753	return;
754
755}
756
757void
758witness_enter(struct mtx *m, int flags, const char *file, int line)
759{
760	struct witness *w, *w1;
761	struct mtx *m1;
762	struct proc *p;
763	int i;
764#ifdef DDB
765	int go_into_ddb = 0;
766#endif /* DDB */
767
768	w = m->mtx_witness;
769	p = CURPROC;
770
771	if (flags & MTX_SPIN) {
772		if (!w->w_spin)
773			panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @"
774			    " %s:%d", m->mtx_description, file, line);
775		if (m->mtx_recurse != 0)
776			return;
777		mtx_enter(&w_mtx, MTX_SPIN);
778		i = witness_spin_check;
779		if (i != 0 && w->w_level < i) {
780			mtx_exit(&w_mtx, MTX_SPIN);
781			panic("mutex_enter(%s:%x, MTX_SPIN) out of order @"
782			    " %s:%d already holding %s:%x",
783			    m->mtx_description, w->w_level, file, line,
784			    spin_order_list[ffs(i)-1], i);
785		}
786		PCPU_SET(witness_spin_check, i | w->w_level);
787		mtx_exit(&w_mtx, MTX_SPIN);
788		return;
789	}
790	if (w->w_spin)
791		panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
792		    m->mtx_description, file, line);
793
794	if (m->mtx_recurse != 0)
795		return;
796	if (witness_dead)
797		goto out;
798	if (cold || panicstr)
799		goto out;
800
801	if (!mtx_legal2block())
802		panic("blockable mtx_enter() of %s when not legal @ %s:%d",
803			    m->mtx_description, file, line);
804	/*
805	 * Is this the first mutex acquired
806	 */
807	if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
808		goto out;
809
810	if ((w1 = m1->mtx_witness) == w) {
811		if (w->w_same_squawked || dup_ok(w))
812			goto out;
813		w->w_same_squawked = 1;
814		printf("acquring duplicate lock of same type: \"%s\"\n",
815			m->mtx_description);
816		printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
817		printf(" 2nd @ %s:%d\n", file, line);
818#ifdef DDB
819		go_into_ddb = 1;
820#endif /* DDB */
821		goto out;
822	}
823	MPASS(!mtx_owned(&w_mtx));
824	mtx_enter(&w_mtx, MTX_SPIN);
825	/*
826	 * If we have a known higher number just say ok
827	 */
828	if (witness_watch > 1 && w->w_level > w1->w_level) {
829		mtx_exit(&w_mtx, MTX_SPIN);
830		goto out;
831	}
832	if (isitmydescendant(m1->mtx_witness, w)) {
833		mtx_exit(&w_mtx, MTX_SPIN);
834		goto out;
835	}
836	for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
837
838		MPASS(i < 200);
839		w1 = m1->mtx_witness;
840		if (isitmydescendant(w, w1)) {
841			mtx_exit(&w_mtx, MTX_SPIN);
842			if (blessed(w, w1))
843				goto out;
844			if (m1 == &Giant) {
845				if (w1->w_Giant_squawked)
846					goto out;
847				else
848					w1->w_Giant_squawked = 1;
849			} else {
850				if (w1->w_other_squawked)
851					goto out;
852				else
853					w1->w_other_squawked = 1;
854			}
855			printf("lock order reversal\n");
856			printf(" 1st %s last acquired @ %s:%d\n",
857			    w->w_description, w->w_file, w->w_line);
858			printf(" 2nd %p %s @ %s:%d\n",
859			    m1, w1->w_description, w1->w_file, w1->w_line);
860			printf(" 3rd %p %s @ %s:%d\n",
861			    m, w->w_description, file, line);
862#ifdef DDB
863			go_into_ddb = 1;
864#endif /* DDB */
865			goto out;
866		}
867	}
868	m1 = LIST_FIRST(&p->p_heldmtx);
869	if (!itismychild(m1->mtx_witness, w))
870		mtx_exit(&w_mtx, MTX_SPIN);
871
872out:
873#ifdef DDB
874	if (witness_ddb && go_into_ddb)
875		Debugger("witness_enter");
876#endif /* DDB */
877	w->w_file = file;
878	w->w_line = line;
879	m->mtx_line = line;
880	m->mtx_file = file;
881
882	/*
883	 * If this pays off it likely means that a mutex being witnessed
884	 * is acquired in hardclock. Put it in the ignore list. It is
885	 * likely not the mutex this assert fails on.
886	 */
887	MPASS(m->mtx_held.le_prev == NULL);
888	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
889}
890
891void
892witness_exit(struct mtx *m, int flags, const char *file, int line)
893{
894	struct witness *w;
895
896	w = m->mtx_witness;
897
898	if (flags & MTX_SPIN) {
899		if (!w->w_spin)
900			panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @"
901			    " %s:%d", m->mtx_description, file, line);
902		if (m->mtx_recurse != 0)
903			return;
904		mtx_enter(&w_mtx, MTX_SPIN);
905		PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level);
906		mtx_exit(&w_mtx, MTX_SPIN);
907		return;
908	}
909	if (w->w_spin)
910		panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
911		    m->mtx_description, file, line);
912
913	if (m->mtx_recurse != 0)
914		return;
915
916	if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
917		panic("switchable mtx_exit() of %s when not legal @ %s:%d",
918			    m->mtx_description, file, line);
919	LIST_REMOVE(m, mtx_held);
920	m->mtx_held.le_prev = NULL;
921}
922
923void
924witness_try_enter(struct mtx *m, int flags, const char *file, int line)
925{
926	struct proc *p;
927	struct witness *w = m->mtx_witness;
928
929	if (flags & MTX_SPIN) {
930		if (!w->w_spin)
931			panic("mutex_try_enter: "
932			    "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
933			    m->mtx_description, file, line);
934		if (m->mtx_recurse != 0)
935			return;
936		mtx_enter(&w_mtx, MTX_SPIN);
937		PCPU_SET(witness_spin_check, witness_spin_check | w->w_level);
938		mtx_exit(&w_mtx, MTX_SPIN);
939		return;
940	}
941
942	if (w->w_spin)
943		panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
944		    m->mtx_description, file, line);
945
946	if (m->mtx_recurse != 0)
947		return;
948
949	w->w_file = file;
950	w->w_line = line;
951	m->mtx_line = line;
952	m->mtx_file = file;
953	p = CURPROC;
954	MPASS(m->mtx_held.le_prev == NULL);
955	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
956}
957
958void
959witness_display(void(*prnt)(const char *fmt, ...))
960{
961	struct witness *w, *w1;
962
963	witness_levelall();
964
965	for (w = w_all; w; w = w->w_next) {
966		if (w->w_file == NULL)
967			continue;
968		for (w1 = w_all; w1; w1 = w1->w_next) {
969			if (isitmychild(w1, w))
970				break;
971		}
972		if (w1 != NULL)
973			continue;
974		/*
975		 * This lock has no anscestors, display its descendants.
976		 */
977		witness_displaydescendants(prnt, w);
978	}
979	prnt("\nMutex which were never acquired\n");
980	for (w = w_all; w; w = w->w_next) {
981		if (w->w_file != NULL)
982			continue;
983		prnt("%s\n", w->w_description);
984	}
985}
986
987int
988witness_sleep(int check_only, struct mtx *mtx, const char *file, int line)
989{
990	struct mtx *m;
991	struct proc *p;
992	char **sleep;
993	int n = 0;
994
995	p = CURPROC;
996	for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
997	    m = LIST_NEXT(m, mtx_held)) {
998		if (m == mtx)
999			continue;
1000		for (sleep = sleep_list; *sleep!= NULL; sleep++)
1001			if (strcmp(m->mtx_description, *sleep) == 0)
1002				goto next;
1003		printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
1004			file, line, check_only ? "could sleep" : "sleeping",
1005			m->mtx_description,
1006			m->mtx_witness->w_file, m->mtx_witness->w_line);
1007		n++;
1008	next:
1009	}
1010#ifdef DDB
1011	if (witness_ddb && n)
1012		Debugger("witness_sleep");
1013#endif /* DDB */
1014	return (n);
1015}
1016
1017static struct witness *
1018enroll(const char *description, int flag)
1019{
1020	int i;
1021	struct witness *w, *w1;
1022	char **ignore;
1023	char **order;
1024
1025	if (!witness_watch)
1026		return (NULL);
1027	for (ignore = ignore_list; *ignore != NULL; ignore++)
1028		if (strcmp(description, *ignore) == 0)
1029			return (NULL);
1030
1031	if (w_inited == 0) {
1032		mtx_init(&w_mtx, "witness lock", MTX_COLD | MTX_DEF);
1033		for (i = 0; i < WITNESS_COUNT; i++) {
1034			w = &w_data[i];
1035			witness_free(w);
1036		}
1037		w_inited = 1;
1038		for (order = order_list; *order != NULL; order++) {
1039			w = enroll(*order, MTX_DEF);
1040			w->w_file = "order list";
1041			for (order++; *order != NULL; order++) {
1042				w1 = enroll(*order, MTX_DEF);
1043				w1->w_file = "order list";
1044				itismychild(w, w1);
1045				w = w1;
1046    	    	    	}
1047		}
1048	}
1049	if ((flag & MTX_SPIN) && witness_skipspin)
1050		return (NULL);
1051	mtx_enter(&w_mtx, MTX_SPIN);
1052	for (w = w_all; w; w = w->w_next) {
1053		if (strcmp(description, w->w_description) == 0) {
1054			mtx_exit(&w_mtx, MTX_SPIN);
1055			return (w);
1056		}
1057	}
1058	if ((w = witness_get()) == NULL)
1059		return (NULL);
1060	w->w_next = w_all;
1061	w_all = w;
1062	w->w_description = description;
1063	mtx_exit(&w_mtx, MTX_SPIN);
1064	if (flag & MTX_SPIN) {
1065		w->w_spin = 1;
1066
1067		i = 1;
1068		for (order = spin_order_list; *order != NULL; order++) {
1069			if (strcmp(description, *order) == 0)
1070				break;
1071			i <<= 1;
1072		}
1073		if (*order == NULL)
1074			panic("spin lock %s not in order list", description);
1075		w->w_level = i;
1076	}
1077	return (w);
1078}
1079
1080static int
1081itismychild(struct witness *parent, struct witness *child)
1082{
1083	static int recursed;
1084
1085	/*
1086	 * Insert "child" after "parent"
1087	 */
1088	while (parent->w_morechildren)
1089		parent = parent->w_morechildren;
1090
1091	if (parent->w_childcnt == WITNESS_NCHILDREN) {
1092		if ((parent->w_morechildren = witness_get()) == NULL)
1093			return (1);
1094		parent = parent->w_morechildren;
1095	}
1096	MPASS(child != NULL);
1097	parent->w_children[parent->w_childcnt++] = child;
1098	/*
1099	 * now prune whole tree
1100	 */
1101	if (recursed)
1102		return (0);
1103	recursed = 1;
1104	for (child = w_all; child != NULL; child = child->w_next) {
1105		for (parent = w_all; parent != NULL;
1106		    parent = parent->w_next) {
1107			if (!isitmychild(parent, child))
1108				continue;
1109			removechild(parent, child);
1110			if (isitmydescendant(parent, child))
1111				continue;
1112			itismychild(parent, child);
1113		}
1114	}
1115	recursed = 0;
1116	witness_levelall();
1117	return (0);
1118}
1119
1120static void
1121removechild(struct witness *parent, struct witness *child)
1122{
1123	struct witness *w, *w1;
1124	int i;
1125
1126	for (w = parent; w != NULL; w = w->w_morechildren)
1127		for (i = 0; i < w->w_childcnt; i++)
1128			if (w->w_children[i] == child)
1129				goto found;
1130	return;
1131found:
1132	for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
1133		continue;
1134	w->w_children[i] = w1->w_children[--w1->w_childcnt];
1135	MPASS(w->w_children[i] != NULL);
1136
1137	if (w1->w_childcnt != 0)
1138		return;
1139
1140	if (w1 == parent)
1141		return;
1142	for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
1143		continue;
1144	w->w_morechildren = 0;
1145	witness_free(w1);
1146}
1147
1148static int
1149isitmychild(struct witness *parent, struct witness *child)
1150{
1151	struct witness *w;
1152	int i;
1153
1154	for (w = parent; w != NULL; w = w->w_morechildren) {
1155		for (i = 0; i < w->w_childcnt; i++) {
1156			if (w->w_children[i] == child)
1157				return (1);
1158		}
1159	}
1160	return (0);
1161}
1162
1163static int
1164isitmydescendant(struct witness *parent, struct witness *child)
1165{
1166	struct witness *w;
1167	int i;
1168	int j;
1169
1170	for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
1171		MPASS(j < 1000);
1172		for (i = 0; i < w->w_childcnt; i++) {
1173			if (w->w_children[i] == child)
1174				return (1);
1175		}
1176		for (i = 0; i < w->w_childcnt; i++) {
1177			if (isitmydescendant(w->w_children[i], child))
1178				return (1);
1179		}
1180	}
1181	return (0);
1182}
1183
1184void
1185witness_levelall (void)
1186{
1187	struct witness *w, *w1;
1188
1189	for (w = w_all; w; w = w->w_next)
1190		if (!w->w_spin)
1191			w->w_level = 0;
1192	for (w = w_all; w; w = w->w_next) {
1193		if (w->w_spin)
1194			continue;
1195		for (w1 = w_all; w1; w1 = w1->w_next) {
1196			if (isitmychild(w1, w))
1197				break;
1198		}
1199		if (w1 != NULL)
1200			continue;
1201		witness_leveldescendents(w, 0);
1202	}
1203}
1204
1205static void
1206witness_leveldescendents(struct witness *parent, int level)
1207{
1208	int i;
1209	struct witness *w;
1210
1211	if (parent->w_level < level)
1212		parent->w_level = level;
1213	level++;
1214	for (w = parent; w != NULL; w = w->w_morechildren)
1215		for (i = 0; i < w->w_childcnt; i++)
1216			witness_leveldescendents(w->w_children[i], level);
1217}
1218
1219static void
1220witness_displaydescendants(void(*prnt)(const char *fmt, ...),
1221			   struct witness *parent)
1222{
1223	struct witness *w;
1224	int i;
1225	int level = parent->w_level;
1226
1227	prnt("%d", level);
1228	if (level < 10)
1229		prnt(" ");
1230	for (i = 0; i < level; i++)
1231		prnt(" ");
1232	prnt("%s", parent->w_description);
1233	if (parent->w_file != NULL) {
1234		prnt(" -- last acquired @ %s", parent->w_file);
1235#ifndef W_USE_WHERE
1236		prnt(":%d", parent->w_line);
1237#endif
1238		prnt("\n");
1239	}
1240
1241	for (w = parent; w != NULL; w = w->w_morechildren)
1242		for (i = 0; i < w->w_childcnt; i++)
1243			    witness_displaydescendants(prnt, w->w_children[i]);
1244    }
1245
1246static int
1247dup_ok(struct witness *w)
1248{
1249	char **dup;
1250
1251	for (dup = dup_list; *dup!= NULL; dup++)
1252		if (strcmp(w->w_description, *dup) == 0)
1253			return (1);
1254	return (0);
1255}
1256
1257static int
1258blessed(struct witness *w1, struct witness *w2)
1259{
1260	int i;
1261	struct witness_blessed *b;
1262
1263	for (i = 0; i < blessed_count; i++) {
1264		b = &blessed_list[i];
1265		if (strcmp(w1->w_description, b->b_lock1) == 0) {
1266			if (strcmp(w2->w_description, b->b_lock2) == 0)
1267				return (1);
1268			continue;
1269		}
1270		if (strcmp(w1->w_description, b->b_lock2) == 0)
1271			if (strcmp(w2->w_description, b->b_lock1) == 0)
1272				return (1);
1273	}
1274	return (0);
1275}
1276
1277static struct witness *
1278witness_get()
1279{
1280	struct witness *w;
1281
1282	if ((w = w_free) == NULL) {
1283		witness_dead = 1;
1284		mtx_exit(&w_mtx, MTX_SPIN);
1285		printf("witness exhausted\n");
1286		return (NULL);
1287	}
1288	w_free = w->w_next;
1289	bzero(w, sizeof(*w));
1290	return (w);
1291}
1292
1293static void
1294witness_free(struct witness *w)
1295{
1296	w->w_next = w_free;
1297	w_free = w;
1298}
1299
1300void
1301witness_list(struct proc *p)
1302{
1303	struct mtx *m;
1304
1305	for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
1306	    m = LIST_NEXT(m, mtx_held)) {
1307		printf("\t\"%s\" (%p) locked at %s:%d\n",
1308		    m->mtx_description, m,
1309		    m->mtx_witness->w_file, m->mtx_witness->w_line);
1310	}
1311}
1312
1313void
1314witness_save(struct mtx *m, const char **filep, int *linep)
1315{
1316	*filep = m->mtx_witness->w_file;
1317	*linep = m->mtx_witness->w_line;
1318}
1319
1320void
1321witness_restore(struct mtx *m, const char *file, int line)
1322{
1323	m->mtx_witness->w_file = file;
1324	m->mtx_witness->w_line = line;
1325}
1326
1327#endif	/* (defined(MUTEX_DEBUG) && defined(WITNESS)) */
1328