subr_witness.c revision 68785
1/*-
2 * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 * 3. Berkeley Software Design Inc's name may not be used to endorse or
13 *    promote products derived from this software without specific prior
14 *    written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
29 *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
30 * $FreeBSD: head/sys/kern/subr_witness.c 68785 2000-11-15 22:00:31Z jhb $
31 */
32
33/*
34 *	Main Entry: witness
35 *	Pronunciation: 'wit-n&s
36 *	Function: noun
37 *	Etymology: Middle English witnesse, from Old English witnes knowledge,
38 *	    testimony, witness, from 2wit
39 *	Date: before 12th century
40 *	1 : attestation of a fact or event : TESTIMONY
41 *	2 : one that gives evidence; specifically : one who testifies in
42 *	    a cause or before a judicial tribunal
43 *	3 : one asked to be present at a transaction so as to be able to
44 *	    testify to its having taken place
45 *	4 : one who has personal knowledge of something
46 *	5 a : something serving as evidence or proof : SIGN
47 *	  b : public affirmation by word or example of usually
48 *	      religious faith or conviction <the heroic witness to divine
49 *	      life -- Pilot>
50 *	6 capitalized : a member of the Jehovah's Witnesses
51 */
52
53#include "opt_witness.h"
54
55#include <sys/param.h>
56#include <sys/bus.h>
57#include <sys/kernel.h>
58#include <sys/malloc.h>
59#include <sys/proc.h>
60#include <sys/sysctl.h>
61#include <sys/systm.h>
62#include <sys/vmmeter.h>
63#include <sys/ktr.h>
64
65#include <machine/atomic.h>
66#include <machine/bus.h>
67#include <machine/clock.h>
68#include <machine/cpu.h>
69
70#include <vm/vm.h>
71#include <vm/vm_extern.h>
72
73#define _KERN_MUTEX_C_		/* Cause non-inlined mtx_*() to be compiled. */
74#include <sys/mutex.h>
75
76/*
77 * Machine independent bits of the mutex implementation
78 */
79/* All mutexes in system (used for debug/panic) */
80#ifdef MUTEX_DEBUG
81static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0,
82	"All mutexes queue head" };
83static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, &all_mtx_debug,
84	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
85	{ NULL, NULL }, &all_mtx, &all_mtx };
86#else	/* MUTEX_DEBUG */
87static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, "All mutexes queue head",
88	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
89	{ NULL, NULL }, &all_mtx, &all_mtx };
90#endif	/* MUTEX_DEBUG */
91
92static int	mtx_cur_cnt;
93static int	mtx_max_cnt;
94
95void	_mtx_enter_giant_def(void);
96void	_mtx_exit_giant_def(void);
97static void propagate_priority(struct proc *) __unused;
98
99#define	mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
100#define	mtx_owner(m)	(mtx_unowned(m) ? NULL \
101			    : (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))
102
103#define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
104#define	SET_PRIO(p, pri)	(p)->p_priority = (pri)
105
106/*
107 * XXX Temporary, for use from assembly language
108 */
109
110void
111_mtx_enter_giant_def(void)
112{
113
114	mtx_enter(&Giant, MTX_DEF);
115}
116
117void
118_mtx_exit_giant_def(void)
119{
120
121	mtx_exit(&Giant, MTX_DEF);
122}
123
124static void
125propagate_priority(struct proc *p)
126{
127	int pri = p->p_priority;
128	struct mtx *m = p->p_blocked;
129
130	for (;;) {
131		struct proc *p1;
132
133		p = mtx_owner(m);
134
135		if (p == NULL) {
136			/*
137			 * This really isn't quite right. Really
138			 * ought to bump priority of process that
139			 * next acquires the mutex.
140			 */
141			MPASS(m->mtx_lock == MTX_CONTESTED);
142			return;
143		}
144		MPASS(p->p_magic == P_MAGIC);
145		if (p->p_priority <= pri)
146			return;
147		/*
148		 * If lock holder is actually running, just bump priority.
149		 */
150		if (TAILQ_NEXT(p, p_procq) == NULL) {
151			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
152			SET_PRIO(p, pri);
153			return;
154		}
155		/*
156		 * If on run queue move to new run queue, and
157		 * quit.
158		 */
159		if (p->p_stat == SRUN) {
160			MPASS(p->p_blocked == NULL);
161			remrunqueue(p);
162			SET_PRIO(p, pri);
163			setrunqueue(p);
164			return;
165		}
166
167		/*
168		 * If we aren't blocked on a mutex, give up and quit.
169		 */
170		if (p->p_stat != SMTX) {
171			printf(
172	"XXX: process %d(%s):%d holds %s but isn't blocked on a mutex\n",
173			    p->p_pid, p->p_comm, p->p_stat, m->mtx_description);
174			return;
175		}
176
177		/*
178		 * Pick up the mutex that p is blocked on.
179		 */
180		m = p->p_blocked;
181		MPASS(m != NULL);
182
183		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
184		    p->p_comm, m->mtx_description);
185		/*
186		 * Check if the proc needs to be moved up on
187		 * the blocked chain
188		 */
189		if ((p1 = TAILQ_PREV(p, rq, p_procq)) == NULL ||
190		    p1->p_priority <= pri) {
191			if (p1)
192				printf(
193	"XXX: previous process %d(%s) has higher priority\n",
194				    p->p_pid, p->p_comm);
195			else
196				printf("XXX: process at head of run queue\n");
197			continue;
198		}
199
200		/*
201		 * Remove proc from blocked chain
202		 */
203		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
204		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
205			MPASS(p1->p_magic == P_MAGIC);
206			if (p1->p_priority > pri)
207				break;
208		}
209		if (p1)
210			TAILQ_INSERT_BEFORE(p1, p, p_procq);
211		else
212			TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
213		CTR4(KTR_LOCK,
214		    "propagate priority: p 0x%p moved before 0x%p on [0x%p] %s",
215		    p, p1, m, m->mtx_description);
216	}
217}
218
219void
220mtx_enter_hard(struct mtx *m, int type, int saveintr)
221{
222	struct proc *p = CURPROC;
223	struct timeval new_switchtime;
224
225	KASSERT(p != NULL, ("curproc is NULL in mutex"));
226
227	switch (type) {
228	case MTX_DEF:
229		if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) {
230			m->mtx_recurse++;
231			atomic_set_ptr(&m->mtx_lock, MTX_RECURSE);
232			CTR1(KTR_LOCK, "mtx_enter: 0x%p recurse", m);
233			return;
234		}
235		CTR3(KTR_LOCK, "mtx_enter: 0x%p contested (lock=%p) [0x%p]",
236		    m, (void *)m->mtx_lock, (void *)RETIP(m));
237		while (!_obtain_lock(m, p)) {
238			uintptr_t v;
239			struct proc *p1;
240
241			mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY);
242			/*
243			 * check if the lock has been released while
244			 * waiting for the schedlock.
245			 */
246			if ((v = m->mtx_lock) == MTX_UNOWNED) {
247				mtx_exit(&sched_lock, MTX_SPIN);
248				continue;
249			}
250			/*
251			 * The mutex was marked contested on release. This
252			 * means that there are processes blocked on it.
253			 */
254			if (v == MTX_CONTESTED) {
255				p1 = TAILQ_FIRST(&m->mtx_blocked);
256				KASSERT(p1 != NULL, ("contested mutex has no contesters"));
257				KASSERT(p != NULL, ("curproc is NULL for contested mutex"));
258				m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;
259				if (p1->p_priority < p->p_priority) {
260					SET_PRIO(p, p1->p_priority);
261				}
262				mtx_exit(&sched_lock, MTX_SPIN);
263				return;
264			}
265			/*
266			 * If the mutex isn't already contested and
267			 * a failure occurs setting the contested bit the
268			 * mutex was either release or the
269			 * state of the RECURSION bit changed.
270			 */
271			if ((v & MTX_CONTESTED) == 0 &&
272			    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
273				               (void *)(v | MTX_CONTESTED))) {
274				mtx_exit(&sched_lock, MTX_SPIN);
275				continue;
276			}
277
278			/* We definitely have to sleep for this lock */
279			mtx_assert(m, MA_NOTOWNED);
280
281#ifdef notyet
282			/*
283			 * If we're borrowing an interrupted thread's VM
284			 * context must clean up before going to sleep.
285			 */
286			if (p->p_flag & (P_ITHD | P_SITHD)) {
287				ithd_t *it = (ithd_t *)p;
288
289				if (it->it_interrupted) {
290					CTR2(KTR_LOCK,
291					    "mtx_enter: 0x%x interrupted 0x%x",
292					    it, it->it_interrupted);
293					intr_thd_fixup(it);
294				}
295			}
296#endif
297
298			/* Put us on the list of procs blocked on this mutex */
299			if (TAILQ_EMPTY(&m->mtx_blocked)) {
300				p1 = (struct proc *)(m->mtx_lock &
301						     MTX_FLAGMASK);
302				LIST_INSERT_HEAD(&p1->p_contested, m,
303						 mtx_contested);
304				TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
305			} else {
306				TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
307					if (p1->p_priority > p->p_priority)
308						break;
309				if (p1)
310					TAILQ_INSERT_BEFORE(p1, p, p_procq);
311				else
312					TAILQ_INSERT_TAIL(&m->mtx_blocked, p,
313							  p_procq);
314			}
315
316			p->p_blocked = m;	/* Who we're blocked on */
317			p->p_stat = SMTX;
318#if 0
319			propagate_priority(p);
320#endif
321			CTR3(KTR_LOCK, "mtx_enter: p 0x%p blocked on [0x%p] %s",
322			    p, m, m->mtx_description);
323			/*
324			 * Blatantly copied from mi_switch nearly verbatim.
325			 * When Giant goes away and we stop dinking with it
326			 * in mi_switch, we can go back to calling mi_switch
327			 * directly here.
328			 */
329
330			/*
331			 * Compute the amount of time during which the current
332			 * process was running, and add that to its total so
333			 * far.
334			 */
335			microuptime(&new_switchtime);
336			if (timevalcmp(&new_switchtime, &switchtime, <)) {
337				printf(
338		    "microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
339		    		    switchtime.tv_sec, switchtime.tv_usec,
340		    		    new_switchtime.tv_sec,
341		    		    new_switchtime.tv_usec);
342				new_switchtime = switchtime;
343			} else {
344				p->p_runtime += (new_switchtime.tv_usec -
345				    switchtime.tv_usec) +
346				    (new_switchtime.tv_sec - switchtime.tv_sec) *
347				    (int64_t)1000000;
348			}
349
350			/*
351			 * Pick a new current process and record its start time.
352			 */
353			cnt.v_swtch++;
354			switchtime = new_switchtime;
355			cpu_switch();
356			if (switchtime.tv_sec == 0)
357				microuptime(&switchtime);
358			switchticks = ticks;
359			CTR3(KTR_LOCK,
360			    "mtx_enter: p 0x%p free from blocked on [0x%p] %s",
361			    p, m, m->mtx_description);
362			mtx_exit(&sched_lock, MTX_SPIN);
363		}
364		return;
365	case MTX_SPIN:
366	case MTX_SPIN | MTX_FIRST:
367	case MTX_SPIN | MTX_TOPHALF:
368	    {
369		int i = 0;
370
371		if (m->mtx_lock == (uintptr_t)p) {
372			m->mtx_recurse++;
373			return;
374		}
375		CTR1(KTR_LOCK, "mtx_enter: %p spinning", m);
376		for (;;) {
377			if (_obtain_lock(m, p))
378				break;
379			while (m->mtx_lock != MTX_UNOWNED) {
380				if (i++ < 1000000)
381					continue;
382				if (i++ < 6000000)
383					DELAY (1);
384#ifdef DDB
385				else if (!db_active)
386#else
387				else
388#endif
389					panic(
390				"spin lock %s held by 0x%p for > 5 seconds",
391					    m->mtx_description,
392					    (void *)m->mtx_lock);
393			}
394		}
395
396#ifdef MUTEX_DEBUG
397		if (type != MTX_SPIN)
398			m->mtx_saveintr = 0xbeefface;
399		else
400#endif
401			m->mtx_saveintr = saveintr;
402		CTR1(KTR_LOCK, "mtx_enter: 0x%p spin done", m);
403		return;
404	    }
405	}
406}
407
408void
409mtx_exit_hard(struct mtx *m, int type)
410{
411	struct proc *p, *p1;
412	struct mtx *m1;
413	int pri;
414
415	p = CURPROC;
416	switch (type) {
417	case MTX_DEF:
418	case MTX_DEF | MTX_NOSWITCH:
419		if (m->mtx_recurse != 0) {
420			if (--(m->mtx_recurse) == 0)
421				atomic_clear_ptr(&m->mtx_lock, MTX_RECURSE);
422			CTR1(KTR_LOCK, "mtx_exit: 0x%p unrecurse", m);
423			return;
424		}
425		mtx_enter(&sched_lock, MTX_SPIN);
426		CTR1(KTR_LOCK, "mtx_exit: 0x%p contested", m);
427		p1 = TAILQ_FIRST(&m->mtx_blocked);
428		MPASS(p->p_magic == P_MAGIC);
429		MPASS(p1->p_magic == P_MAGIC);
430		TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq);
431		if (TAILQ_EMPTY(&m->mtx_blocked)) {
432			LIST_REMOVE(m, mtx_contested);
433			_release_lock_quick(m);
434			CTR1(KTR_LOCK, "mtx_exit: 0x%p not held", m);
435		} else
436			m->mtx_lock = MTX_CONTESTED;
437		pri = MAXPRI;
438		LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
439			int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
440			if (cp < pri)
441				pri = cp;
442		}
443		if (pri > p->p_nativepri)
444			pri = p->p_nativepri;
445		SET_PRIO(p, pri);
446		CTR2(KTR_LOCK, "mtx_exit: 0x%p contested setrunqueue 0x%p",
447		    m, p1);
448		p1->p_blocked = NULL;
449		p1->p_stat = SRUN;
450		setrunqueue(p1);
451		if ((type & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
452#ifdef notyet
453			if (p->p_flag & (P_ITHD | P_SITHD)) {
454				ithd_t *it = (ithd_t *)p;
455
456				if (it->it_interrupted) {
457					CTR2(KTR_LOCK,
458					    "mtx_exit: 0x%x interruped 0x%x",
459					    it, it->it_interrupted);
460					intr_thd_fixup(it);
461				}
462			}
463#endif
464			setrunqueue(p);
465			CTR2(KTR_LOCK, "mtx_exit: 0x%p switching out lock=0x%p",
466			    m, (void *)m->mtx_lock);
467			mi_switch();
468			CTR2(KTR_LOCK, "mtx_exit: 0x%p resuming lock=0x%p",
469			    m, (void *)m->mtx_lock);
470		}
471		mtx_exit(&sched_lock, MTX_SPIN);
472		break;
473	case MTX_SPIN:
474	case MTX_SPIN | MTX_FIRST:
475		if (m->mtx_recurse != 0) {
476			m->mtx_recurse--;
477			return;
478		}
479		MPASS(mtx_owned(m));
480		_release_lock_quick(m);
481		if (type & MTX_FIRST)
482			enable_intr();	/* XXX is this kosher? */
483		else {
484			MPASS(m->mtx_saveintr != 0xbeefface);
485			restore_intr(m->mtx_saveintr);
486		}
487		break;
488	case MTX_SPIN | MTX_TOPHALF:
489		if (m->mtx_recurse != 0) {
490			m->mtx_recurse--;
491			return;
492		}
493		MPASS(mtx_owned(m));
494		_release_lock_quick(m);
495		break;
496	default:
497		panic("mtx_exit_hard: unsupported type 0x%x\n", type);
498	}
499}
500
501#define MV_DESTROY	0	/* validate before destory */
502#define MV_INIT		1	/* validate before init */
503
504#ifdef MUTEX_DEBUG
505
506int mtx_validate __P((struct mtx *, int));
507
508int
509mtx_validate(struct mtx *m, int when)
510{
511	struct mtx *mp;
512	int i;
513	int retval = 0;
514
515	if (m == &all_mtx || cold)
516		return 0;
517
518	mtx_enter(&all_mtx, MTX_DEF);
519/*
520 * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
521 * we can re-enable the kernacc() checks.
522 */
523#ifndef __alpha__
524	MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t),
525	    VM_PROT_READ) == 1);
526#endif
527	MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx);
528	for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
529#ifndef __alpha__
530		if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t),
531		    VM_PROT_READ) != 1) {
532			panic("mtx_validate: mp=%p mp->mtx_next=%p",
533			    mp, mp->mtx_next);
534		}
535#endif
536		i++;
537		if (i > mtx_cur_cnt) {
538			panic("mtx_validate: too many in chain, known=%d\n",
539			    mtx_cur_cnt);
540		}
541	}
542	MPASS(i == mtx_cur_cnt);
543	switch (when) {
544	case MV_DESTROY:
545		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
546			if (mp == m)
547				break;
548		MPASS(mp == m);
549		break;
550	case MV_INIT:
551		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
552		if (mp == m) {
553			/*
554			 * Not good. This mutex already exists.
555			 */
556			printf("re-initing existing mutex %s\n",
557			    m->mtx_description);
558			MPASS(m->mtx_lock == MTX_UNOWNED);
559			retval = 1;
560		}
561	}
562	mtx_exit(&all_mtx, MTX_DEF);
563	return (retval);
564}
565#endif
566
567void
568mtx_init(struct mtx *m, const char *t, int flag)
569{
570#ifdef MUTEX_DEBUG
571	struct mtx_debug *debug;
572#endif
573
574	CTR2(KTR_LOCK, "mtx_init 0x%p (%s)", m, t);
575#ifdef MUTEX_DEBUG
576	if (mtx_validate(m, MV_INIT))	/* diagnostic and error correction */
577		return;
578	if (flag & MTX_COLD)
579		debug = m->mtx_debug;
580	else
581		debug = NULL;
582	if (debug == NULL) {
583#ifdef DIAGNOSTIC
584		if(cold && bootverbose)
585			printf("malloc'ing mtx_debug while cold for %s\n", t);
586#endif
587
588		/* XXX - should not use DEVBUF */
589		debug = malloc(sizeof(struct mtx_debug), M_DEVBUF, M_NOWAIT);
590		MPASS(debug != NULL);
591		bzero(debug, sizeof(struct mtx_debug));
592	}
593#endif
594	bzero((void *)m, sizeof *m);
595	TAILQ_INIT(&m->mtx_blocked);
596#ifdef MUTEX_DEBUG
597	m->mtx_debug = debug;
598#endif
599	m->mtx_description = t;
600	m->mtx_lock = MTX_UNOWNED;
601	/* Put on all mutex queue */
602	mtx_enter(&all_mtx, MTX_DEF);
603	m->mtx_next = &all_mtx;
604	m->mtx_prev = all_mtx.mtx_prev;
605	m->mtx_prev->mtx_next = m;
606	all_mtx.mtx_prev = m;
607	if (++mtx_cur_cnt > mtx_max_cnt)
608		mtx_max_cnt = mtx_cur_cnt;
609	mtx_exit(&all_mtx, MTX_DEF);
610	witness_init(m, flag);
611}
612
613void
614mtx_destroy(struct mtx *m)
615{
616
617	CTR2(KTR_LOCK, "mtx_destroy 0x%p (%s)", m, m->mtx_description);
618#ifdef MUTEX_DEBUG
619	if (m->mtx_next == NULL)
620		panic("mtx_destroy: %p (%s) already destroyed",
621		    m, m->mtx_description);
622
623	if (!mtx_owned(m)) {
624		MPASS(m->mtx_lock == MTX_UNOWNED);
625	} else {
626		MPASS((m->mtx_lock & (MTX_RECURSE|MTX_CONTESTED)) == 0);
627	}
628	mtx_validate(m, MV_DESTROY);		/* diagnostic */
629#endif
630
631#ifdef WITNESS
632	if (m->mtx_witness)
633		witness_destroy(m);
634#endif /* WITNESS */
635
636	/* Remove from the all mutex queue */
637	mtx_enter(&all_mtx, MTX_DEF);
638	m->mtx_next->mtx_prev = m->mtx_prev;
639	m->mtx_prev->mtx_next = m->mtx_next;
640#ifdef MUTEX_DEBUG
641	m->mtx_next = m->mtx_prev = NULL;
642	free(m->mtx_debug, M_DEVBUF);
643	m->mtx_debug = NULL;
644#endif
645	mtx_cur_cnt--;
646	mtx_exit(&all_mtx, MTX_DEF);
647}
648
649/*
650 * The non-inlined versions of the mtx_*() functions are always built (above),
651 * but the witness code depends on the MUTEX_DEBUG and WITNESS kernel options
652 * being specified.
653 */
654#if (defined(MUTEX_DEBUG) && defined(WITNESS))
655
656#define WITNESS_COUNT 200
657#define	WITNESS_NCHILDREN 2
658
659#ifndef SMP
660extern int witness_spin_check;
661#endif
662
663int witness_watch = 1;
664
665struct witness {
666	struct witness	*w_next;
667	const char	*w_description;
668	const char	*w_file;
669	int		 w_line;
670	struct witness	*w_morechildren;
671	u_char		 w_childcnt;
672	u_char		 w_Giant_squawked:1;
673	u_char		 w_other_squawked:1;
674	u_char		 w_same_squawked:1;
675	u_char		 w_sleep:1;
676	u_char		 w_spin:1;	/* this is a spin mutex */
677	u_int		 w_level;
678	struct witness	*w_children[WITNESS_NCHILDREN];
679};
680
681struct witness_blessed {
682	char 	*b_lock1;
683	char	*b_lock2;
684};
685
686#ifdef DDB
687/*
688 * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
689 * drop into kdebug() when:
690 *	- a lock heirarchy violation occurs
691 *	- locks are held when going to sleep.
692 */
693#ifdef WITNESS_DDB
694int	witness_ddb = 1;
695#else
696int	witness_ddb = 0;
697#endif
698SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
699#endif /* DDB */
700
701#ifdef WITNESS_SKIPSPIN
702int	witness_skipspin = 1;
703#else
704int	witness_skipspin = 0;
705#endif
706SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
707    "");
708
709MUTEX_DECLARE(static,w_mtx);
710static struct witness	*w_free;
711static struct witness	*w_all;
712static int		 w_inited;
713static int		 witness_dead;	/* fatal error, probably no memory */
714
715static struct witness	 w_data[WITNESS_COUNT];
716
717static struct witness	 *enroll __P((const char *description, int flag));
718static int itismychild __P((struct witness *parent, struct witness *child));
719static void removechild __P((struct witness *parent, struct witness *child));
720static int isitmychild __P((struct witness *parent, struct witness *child));
721static int isitmydescendant __P((struct witness *parent, struct witness *child));
722static int dup_ok __P((struct witness *));
723static int blessed __P((struct witness *, struct witness *));
724static void witness_displaydescendants
725    __P((void(*)(const char *fmt, ...), struct witness *));
726static void witness_leveldescendents __P((struct witness *parent, int level));
727static void witness_levelall __P((void));
728static struct witness * witness_get __P((void));
729static void witness_free __P((struct witness *m));
730
731
732static char *ignore_list[] = {
733	"witness lock",
734	NULL
735};
736
737static char *spin_order_list[] = {
738	"sched lock",
739	"clk",
740	"sio",
741	/*
742	 * leaf locks
743	 */
744	NULL
745};
746
747static char *order_list[] = {
748	NULL
749};
750
751static char *dup_list[] = {
752	NULL
753};
754
755static char *sleep_list[] = {
756	"Giant lock",
757	NULL
758};
759
760/*
761 * Pairs of locks which have been blessed
762 * Don't complain about order problems with blessed locks
763 */
764static struct witness_blessed blessed_list[] = {
765};
766static int blessed_count = sizeof(blessed_list) / sizeof(struct witness_blessed);
767
768void
769witness_init(struct mtx *m, int flag)
770{
771	m->mtx_witness = enroll(m->mtx_description, flag);
772}
773
774void
775witness_destroy(struct mtx *m)
776{
777	struct mtx *m1;
778	struct proc *p;
779	p = CURPROC;
780	for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL;
781		m1 = LIST_NEXT(m1, mtx_held)) {
782		if (m1 == m) {
783			LIST_REMOVE(m, mtx_held);
784			break;
785		}
786	}
787	return;
788
789}
790
791void
792witness_enter(struct mtx *m, int flags, const char *file, int line)
793{
794	struct witness *w, *w1;
795	struct mtx *m1;
796	struct proc *p;
797	int i;
798#ifdef DDB
799	int go_into_ddb = 0;
800#endif /* DDB */
801
802	w = m->mtx_witness;
803	p = CURPROC;
804
805	if (flags & MTX_SPIN) {
806		if (!w->w_spin)
807			panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @"
808			    " %s:%d", m->mtx_description, file, line);
809		if (m->mtx_recurse != 0)
810			return;
811		mtx_enter(&w_mtx, MTX_SPIN);
812		i = witness_spin_check;
813		if (i != 0 && w->w_level < i) {
814			mtx_exit(&w_mtx, MTX_SPIN);
815			panic("mutex_enter(%s:%x, MTX_SPIN) out of order @"
816			    " %s:%d already holding %s:%x",
817			    m->mtx_description, w->w_level, file, line,
818			    spin_order_list[ffs(i)-1], i);
819		}
820		PCPU_SET(witness_spin_check, i | w->w_level);
821		mtx_exit(&w_mtx, MTX_SPIN);
822		return;
823	}
824	if (w->w_spin)
825		panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
826		    m->mtx_description, file, line);
827
828	if (m->mtx_recurse != 0)
829		return;
830	if (witness_dead)
831		goto out;
832	if (cold || panicstr)
833		goto out;
834
835	if (!mtx_legal2block())
836		panic("blockable mtx_enter() of %s when not legal @ %s:%d",
837			    m->mtx_description, file, line);
838	/*
839	 * Is this the first mutex acquired
840	 */
841	if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
842		goto out;
843
844	if ((w1 = m1->mtx_witness) == w) {
845		if (w->w_same_squawked || dup_ok(w))
846			goto out;
847		w->w_same_squawked = 1;
848		printf("acquring duplicate lock of same type: \"%s\"\n",
849			m->mtx_description);
850		printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
851		printf(" 2nd @ %s:%d\n", file, line);
852#ifdef DDB
853		go_into_ddb = 1;
854#endif /* DDB */
855		goto out;
856	}
857	MPASS(!mtx_owned(&w_mtx));
858	mtx_enter(&w_mtx, MTX_SPIN);
859	/*
860	 * If we have a known higher number just say ok
861	 */
862	if (witness_watch > 1 && w->w_level > w1->w_level) {
863		mtx_exit(&w_mtx, MTX_SPIN);
864		goto out;
865	}
866	if (isitmydescendant(m1->mtx_witness, w)) {
867		mtx_exit(&w_mtx, MTX_SPIN);
868		goto out;
869	}
870	for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
871
872		MPASS(i < 200);
873		w1 = m1->mtx_witness;
874		if (isitmydescendant(w, w1)) {
875			mtx_exit(&w_mtx, MTX_SPIN);
876			if (blessed(w, w1))
877				goto out;
878			if (m1 == &Giant) {
879				if (w1->w_Giant_squawked)
880					goto out;
881				else
882					w1->w_Giant_squawked = 1;
883			} else {
884				if (w1->w_other_squawked)
885					goto out;
886				else
887					w1->w_other_squawked = 1;
888			}
889			printf("lock order reversal\n");
890			printf(" 1st %s last acquired @ %s:%d\n",
891			    w->w_description, w->w_file, w->w_line);
892			printf(" 2nd %p %s @ %s:%d\n",
893			    m1, w1->w_description, w1->w_file, w1->w_line);
894			printf(" 3rd %p %s @ %s:%d\n",
895			    m, w->w_description, file, line);
896#ifdef DDB
897			go_into_ddb = 1;
898#endif /* DDB */
899			goto out;
900		}
901	}
902	m1 = LIST_FIRST(&p->p_heldmtx);
903	if (!itismychild(m1->mtx_witness, w))
904		mtx_exit(&w_mtx, MTX_SPIN);
905
906out:
907#ifdef DDB
908	if (witness_ddb && go_into_ddb)
909		Debugger("witness_enter");
910#endif /* DDB */
911	w->w_file = file;
912	w->w_line = line;
913	m->mtx_line = line;
914	m->mtx_file = file;
915
916	/*
917	 * If this pays off it likely means that a mutex being witnessed
918	 * is acquired in hardclock. Put it in the ignore list. It is
919	 * likely not the mutex this assert fails on.
920	 */
921	MPASS(m->mtx_held.le_prev == NULL);
922	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
923}
924
925void
926witness_exit(struct mtx *m, int flags, const char *file, int line)
927{
928	struct witness *w;
929
930	w = m->mtx_witness;
931
932	if (flags & MTX_SPIN) {
933		if (!w->w_spin)
934			panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @"
935			    " %s:%d", m->mtx_description, file, line);
936		if (m->mtx_recurse != 0)
937			return;
938		mtx_enter(&w_mtx, MTX_SPIN);
939		PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level);
940		mtx_exit(&w_mtx, MTX_SPIN);
941		return;
942	}
943	if (w->w_spin)
944		panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
945		    m->mtx_description, file, line);
946
947	if (m->mtx_recurse != 0)
948		return;
949
950	if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
951		panic("switchable mtx_exit() of %s when not legal @ %s:%d",
952			    m->mtx_description, file, line);
953	LIST_REMOVE(m, mtx_held);
954	m->mtx_held.le_prev = NULL;
955}
956
957void
958witness_try_enter(struct mtx *m, int flags, const char *file, int line)
959{
960	struct proc *p;
961	struct witness *w = m->mtx_witness;
962
963	if (flags & MTX_SPIN) {
964		if (!w->w_spin)
965			panic("mutex_try_enter: "
966			    "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
967			    m->mtx_description, file, line);
968		if (m->mtx_recurse != 0)
969			return;
970		mtx_enter(&w_mtx, MTX_SPIN);
971		PCPU_SET(witness_spin_check, witness_spin_check | w->w_level);
972		mtx_exit(&w_mtx, MTX_SPIN);
973		return;
974	}
975
976	if (w->w_spin)
977		panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
978		    m->mtx_description, file, line);
979
980	if (m->mtx_recurse != 0)
981		return;
982
983	w->w_file = file;
984	w->w_line = line;
985	m->mtx_line = line;
986	m->mtx_file = file;
987	p = CURPROC;
988	MPASS(m->mtx_held.le_prev == NULL);
989	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
990}
991
992void
993witness_display(void(*prnt)(const char *fmt, ...))
994{
995	struct witness *w, *w1;
996
997	witness_levelall();
998
999	for (w = w_all; w; w = w->w_next) {
1000		if (w->w_file == NULL)
1001			continue;
1002		for (w1 = w_all; w1; w1 = w1->w_next) {
1003			if (isitmychild(w1, w))
1004				break;
1005		}
1006		if (w1 != NULL)
1007			continue;
1008		/*
1009		 * This lock has no anscestors, display its descendants.
1010		 */
1011		witness_displaydescendants(prnt, w);
1012	}
1013	prnt("\nMutex which were never acquired\n");
1014	for (w = w_all; w; w = w->w_next) {
1015		if (w->w_file != NULL)
1016			continue;
1017		prnt("%s\n", w->w_description);
1018	}
1019}
1020
1021int
1022witness_sleep(int check_only, struct mtx *mtx, const char *file, int line)
1023{
1024	struct mtx *m;
1025	struct proc *p;
1026	char **sleep;
1027	int n = 0;
1028
1029	p = CURPROC;
1030	for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
1031	    m = LIST_NEXT(m, mtx_held)) {
1032		if (m == mtx)
1033			continue;
1034		for (sleep = sleep_list; *sleep!= NULL; sleep++)
1035			if (strcmp(m->mtx_description, *sleep) == 0)
1036				goto next;
1037		printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
1038			file, line, check_only ? "could sleep" : "sleeping",
1039			m->mtx_description,
1040			m->mtx_witness->w_file, m->mtx_witness->w_line);
1041		n++;
1042	next:
1043	}
1044#ifdef DDB
1045	if (witness_ddb && n)
1046		Debugger("witness_sleep");
1047#endif /* DDB */
1048	return (n);
1049}
1050
1051static struct witness *
1052enroll(const char *description, int flag)
1053{
1054	int i;
1055	struct witness *w, *w1;
1056	char **ignore;
1057	char **order;
1058
1059	if (!witness_watch)
1060		return (NULL);
1061	for (ignore = ignore_list; *ignore != NULL; ignore++)
1062		if (strcmp(description, *ignore) == 0)
1063			return (NULL);
1064
1065	if (w_inited == 0) {
1066		mtx_init(&w_mtx, "witness lock", MTX_COLD | MTX_DEF);
1067		for (i = 0; i < WITNESS_COUNT; i++) {
1068			w = &w_data[i];
1069			witness_free(w);
1070		}
1071		w_inited = 1;
1072		for (order = order_list; *order != NULL; order++) {
1073			w = enroll(*order, MTX_DEF);
1074			w->w_file = "order list";
1075			for (order++; *order != NULL; order++) {
1076				w1 = enroll(*order, MTX_DEF);
1077				w1->w_file = "order list";
1078				itismychild(w, w1);
1079				w = w1;
1080    	    	    	}
1081		}
1082	}
1083	if ((flag & MTX_SPIN) && witness_skipspin)
1084		return (NULL);
1085	mtx_enter(&w_mtx, MTX_SPIN);
1086	for (w = w_all; w; w = w->w_next) {
1087		if (strcmp(description, w->w_description) == 0) {
1088			mtx_exit(&w_mtx, MTX_SPIN);
1089			return (w);
1090		}
1091	}
1092	if ((w = witness_get()) == NULL)
1093		return (NULL);
1094	w->w_next = w_all;
1095	w_all = w;
1096	w->w_description = description;
1097	mtx_exit(&w_mtx, MTX_SPIN);
1098	if (flag & MTX_SPIN) {
1099		w->w_spin = 1;
1100
1101		i = 1;
1102		for (order = spin_order_list; *order != NULL; order++) {
1103			if (strcmp(description, *order) == 0)
1104				break;
1105			i <<= 1;
1106		}
1107		if (*order == NULL)
1108			panic("spin lock %s not in order list", description);
1109		w->w_level = i;
1110	}
1111	return (w);
1112}
1113
1114static int
1115itismychild(struct witness *parent, struct witness *child)
1116{
1117	static int recursed;
1118
1119	/*
1120	 * Insert "child" after "parent"
1121	 */
1122	while (parent->w_morechildren)
1123		parent = parent->w_morechildren;
1124
1125	if (parent->w_childcnt == WITNESS_NCHILDREN) {
1126		if ((parent->w_morechildren = witness_get()) == NULL)
1127			return (1);
1128		parent = parent->w_morechildren;
1129	}
1130	MPASS(child != NULL);
1131	parent->w_children[parent->w_childcnt++] = child;
1132	/*
1133	 * now prune whole tree
1134	 */
1135	if (recursed)
1136		return (0);
1137	recursed = 1;
1138	for (child = w_all; child != NULL; child = child->w_next) {
1139		for (parent = w_all; parent != NULL;
1140		    parent = parent->w_next) {
1141			if (!isitmychild(parent, child))
1142				continue;
1143			removechild(parent, child);
1144			if (isitmydescendant(parent, child))
1145				continue;
1146			itismychild(parent, child);
1147		}
1148	}
1149	recursed = 0;
1150	witness_levelall();
1151	return (0);
1152}
1153
1154static void
1155removechild(struct witness *parent, struct witness *child)
1156{
1157	struct witness *w, *w1;
1158	int i;
1159
1160	for (w = parent; w != NULL; w = w->w_morechildren)
1161		for (i = 0; i < w->w_childcnt; i++)
1162			if (w->w_children[i] == child)
1163				goto found;
1164	return;
1165found:
1166	for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
1167		continue;
1168	w->w_children[i] = w1->w_children[--w1->w_childcnt];
1169	MPASS(w->w_children[i] != NULL);
1170
1171	if (w1->w_childcnt != 0)
1172		return;
1173
1174	if (w1 == parent)
1175		return;
1176	for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
1177		continue;
1178	w->w_morechildren = 0;
1179	witness_free(w1);
1180}
1181
1182static int
1183isitmychild(struct witness *parent, struct witness *child)
1184{
1185	struct witness *w;
1186	int i;
1187
1188	for (w = parent; w != NULL; w = w->w_morechildren) {
1189		for (i = 0; i < w->w_childcnt; i++) {
1190			if (w->w_children[i] == child)
1191				return (1);
1192		}
1193	}
1194	return (0);
1195}
1196
1197static int
1198isitmydescendant(struct witness *parent, struct witness *child)
1199{
1200	struct witness *w;
1201	int i;
1202	int j;
1203
1204	for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
1205		MPASS(j < 1000);
1206		for (i = 0; i < w->w_childcnt; i++) {
1207			if (w->w_children[i] == child)
1208				return (1);
1209		}
1210		for (i = 0; i < w->w_childcnt; i++) {
1211			if (isitmydescendant(w->w_children[i], child))
1212				return (1);
1213		}
1214	}
1215	return (0);
1216}
1217
1218void
1219witness_levelall (void)
1220{
1221	struct witness *w, *w1;
1222
1223	for (w = w_all; w; w = w->w_next)
1224		if (!w->w_spin)
1225			w->w_level = 0;
1226	for (w = w_all; w; w = w->w_next) {
1227		if (w->w_spin)
1228			continue;
1229		for (w1 = w_all; w1; w1 = w1->w_next) {
1230			if (isitmychild(w1, w))
1231				break;
1232		}
1233		if (w1 != NULL)
1234			continue;
1235		witness_leveldescendents(w, 0);
1236	}
1237}
1238
1239static void
1240witness_leveldescendents(struct witness *parent, int level)
1241{
1242	int i;
1243	struct witness *w;
1244
1245	if (parent->w_level < level)
1246		parent->w_level = level;
1247	level++;
1248	for (w = parent; w != NULL; w = w->w_morechildren)
1249		for (i = 0; i < w->w_childcnt; i++)
1250			witness_leveldescendents(w->w_children[i], level);
1251}
1252
1253static void
1254witness_displaydescendants(void(*prnt)(const char *fmt, ...),
1255			   struct witness *parent)
1256{
1257	struct witness *w;
1258	int i;
1259	int level = parent->w_level;
1260
1261	prnt("%d", level);
1262	if (level < 10)
1263		prnt(" ");
1264	for (i = 0; i < level; i++)
1265		prnt(" ");
1266	prnt("%s", parent->w_description);
1267	if (parent->w_file != NULL) {
1268		prnt(" -- last acquired @ %s", parent->w_file);
1269#ifndef W_USE_WHERE
1270		prnt(":%d", parent->w_line);
1271#endif
1272		prnt("\n");
1273	}
1274
1275	for (w = parent; w != NULL; w = w->w_morechildren)
1276		for (i = 0; i < w->w_childcnt; i++)
1277			    witness_displaydescendants(prnt, w->w_children[i]);
1278    }
1279
1280static int
1281dup_ok(struct witness *w)
1282{
1283	char **dup;
1284
1285	for (dup = dup_list; *dup!= NULL; dup++)
1286		if (strcmp(w->w_description, *dup) == 0)
1287			return (1);
1288	return (0);
1289}
1290
1291static int
1292blessed(struct witness *w1, struct witness *w2)
1293{
1294	int i;
1295	struct witness_blessed *b;
1296
1297	for (i = 0; i < blessed_count; i++) {
1298		b = &blessed_list[i];
1299		if (strcmp(w1->w_description, b->b_lock1) == 0) {
1300			if (strcmp(w2->w_description, b->b_lock2) == 0)
1301				return (1);
1302			continue;
1303		}
1304		if (strcmp(w1->w_description, b->b_lock2) == 0)
1305			if (strcmp(w2->w_description, b->b_lock1) == 0)
1306				return (1);
1307	}
1308	return (0);
1309}
1310
1311static struct witness *
1312witness_get()
1313{
1314	struct witness *w;
1315
1316	if ((w = w_free) == NULL) {
1317		witness_dead = 1;
1318		mtx_exit(&w_mtx, MTX_SPIN);
1319		printf("witness exhausted\n");
1320		return (NULL);
1321	}
1322	w_free = w->w_next;
1323	bzero(w, sizeof(*w));
1324	return (w);
1325}
1326
1327static void
1328witness_free(struct witness *w)
1329{
1330	w->w_next = w_free;
1331	w_free = w;
1332}
1333
1334void
1335witness_list(struct proc *p)
1336{
1337	struct mtx *m;
1338
1339	for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
1340	    m = LIST_NEXT(m, mtx_held)) {
1341		printf("\t\"%s\" (%p) locked at %s:%d\n",
1342		    m->mtx_description, m,
1343		    m->mtx_witness->w_file, m->mtx_witness->w_line);
1344	}
1345}
1346
1347void
1348witness_save(struct mtx *m, const char **filep, int *linep)
1349{
1350	*filep = m->mtx_witness->w_file;
1351	*linep = m->mtx_witness->w_line;
1352}
1353
1354void
1355witness_restore(struct mtx *m, const char *file, int line)
1356{
1357	m->mtx_witness->w_file = file;
1358	m->mtx_witness->w_line = line;
1359}
1360
1361#endif	/* (defined(MUTEX_DEBUG) && defined(WITNESS)) */
1362