subr_witness.c revision 69879
1/*-
2 * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 * 3. Berkeley Software Design Inc's name may not be used to endorse or
13 *    promote products derived from this software without specific prior
14 *    written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
29 *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
30 * $FreeBSD: head/sys/kern/subr_witness.c 69879 2000-12-12 00:37:18Z jhb $
31 */
32
33/*
34 *	Main Entry: witness
35 *	Pronunciation: 'wit-n&s
36 *	Function: noun
37 *	Etymology: Middle English witnesse, from Old English witnes knowledge,
38 *	    testimony, witness, from 2wit
39 *	Date: before 12th century
40 *	1 : attestation of a fact or event : TESTIMONY
41 *	2 : one that gives evidence; specifically : one who testifies in
42 *	    a cause or before a judicial tribunal
43 *	3 : one asked to be present at a transaction so as to be able to
44 *	    testify to its having taken place
45 *	4 : one who has personal knowledge of something
46 *	5 a : something serving as evidence or proof : SIGN
47 *	  b : public affirmation by word or example of usually
48 *	      religious faith or conviction <the heroic witness to divine
49 *	      life -- Pilot>
50 *	6 capitalized : a member of the Jehovah's Witnesses
51 */
52
53#include "opt_ddb.h"
54#include "opt_witness.h"
55
56/*
57 * Cause non-inlined mtx_*() to be compiled.
58 * Must be defined early because other system headers may include mutex.h.
59 */
60#define _KERN_MUTEX_C_
61
62#include <sys/param.h>
63#include <sys/bus.h>
64#include <sys/kernel.h>
65#include <sys/malloc.h>
66#include <sys/proc.h>
67#include <sys/sysctl.h>
68#include <sys/systm.h>
69#include <sys/vmmeter.h>
70#include <sys/ktr.h>
71
72#include <machine/atomic.h>
73#include <machine/bus.h>
74#include <machine/clock.h>
75#include <machine/cpu.h>
76
77#include <ddb/ddb.h>
78
79#include <vm/vm.h>
80#include <vm/vm_extern.h>
81
82#include <sys/mutex.h>
83
84/*
85 * Machine independent bits of the mutex implementation
86 */
87/* All mutexes in system (used for debug/panic) */
88#ifdef WITNESS
89static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0,
90	"All mutexes queue head" };
91static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, &all_mtx_debug,
92	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
93	{ NULL, NULL }, &all_mtx, &all_mtx };
94#else	/* WITNESS */
95static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, "All mutexes queue head",
96	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
97	{ NULL, NULL }, &all_mtx, &all_mtx };
98#endif	/* WITNESS */
99
100static int	mtx_cur_cnt;
101static int	mtx_max_cnt;
102
103void	_mtx_enter_giant_def(void);
104void	_mtx_exit_giant_def(void);
105static void propagate_priority(struct proc *);
106
107#define	mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
108#define	mtx_owner(m)	(mtx_unowned(m) ? NULL \
109			    : (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))
110
111#define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
112#define	SET_PRIO(p, pri)	(p)->p_priority = (pri)
113
114/*
115 * XXX Temporary, for use from assembly language
116 */
117
118void
119_mtx_enter_giant_def(void)
120{
121
122	mtx_enter(&Giant, MTX_DEF);
123}
124
125void
126_mtx_exit_giant_def(void)
127{
128
129	mtx_exit(&Giant, MTX_DEF);
130}
131
132static void
133propagate_priority(struct proc *p)
134{
135	int pri = p->p_priority;
136	struct mtx *m = p->p_blocked;
137
138	mtx_assert(&sched_lock, MA_OWNED);
139	for (;;) {
140		struct proc *p1;
141
142		p = mtx_owner(m);
143
144		if (p == NULL) {
145			/*
146			 * This really isn't quite right. Really
147			 * ought to bump priority of process that
148			 * next acquires the mutex.
149			 */
150			MPASS(m->mtx_lock == MTX_CONTESTED);
151			return;
152		}
153		MPASS(p->p_magic == P_MAGIC);
154		KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex"));
155		if (p->p_priority <= pri)
156			return;
157
158		/*
159		 * Bump this process' priority.
160		 */
161		SET_PRIO(p, pri);
162
163		/*
164		 * If lock holder is actually running, just bump priority.
165		 */
166#ifdef SMP
167		/*
168		 * For SMP, we can check the p_oncpu field to see if we are
169		 * running.
170		 */
171		if (p->p_oncpu != 0xff) {
172			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
173			return;
174		}
175#else
176		/*
177		 * For UP, we check to see if p is curproc (this shouldn't
178		 * ever happen however as it would mean we are in a deadlock.)
179		 */
180		if (p == curproc) {
181			panic("Deadlock detected");
182			return;
183		}
184#endif
185		/*
186		 * If on run queue move to new run queue, and
187		 * quit.
188		 */
189		if (p->p_stat == SRUN) {
190			printf("XXX: moving process %d(%s) to a new run queue\n",
191			       p->p_pid, p->p_comm);
192			MPASS(p->p_blocked == NULL);
193			remrunqueue(p);
194			setrunqueue(p);
195			return;
196		}
197
198		/*
199		 * If we aren't blocked on a mutex, we should be.
200		 */
201		KASSERT(p->p_stat == SMTX, (
202		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
203		    p->p_pid, p->p_comm, p->p_stat,
204		    m->mtx_description));
205
206		/*
207		 * Pick up the mutex that p is blocked on.
208		 */
209		m = p->p_blocked;
210		MPASS(m != NULL);
211
212		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
213		    p->p_comm, m->mtx_description);
214		/*
215		 * Check if the proc needs to be moved up on
216		 * the blocked chain
217		 */
218		if (p == TAILQ_FIRST(&m->mtx_blocked)) {
219			printf("XXX: process at head of run queue\n");
220			continue;
221		}
222		p1 = TAILQ_PREV(p, rq, p_procq);
223		if (p1->p_priority <= pri) {
224			printf(
225	"XXX: previous process %d(%s) has higher priority\n",
226	                    p->p_pid, p->p_comm);
227			continue;
228		}
229
230		/*
231		 * Remove proc from blocked chain and determine where
232		 * it should be moved up to.  Since we know that p1 has
233		 * a lower priority than p, we know that at least one
234		 * process in the chain has a lower priority and that
235		 * p1 will thus not be NULL after the loop.
236		 */
237		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
238		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
239			MPASS(p1->p_magic == P_MAGIC);
240			if (p1->p_priority > pri)
241				break;
242		}
243		MPASS(p1 != NULL);
244		TAILQ_INSERT_BEFORE(p1, p, p_procq);
245		CTR4(KTR_LOCK,
246		    "propagate_priority: p 0x%p moved before 0x%p on [0x%p] %s",
247		    p, p1, m, m->mtx_description);
248	}
249}
250
251void
252mtx_enter_hard(struct mtx *m, int type, int saveintr)
253{
254	struct proc *p = CURPROC;
255
256	KASSERT(p != NULL, ("curproc is NULL in mutex"));
257
258	switch (type) {
259	case MTX_DEF:
260		if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) {
261			m->mtx_recurse++;
262			atomic_set_ptr(&m->mtx_lock, MTX_RECURSE);
263			CTR1(KTR_LOCK, "mtx_enter: 0x%p recurse", m);
264			return;
265		}
266		CTR3(KTR_LOCK, "mtx_enter: 0x%p contested (lock=%p) [0x%p]",
267		    m, (void *)m->mtx_lock, (void *)RETIP(m));
268
269		/*
270		 * Save our priority.  Even though p_nativepri is protected
271		 * by sched_lock, we don't obtain it here as it can be
272		 * expensive.  Since this is the only place p_nativepri is
273		 * set, and since two CPUs will not be executing the same
274		 * process concurrently, we know that no other CPU is going
275		 * to be messing with this.  Also, p_nativepri is only read
276		 * when we are blocked on a mutex, so that can't be happening
277		 * right now either.
278		 */
279		p->p_nativepri = p->p_priority;
280		while (!_obtain_lock(m, p)) {
281			uintptr_t v;
282			struct proc *p1;
283
284			mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY);
285			/*
286			 * check if the lock has been released while
287			 * waiting for the schedlock.
288			 */
289			if ((v = m->mtx_lock) == MTX_UNOWNED) {
290				mtx_exit(&sched_lock, MTX_SPIN);
291				continue;
292			}
293			/*
294			 * The mutex was marked contested on release. This
295			 * means that there are processes blocked on it.
296			 */
297			if (v == MTX_CONTESTED) {
298				p1 = TAILQ_FIRST(&m->mtx_blocked);
299				KASSERT(p1 != NULL, ("contested mutex has no contesters"));
300				KASSERT(p != NULL, ("curproc is NULL for contested mutex"));
301				m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;
302				if (p1->p_priority < p->p_priority) {
303					SET_PRIO(p, p1->p_priority);
304				}
305				mtx_exit(&sched_lock, MTX_SPIN);
306				return;
307			}
308			/*
309			 * If the mutex isn't already contested and
310			 * a failure occurs setting the contested bit the
311			 * mutex was either release or the
312			 * state of the RECURSION bit changed.
313			 */
314			if ((v & MTX_CONTESTED) == 0 &&
315			    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
316				               (void *)(v | MTX_CONTESTED))) {
317				mtx_exit(&sched_lock, MTX_SPIN);
318				continue;
319			}
320
321			/* We definitely have to sleep for this lock */
322			mtx_assert(m, MA_NOTOWNED);
323
324#ifdef notyet
325			/*
326			 * If we're borrowing an interrupted thread's VM
327			 * context must clean up before going to sleep.
328			 */
329			if (p->p_flag & (P_ITHD | P_SITHD)) {
330				ithd_t *it = (ithd_t *)p;
331
332				if (it->it_interrupted) {
333					CTR2(KTR_LOCK,
334					    "mtx_enter: 0x%x interrupted 0x%x",
335					    it, it->it_interrupted);
336					intr_thd_fixup(it);
337				}
338			}
339#endif
340
341			/* Put us on the list of procs blocked on this mutex */
342			if (TAILQ_EMPTY(&m->mtx_blocked)) {
343				p1 = (struct proc *)(m->mtx_lock &
344						     MTX_FLAGMASK);
345				LIST_INSERT_HEAD(&p1->p_contested, m,
346						 mtx_contested);
347				TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
348			} else {
349				TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
350					if (p1->p_priority > p->p_priority)
351						break;
352				if (p1)
353					TAILQ_INSERT_BEFORE(p1, p, p_procq);
354				else
355					TAILQ_INSERT_TAIL(&m->mtx_blocked, p,
356							  p_procq);
357			}
358
359			p->p_blocked = m;	/* Who we're blocked on */
360			p->p_mtxname = m->mtx_description;
361			p->p_stat = SMTX;
362#if 0
363			propagate_priority(p);
364#endif
365			CTR3(KTR_LOCK, "mtx_enter: p 0x%p blocked on [0x%p] %s",
366			    p, m, m->mtx_description);
367			mi_switch();
368			CTR3(KTR_LOCK,
369			    "mtx_enter: p 0x%p free from blocked on [0x%p] %s",
370			    p, m, m->mtx_description);
371			mtx_exit(&sched_lock, MTX_SPIN);
372		}
373		return;
374	case MTX_SPIN:
375	case MTX_SPIN | MTX_FIRST:
376	case MTX_SPIN | MTX_TOPHALF:
377	    {
378		int i = 0;
379
380		if (m->mtx_lock == (uintptr_t)p) {
381			m->mtx_recurse++;
382			return;
383		}
384		CTR1(KTR_LOCK, "mtx_enter: %p spinning", m);
385		for (;;) {
386			if (_obtain_lock(m, p))
387				break;
388			while (m->mtx_lock != MTX_UNOWNED) {
389				if (i++ < 1000000)
390					continue;
391				if (i++ < 6000000)
392					DELAY (1);
393#ifdef DDB
394				else if (!db_active)
395#else
396				else
397#endif
398					panic(
399				"spin lock %s held by 0x%p for > 5 seconds",
400					    m->mtx_description,
401					    (void *)m->mtx_lock);
402			}
403		}
404
405#ifdef MUTEX_DEBUG
406		if (type != MTX_SPIN)
407			m->mtx_saveintr = 0xbeefface;
408		else
409#endif
410			m->mtx_saveintr = saveintr;
411		CTR1(KTR_LOCK, "mtx_enter: 0x%p spin done", m);
412		return;
413	    }
414	}
415}
416
417void
418mtx_exit_hard(struct mtx *m, int type)
419{
420	struct proc *p, *p1;
421	struct mtx *m1;
422	int pri;
423
424	p = CURPROC;
425	switch (type) {
426	case MTX_DEF:
427	case MTX_DEF | MTX_NOSWITCH:
428		if (m->mtx_recurse != 0) {
429			if (--(m->mtx_recurse) == 0)
430				atomic_clear_ptr(&m->mtx_lock, MTX_RECURSE);
431			CTR1(KTR_LOCK, "mtx_exit: 0x%p unrecurse", m);
432			return;
433		}
434		mtx_enter(&sched_lock, MTX_SPIN);
435		CTR1(KTR_LOCK, "mtx_exit: 0x%p contested", m);
436		p1 = TAILQ_FIRST(&m->mtx_blocked);
437		MPASS(p->p_magic == P_MAGIC);
438		MPASS(p1->p_magic == P_MAGIC);
439		TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq);
440		if (TAILQ_EMPTY(&m->mtx_blocked)) {
441			LIST_REMOVE(m, mtx_contested);
442			_release_lock_quick(m);
443			CTR1(KTR_LOCK, "mtx_exit: 0x%p not held", m);
444		} else
445			atomic_store_rel_ptr(&m->mtx_lock,
446			    (void *)MTX_CONTESTED);
447		pri = MAXPRI;
448		LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
449			int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
450			if (cp < pri)
451				pri = cp;
452		}
453		if (pri > p->p_nativepri)
454			pri = p->p_nativepri;
455		SET_PRIO(p, pri);
456		CTR2(KTR_LOCK, "mtx_exit: 0x%p contested setrunqueue 0x%p",
457		    m, p1);
458		p1->p_blocked = NULL;
459		p1->p_mtxname = NULL;
460		p1->p_stat = SRUN;
461		setrunqueue(p1);
462		if ((type & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
463#ifdef notyet
464			if (p->p_flag & (P_ITHD | P_SITHD)) {
465				ithd_t *it = (ithd_t *)p;
466
467				if (it->it_interrupted) {
468					CTR2(KTR_LOCK,
469					    "mtx_exit: 0x%x interruped 0x%x",
470					    it, it->it_interrupted);
471					intr_thd_fixup(it);
472				}
473			}
474#endif
475			setrunqueue(p);
476			CTR2(KTR_LOCK, "mtx_exit: 0x%p switching out lock=0x%p",
477			    m, (void *)m->mtx_lock);
478			mi_switch();
479			CTR2(KTR_LOCK, "mtx_exit: 0x%p resuming lock=0x%p",
480			    m, (void *)m->mtx_lock);
481		}
482		mtx_exit(&sched_lock, MTX_SPIN);
483		break;
484	case MTX_SPIN:
485	case MTX_SPIN | MTX_FIRST:
486		if (m->mtx_recurse != 0) {
487			m->mtx_recurse--;
488			return;
489		}
490		MPASS(mtx_owned(m));
491		_release_lock_quick(m);
492		if (type & MTX_FIRST)
493			enable_intr();	/* XXX is this kosher? */
494		else {
495			MPASS(m->mtx_saveintr != 0xbeefface);
496			restore_intr(m->mtx_saveintr);
497		}
498		break;
499	case MTX_SPIN | MTX_TOPHALF:
500		if (m->mtx_recurse != 0) {
501			m->mtx_recurse--;
502			return;
503		}
504		MPASS(mtx_owned(m));
505		_release_lock_quick(m);
506		break;
507	default:
508		panic("mtx_exit_hard: unsupported type 0x%x\n", type);
509	}
510}
511
512#define MV_DESTROY	0	/* validate before destory */
513#define MV_INIT		1	/* validate before init */
514
515#ifdef MUTEX_DEBUG
516
517int mtx_validate __P((struct mtx *, int));
518
519int
520mtx_validate(struct mtx *m, int when)
521{
522	struct mtx *mp;
523	int i;
524	int retval = 0;
525
526	if (m == &all_mtx || cold)
527		return 0;
528
529	mtx_enter(&all_mtx, MTX_DEF);
530/*
531 * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
532 * we can re-enable the kernacc() checks.
533 */
534#ifndef __alpha__
535	MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t),
536	    VM_PROT_READ) == 1);
537#endif
538	MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx);
539	for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
540#ifndef __alpha__
541		if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t),
542		    VM_PROT_READ) != 1) {
543			panic("mtx_validate: mp=%p mp->mtx_next=%p",
544			    mp, mp->mtx_next);
545		}
546#endif
547		i++;
548		if (i > mtx_cur_cnt) {
549			panic("mtx_validate: too many in chain, known=%d\n",
550			    mtx_cur_cnt);
551		}
552	}
553	MPASS(i == mtx_cur_cnt);
554	switch (when) {
555	case MV_DESTROY:
556		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
557			if (mp == m)
558				break;
559		MPASS(mp == m);
560		break;
561	case MV_INIT:
562		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
563		if (mp == m) {
564			/*
565			 * Not good. This mutex already exists.
566			 */
567			printf("re-initing existing mutex %s\n",
568			    m->mtx_description);
569			MPASS(m->mtx_lock == MTX_UNOWNED);
570			retval = 1;
571		}
572	}
573	mtx_exit(&all_mtx, MTX_DEF);
574	return (retval);
575}
576#endif
577
578void
579mtx_init(struct mtx *m, const char *t, int flag)
580{
581#ifdef WITNESS
582	struct mtx_debug *debug;
583#endif
584
585	CTR2(KTR_LOCK, "mtx_init 0x%p (%s)", m, t);
586#ifdef MUTEX_DEBUG
587	if (mtx_validate(m, MV_INIT))	/* diagnostic and error correction */
588		return;
589#endif
590#ifdef WITNESS
591	if (flag & MTX_COLD)
592		debug = m->mtx_debug;
593	else
594		debug = NULL;
595	if (debug == NULL) {
596#ifdef DIAGNOSTIC
597		if(cold && bootverbose)
598			printf("malloc'ing mtx_debug while cold for %s\n", t);
599#endif
600
601		/* XXX - should not use DEVBUF */
602		debug = malloc(sizeof(struct mtx_debug), M_DEVBUF,
603		    M_NOWAIT | M_ZERO);
604		MPASS(debug != NULL);
605	}
606#endif
607	bzero((void *)m, sizeof *m);
608	TAILQ_INIT(&m->mtx_blocked);
609#ifdef WITNESS
610	m->mtx_debug = debug;
611#endif
612	m->mtx_description = t;
613	m->mtx_lock = MTX_UNOWNED;
614	/* Put on all mutex queue */
615	mtx_enter(&all_mtx, MTX_DEF);
616	m->mtx_next = &all_mtx;
617	m->mtx_prev = all_mtx.mtx_prev;
618	m->mtx_prev->mtx_next = m;
619	all_mtx.mtx_prev = m;
620	if (++mtx_cur_cnt > mtx_max_cnt)
621		mtx_max_cnt = mtx_cur_cnt;
622	mtx_exit(&all_mtx, MTX_DEF);
623	witness_init(m, flag);
624}
625
626void
627mtx_destroy(struct mtx *m)
628{
629
630	CTR2(KTR_LOCK, "mtx_destroy 0x%p (%s)", m, m->mtx_description);
631#ifdef MUTEX_DEBUG
632	if (m->mtx_next == NULL)
633		panic("mtx_destroy: %p (%s) already destroyed",
634		    m, m->mtx_description);
635
636	if (!mtx_owned(m)) {
637		MPASS(m->mtx_lock == MTX_UNOWNED);
638	} else {
639		MPASS((m->mtx_lock & (MTX_RECURSE|MTX_CONTESTED)) == 0);
640	}
641	mtx_validate(m, MV_DESTROY);		/* diagnostic */
642#endif
643
644#ifdef WITNESS
645	if (m->mtx_witness)
646		witness_destroy(m);
647#endif /* WITNESS */
648
649	/* Remove from the all mutex queue */
650	mtx_enter(&all_mtx, MTX_DEF);
651	m->mtx_next->mtx_prev = m->mtx_prev;
652	m->mtx_prev->mtx_next = m->mtx_next;
653#ifdef MUTEX_DEBUG
654	m->mtx_next = m->mtx_prev = NULL;
655#endif
656#ifdef WITNESS
657	free(m->mtx_debug, M_DEVBUF);
658	m->mtx_debug = NULL;
659#endif
660	mtx_cur_cnt--;
661	mtx_exit(&all_mtx, MTX_DEF);
662}
663
664/*
665 * The non-inlined versions of the mtx_*() functions are always built (above),
666 * but the witness code depends on the WITNESS kernel option being specified.
667 */
668#ifdef WITNESS
669
670#define WITNESS_COUNT 200
671#define	WITNESS_NCHILDREN 2
672
673int witness_watch = 1;
674
675struct witness {
676	struct witness	*w_next;
677	const char	*w_description;
678	const char	*w_file;
679	int		 w_line;
680	struct witness	*w_morechildren;
681	u_char		 w_childcnt;
682	u_char		 w_Giant_squawked:1;
683	u_char		 w_other_squawked:1;
684	u_char		 w_same_squawked:1;
685	u_char		 w_sleep:1;
686	u_char		 w_spin:1;	/* this is a spin mutex */
687	u_int		 w_level;
688	struct witness	*w_children[WITNESS_NCHILDREN];
689};
690
691struct witness_blessed {
692	char 	*b_lock1;
693	char	*b_lock2;
694};
695
696#ifdef DDB
697/*
698 * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
699 * drop into kdebug() when:
700 *	- a lock heirarchy violation occurs
701 *	- locks are held when going to sleep.
702 */
703#ifdef WITNESS_DDB
704int	witness_ddb = 1;
705#else
706int	witness_ddb = 0;
707#endif
708SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
709#endif /* DDB */
710
711#ifdef WITNESS_SKIPSPIN
712int	witness_skipspin = 1;
713#else
714int	witness_skipspin = 0;
715#endif
716SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
717    "");
718
719MUTEX_DECLARE(static,w_mtx);
720static struct witness	*w_free;
721static struct witness	*w_all;
722static int		 w_inited;
723static int		 witness_dead;	/* fatal error, probably no memory */
724
725static struct witness	 w_data[WITNESS_COUNT];
726
727static struct witness	 *enroll __P((const char *description, int flag));
728static int itismychild __P((struct witness *parent, struct witness *child));
729static void removechild __P((struct witness *parent, struct witness *child));
730static int isitmychild __P((struct witness *parent, struct witness *child));
731static int isitmydescendant __P((struct witness *parent, struct witness *child));
732static int dup_ok __P((struct witness *));
733static int blessed __P((struct witness *, struct witness *));
734static void witness_displaydescendants
735    __P((void(*)(const char *fmt, ...), struct witness *));
736static void witness_leveldescendents __P((struct witness *parent, int level));
737static void witness_levelall __P((void));
738static struct witness * witness_get __P((void));
739static void witness_free __P((struct witness *m));
740
741
742static char *ignore_list[] = {
743	"witness lock",
744	NULL
745};
746
747static char *spin_order_list[] = {
748	"sio",
749	"sched lock",
750#ifdef __i386__
751	"clk",
752#endif
753	"callout",
754	/*
755	 * leaf locks
756	 */
757	NULL
758};
759
760static char *order_list[] = {
761	"uidinfo hash", "uidinfo struct", NULL,
762	NULL
763};
764
765static char *dup_list[] = {
766	NULL
767};
768
769static char *sleep_list[] = {
770	"Giant",
771	NULL
772};
773
774/*
775 * Pairs of locks which have been blessed
776 * Don't complain about order problems with blessed locks
777 */
778static struct witness_blessed blessed_list[] = {
779};
780static int blessed_count = sizeof(blessed_list) / sizeof(struct witness_blessed);
781
782void
783witness_init(struct mtx *m, int flag)
784{
785	m->mtx_witness = enroll(m->mtx_description, flag);
786}
787
788void
789witness_destroy(struct mtx *m)
790{
791	struct mtx *m1;
792	struct proc *p;
793	p = CURPROC;
794	for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL;
795		m1 = LIST_NEXT(m1, mtx_held)) {
796		if (m1 == m) {
797			LIST_REMOVE(m, mtx_held);
798			break;
799		}
800	}
801	return;
802
803}
804
805void
806witness_enter(struct mtx *m, int flags, const char *file, int line)
807{
808	struct witness *w, *w1;
809	struct mtx *m1;
810	struct proc *p;
811	int i;
812#ifdef DDB
813	int go_into_ddb = 0;
814#endif /* DDB */
815
816	w = m->mtx_witness;
817	p = CURPROC;
818
819	if (flags & MTX_SPIN) {
820		if (!w->w_spin)
821			panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @"
822			    " %s:%d", m->mtx_description, file, line);
823		if (m->mtx_recurse != 0)
824			return;
825		mtx_enter(&w_mtx, MTX_SPIN);
826		i = witness_spin_check;
827		if (i != 0 && w->w_level < i) {
828			mtx_exit(&w_mtx, MTX_SPIN);
829			panic("mutex_enter(%s:%x, MTX_SPIN) out of order @"
830			    " %s:%d already holding %s:%x",
831			    m->mtx_description, w->w_level, file, line,
832			    spin_order_list[ffs(i)-1], i);
833		}
834		PCPU_SET(witness_spin_check, i | w->w_level);
835		mtx_exit(&w_mtx, MTX_SPIN);
836		w->w_file = file;
837		w->w_line = line;
838		m->mtx_line = line;
839		m->mtx_file = file;
840		return;
841	}
842	if (w->w_spin)
843		panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
844		    m->mtx_description, file, line);
845
846	if (m->mtx_recurse != 0)
847		return;
848	if (witness_dead)
849		goto out;
850	if (cold || panicstr)
851		goto out;
852
853	if (!mtx_legal2block())
854		panic("blockable mtx_enter() of %s when not legal @ %s:%d",
855			    m->mtx_description, file, line);
856	/*
857	 * Is this the first mutex acquired
858	 */
859	if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
860		goto out;
861
862	if ((w1 = m1->mtx_witness) == w) {
863		if (w->w_same_squawked || dup_ok(w))
864			goto out;
865		w->w_same_squawked = 1;
866		printf("acquring duplicate lock of same type: \"%s\"\n",
867			m->mtx_description);
868		printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
869		printf(" 2nd @ %s:%d\n", file, line);
870#ifdef DDB
871		go_into_ddb = 1;
872#endif /* DDB */
873		goto out;
874	}
875	MPASS(!mtx_owned(&w_mtx));
876	mtx_enter(&w_mtx, MTX_SPIN);
877	/*
878	 * If we have a known higher number just say ok
879	 */
880	if (witness_watch > 1 && w->w_level > w1->w_level) {
881		mtx_exit(&w_mtx, MTX_SPIN);
882		goto out;
883	}
884	if (isitmydescendant(m1->mtx_witness, w)) {
885		mtx_exit(&w_mtx, MTX_SPIN);
886		goto out;
887	}
888	for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
889
890		MPASS(i < 200);
891		w1 = m1->mtx_witness;
892		if (isitmydescendant(w, w1)) {
893			mtx_exit(&w_mtx, MTX_SPIN);
894			if (blessed(w, w1))
895				goto out;
896			if (m1 == &Giant) {
897				if (w1->w_Giant_squawked)
898					goto out;
899				else
900					w1->w_Giant_squawked = 1;
901			} else {
902				if (w1->w_other_squawked)
903					goto out;
904				else
905					w1->w_other_squawked = 1;
906			}
907			printf("lock order reversal\n");
908			printf(" 1st %s last acquired @ %s:%d\n",
909			    w->w_description, w->w_file, w->w_line);
910			printf(" 2nd %p %s @ %s:%d\n",
911			    m1, w1->w_description, w1->w_file, w1->w_line);
912			printf(" 3rd %p %s @ %s:%d\n",
913			    m, w->w_description, file, line);
914#ifdef DDB
915			go_into_ddb = 1;
916#endif /* DDB */
917			goto out;
918		}
919	}
920	m1 = LIST_FIRST(&p->p_heldmtx);
921	if (!itismychild(m1->mtx_witness, w))
922		mtx_exit(&w_mtx, MTX_SPIN);
923
924out:
925#ifdef DDB
926	if (witness_ddb && go_into_ddb)
927		Debugger("witness_enter");
928#endif /* DDB */
929	w->w_file = file;
930	w->w_line = line;
931	m->mtx_line = line;
932	m->mtx_file = file;
933
934	/*
935	 * If this pays off it likely means that a mutex being witnessed
936	 * is acquired in hardclock. Put it in the ignore list. It is
937	 * likely not the mutex this assert fails on.
938	 */
939	MPASS(m->mtx_held.le_prev == NULL);
940	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
941}
942
943void
944witness_exit(struct mtx *m, int flags, const char *file, int line)
945{
946	struct witness *w;
947
948	w = m->mtx_witness;
949
950	if (flags & MTX_SPIN) {
951		if (!w->w_spin)
952			panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @"
953			    " %s:%d", m->mtx_description, file, line);
954		if (m->mtx_recurse != 0)
955			return;
956		mtx_enter(&w_mtx, MTX_SPIN);
957		PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level);
958		mtx_exit(&w_mtx, MTX_SPIN);
959		return;
960	}
961	if (w->w_spin)
962		panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
963		    m->mtx_description, file, line);
964
965	if (m->mtx_recurse != 0)
966		return;
967
968	if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
969		panic("switchable mtx_exit() of %s when not legal @ %s:%d",
970			    m->mtx_description, file, line);
971	LIST_REMOVE(m, mtx_held);
972	m->mtx_held.le_prev = NULL;
973}
974
975void
976witness_try_enter(struct mtx *m, int flags, const char *file, int line)
977{
978	struct proc *p;
979	struct witness *w = m->mtx_witness;
980
981	if (flags & MTX_SPIN) {
982		if (!w->w_spin)
983			panic("mutex_try_enter: "
984			    "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
985			    m->mtx_description, file, line);
986		if (m->mtx_recurse != 0)
987			return;
988		mtx_enter(&w_mtx, MTX_SPIN);
989		PCPU_SET(witness_spin_check, witness_spin_check | w->w_level);
990		mtx_exit(&w_mtx, MTX_SPIN);
991		w->w_file = file;
992		w->w_line = line;
993		m->mtx_line = line;
994		m->mtx_file = file;
995		return;
996	}
997
998	if (w->w_spin)
999		panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
1000		    m->mtx_description, file, line);
1001
1002	if (m->mtx_recurse != 0)
1003		return;
1004
1005	w->w_file = file;
1006	w->w_line = line;
1007	m->mtx_line = line;
1008	m->mtx_file = file;
1009	p = CURPROC;
1010	MPASS(m->mtx_held.le_prev == NULL);
1011	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
1012}
1013
1014void
1015witness_display(void(*prnt)(const char *fmt, ...))
1016{
1017	struct witness *w, *w1;
1018
1019	witness_levelall();
1020
1021	for (w = w_all; w; w = w->w_next) {
1022		if (w->w_file == NULL)
1023			continue;
1024		for (w1 = w_all; w1; w1 = w1->w_next) {
1025			if (isitmychild(w1, w))
1026				break;
1027		}
1028		if (w1 != NULL)
1029			continue;
1030		/*
1031		 * This lock has no anscestors, display its descendants.
1032		 */
1033		witness_displaydescendants(prnt, w);
1034	}
1035	prnt("\nMutex which were never acquired\n");
1036	for (w = w_all; w; w = w->w_next) {
1037		if (w->w_file != NULL)
1038			continue;
1039		prnt("%s\n", w->w_description);
1040	}
1041}
1042
1043int
1044witness_sleep(int check_only, struct mtx *mtx, const char *file, int line)
1045{
1046	struct mtx *m;
1047	struct proc *p;
1048	char **sleep;
1049	int n = 0;
1050
1051	p = CURPROC;
1052	for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
1053	    m = LIST_NEXT(m, mtx_held)) {
1054		if (m == mtx)
1055			continue;
1056		for (sleep = sleep_list; *sleep!= NULL; sleep++)
1057			if (strcmp(m->mtx_description, *sleep) == 0)
1058				goto next;
1059		printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
1060			file, line, check_only ? "could sleep" : "sleeping",
1061			m->mtx_description,
1062			m->mtx_witness->w_file, m->mtx_witness->w_line);
1063		n++;
1064	next:
1065	}
1066#ifdef DDB
1067	if (witness_ddb && n)
1068		Debugger("witness_sleep");
1069#endif /* DDB */
1070	return (n);
1071}
1072
1073static struct witness *
1074enroll(const char *description, int flag)
1075{
1076	int i;
1077	struct witness *w, *w1;
1078	char **ignore;
1079	char **order;
1080
1081	if (!witness_watch)
1082		return (NULL);
1083	for (ignore = ignore_list; *ignore != NULL; ignore++)
1084		if (strcmp(description, *ignore) == 0)
1085			return (NULL);
1086
1087	if (w_inited == 0) {
1088		mtx_init(&w_mtx, "witness lock", MTX_COLD | MTX_SPIN);
1089		for (i = 0; i < WITNESS_COUNT; i++) {
1090			w = &w_data[i];
1091			witness_free(w);
1092		}
1093		w_inited = 1;
1094		for (order = order_list; *order != NULL; order++) {
1095			w = enroll(*order, MTX_DEF);
1096			w->w_file = "order list";
1097			for (order++; *order != NULL; order++) {
1098				w1 = enroll(*order, MTX_DEF);
1099				w1->w_file = "order list";
1100				itismychild(w, w1);
1101				w = w1;
1102    	    	    	}
1103		}
1104	}
1105	if ((flag & MTX_SPIN) && witness_skipspin)
1106		return (NULL);
1107	mtx_enter(&w_mtx, MTX_SPIN);
1108	for (w = w_all; w; w = w->w_next) {
1109		if (strcmp(description, w->w_description) == 0) {
1110			mtx_exit(&w_mtx, MTX_SPIN);
1111			return (w);
1112		}
1113	}
1114	if ((w = witness_get()) == NULL)
1115		return (NULL);
1116	w->w_next = w_all;
1117	w_all = w;
1118	w->w_description = description;
1119	mtx_exit(&w_mtx, MTX_SPIN);
1120	if (flag & MTX_SPIN) {
1121		w->w_spin = 1;
1122
1123		i = 1;
1124		for (order = spin_order_list; *order != NULL; order++) {
1125			if (strcmp(description, *order) == 0)
1126				break;
1127			i <<= 1;
1128		}
1129		if (*order == NULL)
1130			panic("spin lock %s not in order list", description);
1131		w->w_level = i;
1132	}
1133	return (w);
1134}
1135
1136static int
1137itismychild(struct witness *parent, struct witness *child)
1138{
1139	static int recursed;
1140
1141	/*
1142	 * Insert "child" after "parent"
1143	 */
1144	while (parent->w_morechildren)
1145		parent = parent->w_morechildren;
1146
1147	if (parent->w_childcnt == WITNESS_NCHILDREN) {
1148		if ((parent->w_morechildren = witness_get()) == NULL)
1149			return (1);
1150		parent = parent->w_morechildren;
1151	}
1152	MPASS(child != NULL);
1153	parent->w_children[parent->w_childcnt++] = child;
1154	/*
1155	 * now prune whole tree
1156	 */
1157	if (recursed)
1158		return (0);
1159	recursed = 1;
1160	for (child = w_all; child != NULL; child = child->w_next) {
1161		for (parent = w_all; parent != NULL;
1162		    parent = parent->w_next) {
1163			if (!isitmychild(parent, child))
1164				continue;
1165			removechild(parent, child);
1166			if (isitmydescendant(parent, child))
1167				continue;
1168			itismychild(parent, child);
1169		}
1170	}
1171	recursed = 0;
1172	witness_levelall();
1173	return (0);
1174}
1175
1176static void
1177removechild(struct witness *parent, struct witness *child)
1178{
1179	struct witness *w, *w1;
1180	int i;
1181
1182	for (w = parent; w != NULL; w = w->w_morechildren)
1183		for (i = 0; i < w->w_childcnt; i++)
1184			if (w->w_children[i] == child)
1185				goto found;
1186	return;
1187found:
1188	for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
1189		continue;
1190	w->w_children[i] = w1->w_children[--w1->w_childcnt];
1191	MPASS(w->w_children[i] != NULL);
1192
1193	if (w1->w_childcnt != 0)
1194		return;
1195
1196	if (w1 == parent)
1197		return;
1198	for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
1199		continue;
1200	w->w_morechildren = 0;
1201	witness_free(w1);
1202}
1203
1204static int
1205isitmychild(struct witness *parent, struct witness *child)
1206{
1207	struct witness *w;
1208	int i;
1209
1210	for (w = parent; w != NULL; w = w->w_morechildren) {
1211		for (i = 0; i < w->w_childcnt; i++) {
1212			if (w->w_children[i] == child)
1213				return (1);
1214		}
1215	}
1216	return (0);
1217}
1218
1219static int
1220isitmydescendant(struct witness *parent, struct witness *child)
1221{
1222	struct witness *w;
1223	int i;
1224	int j;
1225
1226	for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
1227		MPASS(j < 1000);
1228		for (i = 0; i < w->w_childcnt; i++) {
1229			if (w->w_children[i] == child)
1230				return (1);
1231		}
1232		for (i = 0; i < w->w_childcnt; i++) {
1233			if (isitmydescendant(w->w_children[i], child))
1234				return (1);
1235		}
1236	}
1237	return (0);
1238}
1239
1240void
1241witness_levelall (void)
1242{
1243	struct witness *w, *w1;
1244
1245	for (w = w_all; w; w = w->w_next)
1246		if (!w->w_spin)
1247			w->w_level = 0;
1248	for (w = w_all; w; w = w->w_next) {
1249		if (w->w_spin)
1250			continue;
1251		for (w1 = w_all; w1; w1 = w1->w_next) {
1252			if (isitmychild(w1, w))
1253				break;
1254		}
1255		if (w1 != NULL)
1256			continue;
1257		witness_leveldescendents(w, 0);
1258	}
1259}
1260
1261static void
1262witness_leveldescendents(struct witness *parent, int level)
1263{
1264	int i;
1265	struct witness *w;
1266
1267	if (parent->w_level < level)
1268		parent->w_level = level;
1269	level++;
1270	for (w = parent; w != NULL; w = w->w_morechildren)
1271		for (i = 0; i < w->w_childcnt; i++)
1272			witness_leveldescendents(w->w_children[i], level);
1273}
1274
1275static void
1276witness_displaydescendants(void(*prnt)(const char *fmt, ...),
1277			   struct witness *parent)
1278{
1279	struct witness *w;
1280	int i;
1281	int level = parent->w_level;
1282
1283	prnt("%d", level);
1284	if (level < 10)
1285		prnt(" ");
1286	for (i = 0; i < level; i++)
1287		prnt(" ");
1288	prnt("%s", parent->w_description);
1289	if (parent->w_file != NULL) {
1290		prnt(" -- last acquired @ %s", parent->w_file);
1291#ifndef W_USE_WHERE
1292		prnt(":%d", parent->w_line);
1293#endif
1294		prnt("\n");
1295	}
1296
1297	for (w = parent; w != NULL; w = w->w_morechildren)
1298		for (i = 0; i < w->w_childcnt; i++)
1299			    witness_displaydescendants(prnt, w->w_children[i]);
1300    }
1301
1302static int
1303dup_ok(struct witness *w)
1304{
1305	char **dup;
1306
1307	for (dup = dup_list; *dup!= NULL; dup++)
1308		if (strcmp(w->w_description, *dup) == 0)
1309			return (1);
1310	return (0);
1311}
1312
1313static int
1314blessed(struct witness *w1, struct witness *w2)
1315{
1316	int i;
1317	struct witness_blessed *b;
1318
1319	for (i = 0; i < blessed_count; i++) {
1320		b = &blessed_list[i];
1321		if (strcmp(w1->w_description, b->b_lock1) == 0) {
1322			if (strcmp(w2->w_description, b->b_lock2) == 0)
1323				return (1);
1324			continue;
1325		}
1326		if (strcmp(w1->w_description, b->b_lock2) == 0)
1327			if (strcmp(w2->w_description, b->b_lock1) == 0)
1328				return (1);
1329	}
1330	return (0);
1331}
1332
1333static struct witness *
1334witness_get()
1335{
1336	struct witness *w;
1337
1338	if ((w = w_free) == NULL) {
1339		witness_dead = 1;
1340		mtx_exit(&w_mtx, MTX_SPIN);
1341		printf("witness exhausted\n");
1342		return (NULL);
1343	}
1344	w_free = w->w_next;
1345	bzero(w, sizeof(*w));
1346	return (w);
1347}
1348
1349static void
1350witness_free(struct witness *w)
1351{
1352	w->w_next = w_free;
1353	w_free = w;
1354}
1355
1356void
1357witness_list(struct proc *p)
1358{
1359	struct mtx *m;
1360
1361	for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
1362	    m = LIST_NEXT(m, mtx_held)) {
1363		printf("\t\"%s\" (%p) locked at %s:%d\n",
1364		    m->mtx_description, m,
1365		    m->mtx_witness->w_file, m->mtx_witness->w_line);
1366	}
1367}
1368
1369void
1370witness_save(struct mtx *m, const char **filep, int *linep)
1371{
1372	*filep = m->mtx_witness->w_file;
1373	*linep = m->mtx_witness->w_line;
1374}
1375
1376void
1377witness_restore(struct mtx *m, const char *file, int line)
1378{
1379	m->mtx_witness->w_file = file;
1380	m->mtx_witness->w_line = line;
1381}
1382
1383#endif	/* WITNESS */
1384