subr_turnstile.c revision 93609
1/*-
2 * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 * 3. Berkeley Software Design Inc's name may not be used to endorse or
13 *    promote products derived from this software without specific prior
14 *    written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
29 *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
30 * $FreeBSD: head/sys/kern/subr_turnstile.c 93609 2002-04-02 00:01:49Z des $
31 */
32
33/*
34 * Machine independent bits of mutex implementation.
35 */
36
37#include "opt_ddb.h"
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/bus.h>
42#include <sys/kernel.h>
43#include <sys/ktr.h>
44#include <sys/lock.h>
45#include <sys/malloc.h>
46#include <sys/mutex.h>
47#include <sys/proc.h>
48#include <sys/resourcevar.h>
49#include <sys/sbuf.h>
50#include <sys/sysctl.h>
51#include <sys/vmmeter.h>
52
53#include <machine/atomic.h>
54#include <machine/bus.h>
55#include <machine/clock.h>
56#include <machine/cpu.h>
57
58#include <ddb/ddb.h>
59
60#include <vm/vm.h>
61#include <vm/vm_extern.h>
62
63/*
64 * Internal utility macros.
65 */
66#define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
67
68#define mtx_owner(m)	(mtx_unowned((m)) ? NULL \
69	: (struct thread *)((m)->mtx_lock & MTX_FLAGMASK))
70
71/*
72 * Lock classes for sleep and spin mutexes.
73 */
74struct lock_class lock_class_mtx_sleep = {
75	"sleep mutex",
76	LC_SLEEPLOCK | LC_RECURSABLE
77};
78struct lock_class lock_class_mtx_spin = {
79	"spin mutex",
80	LC_SPINLOCK | LC_RECURSABLE
81};
82
83/*
84 * Prototypes for non-exported routines.
85 */
86static void	propagate_priority(struct thread *);
87
88static void
89propagate_priority(struct thread *td)
90{
91	int pri = td->td_priority;
92	struct mtx *m = td->td_blocked;
93
94	mtx_assert(&sched_lock, MA_OWNED);
95	for (;;) {
96		struct thread *td1;
97
98		td = mtx_owner(m);
99
100		if (td == NULL) {
101			/*
102			 * This really isn't quite right. Really
103			 * ought to bump priority of thread that
104			 * next acquires the mutex.
105			 */
106			MPASS(m->mtx_lock == MTX_CONTESTED);
107			return;
108		}
109
110		MPASS(td->td_proc->p_magic == P_MAGIC);
111		KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));
112		if (td->td_priority <= pri) /* lower is higher priority */
113			return;
114
115		/*
116		 * Bump this thread's priority.
117		 */
118		td->td_priority = pri;
119
120		/*
121		 * If lock holder is actually running, just bump priority.
122		 */
123		 /* XXXKSE this test is not sufficient */
124		if (td->td_kse && (td->td_kse->ke_oncpu != NOCPU)) {
125			MPASS(td->td_proc->p_stat == SRUN
126			|| td->td_proc->p_stat == SZOMB
127			|| td->td_proc->p_stat == SSTOP);
128			return;
129		}
130
131#ifndef SMP
132		/*
133		 * For UP, we check to see if td is curthread (this shouldn't
134		 * ever happen however as it would mean we are in a deadlock.)
135		 */
136		KASSERT(td != curthread, ("Deadlock detected"));
137#endif
138
139		/*
140		 * If on run queue move to new run queue, and quit.
141		 * XXXKSE this gets a lot more complicated under threads
142		 * but try anyhow.
143		 */
144		if (td->td_proc->p_stat == SRUN) {
145			MPASS(td->td_blocked == NULL);
146			remrunqueue(td);
147			setrunqueue(td);
148			return;
149		}
150
151		/*
152		 * If we aren't blocked on a mutex, we should be.
153		 */
154		KASSERT(td->td_proc->p_stat == SMTX, (
155		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
156		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,
157		    m->mtx_object.lo_name));
158
159		/*
160		 * Pick up the mutex that td is blocked on.
161		 */
162		m = td->td_blocked;
163		MPASS(m != NULL);
164
165		/*
166		 * Check if the thread needs to be moved up on
167		 * the blocked chain
168		 */
169		if (td == TAILQ_FIRST(&m->mtx_blocked)) {
170			continue;
171		}
172
173		td1 = TAILQ_PREV(td, threadqueue, td_blkq);
174		if (td1->td_priority <= pri) {
175			continue;
176		}
177
178		/*
179		 * Remove thread from blocked chain and determine where
180		 * it should be moved up to.  Since we know that td1 has
181		 * a lower priority than td, we know that at least one
182		 * thread in the chain has a lower priority and that
183		 * td1 will thus not be NULL after the loop.
184		 */
185		TAILQ_REMOVE(&m->mtx_blocked, td, td_blkq);
186		TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq) {
187			MPASS(td1->td_proc->p_magic == P_MAGIC);
188			if (td1->td_priority > pri)
189				break;
190		}
191
192		MPASS(td1 != NULL);
193		TAILQ_INSERT_BEFORE(td1, td, td_blkq);
194		CTR4(KTR_LOCK,
195		    "propagate_priority: p %p moved before %p on [%p] %s",
196		    td, td1, m, m->mtx_object.lo_name);
197	}
198}
199
200#ifdef MUTEX_PROFILING
201SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging");
202SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling");
203static int mutex_prof_enable = 0;
204SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW,
205    &mutex_prof_enable, 0, "Enable tracing of mutex holdtime");
206
207struct mutex_prof {
208	const char *name;
209	const char *file;
210	int line;
211#define MPROF_MAX 0
212#define MPROF_TOT 1
213#define MPROF_CNT 2
214#define MPROF_AVG 3
215	u_int64_t cycles[4];
216};
217
218/*
219 * mprof_buf is a static pool of profiling records to avoid possible
220 * reentrance of the memory allocation functions.
221 *
222 * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE.
223 */
224#define NUM_MPROF_BUFFERS 4096
225static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS];
226static int first_free_mprof_buf;
227#define MPROF_HASH_SIZE 32771
228static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE];
229
230static int mutex_prof_acquisitions;
231SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD,
232    &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded");
233static int mutex_prof_records;
234SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD,
235    &mutex_prof_records, 0, "Number of profiling records");
236static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS;
237SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD,
238    &mutex_prof_maxrecords, 0, "Maximum number of profiling records");
239static int mutex_prof_rejected;
240SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD,
241    &mutex_prof_rejected, 0, "Number of rejected profiling records");
242static int mutex_prof_hashsize = MPROF_HASH_SIZE;
243SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD,
244    &mutex_prof_hashsize, 0, "Hash size");
245static int mutex_prof_collisions = 0;
246SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD,
247    &mutex_prof_collisions, 0, "Number of hash collisions");
248
249/*
250 * mprof_mtx protects the profiling buffers and the hash.
251 */
252static struct mtx mprof_mtx;
253
254static void
255mprof_init(void *arg __unused)
256{
257	mtx_init(&mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET);
258}
259SYSINIT(mprofinit, SI_SUB_LOCK, SI_ORDER_ANY, mprof_init, NULL);
260
261static int
262dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS)
263{
264	struct sbuf *sb;
265	int error, i;
266
267	if (first_free_mprof_buf == 0)
268		return SYSCTL_OUT(req, "No locking recorded",
269		    sizeof("No locking recorded"));
270
271	sb = sbuf_new(NULL, NULL, 1024, SBUF_AUTOEXTEND);
272	sbuf_printf(sb, "%12s %12s %12s %12s %s\n",
273	    "max", "total", "count", "average", "name");
274	mtx_lock_spin(&mprof_mtx);
275	for (i = 0; i < first_free_mprof_buf; ++i)
276		sbuf_printf(sb, "%12llu %12llu %12llu %12llu %s:%d (%s)\n",
277		    mprof_buf[i].cycles[MPROF_MAX],
278		    mprof_buf[i].cycles[MPROF_TOT],
279		    mprof_buf[i].cycles[MPROF_CNT],
280		    mprof_buf[i].cycles[MPROF_AVG],
281		    mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name);
282	mtx_unlock_spin(&mprof_mtx);
283	sbuf_finish(sb);
284	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
285	sbuf_delete(sb);
286	return (error);
287}
288SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
289    NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics");
290#endif
291
292/*
293 * Function versions of the inlined __mtx_* macros.  These are used by
294 * modules and can also be called from assembly language if needed.
295 */
296void
297_mtx_lock_flags(struct mtx *m, int opts, const char *file, int line)
298{
299
300	MPASS(curthread != NULL);
301	_get_sleep_lock(m, curthread, opts, file, line);
302	LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
303	    line);
304	WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
305#ifdef MUTEX_PROFILING
306	/* don't reset the timer when/if recursing */
307	if (m->cycles == 0) {
308		m->file = file;
309		m->line = line;
310		m->cycles = mutex_prof_enable ? get_cyclecount() : 0;
311		++mutex_prof_acquisitions;
312	}
313#endif
314}
315
316void
317_mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line)
318{
319
320	MPASS(curthread != NULL);
321	mtx_assert(m, MA_OWNED);
322#ifdef MUTEX_PROFILING
323	if (m->cycles != 0) {
324		static const char *unknown = "(unknown)";
325		struct mutex_prof *mpp;
326		u_int64_t cycles, mcycles;
327		const char *p, *q;
328		volatile u_int hash, n;
329
330		cycles = get_cyclecount();
331		mcycles = m->cycles;
332		m->cycles = 0;
333		if (cycles <= mcycles)
334			goto out;
335		for (p = file; strncmp(p, "../", 3) == 0; p += 3)
336			/* nothing */ ;
337		if (p == NULL || *p == '\0')
338			p = unknown;
339		for (hash = line, q = p; *q != '\0'; ++q)
340			hash = (hash * 2 + *q) % MPROF_HASH_SIZE;
341		mtx_lock_spin(&mprof_mtx);
342		n = hash;
343		while ((mpp = mprof_hash[n]) != NULL) {
344			if (mpp->line == line && strcmp(mpp->file, p) == 0)
345				break;
346			n = (n + 1) % MPROF_HASH_SIZE;
347		}
348		if (mpp == NULL) {
349			/* Just exit if we cannot get a trace buffer */
350			if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) {
351				++mutex_prof_rejected;
352				goto unlock;
353			}
354			mpp = &mprof_buf[first_free_mprof_buf++];
355			mpp->name = mtx_name(m);
356			mpp->file = p;
357			mpp->line = line;
358			mutex_prof_collisions += n - hash;
359			++mutex_prof_records;
360			mprof_hash[hash] = mpp;
361		}
362		/*
363		 * Record if the mutex has been held longer now than ever
364		 * before
365		 */
366		if ((cycles - mcycles) > mpp->cycles[MPROF_MAX])
367			mpp->cycles[MPROF_MAX] = cycles - mcycles;
368		mpp->cycles[MPROF_TOT] += cycles - mcycles;
369		mpp->cycles[MPROF_CNT] += 1;
370		mpp->cycles[MPROF_AVG] =
371		    mpp->cycles[MPROF_TOT] / mpp->cycles[MPROF_CNT];
372unlock:
373		mtx_unlock_spin(&mprof_mtx);
374	}
375out:
376#endif
377 	WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
378	LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
379	    line);
380	_rel_sleep_lock(m, curthread, opts, file, line);
381}
382
383void
384_mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line)
385{
386
387	MPASS(curthread != NULL);
388	_get_spin_lock(m, curthread, opts, file, line);
389	LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
390	    line);
391	WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
392}
393
394void
395_mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file, int line)
396{
397
398	MPASS(curthread != NULL);
399	mtx_assert(m, MA_OWNED);
400 	WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
401	LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
402	    line);
403	_rel_spin_lock(m);
404}
405
406/*
407 * The important part of mtx_trylock{,_flags}()
408 * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that
409 * if we're called, it's because we know we don't already own this lock.
410 */
411int
412_mtx_trylock(struct mtx *m, int opts, const char *file, int line)
413{
414	int rval;
415
416	MPASS(curthread != NULL);
417
418	rval = _obtain_lock(m, curthread);
419
420	LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line);
421	if (rval) {
422		/*
423		 * We do not handle recursion in _mtx_trylock; see the
424		 * note at the top of the routine.
425		 */
426		KASSERT(!mtx_recursed(m),
427		    ("mtx_trylock() called on a recursed mutex"));
428		WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
429		    file, line);
430	}
431
432	return (rval);
433}
434
435/*
436 * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
437 *
438 * We call this if the lock is either contested (i.e. we need to go to
439 * sleep waiting for it), or if we need to recurse on it.
440 */
441void
442_mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
443{
444	struct thread *td = curthread;
445
446	if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)td) {
447		m->mtx_recurse++;
448		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
449		if (LOCK_LOG_TEST(&m->mtx_object, opts))
450			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
451		return;
452	}
453
454	if (LOCK_LOG_TEST(&m->mtx_object, opts))
455		CTR4(KTR_LOCK,
456		    "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
457		    m->mtx_object.lo_name, (void *)m->mtx_lock, file, line);
458
459	while (!_obtain_lock(m, td)) {
460		uintptr_t v;
461		struct thread *td1;
462
463		mtx_lock_spin(&sched_lock);
464		/*
465		 * Check if the lock has been released while spinning for
466		 * the sched_lock.
467		 */
468		if ((v = m->mtx_lock) == MTX_UNOWNED) {
469			mtx_unlock_spin(&sched_lock);
470			continue;
471		}
472
473		/*
474		 * The mutex was marked contested on release. This means that
475		 * there are threads blocked on it.
476		 */
477		if (v == MTX_CONTESTED) {
478			td1 = TAILQ_FIRST(&m->mtx_blocked);
479			MPASS(td1 != NULL);
480			m->mtx_lock = (uintptr_t)td | MTX_CONTESTED;
481
482			if (td1->td_priority < td->td_priority)
483				td->td_priority = td1->td_priority;
484			mtx_unlock_spin(&sched_lock);
485			return;
486		}
487
488		/*
489		 * If the mutex isn't already contested and a failure occurs
490		 * setting the contested bit, the mutex was either released
491		 * or the state of the MTX_RECURSED bit changed.
492		 */
493		if ((v & MTX_CONTESTED) == 0 &&
494		    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
495			(void *)(v | MTX_CONTESTED))) {
496			mtx_unlock_spin(&sched_lock);
497			continue;
498		}
499
500		/*
501		 * We deffinately must sleep for this lock.
502		 */
503		mtx_assert(m, MA_NOTOWNED);
504
505#ifdef notyet
506		/*
507		 * If we're borrowing an interrupted thread's VM context, we
508		 * must clean up before going to sleep.
509		 */
510		if (td->td_ithd != NULL) {
511			struct ithd *it = td->td_ithd;
512
513			if (it->it_interrupted) {
514				if (LOCK_LOG_TEST(&m->mtx_object, opts))
515					CTR2(KTR_LOCK,
516				    "_mtx_lock_sleep: %p interrupted %p",
517					    it, it->it_interrupted);
518				intr_thd_fixup(it);
519			}
520		}
521#endif
522
523		/*
524		 * Put us on the list of threads blocked on this mutex.
525		 */
526		if (TAILQ_EMPTY(&m->mtx_blocked)) {
527			td1 = mtx_owner(m);
528			LIST_INSERT_HEAD(&td1->td_contested, m, mtx_contested);
529			TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq);
530		} else {
531			TAILQ_FOREACH(td1, &m->mtx_blocked, td_blkq)
532				if (td1->td_priority > td->td_priority)
533					break;
534			if (td1)
535				TAILQ_INSERT_BEFORE(td1, td, td_blkq);
536			else
537				TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_blkq);
538		}
539
540		/*
541		 * Save who we're blocked on.
542		 */
543		td->td_blocked = m;
544		td->td_mtxname = m->mtx_object.lo_name;
545		td->td_proc->p_stat = SMTX;
546		propagate_priority(td);
547
548		if (LOCK_LOG_TEST(&m->mtx_object, opts))
549			CTR3(KTR_LOCK,
550			    "_mtx_lock_sleep: p %p blocked on [%p] %s", td, m,
551			    m->mtx_object.lo_name);
552
553		td->td_proc->p_stats->p_ru.ru_nvcsw++;
554		mi_switch();
555
556		if (LOCK_LOG_TEST(&m->mtx_object, opts))
557			CTR3(KTR_LOCK,
558			  "_mtx_lock_sleep: p %p free from blocked on [%p] %s",
559			  td, m, m->mtx_object.lo_name);
560
561		mtx_unlock_spin(&sched_lock);
562	}
563
564	return;
565}
566
567/*
568 * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
569 *
570 * This is only called if we need to actually spin for the lock. Recursion
571 * is handled inline.
572 */
573void
574_mtx_lock_spin(struct mtx *m, int opts, const char *file, int line)
575{
576	int i = 0;
577
578	if (LOCK_LOG_TEST(&m->mtx_object, opts))
579		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
580
581	for (;;) {
582		if (_obtain_lock(m, curthread))
583			break;
584
585		/* Give interrupts a chance while we spin. */
586		critical_exit();
587		while (m->mtx_lock != MTX_UNOWNED) {
588			if (i++ < 10000000)
589				continue;
590			if (i++ < 60000000)
591				DELAY(1);
592#ifdef DDB
593			else if (!db_active)
594#else
595			else
596#endif
597			panic("spin lock %s held by %p for > 5 seconds",
598			    m->mtx_object.lo_name, (void *)m->mtx_lock);
599		}
600		critical_enter();
601	}
602
603	if (LOCK_LOG_TEST(&m->mtx_object, opts))
604		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
605
606	return;
607}
608
609/*
610 * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
611 *
612 * We are only called here if the lock is recursed or contested (i.e. we
613 * need to wake up a blocked thread).
614 */
615void
616_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
617{
618	struct thread *td, *td1;
619	struct mtx *m1;
620	int pri;
621
622	td = curthread;
623
624	if (mtx_recursed(m)) {
625		if (--(m->mtx_recurse) == 0)
626			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
627		if (LOCK_LOG_TEST(&m->mtx_object, opts))
628			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
629		return;
630	}
631
632	mtx_lock_spin(&sched_lock);
633	if (LOCK_LOG_TEST(&m->mtx_object, opts))
634		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
635
636	td1 = TAILQ_FIRST(&m->mtx_blocked);
637	MPASS(td->td_proc->p_magic == P_MAGIC);
638	MPASS(td1->td_proc->p_magic == P_MAGIC);
639
640	TAILQ_REMOVE(&m->mtx_blocked, td1, td_blkq);
641
642	if (TAILQ_EMPTY(&m->mtx_blocked)) {
643		LIST_REMOVE(m, mtx_contested);
644		_release_lock_quick(m);
645		if (LOCK_LOG_TEST(&m->mtx_object, opts))
646			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
647	} else
648		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);
649
650	pri = PRI_MAX;
651	LIST_FOREACH(m1, &td->td_contested, mtx_contested) {
652		int cp = TAILQ_FIRST(&m1->mtx_blocked)->td_priority;
653		if (cp < pri)
654			pri = cp;
655	}
656
657	if (pri > td->td_base_pri)
658		pri = td->td_base_pri;
659	td->td_priority = pri;
660
661	if (LOCK_LOG_TEST(&m->mtx_object, opts))
662		CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p",
663		    m, td1);
664
665	td1->td_blocked = NULL;
666	td1->td_proc->p_stat = SRUN;
667	setrunqueue(td1);
668
669	if (td->td_critnest == 1 && td1->td_priority < pri) {
670#ifdef notyet
671		if (td->td_ithd != NULL) {
672			struct ithd *it = td->td_ithd;
673
674			if (it->it_interrupted) {
675				if (LOCK_LOG_TEST(&m->mtx_object, opts))
676					CTR2(KTR_LOCK,
677				    "_mtx_unlock_sleep: %p interrupted %p",
678					    it, it->it_interrupted);
679				intr_thd_fixup(it);
680			}
681		}
682#endif
683		setrunqueue(td);
684		if (LOCK_LOG_TEST(&m->mtx_object, opts))
685			CTR2(KTR_LOCK,
686			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
687			    (void *)m->mtx_lock);
688
689		td->td_proc->p_stats->p_ru.ru_nivcsw++;
690		mi_switch();
691		if (LOCK_LOG_TEST(&m->mtx_object, opts))
692			CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
693			    m, (void *)m->mtx_lock);
694	}
695
696	mtx_unlock_spin(&sched_lock);
697
698	return;
699}
700
701/*
702 * All the unlocking of MTX_SPIN locks is done inline.
703 * See the _rel_spin_lock() macro for the details.
704 */
705
706/*
707 * The backing function for the INVARIANTS-enabled mtx_assert()
708 */
709#ifdef INVARIANT_SUPPORT
710void
711_mtx_assert(struct mtx *m, int what, const char *file, int line)
712{
713
714	if (panicstr != NULL)
715		return;
716	switch (what) {
717	case MA_OWNED:
718	case MA_OWNED | MA_RECURSED:
719	case MA_OWNED | MA_NOTRECURSED:
720		if (!mtx_owned(m))
721			panic("mutex %s not owned at %s:%d",
722			    m->mtx_object.lo_name, file, line);
723		if (mtx_recursed(m)) {
724			if ((what & MA_NOTRECURSED) != 0)
725				panic("mutex %s recursed at %s:%d",
726				    m->mtx_object.lo_name, file, line);
727		} else if ((what & MA_RECURSED) != 0) {
728			panic("mutex %s unrecursed at %s:%d",
729			    m->mtx_object.lo_name, file, line);
730		}
731		break;
732	case MA_NOTOWNED:
733		if (mtx_owned(m))
734			panic("mutex %s owned at %s:%d",
735			    m->mtx_object.lo_name, file, line);
736		break;
737	default:
738		panic("unknown mtx_assert at %s:%d", file, line);
739	}
740}
741#endif
742
743/*
744 * The MUTEX_DEBUG-enabled mtx_validate()
745 *
746 * Most of these checks have been moved off into the LO_INITIALIZED flag
747 * maintained by the witness code.
748 */
749#ifdef MUTEX_DEBUG
750
751void	mtx_validate(struct mtx *);
752
753void
754mtx_validate(struct mtx *m)
755{
756
757/*
758 * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
759 * we can re-enable the kernacc() checks.
760 */
761#ifndef __alpha__
762	/*
763	 * Can't call kernacc() from early init386(), especially when
764	 * initializing Giant mutex, because some stuff in kernacc()
765	 * requires Giant itself.
766	 */
767	if (!cold)
768		if (!kernacc((caddr_t)m, sizeof(m),
769		    VM_PROT_READ | VM_PROT_WRITE))
770			panic("Can't read and write to mutex %p", m);
771#endif
772}
773#endif
774
775/*
776 * Mutex initialization routine; initialize lock `m' of type contained in
777 * `opts' with options contained in `opts' and description `description.'
778 */
779void
780mtx_init(struct mtx *m, const char *description, int opts)
781{
782	struct lock_object *lock;
783
784	MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
785	    MTX_SLEEPABLE | MTX_NOWITNESS | MTX_DUPOK)) == 0);
786
787#ifdef MUTEX_DEBUG
788	/* Diagnostic and error correction */
789	mtx_validate(m);
790#endif
791
792	lock = &m->mtx_object;
793	KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
794	    ("mutex %s %p already initialized", description, m));
795	bzero(m, sizeof(*m));
796	if (opts & MTX_SPIN)
797		lock->lo_class = &lock_class_mtx_spin;
798	else
799		lock->lo_class = &lock_class_mtx_sleep;
800	lock->lo_name = description;
801	if (opts & MTX_QUIET)
802		lock->lo_flags = LO_QUIET;
803	if (opts & MTX_RECURSE)
804		lock->lo_flags |= LO_RECURSABLE;
805	if (opts & MTX_SLEEPABLE)
806		lock->lo_flags |= LO_SLEEPABLE;
807	if ((opts & MTX_NOWITNESS) == 0)
808		lock->lo_flags |= LO_WITNESS;
809	if (opts & MTX_DUPOK)
810		lock->lo_flags |= LO_DUPOK;
811
812	m->mtx_lock = MTX_UNOWNED;
813	TAILQ_INIT(&m->mtx_blocked);
814
815	LOCK_LOG_INIT(lock, opts);
816
817	WITNESS_INIT(lock);
818}
819
820/*
821 * Remove lock `m' from all_mtx queue.  We don't allow MTX_QUIET to be
822 * passed in as a flag here because if the corresponding mtx_init() was
823 * called with MTX_QUIET set, then it will already be set in the mutex's
824 * flags.
825 */
826void
827mtx_destroy(struct mtx *m)
828{
829
830	LOCK_LOG_DESTROY(&m->mtx_object, 0);
831
832	if (!mtx_owned(m))
833		MPASS(mtx_unowned(m));
834	else {
835		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
836
837		/* Tell witness this isn't locked to make it happy. */
838		WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__,
839		    __LINE__);
840	}
841
842	WITNESS_DESTROY(&m->mtx_object);
843}
844
845/*
846 * Encapsulated Giant mutex routines.  These routines provide encapsulation
847 * control for the Giant mutex, allowing sysctls to be used to turn on and
848 * off Giant around certain subsystems.  The default value for the sysctls
849 * are set to what developers believe is stable and working in regards to
850 * the Giant pushdown.  Developers should not turn off Giant via these
851 * sysctls unless they know what they are doing.
852 *
853 * Callers of mtx_lock_giant() are expected to pass the return value to an
854 * accompanying mtx_unlock_giant() later on.  If multiple subsystems are
855 * effected by a Giant wrap, all related sysctl variables must be zero for
856 * the subsystem call to operate without Giant (as determined by the caller).
857 */
858
859SYSCTL_NODE(_kern, OID_AUTO, giant, CTLFLAG_RD, NULL, "Giant mutex manipulation");
860
861static int kern_giant_all = 0;
862SYSCTL_INT(_kern_giant, OID_AUTO, all, CTLFLAG_RW, &kern_giant_all, 0, "");
863
864int kern_giant_proc = 1;	/* Giant around PROC locks */
865int kern_giant_file = 1;	/* Giant around struct file & filedesc */
866int kern_giant_ucred = 1;	/* Giant around ucred */
867SYSCTL_INT(_kern_giant, OID_AUTO, proc, CTLFLAG_RW, &kern_giant_proc, 0, "");
868SYSCTL_INT(_kern_giant, OID_AUTO, file, CTLFLAG_RW, &kern_giant_file, 0, "");
869SYSCTL_INT(_kern_giant, OID_AUTO, ucred, CTLFLAG_RW, &kern_giant_ucred, 0, "");
870
871int
872mtx_lock_giant(int sysctlvar)
873{
874	if (sysctlvar || kern_giant_all) {
875		mtx_lock(&Giant);
876		return(1);
877	}
878	return(0);
879}
880
881void
882mtx_unlock_giant(int s)
883{
884	if (s)
885		mtx_unlock(&Giant);
886}
887
888