1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 *	File:	kern/lock.c
58 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
59 *	Date:	1985
60 *
61 *	Locking primitives implementation
62 */
63
64#include <mach_ldebug.h>
65
66#include <kern/lock.h>
67#include <kern/locks.h>
68#include <kern/kalloc.h>
69#include <kern/misc_protos.h>
70#include <kern/thread.h>
71#include <kern/processor.h>
72#include <kern/cpu_data.h>
73#include <kern/cpu_number.h>
74#include <kern/sched_prim.h>
75#include <kern/xpr.h>
76#include <kern/debug.h>
77#include <string.h>
78
79#include <i386/machine_routines.h> /* machine_timeout_suspended() */
80#include <machine/machine_cpu.h>
81#include <i386/mp.h>
82
83#include <sys/kdebug.h>
84#include <mach/branch_predicates.h>
85
86/*
87 * We need only enough declarations from the BSD-side to be able to
88 * test if our probe is active, and to call __dtrace_probe().  Setting
89 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
90 */
91#if	CONFIG_DTRACE
92#define NEED_DTRACE_DEFS
93#include <../bsd/sys/lockstat.h>
94#endif
95
96#define	LCK_RW_LCK_EXCLUSIVE_CODE	0x100
97#define	LCK_RW_LCK_EXCLUSIVE1_CODE	0x101
98#define	LCK_RW_LCK_SHARED_CODE		0x102
99#define	LCK_RW_LCK_SH_TO_EX_CODE	0x103
100#define	LCK_RW_LCK_SH_TO_EX1_CODE	0x104
101#define	LCK_RW_LCK_EX_TO_SH_CODE	0x105
102
103#define LCK_RW_LCK_EX_WRITER_SPIN_CODE	0x106
104#define LCK_RW_LCK_EX_WRITER_WAIT_CODE	0x107
105#define LCK_RW_LCK_EX_READER_SPIN_CODE	0x108
106#define LCK_RW_LCK_EX_READER_WAIT_CODE	0x109
107#define LCK_RW_LCK_SHARED_SPIN_CODE	0x110
108#define LCK_RW_LCK_SHARED_WAIT_CODE	0x111
109#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE	0x112
110#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE	0x113
111
112
113#define	ANY_LOCK_DEBUG	(USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
114
115unsigned int LcksOpts=0;
116
117/* Forwards */
118
119#if	USLOCK_DEBUG
120/*
121 *	Perform simple lock checks.
122 */
123int	uslock_check = 1;
124int	max_lock_loops	= 100000000;
125decl_simple_lock_data(extern , printf_lock)
126decl_simple_lock_data(extern , panic_lock)
127#endif	/* USLOCK_DEBUG */
128
129
130/*
131 *	We often want to know the addresses of the callers
132 *	of the various lock routines.  However, this information
133 *	is only used for debugging and statistics.
134 */
135typedef void	*pc_t;
136#define	INVALID_PC	((void *) VM_MAX_KERNEL_ADDRESS)
137#define	INVALID_THREAD	((void *) VM_MAX_KERNEL_ADDRESS)
138#if	ANY_LOCK_DEBUG
139#define	OBTAIN_PC(pc)	((pc) = GET_RETURN_PC())
140#define DECL_PC(pc)	pc_t pc;
141#else	/* ANY_LOCK_DEBUG */
142#define DECL_PC(pc)
143#ifdef	lint
144/*
145 *	Eliminate lint complaints about unused local pc variables.
146 */
147#define	OBTAIN_PC(pc)	++pc
148#else	/* lint */
149#define	OBTAIN_PC(pc)
150#endif	/* lint */
151#endif	/* USLOCK_DEBUG */
152
153
154/*
155 *	Portable lock package implementation of usimple_locks.
156 */
157
158#if	USLOCK_DEBUG
159#define	USLDBG(stmt)	stmt
160void		usld_lock_init(usimple_lock_t, unsigned short);
161void		usld_lock_pre(usimple_lock_t, pc_t);
162void		usld_lock_post(usimple_lock_t, pc_t);
163void		usld_unlock(usimple_lock_t, pc_t);
164void		usld_lock_try_pre(usimple_lock_t, pc_t);
165void		usld_lock_try_post(usimple_lock_t, pc_t);
166int		usld_lock_common_checks(usimple_lock_t, char *);
167#else	/* USLOCK_DEBUG */
168#define	USLDBG(stmt)
169#endif	/* USLOCK_DEBUG */
170
171
172extern int lck_rw_grab_want(lck_rw_t *lck);
173extern int lck_rw_grab_shared(lck_rw_t *lck);
174extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
175
176
177/*
178 * Forward definitions
179 */
180
181void lck_rw_lock_shared_gen(
182	lck_rw_t	*lck);
183
184void lck_rw_lock_exclusive_gen(
185	lck_rw_t	*lck);
186
187boolean_t lck_rw_lock_shared_to_exclusive_success(
188	lck_rw_t	*lck);
189
190boolean_t lck_rw_lock_shared_to_exclusive_failure(
191	lck_rw_t	*lck,
192	int		prior_lock_state);
193
194void lck_rw_lock_exclusive_to_shared_gen(
195	lck_rw_t	*lck,
196	int		prior_lock_state);
197
198lck_rw_type_t lck_rw_done_gen(
199	lck_rw_t	*lck,
200	int		prior_lock_state);
201
202void lck_rw_clear_promotions_x86(thread_t thread);
203
204/*
205 *      Routine:        lck_spin_alloc_init
206 */
207lck_spin_t *
208lck_spin_alloc_init(
209	lck_grp_t	*grp,
210	lck_attr_t	*attr)
211{
212	lck_spin_t	*lck;
213
214	if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
215		lck_spin_init(lck, grp, attr);
216
217	return(lck);
218}
219
220/*
221 *      Routine:        lck_spin_free
222 */
223void
224lck_spin_free(
225	lck_spin_t	*lck,
226	lck_grp_t	*grp)
227{
228	lck_spin_destroy(lck, grp);
229	kfree(lck, sizeof(lck_spin_t));
230}
231
232/*
233 *      Routine:        lck_spin_init
234 */
235void
236lck_spin_init(
237	lck_spin_t	*lck,
238	lck_grp_t	*grp,
239	__unused lck_attr_t	*attr)
240{
241	usimple_lock_init((usimple_lock_t) lck, 0);
242	lck_grp_reference(grp);
243	lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
244}
245
246/*
247 *      Routine:        lck_spin_destroy
248 */
249void
250lck_spin_destroy(
251	lck_spin_t	*lck,
252	lck_grp_t	*grp)
253{
254	if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
255		return;
256	lck->interlock = LCK_SPIN_TAG_DESTROYED;
257	lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
258	lck_grp_deallocate(grp);
259	return;
260}
261
262/*
263 *      Routine:        lck_spin_lock
264 */
265void
266lck_spin_lock(
267	lck_spin_t	*lck)
268{
269	usimple_lock((usimple_lock_t) lck);
270}
271
272/*
273 *      Routine:        lck_spin_unlock
274 */
275void
276lck_spin_unlock(
277	lck_spin_t	*lck)
278{
279	usimple_unlock((usimple_lock_t) lck);
280}
281
282
283/*
284 *      Routine:        lck_spin_try_lock
285 */
286boolean_t
287lck_spin_try_lock(
288	lck_spin_t	*lck)
289{
290	return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
291}
292
293/*
294 *	Initialize a usimple_lock.
295 *
296 *	No change in preemption state.
297 */
298void
299usimple_lock_init(
300	usimple_lock_t	l,
301	__unused unsigned short	tag)
302{
303#ifndef	MACHINE_SIMPLE_LOCK
304	USLDBG(usld_lock_init(l, tag));
305	hw_lock_init(&l->interlock);
306#else
307	simple_lock_init((simple_lock_t)l,tag);
308#endif
309}
310
311volatile uint32_t spinlock_owner_cpu = ~0;
312volatile usimple_lock_t spinlock_timed_out;
313
314static uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
315	uint64_t deadline;
316	uint32_t i;
317
318	for (i = 0; i < real_ncpus; i++) {
319		if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
320			spinlock_owner_cpu = i;
321			if ((uint32_t) cpu_number() == i)
322				break;
323			cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
324			cpu_NMI_interrupt(i);
325			deadline = mach_absolute_time() + (LockTimeOut * 2);
326			while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
327				cpu_pause();
328			break;
329		}
330	}
331
332	return spinlock_owner_cpu;
333}
334
335/*
336 *	Acquire a usimple_lock.
337 *
338 *	Returns with preemption disabled.  Note
339 *	that the hw_lock routines are responsible for
340 *	maintaining preemption state.
341 */
342void
343usimple_lock(
344	usimple_lock_t	l)
345{
346#ifndef	MACHINE_SIMPLE_LOCK
347	DECL_PC(pc);
348
349	OBTAIN_PC(pc);
350	USLDBG(usld_lock_pre(l, pc));
351
352	if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))	{
353		boolean_t uslock_acquired = FALSE;
354		while (machine_timeout_suspended()) {
355			enable_preemption();
356			if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
357				break;
358		}
359
360		if (uslock_acquired == FALSE) {
361			uint32_t lock_cpu;
362			uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
363			spinlock_timed_out = l;
364			lock_cpu = spinlock_timeout_NMI(lowner);
365			panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
366		}
367	}
368	USLDBG(usld_lock_post(l, pc));
369#else
370	simple_lock((simple_lock_t)l);
371#endif
372}
373
374
375/*
376 *	Release a usimple_lock.
377 *
378 *	Returns with preemption enabled.  Note
379 *	that the hw_lock routines are responsible for
380 *	maintaining preemption state.
381 */
382void
383usimple_unlock(
384	usimple_lock_t	l)
385{
386#ifndef	MACHINE_SIMPLE_LOCK
387	DECL_PC(pc);
388
389	OBTAIN_PC(pc);
390	USLDBG(usld_unlock(l, pc));
391	hw_lock_unlock(&l->interlock);
392#else
393	simple_unlock_rwmb((simple_lock_t)l);
394#endif
395}
396
397
398/*
399 *	Conditionally acquire a usimple_lock.
400 *
401 *	On success, returns with preemption disabled.
402 *	On failure, returns with preemption in the same state
403 *	as when first invoked.  Note that the hw_lock routines
404 *	are responsible for maintaining preemption state.
405 *
406 *	XXX No stats are gathered on a miss; I preserved this
407 *	behavior from the original assembly-language code, but
408 *	doesn't it make sense to log misses?  XXX
409 */
410unsigned int
411usimple_lock_try(
412	usimple_lock_t	l)
413{
414#ifndef	MACHINE_SIMPLE_LOCK
415	unsigned int	success;
416	DECL_PC(pc);
417
418	OBTAIN_PC(pc);
419	USLDBG(usld_lock_try_pre(l, pc));
420	if ((success = hw_lock_try(&l->interlock))) {
421		USLDBG(usld_lock_try_post(l, pc));
422	}
423	return success;
424#else
425	return(simple_lock_try((simple_lock_t)l));
426#endif
427}
428
429#if	USLOCK_DEBUG
430/*
431 *	States of a usimple_lock.  The default when initializing
432 *	a usimple_lock is setting it up for debug checking.
433 */
434#define	USLOCK_CHECKED		0x0001		/* lock is being checked */
435#define	USLOCK_TAKEN		0x0002		/* lock has been taken */
436#define	USLOCK_INIT		0xBAA0		/* lock has been initialized */
437#define	USLOCK_INITIALIZED	(USLOCK_INIT|USLOCK_CHECKED)
438#define	USLOCK_CHECKING(l)	(uslock_check &&			\
439				 ((l)->debug.state & USLOCK_CHECKED))
440
441/*
442 *	Trace activities of a particularly interesting lock.
443 */
444void	usl_trace(usimple_lock_t, int, pc_t, const char *);
445
446
447/*
448 *	Initialize the debugging information contained
449 *	in a usimple_lock.
450 */
451void
452usld_lock_init(
453	usimple_lock_t	l,
454	__unused unsigned short	tag)
455{
456	if (l == USIMPLE_LOCK_NULL)
457		panic("lock initialization:  null lock pointer");
458	l->lock_type = USLOCK_TAG;
459	l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
460	l->debug.lock_cpu = l->debug.unlock_cpu = 0;
461	l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
462	l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
463	l->debug.duration[0] = l->debug.duration[1] = 0;
464	l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
465	l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
466	l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
467}
468
469
470/*
471 *	These checks apply to all usimple_locks, not just
472 *	those with USLOCK_CHECKED turned on.
473 */
474int
475usld_lock_common_checks(
476	usimple_lock_t	l,
477	char		*caller)
478{
479	if (l == USIMPLE_LOCK_NULL)
480		panic("%s:  null lock pointer", caller);
481	if (l->lock_type != USLOCK_TAG)
482		panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
483	if (!(l->debug.state & USLOCK_INIT))
484		panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
485	return USLOCK_CHECKING(l);
486}
487
488
489/*
490 *	Debug checks on a usimple_lock just before attempting
491 *	to acquire it.
492 */
493/* ARGSUSED */
494void
495usld_lock_pre(
496	usimple_lock_t	l,
497	pc_t		pc)
498{
499	char	caller[] = "usimple_lock";
500
501
502	if (!usld_lock_common_checks(l, caller))
503		return;
504
505/*
506 *	Note that we have a weird case where we are getting a lock when we are]
507 *	in the process of putting the system to sleep. We are running with no
508 *	current threads, therefore we can't tell if we are trying to retake a lock
509 *	we have or someone on the other processor has it.  Therefore we just
510 *	ignore this test if the locking thread is 0.
511 */
512
513	if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
514	    l->debug.lock_thread == (void *) current_thread()) {
515		printf("%s:  lock %p already locked (at %p) by",
516		      caller, l, l->debug.lock_pc);
517		printf(" current thread %p (new attempt at pc %p)\n",
518		       l->debug.lock_thread, pc);
519		panic("%s", caller);
520	}
521	mp_disable_preemption();
522	usl_trace(l, cpu_number(), pc, caller);
523	mp_enable_preemption();
524}
525
526
527/*
528 *	Debug checks on a usimple_lock just after acquiring it.
529 *
530 *	Pre-emption has been disabled at this point,
531 *	so we are safe in using cpu_number.
532 */
533void
534usld_lock_post(
535	usimple_lock_t	l,
536	pc_t		pc)
537{
538	register int	mycpu;
539	char	caller[] = "successful usimple_lock";
540
541
542	if (!usld_lock_common_checks(l, caller))
543		return;
544
545	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
546		panic("%s:  lock %p became uninitialized",
547		      caller, l);
548	if ((l->debug.state & USLOCK_TAKEN))
549		panic("%s:  lock 0x%p became TAKEN by someone else",
550		      caller, l);
551
552	mycpu = cpu_number();
553	l->debug.lock_thread = (void *)current_thread();
554	l->debug.state |= USLOCK_TAKEN;
555	l->debug.lock_pc = pc;
556	l->debug.lock_cpu = mycpu;
557
558	usl_trace(l, mycpu, pc, caller);
559}
560
561
562/*
563 *	Debug checks on a usimple_lock just before
564 *	releasing it.  Note that the caller has not
565 *	yet released the hardware lock.
566 *
567 *	Preemption is still disabled, so there's
568 *	no problem using cpu_number.
569 */
570void
571usld_unlock(
572	usimple_lock_t	l,
573	pc_t		pc)
574{
575	register int	mycpu;
576	char	caller[] = "usimple_unlock";
577
578
579	if (!usld_lock_common_checks(l, caller))
580		return;
581
582	mycpu = cpu_number();
583
584	if (!(l->debug.state & USLOCK_TAKEN))
585		panic("%s:  lock 0x%p hasn't been taken",
586		      caller, l);
587	if (l->debug.lock_thread != (void *) current_thread())
588		panic("%s:  unlocking lock 0x%p, owned by thread %p",
589		      caller, l, l->debug.lock_thread);
590	if (l->debug.lock_cpu != mycpu) {
591		printf("%s:  unlocking lock 0x%p on cpu 0x%x",
592		       caller, l, mycpu);
593		printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
594		panic("%s", caller);
595	}
596	usl_trace(l, mycpu, pc, caller);
597
598	l->debug.unlock_thread = l->debug.lock_thread;
599	l->debug.lock_thread = INVALID_PC;
600	l->debug.state &= ~USLOCK_TAKEN;
601	l->debug.unlock_pc = pc;
602	l->debug.unlock_cpu = mycpu;
603}
604
605
606/*
607 *	Debug checks on a usimple_lock just before
608 *	attempting to acquire it.
609 *
610 *	Preemption isn't guaranteed to be disabled.
611 */
612void
613usld_lock_try_pre(
614	usimple_lock_t	l,
615	pc_t		pc)
616{
617	char	caller[] = "usimple_lock_try";
618
619	if (!usld_lock_common_checks(l, caller))
620		return;
621	mp_disable_preemption();
622	usl_trace(l, cpu_number(), pc, caller);
623	mp_enable_preemption();
624}
625
626
627/*
628 *	Debug checks on a usimple_lock just after
629 *	successfully attempting to acquire it.
630 *
631 *	Preemption has been disabled by the
632 *	lock acquisition attempt, so it's safe
633 *	to use cpu_number.
634 */
635void
636usld_lock_try_post(
637	usimple_lock_t	l,
638	pc_t		pc)
639{
640	register int	mycpu;
641	char	caller[] = "successful usimple_lock_try";
642
643	if (!usld_lock_common_checks(l, caller))
644		return;
645
646	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
647		panic("%s:  lock 0x%p became uninitialized",
648		      caller, l);
649	if ((l->debug.state & USLOCK_TAKEN))
650		panic("%s:  lock 0x%p became TAKEN by someone else",
651		      caller, l);
652
653	mycpu = cpu_number();
654	l->debug.lock_thread = (void *) current_thread();
655	l->debug.state |= USLOCK_TAKEN;
656	l->debug.lock_pc = pc;
657	l->debug.lock_cpu = mycpu;
658
659	usl_trace(l, mycpu, pc, caller);
660}
661
662
663/*
664 *	For very special cases, set traced_lock to point to a
665 *	specific lock of interest.  The result is a series of
666 *	XPRs showing lock operations on that lock.  The lock_seq
667 *	value is used to show the order of those operations.
668 */
669usimple_lock_t		traced_lock;
670unsigned int		lock_seq;
671
672void
673usl_trace(
674	usimple_lock_t	l,
675	int		mycpu,
676	pc_t		pc,
677	const char *	op_name)
678{
679	if (traced_lock == l) {
680		XPR(XPR_SLOCK,
681		    "seq %d, cpu %d, %s @ %x\n",
682		    (uintptr_t) lock_seq, (uintptr_t) mycpu,
683		    (uintptr_t) op_name, (uintptr_t) pc, 0);
684		lock_seq++;
685	}
686}
687
688
689#endif	/* USLOCK_DEBUG */
690
691/*
692 *	Routine:	lock_alloc
693 *	Function:
694 *		Allocate a lock for external users who cannot
695 *		hard-code the structure definition into their
696 *		objects.
697 *		For now just use kalloc, but a zone is probably
698 *		warranted.
699 */
700lock_t *
701lock_alloc(
702	boolean_t	can_sleep,
703	unsigned short	tag,
704	unsigned short	tag1)
705{
706	lock_t		*l;
707
708	if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0)
709	  lock_init(l, can_sleep, tag, tag1);
710	return(l);
711}
712
713/*
714 *	Routine:	lock_free
715 *	Function:
716 *		Free a lock allocated for external users.
717 *		For now just use kfree, but a zone is probably
718 *		warranted.
719 */
720void
721lock_free(
722	lock_t		*l)
723{
724	kfree(l, sizeof(lock_t));
725}
726
727
728/*
729 *	Routine:	lock_init
730 *	Function:
731 *		Initialize a lock; required before use.
732 *		Note that clients declare the "struct lock"
733 *		variables and then initialize them, rather
734 *		than getting a new one from this module.
735 */
736void
737lock_init(
738	lock_t		*l,
739	boolean_t	can_sleep,
740	__unused unsigned short	tag,
741	__unused unsigned short	tag1)
742{
743	hw_lock_byte_init(&l->lck_rw_interlock);
744	l->lck_rw_want_write = FALSE;
745	l->lck_rw_want_upgrade = FALSE;
746	l->lck_rw_shared_count = 0;
747	l->lck_rw_can_sleep = can_sleep;
748	l->lck_rw_tag = tag;
749	l->lck_rw_priv_excl = 1;
750	l->lck_r_waiting = l->lck_w_waiting = 0;
751}
752
753
754/*
755 *	Sleep locks.  These use the same data structure and algorithm
756 *	as the spin locks, but the process sleeps while it is waiting
757 *	for the lock.  These work on uniprocessor systems.
758 */
759
760#define DECREMENTER_TIMEOUT 1000000
761
762void
763lock_write(
764	register lock_t	* l)
765{
766	lck_rw_lock_exclusive(l);
767}
768
769void
770lock_done(
771	register lock_t	* l)
772{
773	(void) lck_rw_done(l);
774}
775
776void
777lock_read(
778	register lock_t	* l)
779{
780	lck_rw_lock_shared(l);
781}
782
783
784/*
785 *	Routine:	lock_read_to_write
786 *	Function:
787 *		Improves a read-only lock to one with
788 *		write permission.  If another reader has
789 *		already requested an upgrade to a write lock,
790 *		no lock is held upon return.
791 *
792 *		Returns FALSE if the upgrade *failed*.
793 */
794
795boolean_t
796lock_read_to_write(
797	register lock_t	* l)
798{
799	return lck_rw_lock_shared_to_exclusive(l);
800}
801
802void
803lock_write_to_read(
804	register lock_t	* l)
805{
806	lck_rw_lock_exclusive_to_shared(l);
807}
808
809
810
811/*
812 *      Routine:        lck_rw_alloc_init
813 */
814lck_rw_t *
815lck_rw_alloc_init(
816	lck_grp_t	*grp,
817	lck_attr_t	*attr) {
818	lck_rw_t	*lck;
819
820	if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
821		bzero(lck, sizeof(lck_rw_t));
822		lck_rw_init(lck, grp, attr);
823	}
824
825	return(lck);
826}
827
828/*
829 *      Routine:        lck_rw_free
830 */
831void
832lck_rw_free(
833	lck_rw_t	*lck,
834	lck_grp_t	*grp) {
835	lck_rw_destroy(lck, grp);
836	kfree(lck, sizeof(lck_rw_t));
837}
838
839/*
840 *      Routine:        lck_rw_init
841 */
842void
843lck_rw_init(
844	lck_rw_t	*lck,
845	lck_grp_t	*grp,
846	lck_attr_t	*attr)
847{
848	lck_attr_t	*lck_attr = (attr != LCK_ATTR_NULL) ?
849					attr : &LockDefaultLckAttr;
850
851	hw_lock_byte_init(&lck->lck_rw_interlock);
852	lck->lck_rw_want_write = FALSE;
853	lck->lck_rw_want_upgrade = FALSE;
854	lck->lck_rw_shared_count = 0;
855	lck->lck_rw_can_sleep = TRUE;
856	lck->lck_r_waiting = lck->lck_w_waiting = 0;
857	lck->lck_rw_tag = 0;
858	lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
859				LCK_ATTR_RW_SHARED_PRIORITY) == 0);
860
861	lck_grp_reference(grp);
862	lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
863}
864
865/*
866 *      Routine:        lck_rw_destroy
867 */
868void
869lck_rw_destroy(
870	lck_rw_t	*lck,
871	lck_grp_t	*grp)
872{
873	if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
874		return;
875#if MACH_LDEBUG
876	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
877#endif
878	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
879	lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
880	lck_grp_deallocate(grp);
881	return;
882}
883
884/*
885 *	Sleep locks.  These use the same data structure and algorithm
886 *	as the spin locks, but the process sleeps while it is waiting
887 *	for the lock.  These work on uniprocessor systems.
888 */
889
890#define DECREMENTER_TIMEOUT 1000000
891
892#define RW_LOCK_READER_EVENT(x)		\
893		((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
894
895#define RW_LOCK_WRITER_EVENT(x)		\
896		((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
897
898/*
899 * We disable interrupts while holding the RW interlock to prevent an
900 * interrupt from exacerbating hold time.
901 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
902 */
903static boolean_t
904lck_interlock_lock(lck_rw_t *lck)
905{
906	boolean_t	istate;
907
908	istate = ml_set_interrupts_enabled(FALSE);
909	hw_lock_byte_lock(&lck->lck_rw_interlock);
910
911	return istate;
912}
913
914static void
915lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
916{
917	hw_lock_byte_unlock(&lck->lck_rw_interlock);
918	ml_set_interrupts_enabled(istate);
919}
920
921/*
922 * This inline is used when busy-waiting for an rw lock.
923 * If interrupts were disabled when the lock primitive was called,
924 * we poll the IPI handler for pending tlb flushes.
925 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
926 */
927static inline void
928lck_rw_lock_pause(boolean_t interrupts_enabled)
929{
930	if (!interrupts_enabled)
931		handle_pending_TLB_flushes();
932	cpu_pause();
933}
934
935
936/*
937 * compute the deadline to spin against when
938 * waiting for a change of state on a lck_rw_t
939 */
940static inline uint64_t
941lck_rw_deadline_for_spin(lck_rw_t *lck)
942{
943	if (lck->lck_rw_can_sleep) {
944		if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
945			/*
946			 * there are already threads waiting on this lock... this
947			 * implies that they have spun beyond their deadlines waiting for
948			 * the desired state to show up so we will not bother spinning at this time...
949			 *   or
950			 * the current number of threads sharing this lock exceeds our capacity to run them
951			 * concurrently and since all states we're going to spin for require the rw_shared_count
952			 * to be at 0, we'll not bother spinning since the latency for this to happen is
953			 * unpredictable...
954			 */
955			return (mach_absolute_time());
956		}
957		return (mach_absolute_time() + MutexSpin);
958	} else
959		return (mach_absolute_time() + (100000LL * 1000000000LL));
960}
961
962
963/*
964 *      Routine:        lck_rw_lock_exclusive
965 */
966void
967lck_rw_lock_exclusive_gen(
968	lck_rw_t	*lck)
969{
970	uint64_t	deadline = 0;
971	int		slept = 0;
972	int		gotlock = 0;
973	int		lockheld = 0;
974	wait_result_t	res = 0;
975	boolean_t	istate = -1;
976
977#if	CONFIG_DTRACE
978	boolean_t dtrace_ls_initialized = FALSE;
979	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
980	uint64_t wait_interval = 0;
981	int readers_at_sleep = 0;
982#endif
983
984	/*
985	 *	Try to acquire the lck_rw_want_write bit.
986	 */
987	while ( !lck_rw_grab_want(lck)) {
988
989#if	CONFIG_DTRACE
990		if (dtrace_ls_initialized == FALSE) {
991			dtrace_ls_initialized = TRUE;
992			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
993			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
994			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
995			if (dtrace_ls_enabled) {
996				/*
997				 * Either sleeping or spinning is happening,
998				 *  start a timing of our delay interval now.
999				 */
1000				readers_at_sleep = lck->lck_rw_shared_count;
1001				wait_interval = mach_absolute_time();
1002			}
1003		}
1004#endif
1005		if (istate == -1)
1006			istate = ml_get_interrupts_enabled();
1007
1008		deadline = lck_rw_deadline_for_spin(lck);
1009
1010		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1011
1012		while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
1013			lck_rw_lock_pause(istate);
1014
1015		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0);
1016
1017		if (gotlock)
1018			break;
1019		/*
1020		 * if we get here, the deadline has expired w/o us
1021		 * being able to grab the lock exclusively
1022		 * check to see if we're allowed to do a thread_block
1023		 */
1024		if (lck->lck_rw_can_sleep) {
1025
1026			istate = lck_interlock_lock(lck);
1027
1028			if (lck->lck_rw_want_write) {
1029
1030				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1031
1032				lck->lck_w_waiting = TRUE;
1033
1034				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1035				lck_interlock_unlock(lck, istate);
1036
1037				if (res == THREAD_WAITING) {
1038					res = thread_block(THREAD_CONTINUE_NULL);
1039					slept++;
1040				}
1041				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
1042			} else {
1043				lck->lck_rw_want_write = TRUE;
1044				lck_interlock_unlock(lck, istate);
1045				break;
1046			}
1047		}
1048	}
1049	/*
1050	 * Wait for readers (and upgrades) to finish...
1051	 * the test for these conditions must be done simultaneously with
1052	 * a check of the interlock not being held since
1053	 * the rw_shared_count will drop to 0 first and then want_upgrade
1054	 * will be set to 1 in the shared_to_exclusive scenario... those
1055	 * adjustments are done behind the interlock and represent an
1056	 * atomic change in state and must be considered as such
1057	 * however, once we see the read count at 0, the want_upgrade not set
1058	 * and the interlock not held, we are safe to proceed
1059	 */
1060	while (lck_rw_held_read_or_upgrade(lck)) {
1061
1062#if	CONFIG_DTRACE
1063		/*
1064		 * Either sleeping or spinning is happening, start
1065		 * a timing of our delay interval now.  If we set it
1066		 * to -1 we don't have accurate data so we cannot later
1067		 * decide to record a dtrace spin or sleep event.
1068		 */
1069		if (dtrace_ls_initialized == FALSE) {
1070			dtrace_ls_initialized = TRUE;
1071			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1072			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1073			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1074			if (dtrace_ls_enabled) {
1075				/*
1076				 * Either sleeping or spinning is happening,
1077				 *  start a timing of our delay interval now.
1078				 */
1079				readers_at_sleep = lck->lck_rw_shared_count;
1080				wait_interval = mach_absolute_time();
1081			}
1082		}
1083#endif
1084		if (istate == -1)
1085			istate = ml_get_interrupts_enabled();
1086
1087		deadline = lck_rw_deadline_for_spin(lck);
1088
1089		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1090
1091		while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1092			lck_rw_lock_pause(istate);
1093
1094		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0);
1095
1096		if ( !lockheld)
1097			break;
1098		/*
1099		 * if we get here, the deadline has expired w/o us
1100		 * being able to grab the lock exclusively
1101		 * check to see if we're allowed to do a thread_block
1102		 */
1103		if (lck->lck_rw_can_sleep) {
1104
1105			istate = lck_interlock_lock(lck);
1106
1107			if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1108				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1109
1110				lck->lck_w_waiting = TRUE;
1111
1112				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1113				lck_interlock_unlock(lck, istate);
1114
1115				if (res == THREAD_WAITING) {
1116					res = thread_block(THREAD_CONTINUE_NULL);
1117					slept++;
1118				}
1119				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
1120			} else {
1121				lck_interlock_unlock(lck, istate);
1122				/*
1123				 * must own the lock now, since we checked for
1124				 * readers or upgrade owner behind the interlock
1125				 * no need for a call to 'lck_rw_held_read_or_upgrade'
1126				 */
1127				break;
1128			}
1129		}
1130	}
1131
1132#if	CONFIG_DTRACE
1133	/*
1134	 * Decide what latencies we suffered that are Dtrace events.
1135	 * If we have set wait_interval, then we either spun or slept.
1136	 * At least we get out from under the interlock before we record
1137	 * which is the best we can do here to minimize the impact
1138	 * of the tracing.
1139	 * If we have set wait_interval to -1, then dtrace was not enabled when we
1140	 * started sleeping/spinning so we don't record this event.
1141	 */
1142	if (dtrace_ls_enabled == TRUE) {
1143		if (slept == 0) {
1144			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1145			    mach_absolute_time() - wait_interval, 1);
1146		} else {
1147			/*
1148			 * For the blocking case, we also record if when we blocked
1149			 * it was held for read or write, and how many readers.
1150			 * Notice that above we recorded this before we dropped
1151			 * the interlock so the count is accurate.
1152			 */
1153			LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1154			    mach_absolute_time() - wait_interval, 1,
1155			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1156		}
1157	}
1158	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1159#endif
1160}
1161
1162
1163/*
1164 *      Routine:        lck_rw_done_gen
1165 *
1166 *	called from the assembly language wrapper...
1167 *	prior_lock_state is the value in the 1st
1168 * 	word of the lock at the time of a successful
1169 *	atomic compare and exchange with the new value...
1170 * 	it represents the state of the lock before we
1171 *	decremented the rw_shared_count or cleared either
1172 * 	rw_want_upgrade or rw_want_write and
1173 *	the lck_x_waiting bits...  since the wrapper
1174 * 	routine has already changed the state atomically,
1175 *	we just need to decide if we should
1176 *	wake up anyone and what value to return... we do
1177 *	this by examining the state of the lock before
1178 *	we changed it
1179 */
1180lck_rw_type_t
1181lck_rw_done_gen(
1182	lck_rw_t	*lck,
1183	int		prior_lock_state)
1184{
1185	lck_rw_t	*fake_lck;
1186	lck_rw_type_t	lock_type;
1187	thread_t	thread = current_thread();
1188	uint32_t	rwlock_count;
1189
1190	/* Check if dropping the lock means that we need to unpromote */
1191	rwlock_count = thread->rwlock_count--;
1192#if MACH_LDEBUG
1193	if (rwlock_count == 0) {
1194		panic("rw lock count underflow for thread %p", thread);
1195	}
1196#endif
1197	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1198		/* sched_flags checked without lock, but will be rechecked while clearing */
1199		lck_rw_clear_promotion(thread);
1200	}
1201
1202	/*
1203	 * prior_lock state is a snapshot of the 1st word of the
1204	 * lock in question... we'll fake up a pointer to it
1205	 * and carefully not access anything beyond whats defined
1206	 * in the first word of a lck_rw_t
1207	 */
1208	fake_lck = (lck_rw_t *)&prior_lock_state;
1209
1210	if (fake_lck->lck_rw_shared_count <= 1) {
1211		if (fake_lck->lck_w_waiting)
1212			thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1213
1214		if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1215			thread_wakeup(RW_LOCK_READER_EVENT(lck));
1216	}
1217	if (fake_lck->lck_rw_shared_count)
1218		lock_type = LCK_RW_TYPE_SHARED;
1219	else
1220		lock_type = LCK_RW_TYPE_EXCLUSIVE;
1221
1222#if CONFIG_DTRACE
1223	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1224#endif
1225
1226	return(lock_type);
1227}
1228
1229
1230/*
1231 *	Routine:	lck_rw_unlock
1232 */
1233void
1234lck_rw_unlock(
1235	lck_rw_t	*lck,
1236	lck_rw_type_t	lck_rw_type)
1237{
1238	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1239		lck_rw_unlock_shared(lck);
1240	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1241		lck_rw_unlock_exclusive(lck);
1242	else
1243		panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1244}
1245
1246
1247/*
1248 *	Routine:	lck_rw_unlock_shared
1249 */
1250void
1251lck_rw_unlock_shared(
1252	lck_rw_t	*lck)
1253{
1254	lck_rw_type_t	ret;
1255
1256	ret = lck_rw_done(lck);
1257
1258	if (ret != LCK_RW_TYPE_SHARED)
1259		panic("lck_rw_unlock(): lock held in mode: %d\n", ret);
1260}
1261
1262
1263/*
1264 *	Routine:	lck_rw_unlock_exclusive
1265 */
1266void
1267lck_rw_unlock_exclusive(
1268	lck_rw_t	*lck)
1269{
1270	lck_rw_type_t	ret;
1271
1272	ret = lck_rw_done(lck);
1273
1274	if (ret != LCK_RW_TYPE_EXCLUSIVE)
1275		panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1276}
1277
1278
1279/*
1280 *	Routine:	lck_rw_lock
1281 */
1282void
1283lck_rw_lock(
1284	lck_rw_t	*lck,
1285	lck_rw_type_t	lck_rw_type)
1286{
1287	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1288		lck_rw_lock_shared(lck);
1289	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1290		lck_rw_lock_exclusive(lck);
1291	else
1292		panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1293}
1294
1295
1296/*
1297 *	Routine:	lck_rw_lock_shared_gen
1298 *	Function:
1299 *		assembly fast path code has determined that this lock
1300 *		is held exclusively... this is where we spin/block
1301 *		until we can acquire the lock in the shared mode
1302 */
1303void
1304lck_rw_lock_shared_gen(
1305	lck_rw_t	*lck)
1306{
1307	uint64_t	deadline = 0;
1308	int		gotlock = 0;
1309	int		slept = 0;
1310	wait_result_t	res = 0;
1311	boolean_t	istate = -1;
1312
1313#if	CONFIG_DTRACE
1314	uint64_t wait_interval = 0;
1315	int readers_at_sleep = 0;
1316	boolean_t dtrace_ls_initialized = FALSE;
1317	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1318#endif
1319
1320	while ( !lck_rw_grab_shared(lck)) {
1321
1322#if	CONFIG_DTRACE
1323		if (dtrace_ls_initialized == FALSE) {
1324			dtrace_ls_initialized = TRUE;
1325			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1326			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1327			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1328			if (dtrace_ls_enabled) {
1329				/*
1330				 * Either sleeping or spinning is happening,
1331				 *  start a timing of our delay interval now.
1332				 */
1333				readers_at_sleep = lck->lck_rw_shared_count;
1334				wait_interval = mach_absolute_time();
1335			}
1336		}
1337#endif
1338		if (istate == -1)
1339			istate = ml_get_interrupts_enabled();
1340
1341		deadline = lck_rw_deadline_for_spin(lck);
1342
1343		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1344			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1345
1346		while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1347			lck_rw_lock_pause(istate);
1348
1349		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1350			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1351
1352		if (gotlock)
1353			break;
1354		/*
1355		 * if we get here, the deadline has expired w/o us
1356		 * being able to grab the lock for read
1357		 * check to see if we're allowed to do a thread_block
1358		 */
1359		if (lck->lck_rw_can_sleep) {
1360
1361			istate = lck_interlock_lock(lck);
1362
1363			if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1364			    ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1365
1366				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1367					     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1368
1369				lck->lck_r_waiting = TRUE;
1370
1371				res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1372				lck_interlock_unlock(lck, istate);
1373
1374				if (res == THREAD_WAITING) {
1375					res = thread_block(THREAD_CONTINUE_NULL);
1376					slept++;
1377				}
1378				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1379					     (int)lck, res, slept, 0, 0);
1380			} else {
1381				lck->lck_rw_shared_count++;
1382				lck_interlock_unlock(lck, istate);
1383				break;
1384			}
1385		}
1386	}
1387
1388#if	CONFIG_DTRACE
1389	if (dtrace_ls_enabled == TRUE) {
1390		if (slept == 0) {
1391			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1392		} else {
1393			LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1394			    mach_absolute_time() - wait_interval, 0,
1395			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1396		}
1397	}
1398	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1399#endif
1400}
1401
1402
1403/*
1404 *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1405 *	Function:
1406 *		assembly fast path code has already dropped our read
1407 *		count and determined that someone else owns 'lck_rw_want_upgrade'
1408 *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1409 *		all we need to do here is determine if a wakeup is needed
1410 */
1411boolean_t
1412lck_rw_lock_shared_to_exclusive_failure(
1413	lck_rw_t	*lck,
1414	int		prior_lock_state)
1415{
1416	lck_rw_t	*fake_lck;
1417	thread_t	thread = current_thread();
1418	uint32_t	rwlock_count;
1419
1420	/* Check if dropping the lock means that we need to unpromote */
1421	rwlock_count = thread->rwlock_count--;
1422#if MACH_LDEBUG
1423	if (rwlock_count == 0) {
1424		panic("rw lock count underflow for thread %p", thread);
1425	}
1426#endif
1427	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1428		/* sched_flags checked without lock, but will be rechecked while clearing */
1429		lck_rw_clear_promotion(thread);
1430	}
1431
1432	/*
1433	 * prior_lock state is a snapshot of the 1st word of the
1434	 * lock in question... we'll fake up a pointer to it
1435	 * and carefully not access anything beyond whats defined
1436	 * in the first word of a lck_rw_t
1437	 */
1438	fake_lck = (lck_rw_t *)&prior_lock_state;
1439
1440	if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1441		/*
1442		 *	Someone else has requested upgrade.
1443		 *	Since we've released the read lock, wake
1444		 *	him up if he's blocked waiting
1445		 */
1446		thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1447	}
1448	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1449		     (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1450
1451	return (FALSE);
1452}
1453
1454
1455/*
1456 *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1457 *	Function:
1458 *		assembly fast path code has already dropped our read
1459 *		count and successfully acquired 'lck_rw_want_upgrade'
1460 *		we just need to wait for the rest of the readers to drain
1461 *		and then we can return as the exclusive holder of this lock
1462 */
1463boolean_t
1464lck_rw_lock_shared_to_exclusive_success(
1465	lck_rw_t	*lck)
1466{
1467	uint64_t	deadline = 0;
1468	int		slept = 0;
1469	int		still_shared = 0;
1470	wait_result_t	res;
1471	boolean_t	istate = -1;
1472
1473#if	CONFIG_DTRACE
1474	uint64_t wait_interval = 0;
1475	int readers_at_sleep = 0;
1476	boolean_t dtrace_ls_initialized = FALSE;
1477	boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1478#endif
1479
1480	while (lck->lck_rw_shared_count != 0) {
1481
1482#if	CONFIG_DTRACE
1483		if (dtrace_ls_initialized == FALSE) {
1484			dtrace_ls_initialized = TRUE;
1485			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1486			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1487			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1488			if (dtrace_ls_enabled) {
1489				/*
1490				 * Either sleeping or spinning is happening,
1491				 *  start a timing of our delay interval now.
1492				 */
1493				readers_at_sleep = lck->lck_rw_shared_count;
1494				wait_interval = mach_absolute_time();
1495			}
1496		}
1497#endif
1498		if (istate == -1)
1499			istate = ml_get_interrupts_enabled();
1500
1501		deadline = lck_rw_deadline_for_spin(lck);
1502
1503		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1504			     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1505
1506		while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1507			lck_rw_lock_pause(istate);
1508
1509		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1510			     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1511
1512		if ( !still_shared)
1513			break;
1514		/*
1515		 * if we get here, the deadline has expired w/o
1516		 * the rw_shared_count having drained to 0
1517		 * check to see if we're allowed to do a thread_block
1518		 */
1519		if (lck->lck_rw_can_sleep) {
1520
1521			istate = lck_interlock_lock(lck);
1522
1523			if (lck->lck_rw_shared_count != 0) {
1524				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1525					     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1526
1527				lck->lck_w_waiting = TRUE;
1528
1529				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1530				lck_interlock_unlock(lck, istate);
1531
1532				if (res == THREAD_WAITING) {
1533					res = thread_block(THREAD_CONTINUE_NULL);
1534					slept++;
1535				}
1536				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1537					     (int)lck, res, slept, 0, 0);
1538			} else {
1539				lck_interlock_unlock(lck, istate);
1540				break;
1541			}
1542		}
1543	}
1544#if	CONFIG_DTRACE
1545	/*
1546	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1547	 */
1548	if (dtrace_ls_enabled == TRUE) {
1549		if (slept == 0) {
1550			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1551		} else {
1552			LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1553			    mach_absolute_time() - wait_interval, 1,
1554			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1555		}
1556	}
1557	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1558#endif
1559	return (TRUE);
1560}
1561
1562
1563/*
1564 *      Routine:        lck_rw_lock_exclusive_to_shared
1565 * 	Function:
1566 *		assembly fast path has already dropped
1567 *		our exclusive state and bumped lck_rw_shared_count
1568 *		all we need to do here is determine if anyone
1569 *		needs to be awakened.
1570 */
1571void
1572lck_rw_lock_exclusive_to_shared_gen(
1573	lck_rw_t	*lck,
1574	int		prior_lock_state)
1575{
1576	lck_rw_t	*fake_lck;
1577
1578	/*
1579	 * prior_lock state is a snapshot of the 1st word of the
1580	 * lock in question... we'll fake up a pointer to it
1581	 * and carefully not access anything beyond whats defined
1582	 * in the first word of a lck_rw_t
1583	 */
1584	fake_lck = (lck_rw_t *)&prior_lock_state;
1585
1586	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1587			     (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1588
1589	/*
1590	 * don't wake up anyone waiting to take the lock exclusively
1591	 * since we hold a read count... when the read count drops to 0,
1592	 * the writers will be woken.
1593	 *
1594	 * wake up any waiting readers if we don't have any writers waiting,
1595	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1596	 */
1597	if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1598		thread_wakeup(RW_LOCK_READER_EVENT(lck));
1599
1600	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1601			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1602
1603#if CONFIG_DTRACE
1604	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1605#endif
1606}
1607
1608
1609/*
1610 *      Routine:        lck_rw_try_lock
1611 */
1612boolean_t
1613lck_rw_try_lock(
1614	lck_rw_t	*lck,
1615	lck_rw_type_t	lck_rw_type)
1616{
1617	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1618		return(lck_rw_try_lock_shared(lck));
1619	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1620		return(lck_rw_try_lock_exclusive(lck));
1621	else
1622		panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1623	return(FALSE);
1624}
1625
1626
1627void
1628lck_rw_assert(
1629	lck_rw_t	*lck,
1630	unsigned int	type)
1631{
1632	switch (type) {
1633	case LCK_RW_ASSERT_SHARED:
1634		if (lck->lck_rw_shared_count != 0) {
1635			return;
1636		}
1637		break;
1638	case LCK_RW_ASSERT_EXCLUSIVE:
1639		if ((lck->lck_rw_want_write ||
1640		     lck->lck_rw_want_upgrade) &&
1641		    lck->lck_rw_shared_count == 0) {
1642			return;
1643		}
1644		break;
1645	case LCK_RW_ASSERT_HELD:
1646		if (lck->lck_rw_want_write ||
1647		    lck->lck_rw_want_upgrade ||
1648		    lck->lck_rw_shared_count != 0) {
1649			return;
1650		}
1651		break;
1652	case LCK_RW_ASSERT_NOTHELD:
1653		if (!(lck->lck_rw_want_write ||
1654			  lck->lck_rw_want_upgrade ||
1655			  lck->lck_rw_shared_count != 0)) {
1656			return;
1657		}
1658		break;
1659	default:
1660		break;
1661	}
1662
1663	panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1664}
1665
1666/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1667void
1668lck_rw_clear_promotions_x86(thread_t thread)
1669{
1670#if MACH_LDEBUG
1671	/* It's fatal to leave a RW lock locked and return to userspace */
1672	panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1673#else
1674	/* Paper over the issue */
1675	thread->rwlock_count = 0;
1676	lck_rw_clear_promotion(thread);
1677#endif
1678}
1679
1680
1681#ifdef	MUTEX_ZONE
1682extern zone_t lck_mtx_zone;
1683#endif
1684/*
1685 *      Routine:        lck_mtx_alloc_init
1686 */
1687lck_mtx_t *
1688lck_mtx_alloc_init(
1689	lck_grp_t	*grp,
1690	lck_attr_t	*attr)
1691{
1692	lck_mtx_t	*lck;
1693#ifdef	MUTEX_ZONE
1694	if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
1695		lck_mtx_init(lck, grp, attr);
1696#else
1697	if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
1698		lck_mtx_init(lck, grp, attr);
1699#endif
1700	return(lck);
1701}
1702
1703/*
1704 *      Routine:        lck_mtx_free
1705 */
1706void
1707lck_mtx_free(
1708	lck_mtx_t	*lck,
1709	lck_grp_t	*grp)
1710{
1711	lck_mtx_destroy(lck, grp);
1712#ifdef	MUTEX_ZONE
1713	zfree(lck_mtx_zone, lck);
1714#else
1715	kfree(lck, sizeof(lck_mtx_t));
1716#endif
1717}
1718
1719/*
1720 *      Routine:        lck_mtx_ext_init
1721 */
1722static void
1723lck_mtx_ext_init(
1724	lck_mtx_ext_t	*lck,
1725	lck_grp_t	*grp,
1726	lck_attr_t	*attr)
1727{
1728	bzero((void *)lck, sizeof(lck_mtx_ext_t));
1729
1730	if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1731		lck->lck_mtx_deb.type = MUTEX_TAG;
1732		lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
1733	}
1734
1735	lck->lck_mtx_grp = grp;
1736
1737	if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
1738		lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
1739
1740	lck->lck_mtx.lck_mtx_is_ext = 1;
1741	lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1742}
1743
1744/*
1745 *      Routine:        lck_mtx_init
1746 */
1747void
1748lck_mtx_init(
1749	lck_mtx_t	*lck,
1750	lck_grp_t	*grp,
1751	lck_attr_t	*attr)
1752{
1753	lck_mtx_ext_t	*lck_ext;
1754	lck_attr_t	*lck_attr;
1755
1756	if (attr != LCK_ATTR_NULL)
1757		lck_attr = attr;
1758	else
1759		lck_attr = &LockDefaultLckAttr;
1760
1761	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1762		if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
1763			lck_mtx_ext_init(lck_ext, grp, lck_attr);
1764			lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1765			lck->lck_mtx_ptr = lck_ext;
1766		}
1767	} else {
1768		lck->lck_mtx_owner = 0;
1769		lck->lck_mtx_state = 0;
1770	}
1771	lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1772	lck_grp_reference(grp);
1773	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1774}
1775
1776/*
1777 *      Routine:        lck_mtx_init_ext
1778 */
1779void
1780lck_mtx_init_ext(
1781	lck_mtx_t	*lck,
1782	lck_mtx_ext_t	*lck_ext,
1783	lck_grp_t	*grp,
1784	lck_attr_t	*attr)
1785{
1786	lck_attr_t	*lck_attr;
1787
1788	if (attr != LCK_ATTR_NULL)
1789		lck_attr = attr;
1790	else
1791		lck_attr = &LockDefaultLckAttr;
1792
1793	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1794		lck_mtx_ext_init(lck_ext, grp, lck_attr);
1795		lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1796		lck->lck_mtx_ptr = lck_ext;
1797	} else {
1798		lck->lck_mtx_owner = 0;
1799		lck->lck_mtx_state = 0;
1800	}
1801	lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1802
1803	lck_grp_reference(grp);
1804	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1805}
1806
1807/*
1808 *      Routine:        lck_mtx_destroy
1809 */
1810void
1811lck_mtx_destroy(
1812	lck_mtx_t	*lck,
1813	lck_grp_t	*grp)
1814{
1815	boolean_t lck_is_indirect;
1816
1817	if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
1818		return;
1819#if MACH_LDEBUG
1820	lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1821#endif
1822	lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1823
1824	lck_mtx_lock_mark_destroyed(lck);
1825
1826	if (lck_is_indirect)
1827		kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
1828	lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1829	lck_grp_deallocate(grp);
1830	return;
1831}
1832
1833
1834#define	LCK_MTX_LCK_WAIT_CODE		0x20
1835#define	LCK_MTX_LCK_WAKEUP_CODE		0x21
1836#define	LCK_MTX_LCK_SPIN_CODE		0x22
1837#define	LCK_MTX_LCK_ACQUIRE_CODE	0x23
1838#define LCK_MTX_LCK_DEMOTE_CODE		0x24
1839
1840
1841/*
1842 * Routine: 	lck_mtx_unlock_wakeup_x86
1843 *
1844 * Invoked on unlock when there is
1845 * contention (i.e. the assembly routine sees that
1846 * that mutex->lck_mtx_waiters != 0 or
1847 * that mutex->lck_mtx_promoted != 0...
1848 *
1849 * neither the mutex or interlock is held
1850 */
1851void
1852lck_mtx_unlock_wakeup_x86 (
1853	lck_mtx_t	*mutex,
1854	int		prior_lock_state)
1855{
1856	lck_mtx_t	fake_lck;
1857
1858	/*
1859	 * prior_lock state is a snapshot of the 2nd word of the
1860	 * lock in question... we'll fake up a lock with the bits
1861	 * copied into place and carefully not access anything
1862	 * beyond whats defined in the second word of a lck_mtx_t
1863	 */
1864	fake_lck.lck_mtx_state = prior_lock_state;
1865
1866	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
1867		     mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
1868
1869	if (__probable(fake_lck.lck_mtx_waiters)) {
1870		if (fake_lck.lck_mtx_waiters > 1)
1871			thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri);
1872		else
1873			thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
1874	}
1875
1876	if (__improbable(fake_lck.lck_mtx_promoted)) {
1877		thread_t	thread = current_thread();
1878
1879
1880		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
1881			     thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
1882
1883		if (thread->promotions > 0) {
1884			spl_t	s = splsched();
1885
1886			thread_lock(thread);
1887
1888			if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
1889
1890				thread->sched_flags &= ~TH_SFLAG_PROMOTED;
1891
1892				if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1893					KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1894							      thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
1895
1896					set_sched_pri(thread, DEPRESSPRI);
1897				}
1898				else {
1899					if (thread->priority < thread->sched_pri) {
1900						KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1901								      thread->sched_pri, thread->priority, 0, mutex, 0);
1902
1903						SCHED(compute_priority)(thread, FALSE);
1904					}
1905				}
1906			}
1907			thread_unlock(thread);
1908			splx(s);
1909		}
1910	}
1911	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
1912		     mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1913}
1914
1915
1916/*
1917 * Routine: 	lck_mtx_lock_acquire_x86
1918 *
1919 * Invoked on acquiring the mutex when there is
1920 * contention (i.e. the assembly routine sees that
1921 * that mutex->lck_mtx_waiters != 0 or
1922 * thread->was_promoted_on_wakeup != 0)...
1923 *
1924 * mutex is owned...  interlock is held... preemption is disabled
1925 */
1926void
1927lck_mtx_lock_acquire_x86(
1928	lck_mtx_t	*mutex)
1929{
1930	thread_t	thread;
1931	integer_t	priority;
1932	spl_t		s;
1933
1934	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
1935		     mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
1936
1937	if (mutex->lck_mtx_waiters)
1938		priority = mutex->lck_mtx_pri;
1939	else
1940		priority = 0;
1941
1942	thread = (thread_t)mutex->lck_mtx_owner;	/* faster then current_thread() */
1943
1944	if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
1945
1946		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
1947				      thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0);
1948
1949		s = splsched();
1950		thread_lock(thread);
1951
1952		if (thread->sched_pri < priority) {
1953			/* Do not promote into the realtime priority band */
1954			assert(priority <= MAXPRI_KERNEL);
1955			set_sched_pri(thread, priority);
1956		}
1957		if (mutex->lck_mtx_promoted == 0) {
1958			mutex->lck_mtx_promoted = 1;
1959
1960			thread->promotions++;
1961			thread->sched_flags |= TH_SFLAG_PROMOTED;
1962		}
1963		thread->was_promoted_on_wakeup = 0;
1964
1965		thread_unlock(thread);
1966		splx(s);
1967	}
1968	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
1969		     mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1970}
1971
1972
1973
1974/*
1975 * Routine: 	lck_mtx_lock_spinwait_x86
1976 *
1977 * Invoked trying to acquire a mutex when there is contention but
1978 * the holder is running on another processor. We spin for up to a maximum
1979 * time waiting for the lock to be released.
1980 *
1981 * Called with the interlock unlocked.
1982 * returns 0 if mutex acquired
1983 * returns 1 if we spun
1984 * returns 2 if we didn't spin due to the holder not running
1985 */
1986int
1987lck_mtx_lock_spinwait_x86(
1988	lck_mtx_t	*mutex)
1989{
1990	thread_t	holder;
1991	uint64_t	deadline;
1992	int		retval = 1;
1993	int		loopcount = 0;
1994
1995
1996	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
1997		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
1998
1999	deadline = mach_absolute_time() + MutexSpin;
2000
2001	/*
2002	 * Spin while:
2003	 *   - mutex is locked, and
2004	 *   - its locked as a spin lock, and
2005	 *   - owner is running on another processor, and
2006	 *   - owner (processor) is not idling, and
2007	 *   - we haven't spun for long enough.
2008	 */
2009	do {
2010		if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
2011			retval = 0;
2012			break;
2013		}
2014		if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
2015
2016			if ( !(holder->machine.specFlags & OnProc) ||
2017			     (holder->state & TH_IDLE)) {
2018				if (loopcount == 0)
2019					retval = 2;
2020				break;
2021			}
2022		}
2023		cpu_pause();
2024
2025		loopcount++;
2026
2027	} while (mach_absolute_time() < deadline);
2028
2029
2030#if	CONFIG_DTRACE
2031	/*
2032	 * We've already kept a count via deadline of how long we spun.
2033	 * If dtrace is active, then we compute backwards to decide how
2034	 * long we spun.
2035	 *
2036	 * Note that we record a different probe id depending on whether
2037	 * this is a direct or indirect mutex.  This allows us to
2038	 * penalize only lock groups that have debug/stats enabled
2039	 * with dtrace processing if desired.
2040	 */
2041	if (__probable(mutex->lck_mtx_is_ext == 0)) {
2042		LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
2043		    mach_absolute_time() - (deadline - MutexSpin));
2044	} else {
2045		LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
2046		    mach_absolute_time() - (deadline - MutexSpin));
2047	}
2048	/* The lockstat acquire event is recorded by the assembly code beneath us. */
2049#endif
2050
2051	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2052		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
2053
2054	return retval;
2055}
2056
2057
2058
2059/*
2060 * Routine: 	lck_mtx_lock_wait_x86
2061 *
2062 * Invoked in order to wait on contention.
2063 *
2064 * Called with the interlock locked and
2065 * preemption disabled...
2066 * returns it unlocked and with preemption enabled
2067 */
2068void
2069lck_mtx_lock_wait_x86 (
2070	lck_mtx_t	*mutex)
2071{
2072	thread_t	self = current_thread();
2073	thread_t	holder;
2074	integer_t	priority;
2075	spl_t		s;
2076#if	CONFIG_DTRACE
2077	uint64_t	sleep_start = 0;
2078
2079	if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
2080		sleep_start = mach_absolute_time();
2081	}
2082#endif
2083	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
2084		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2085
2086	priority = self->sched_pri;
2087
2088	if (priority < self->priority)
2089		priority = self->priority;
2090	if (priority < BASEPRI_DEFAULT)
2091		priority = BASEPRI_DEFAULT;
2092
2093	/* Do not promote into the realtime priority band */
2094	priority = MIN(priority, MAXPRI_KERNEL);
2095
2096	if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
2097		mutex->lck_mtx_pri = priority;
2098	mutex->lck_mtx_waiters++;
2099
2100	if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
2101	     holder->sched_pri < mutex->lck_mtx_pri ) {
2102		/* Assert that we're not altering the priority of a
2103		 * MAXPRI_KERNEL or RT prio band thread
2104		 */
2105		assert(holder->sched_pri < MAXPRI_KERNEL);
2106		s = splsched();
2107		thread_lock(holder);
2108
2109		if (holder->sched_pri < mutex->lck_mtx_pri) {
2110			KERNEL_DEBUG_CONSTANT(
2111				MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2112				holder->sched_pri, priority, thread_tid(holder), mutex, 0);
2113
2114			set_sched_pri(holder, priority);
2115
2116			if (mutex->lck_mtx_promoted == 0) {
2117				holder->promotions++;
2118				holder->sched_flags |= TH_SFLAG_PROMOTED;
2119
2120				mutex->lck_mtx_promoted = 1;
2121			}
2122		}
2123		thread_unlock(holder);
2124		splx(s);
2125	}
2126	assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
2127
2128	lck_mtx_ilk_unlock(mutex);
2129
2130	thread_block(THREAD_CONTINUE_NULL);
2131
2132	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2133		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2134
2135#if	CONFIG_DTRACE
2136	/*
2137	 * Record the Dtrace lockstat probe for blocking, block time
2138	 * measured from when we were entered.
2139	 */
2140	if (sleep_start) {
2141		if (mutex->lck_mtx_is_ext == 0) {
2142			LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2143			    mach_absolute_time() - sleep_start);
2144		} else {
2145			LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2146			    mach_absolute_time() - sleep_start);
2147		}
2148	}
2149#endif
2150}
2151