1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 *	File:	kern/lock.c
58 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
59 *	Date:	1985
60 *
61 *	Locking primitives implementation
62 */
63
64#include <mach_ldebug.h>
65
66#include <kern/locks.h>
67#include <kern/kalloc.h>
68#include <kern/misc_protos.h>
69#include <kern/thread.h>
70#include <kern/processor.h>
71#include <kern/cpu_data.h>
72#include <kern/cpu_number.h>
73#include <kern/sched_prim.h>
74#include <kern/xpr.h>
75#include <kern/debug.h>
76#include <string.h>
77
78#include <i386/machine_routines.h> /* machine_timeout_suspended() */
79#include <machine/machine_cpu.h>
80#include <i386/mp.h>
81
82#include <sys/kdebug.h>
83#include <mach/branch_predicates.h>
84
85/*
86 * We need only enough declarations from the BSD-side to be able to
87 * test if our probe is active, and to call __dtrace_probe().  Setting
88 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
89 */
90#if	CONFIG_DTRACE
91#define NEED_DTRACE_DEFS
92#include <../bsd/sys/lockstat.h>
93#endif
94
95#define	LCK_RW_LCK_EXCLUSIVE_CODE	0x100
96#define	LCK_RW_LCK_EXCLUSIVE1_CODE	0x101
97#define	LCK_RW_LCK_SHARED_CODE		0x102
98#define	LCK_RW_LCK_SH_TO_EX_CODE	0x103
99#define	LCK_RW_LCK_SH_TO_EX1_CODE	0x104
100#define	LCK_RW_LCK_EX_TO_SH_CODE	0x105
101
102#define LCK_RW_LCK_EX_WRITER_SPIN_CODE	0x106
103#define LCK_RW_LCK_EX_WRITER_WAIT_CODE	0x107
104#define LCK_RW_LCK_EX_READER_SPIN_CODE	0x108
105#define LCK_RW_LCK_EX_READER_WAIT_CODE	0x109
106#define LCK_RW_LCK_SHARED_SPIN_CODE	0x110
107#define LCK_RW_LCK_SHARED_WAIT_CODE	0x111
108#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE	0x112
109#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE	0x113
110
111
112#define	ANY_LOCK_DEBUG	(USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
113
114unsigned int LcksOpts=0;
115
116/* Forwards */
117
118#if	USLOCK_DEBUG
119/*
120 *	Perform simple lock checks.
121 */
122int	uslock_check = 1;
123int	max_lock_loops	= 100000000;
124decl_simple_lock_data(extern , printf_lock)
125decl_simple_lock_data(extern , panic_lock)
126#endif	/* USLOCK_DEBUG */
127
128extern unsigned int not_in_kdp;
129
130/*
131 *	We often want to know the addresses of the callers
132 *	of the various lock routines.  However, this information
133 *	is only used for debugging and statistics.
134 */
135typedef void	*pc_t;
136#define	INVALID_PC	((void *) VM_MAX_KERNEL_ADDRESS)
137#define	INVALID_THREAD	((void *) VM_MAX_KERNEL_ADDRESS)
138#if	ANY_LOCK_DEBUG
139#define	OBTAIN_PC(pc)	((pc) = GET_RETURN_PC())
140#define DECL_PC(pc)	pc_t pc;
141#else	/* ANY_LOCK_DEBUG */
142#define DECL_PC(pc)
143#ifdef	lint
144/*
145 *	Eliminate lint complaints about unused local pc variables.
146 */
147#define	OBTAIN_PC(pc)	++pc
148#else	/* lint */
149#define	OBTAIN_PC(pc)
150#endif	/* lint */
151#endif	/* USLOCK_DEBUG */
152
153
154/*
155 *	Portable lock package implementation of usimple_locks.
156 */
157
158#if	USLOCK_DEBUG
159#define	USLDBG(stmt)	stmt
160void		usld_lock_init(usimple_lock_t, unsigned short);
161void		usld_lock_pre(usimple_lock_t, pc_t);
162void		usld_lock_post(usimple_lock_t, pc_t);
163void		usld_unlock(usimple_lock_t, pc_t);
164void		usld_lock_try_pre(usimple_lock_t, pc_t);
165void		usld_lock_try_post(usimple_lock_t, pc_t);
166int		usld_lock_common_checks(usimple_lock_t, char *);
167#else	/* USLOCK_DEBUG */
168#define	USLDBG(stmt)
169#endif	/* USLOCK_DEBUG */
170
171
172extern int lck_rw_grab_want(lck_rw_t *lck);
173extern int lck_rw_grab_shared(lck_rw_t *lck);
174extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
175
176
177/*
178 * Forward definitions
179 */
180
181void lck_rw_lock_shared_gen(
182	lck_rw_t	*lck);
183
184void lck_rw_lock_exclusive_gen(
185	lck_rw_t	*lck);
186
187boolean_t lck_rw_lock_shared_to_exclusive_success(
188	lck_rw_t	*lck);
189
190boolean_t lck_rw_lock_shared_to_exclusive_failure(
191	lck_rw_t	*lck,
192	int		prior_lock_state);
193
194void lck_rw_lock_exclusive_to_shared_gen(
195	lck_rw_t	*lck,
196	int		prior_lock_state);
197
198lck_rw_type_t lck_rw_done_gen(
199	lck_rw_t	*lck,
200	int		prior_lock_state);
201
202void lck_rw_clear_promotions_x86(thread_t thread);
203
204/*
205 *      Routine:        lck_spin_alloc_init
206 */
207lck_spin_t *
208lck_spin_alloc_init(
209	lck_grp_t	*grp,
210	lck_attr_t	*attr)
211{
212	lck_spin_t	*lck;
213
214	if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
215		lck_spin_init(lck, grp, attr);
216
217	return(lck);
218}
219
220/*
221 *      Routine:        lck_spin_free
222 */
223void
224lck_spin_free(
225	lck_spin_t	*lck,
226	lck_grp_t	*grp)
227{
228	lck_spin_destroy(lck, grp);
229	kfree(lck, sizeof(lck_spin_t));
230}
231
232/*
233 *      Routine:        lck_spin_init
234 */
235void
236lck_spin_init(
237	lck_spin_t	*lck,
238	lck_grp_t	*grp,
239	__unused lck_attr_t	*attr)
240{
241	usimple_lock_init((usimple_lock_t) lck, 0);
242	lck_grp_reference(grp);
243	lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
244}
245
246/*
247 *      Routine:        lck_spin_destroy
248 */
249void
250lck_spin_destroy(
251	lck_spin_t	*lck,
252	lck_grp_t	*grp)
253{
254	if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
255		return;
256	lck->interlock = LCK_SPIN_TAG_DESTROYED;
257	lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
258	lck_grp_deallocate(grp);
259	return;
260}
261
262/*
263 *      Routine:        lck_spin_lock
264 */
265void
266lck_spin_lock(
267	lck_spin_t	*lck)
268{
269	usimple_lock((usimple_lock_t) lck);
270}
271
272/*
273 *      Routine:        lck_spin_unlock
274 */
275void
276lck_spin_unlock(
277	lck_spin_t	*lck)
278{
279	usimple_unlock((usimple_lock_t) lck);
280}
281
282
283/*
284 *      Routine:        lck_spin_try_lock
285 */
286boolean_t
287lck_spin_try_lock(
288	lck_spin_t	*lck)
289{
290	return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
291}
292
293/*
294 *      Routine: lck_spin_is_acquired
295 *      NOT SAFE: To be used only by kernel debugger to avoid deadlock.
296 *      Returns: TRUE if lock is acquired.
297 */
298boolean_t
299lck_spin_is_acquired(lck_spin_t *lck) {
300	if (not_in_kdp) {
301		panic("panic: spinlock acquired check done outside of kernel debugger");
302	}
303	return (lck->interlock != 0)? TRUE : FALSE;
304}
305
306/*
307 *	Initialize a usimple_lock.
308 *
309 *	No change in preemption state.
310 */
311void
312usimple_lock_init(
313	usimple_lock_t	l,
314	__unused unsigned short	tag)
315{
316#ifndef	MACHINE_SIMPLE_LOCK
317	USLDBG(usld_lock_init(l, tag));
318	hw_lock_init(&l->interlock);
319#else
320	simple_lock_init((simple_lock_t)l,tag);
321#endif
322}
323
324volatile uint32_t spinlock_owner_cpu = ~0;
325volatile usimple_lock_t spinlock_timed_out;
326
327uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
328	uint64_t deadline;
329	uint32_t i;
330
331	for (i = 0; i < real_ncpus; i++) {
332		if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
333			spinlock_owner_cpu = i;
334			if ((uint32_t) cpu_number() == i)
335				break;
336			cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
337			cpu_NMI_interrupt(i);
338			deadline = mach_absolute_time() + (LockTimeOut * 2);
339			while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
340				cpu_pause();
341			break;
342		}
343	}
344
345	return spinlock_owner_cpu;
346}
347
348/*
349 *	Acquire a usimple_lock.
350 *
351 *	Returns with preemption disabled.  Note
352 *	that the hw_lock routines are responsible for
353 *	maintaining preemption state.
354 */
355void
356usimple_lock(
357	usimple_lock_t	l)
358{
359#ifndef	MACHINE_SIMPLE_LOCK
360	DECL_PC(pc);
361
362	OBTAIN_PC(pc);
363	USLDBG(usld_lock_pre(l, pc));
364
365	if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))	{
366		boolean_t uslock_acquired = FALSE;
367		while (machine_timeout_suspended()) {
368			enable_preemption();
369			if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
370				break;
371		}
372
373		if (uslock_acquired == FALSE) {
374			uint32_t lock_cpu;
375			uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
376			spinlock_timed_out = l;
377			lock_cpu = spinlock_timeout_NMI(lowner);
378			panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
379		}
380	}
381	USLDBG(usld_lock_post(l, pc));
382#else
383	simple_lock((simple_lock_t)l);
384#endif
385}
386
387
388/*
389 *	Release a usimple_lock.
390 *
391 *	Returns with preemption enabled.  Note
392 *	that the hw_lock routines are responsible for
393 *	maintaining preemption state.
394 */
395void
396usimple_unlock(
397	usimple_lock_t	l)
398{
399#ifndef	MACHINE_SIMPLE_LOCK
400	DECL_PC(pc);
401
402	OBTAIN_PC(pc);
403	USLDBG(usld_unlock(l, pc));
404	hw_lock_unlock(&l->interlock);
405#else
406	simple_unlock_rwmb((simple_lock_t)l);
407#endif
408}
409
410
411/*
412 *	Conditionally acquire a usimple_lock.
413 *
414 *	On success, returns with preemption disabled.
415 *	On failure, returns with preemption in the same state
416 *	as when first invoked.  Note that the hw_lock routines
417 *	are responsible for maintaining preemption state.
418 *
419 *	XXX No stats are gathered on a miss; I preserved this
420 *	behavior from the original assembly-language code, but
421 *	doesn't it make sense to log misses?  XXX
422 */
423unsigned int
424usimple_lock_try(
425	usimple_lock_t	l)
426{
427#ifndef	MACHINE_SIMPLE_LOCK
428	unsigned int	success;
429	DECL_PC(pc);
430
431	OBTAIN_PC(pc);
432	USLDBG(usld_lock_try_pre(l, pc));
433	if ((success = hw_lock_try(&l->interlock))) {
434		USLDBG(usld_lock_try_post(l, pc));
435	}
436	return success;
437#else
438	return(simple_lock_try((simple_lock_t)l));
439#endif
440}
441
442#if	USLOCK_DEBUG
443/*
444 *	States of a usimple_lock.  The default when initializing
445 *	a usimple_lock is setting it up for debug checking.
446 */
447#define	USLOCK_CHECKED		0x0001		/* lock is being checked */
448#define	USLOCK_TAKEN		0x0002		/* lock has been taken */
449#define	USLOCK_INIT		0xBAA0		/* lock has been initialized */
450#define	USLOCK_INITIALIZED	(USLOCK_INIT|USLOCK_CHECKED)
451#define	USLOCK_CHECKING(l)	(uslock_check &&			\
452				 ((l)->debug.state & USLOCK_CHECKED))
453
454/*
455 *	Trace activities of a particularly interesting lock.
456 */
457void	usl_trace(usimple_lock_t, int, pc_t, const char *);
458
459
460/*
461 *	Initialize the debugging information contained
462 *	in a usimple_lock.
463 */
464void
465usld_lock_init(
466	usimple_lock_t	l,
467	__unused unsigned short	tag)
468{
469	if (l == USIMPLE_LOCK_NULL)
470		panic("lock initialization:  null lock pointer");
471	l->lock_type = USLOCK_TAG;
472	l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
473	l->debug.lock_cpu = l->debug.unlock_cpu = 0;
474	l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
475	l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
476	l->debug.duration[0] = l->debug.duration[1] = 0;
477	l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
478	l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
479	l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
480}
481
482
483/*
484 *	These checks apply to all usimple_locks, not just
485 *	those with USLOCK_CHECKED turned on.
486 */
487int
488usld_lock_common_checks(
489	usimple_lock_t	l,
490	char		*caller)
491{
492	if (l == USIMPLE_LOCK_NULL)
493		panic("%s:  null lock pointer", caller);
494	if (l->lock_type != USLOCK_TAG)
495		panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
496	if (!(l->debug.state & USLOCK_INIT))
497		panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
498	return USLOCK_CHECKING(l);
499}
500
501
502/*
503 *	Debug checks on a usimple_lock just before attempting
504 *	to acquire it.
505 */
506/* ARGSUSED */
507void
508usld_lock_pre(
509	usimple_lock_t	l,
510	pc_t		pc)
511{
512	char	caller[] = "usimple_lock";
513
514
515	if (!usld_lock_common_checks(l, caller))
516		return;
517
518/*
519 *	Note that we have a weird case where we are getting a lock when we are]
520 *	in the process of putting the system to sleep. We are running with no
521 *	current threads, therefore we can't tell if we are trying to retake a lock
522 *	we have or someone on the other processor has it.  Therefore we just
523 *	ignore this test if the locking thread is 0.
524 */
525
526	if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
527	    l->debug.lock_thread == (void *) current_thread()) {
528		printf("%s:  lock %p already locked (at %p) by",
529		      caller, l, l->debug.lock_pc);
530		printf(" current thread %p (new attempt at pc %p)\n",
531		       l->debug.lock_thread, pc);
532		panic("%s", caller);
533	}
534	mp_disable_preemption();
535	usl_trace(l, cpu_number(), pc, caller);
536	mp_enable_preemption();
537}
538
539
540/*
541 *	Debug checks on a usimple_lock just after acquiring it.
542 *
543 *	Pre-emption has been disabled at this point,
544 *	so we are safe in using cpu_number.
545 */
546void
547usld_lock_post(
548	usimple_lock_t	l,
549	pc_t		pc)
550{
551	register int	mycpu;
552	char	caller[] = "successful usimple_lock";
553
554
555	if (!usld_lock_common_checks(l, caller))
556		return;
557
558	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
559		panic("%s:  lock %p became uninitialized",
560		      caller, l);
561	if ((l->debug.state & USLOCK_TAKEN))
562		panic("%s:  lock 0x%p became TAKEN by someone else",
563		      caller, l);
564
565	mycpu = cpu_number();
566	l->debug.lock_thread = (void *)current_thread();
567	l->debug.state |= USLOCK_TAKEN;
568	l->debug.lock_pc = pc;
569	l->debug.lock_cpu = mycpu;
570
571	usl_trace(l, mycpu, pc, caller);
572}
573
574
575/*
576 *	Debug checks on a usimple_lock just before
577 *	releasing it.  Note that the caller has not
578 *	yet released the hardware lock.
579 *
580 *	Preemption is still disabled, so there's
581 *	no problem using cpu_number.
582 */
583void
584usld_unlock(
585	usimple_lock_t	l,
586	pc_t		pc)
587{
588	register int	mycpu;
589	char	caller[] = "usimple_unlock";
590
591
592	if (!usld_lock_common_checks(l, caller))
593		return;
594
595	mycpu = cpu_number();
596
597	if (!(l->debug.state & USLOCK_TAKEN))
598		panic("%s:  lock 0x%p hasn't been taken",
599		      caller, l);
600	if (l->debug.lock_thread != (void *) current_thread())
601		panic("%s:  unlocking lock 0x%p, owned by thread %p",
602		      caller, l, l->debug.lock_thread);
603	if (l->debug.lock_cpu != mycpu) {
604		printf("%s:  unlocking lock 0x%p on cpu 0x%x",
605		       caller, l, mycpu);
606		printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
607		panic("%s", caller);
608	}
609	usl_trace(l, mycpu, pc, caller);
610
611	l->debug.unlock_thread = l->debug.lock_thread;
612	l->debug.lock_thread = INVALID_PC;
613	l->debug.state &= ~USLOCK_TAKEN;
614	l->debug.unlock_pc = pc;
615	l->debug.unlock_cpu = mycpu;
616}
617
618
619/*
620 *	Debug checks on a usimple_lock just before
621 *	attempting to acquire it.
622 *
623 *	Preemption isn't guaranteed to be disabled.
624 */
625void
626usld_lock_try_pre(
627	usimple_lock_t	l,
628	pc_t		pc)
629{
630	char	caller[] = "usimple_lock_try";
631
632	if (!usld_lock_common_checks(l, caller))
633		return;
634	mp_disable_preemption();
635	usl_trace(l, cpu_number(), pc, caller);
636	mp_enable_preemption();
637}
638
639
640/*
641 *	Debug checks on a usimple_lock just after
642 *	successfully attempting to acquire it.
643 *
644 *	Preemption has been disabled by the
645 *	lock acquisition attempt, so it's safe
646 *	to use cpu_number.
647 */
648void
649usld_lock_try_post(
650	usimple_lock_t	l,
651	pc_t		pc)
652{
653	register int	mycpu;
654	char	caller[] = "successful usimple_lock_try";
655
656	if (!usld_lock_common_checks(l, caller))
657		return;
658
659	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
660		panic("%s:  lock 0x%p became uninitialized",
661		      caller, l);
662	if ((l->debug.state & USLOCK_TAKEN))
663		panic("%s:  lock 0x%p became TAKEN by someone else",
664		      caller, l);
665
666	mycpu = cpu_number();
667	l->debug.lock_thread = (void *) current_thread();
668	l->debug.state |= USLOCK_TAKEN;
669	l->debug.lock_pc = pc;
670	l->debug.lock_cpu = mycpu;
671
672	usl_trace(l, mycpu, pc, caller);
673}
674
675
676/*
677 *	For very special cases, set traced_lock to point to a
678 *	specific lock of interest.  The result is a series of
679 *	XPRs showing lock operations on that lock.  The lock_seq
680 *	value is used to show the order of those operations.
681 */
682usimple_lock_t		traced_lock;
683unsigned int		lock_seq;
684
685void
686usl_trace(
687	usimple_lock_t	l,
688	int		mycpu,
689	pc_t		pc,
690	const char *	op_name)
691{
692	if (traced_lock == l) {
693		XPR(XPR_SLOCK,
694		    "seq %d, cpu %d, %s @ %x\n",
695		    (uintptr_t) lock_seq, (uintptr_t) mycpu,
696		    (uintptr_t) op_name, (uintptr_t) pc, 0);
697		lock_seq++;
698	}
699}
700
701
702#endif	/* USLOCK_DEBUG */
703
704/*
705 *      Routine:        lck_rw_alloc_init
706 */
707lck_rw_t *
708lck_rw_alloc_init(
709	lck_grp_t	*grp,
710	lck_attr_t	*attr) {
711	lck_rw_t	*lck;
712
713	if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
714		bzero(lck, sizeof(lck_rw_t));
715		lck_rw_init(lck, grp, attr);
716	}
717
718	return(lck);
719}
720
721/*
722 *      Routine:        lck_rw_free
723 */
724void
725lck_rw_free(
726	lck_rw_t	*lck,
727	lck_grp_t	*grp) {
728	lck_rw_destroy(lck, grp);
729	kfree(lck, sizeof(lck_rw_t));
730}
731
732/*
733 *      Routine:        lck_rw_init
734 */
735void
736lck_rw_init(
737	lck_rw_t	*lck,
738	lck_grp_t	*grp,
739	lck_attr_t	*attr)
740{
741	lck_attr_t	*lck_attr = (attr != LCK_ATTR_NULL) ?
742					attr : &LockDefaultLckAttr;
743
744	hw_lock_byte_init(&lck->lck_rw_interlock);
745	lck->lck_rw_want_write = FALSE;
746	lck->lck_rw_want_upgrade = FALSE;
747	lck->lck_rw_shared_count = 0;
748	lck->lck_rw_can_sleep = TRUE;
749	lck->lck_r_waiting = lck->lck_w_waiting = 0;
750	lck->lck_rw_tag = 0;
751	lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
752				LCK_ATTR_RW_SHARED_PRIORITY) == 0);
753
754	lck_grp_reference(grp);
755	lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
756}
757
758/*
759 *      Routine:        lck_rw_destroy
760 */
761void
762lck_rw_destroy(
763	lck_rw_t	*lck,
764	lck_grp_t	*grp)
765{
766	if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
767		return;
768#if MACH_LDEBUG
769	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
770#endif
771	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
772	lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
773	lck_grp_deallocate(grp);
774	return;
775}
776
777/*
778 *	Sleep locks.  These use the same data structure and algorithm
779 *	as the spin locks, but the process sleeps while it is waiting
780 *	for the lock.  These work on uniprocessor systems.
781 */
782
783#define DECREMENTER_TIMEOUT 1000000
784
785#define RW_LOCK_READER_EVENT(x)		\
786		((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
787
788#define RW_LOCK_WRITER_EVENT(x)		\
789		((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
790
791/*
792 * We disable interrupts while holding the RW interlock to prevent an
793 * interrupt from exacerbating hold time.
794 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
795 */
796static boolean_t
797lck_interlock_lock(lck_rw_t *lck)
798{
799	boolean_t	istate;
800
801	istate = ml_set_interrupts_enabled(FALSE);
802	hw_lock_byte_lock(&lck->lck_rw_interlock);
803
804	return istate;
805}
806
807static void
808lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
809{
810	hw_lock_byte_unlock(&lck->lck_rw_interlock);
811	ml_set_interrupts_enabled(istate);
812}
813
814/*
815 * This inline is used when busy-waiting for an rw lock.
816 * If interrupts were disabled when the lock primitive was called,
817 * we poll the IPI handler for pending tlb flushes.
818 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
819 */
820static inline void
821lck_rw_lock_pause(boolean_t interrupts_enabled)
822{
823	if (!interrupts_enabled)
824		handle_pending_TLB_flushes();
825	cpu_pause();
826}
827
828
829/*
830 * compute the deadline to spin against when
831 * waiting for a change of state on a lck_rw_t
832 */
833static inline uint64_t
834lck_rw_deadline_for_spin(lck_rw_t *lck)
835{
836	if (lck->lck_rw_can_sleep) {
837		if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
838			/*
839			 * there are already threads waiting on this lock... this
840			 * implies that they have spun beyond their deadlines waiting for
841			 * the desired state to show up so we will not bother spinning at this time...
842			 *   or
843			 * the current number of threads sharing this lock exceeds our capacity to run them
844			 * concurrently and since all states we're going to spin for require the rw_shared_count
845			 * to be at 0, we'll not bother spinning since the latency for this to happen is
846			 * unpredictable...
847			 */
848			return (mach_absolute_time());
849		}
850		return (mach_absolute_time() + MutexSpin);
851	} else
852		return (mach_absolute_time() + (100000LL * 1000000000LL));
853}
854
855
856/*
857 *      Routine:        lck_rw_lock_exclusive
858 */
859void
860lck_rw_lock_exclusive_gen(
861	lck_rw_t	*lck)
862{
863	uint64_t	deadline = 0;
864	int		slept = 0;
865	int		gotlock = 0;
866	int		lockheld = 0;
867	wait_result_t	res = 0;
868	boolean_t	istate = -1;
869
870#if	CONFIG_DTRACE
871	boolean_t dtrace_ls_initialized = FALSE;
872	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
873	uint64_t wait_interval = 0;
874	int readers_at_sleep = 0;
875#endif
876
877	/*
878	 *	Try to acquire the lck_rw_want_write bit.
879	 */
880	while ( !lck_rw_grab_want(lck)) {
881
882#if	CONFIG_DTRACE
883		if (dtrace_ls_initialized == FALSE) {
884			dtrace_ls_initialized = TRUE;
885			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
886			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
887			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
888			if (dtrace_ls_enabled) {
889				/*
890				 * Either sleeping or spinning is happening,
891				 *  start a timing of our delay interval now.
892				 */
893				readers_at_sleep = lck->lck_rw_shared_count;
894				wait_interval = mach_absolute_time();
895			}
896		}
897#endif
898		if (istate == -1)
899			istate = ml_get_interrupts_enabled();
900
901		deadline = lck_rw_deadline_for_spin(lck);
902
903		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
904
905		while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
906			lck_rw_lock_pause(istate);
907
908		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0);
909
910		if (gotlock)
911			break;
912		/*
913		 * if we get here, the deadline has expired w/o us
914		 * being able to grab the lock exclusively
915		 * check to see if we're allowed to do a thread_block
916		 */
917		if (lck->lck_rw_can_sleep) {
918
919			istate = lck_interlock_lock(lck);
920
921			if (lck->lck_rw_want_write) {
922
923				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
924
925				lck->lck_w_waiting = TRUE;
926
927				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
928				lck_interlock_unlock(lck, istate);
929
930				if (res == THREAD_WAITING) {
931					res = thread_block(THREAD_CONTINUE_NULL);
932					slept++;
933				}
934				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
935			} else {
936				lck->lck_rw_want_write = TRUE;
937				lck_interlock_unlock(lck, istate);
938				break;
939			}
940		}
941	}
942	/*
943	 * Wait for readers (and upgrades) to finish...
944	 * the test for these conditions must be done simultaneously with
945	 * a check of the interlock not being held since
946	 * the rw_shared_count will drop to 0 first and then want_upgrade
947	 * will be set to 1 in the shared_to_exclusive scenario... those
948	 * adjustments are done behind the interlock and represent an
949	 * atomic change in state and must be considered as such
950	 * however, once we see the read count at 0, the want_upgrade not set
951	 * and the interlock not held, we are safe to proceed
952	 */
953	while (lck_rw_held_read_or_upgrade(lck)) {
954
955#if	CONFIG_DTRACE
956		/*
957		 * Either sleeping or spinning is happening, start
958		 * a timing of our delay interval now.  If we set it
959		 * to -1 we don't have accurate data so we cannot later
960		 * decide to record a dtrace spin or sleep event.
961		 */
962		if (dtrace_ls_initialized == FALSE) {
963			dtrace_ls_initialized = TRUE;
964			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
965			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
966			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
967			if (dtrace_ls_enabled) {
968				/*
969				 * Either sleeping or spinning is happening,
970				 *  start a timing of our delay interval now.
971				 */
972				readers_at_sleep = lck->lck_rw_shared_count;
973				wait_interval = mach_absolute_time();
974			}
975		}
976#endif
977		if (istate == -1)
978			istate = ml_get_interrupts_enabled();
979
980		deadline = lck_rw_deadline_for_spin(lck);
981
982		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
983
984		while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
985			lck_rw_lock_pause(istate);
986
987		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0);
988
989		if ( !lockheld)
990			break;
991		/*
992		 * if we get here, the deadline has expired w/o us
993		 * being able to grab the lock exclusively
994		 * check to see if we're allowed to do a thread_block
995		 */
996		if (lck->lck_rw_can_sleep) {
997
998			istate = lck_interlock_lock(lck);
999
1000			if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1001				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1002
1003				lck->lck_w_waiting = TRUE;
1004
1005				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1006				lck_interlock_unlock(lck, istate);
1007
1008				if (res == THREAD_WAITING) {
1009					res = thread_block(THREAD_CONTINUE_NULL);
1010					slept++;
1011				}
1012				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
1013			} else {
1014				lck_interlock_unlock(lck, istate);
1015				/*
1016				 * must own the lock now, since we checked for
1017				 * readers or upgrade owner behind the interlock
1018				 * no need for a call to 'lck_rw_held_read_or_upgrade'
1019				 */
1020				break;
1021			}
1022		}
1023	}
1024
1025#if	CONFIG_DTRACE
1026	/*
1027	 * Decide what latencies we suffered that are Dtrace events.
1028	 * If we have set wait_interval, then we either spun or slept.
1029	 * At least we get out from under the interlock before we record
1030	 * which is the best we can do here to minimize the impact
1031	 * of the tracing.
1032	 * If we have set wait_interval to -1, then dtrace was not enabled when we
1033	 * started sleeping/spinning so we don't record this event.
1034	 */
1035	if (dtrace_ls_enabled == TRUE) {
1036		if (slept == 0) {
1037			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1038			    mach_absolute_time() - wait_interval, 1);
1039		} else {
1040			/*
1041			 * For the blocking case, we also record if when we blocked
1042			 * it was held for read or write, and how many readers.
1043			 * Notice that above we recorded this before we dropped
1044			 * the interlock so the count is accurate.
1045			 */
1046			LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1047			    mach_absolute_time() - wait_interval, 1,
1048			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1049		}
1050	}
1051	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1052#endif
1053}
1054
1055
1056/*
1057 *      Routine:        lck_rw_done_gen
1058 *
1059 *	called from the assembly language wrapper...
1060 *	prior_lock_state is the value in the 1st
1061 * 	word of the lock at the time of a successful
1062 *	atomic compare and exchange with the new value...
1063 * 	it represents the state of the lock before we
1064 *	decremented the rw_shared_count or cleared either
1065 * 	rw_want_upgrade or rw_want_write and
1066 *	the lck_x_waiting bits...  since the wrapper
1067 * 	routine has already changed the state atomically,
1068 *	we just need to decide if we should
1069 *	wake up anyone and what value to return... we do
1070 *	this by examining the state of the lock before
1071 *	we changed it
1072 */
1073lck_rw_type_t
1074lck_rw_done_gen(
1075	lck_rw_t	*lck,
1076	int		prior_lock_state)
1077{
1078	lck_rw_t	*fake_lck;
1079	lck_rw_type_t	lock_type;
1080	thread_t	thread;
1081	uint32_t	rwlock_count;
1082
1083	/*
1084	 * prior_lock state is a snapshot of the 1st word of the
1085	 * lock in question... we'll fake up a pointer to it
1086	 * and carefully not access anything beyond whats defined
1087	 * in the first word of a lck_rw_t
1088	 */
1089	fake_lck = (lck_rw_t *)&prior_lock_state;
1090
1091	if (fake_lck->lck_rw_shared_count <= 1) {
1092		if (fake_lck->lck_w_waiting)
1093			thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1094
1095		if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1096			thread_wakeup(RW_LOCK_READER_EVENT(lck));
1097	}
1098	if (fake_lck->lck_rw_shared_count)
1099		lock_type = LCK_RW_TYPE_SHARED;
1100	else
1101		lock_type = LCK_RW_TYPE_EXCLUSIVE;
1102
1103	/* Check if dropping the lock means that we need to unpromote */
1104	thread = current_thread();
1105	rwlock_count = thread->rwlock_count--;
1106#if MACH_LDEBUG
1107	if (rwlock_count == 0) {
1108		panic("rw lock count underflow for thread %p", thread);
1109	}
1110#endif
1111	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1112		/* sched_flags checked without lock, but will be rechecked while clearing */
1113		lck_rw_clear_promotion(thread);
1114	}
1115
1116#if CONFIG_DTRACE
1117	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1118#endif
1119
1120	return(lock_type);
1121}
1122
1123
1124/*
1125 *	Routine:	lck_rw_unlock
1126 */
1127void
1128lck_rw_unlock(
1129	lck_rw_t	*lck,
1130	lck_rw_type_t	lck_rw_type)
1131{
1132	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1133		lck_rw_unlock_shared(lck);
1134	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1135		lck_rw_unlock_exclusive(lck);
1136	else
1137		panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1138}
1139
1140
1141/*
1142 *	Routine:	lck_rw_unlock_shared
1143 */
1144void
1145lck_rw_unlock_shared(
1146	lck_rw_t	*lck)
1147{
1148	lck_rw_type_t	ret;
1149
1150	ret = lck_rw_done(lck);
1151
1152	if (ret != LCK_RW_TYPE_SHARED)
1153		panic("lck_rw_unlock(): lock held in mode: %d\n", ret);
1154}
1155
1156
1157/*
1158 *	Routine:	lck_rw_unlock_exclusive
1159 */
1160void
1161lck_rw_unlock_exclusive(
1162	lck_rw_t	*lck)
1163{
1164	lck_rw_type_t	ret;
1165
1166	ret = lck_rw_done(lck);
1167
1168	if (ret != LCK_RW_TYPE_EXCLUSIVE)
1169		panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1170}
1171
1172
1173/*
1174 *	Routine:	lck_rw_lock
1175 */
1176void
1177lck_rw_lock(
1178	lck_rw_t	*lck,
1179	lck_rw_type_t	lck_rw_type)
1180{
1181	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1182		lck_rw_lock_shared(lck);
1183	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1184		lck_rw_lock_exclusive(lck);
1185	else
1186		panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1187}
1188
1189
1190/*
1191 *	Routine:	lck_rw_lock_shared_gen
1192 *	Function:
1193 *		assembly fast path code has determined that this lock
1194 *		is held exclusively... this is where we spin/block
1195 *		until we can acquire the lock in the shared mode
1196 */
1197void
1198lck_rw_lock_shared_gen(
1199	lck_rw_t	*lck)
1200{
1201	uint64_t	deadline = 0;
1202	int		gotlock = 0;
1203	int		slept = 0;
1204	wait_result_t	res = 0;
1205	boolean_t	istate = -1;
1206
1207#if	CONFIG_DTRACE
1208	uint64_t wait_interval = 0;
1209	int readers_at_sleep = 0;
1210	boolean_t dtrace_ls_initialized = FALSE;
1211	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1212#endif
1213
1214	while ( !lck_rw_grab_shared(lck)) {
1215
1216#if	CONFIG_DTRACE
1217		if (dtrace_ls_initialized == FALSE) {
1218			dtrace_ls_initialized = TRUE;
1219			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1220			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1221			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1222			if (dtrace_ls_enabled) {
1223				/*
1224				 * Either sleeping or spinning is happening,
1225				 *  start a timing of our delay interval now.
1226				 */
1227				readers_at_sleep = lck->lck_rw_shared_count;
1228				wait_interval = mach_absolute_time();
1229			}
1230		}
1231#endif
1232		if (istate == -1)
1233			istate = ml_get_interrupts_enabled();
1234
1235		deadline = lck_rw_deadline_for_spin(lck);
1236
1237		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1238			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1239
1240		while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1241			lck_rw_lock_pause(istate);
1242
1243		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1244			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1245
1246		if (gotlock)
1247			break;
1248		/*
1249		 * if we get here, the deadline has expired w/o us
1250		 * being able to grab the lock for read
1251		 * check to see if we're allowed to do a thread_block
1252		 */
1253		if (lck->lck_rw_can_sleep) {
1254
1255			istate = lck_interlock_lock(lck);
1256
1257			if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1258			    ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1259
1260				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1261					     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1262
1263				lck->lck_r_waiting = TRUE;
1264
1265				res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1266				lck_interlock_unlock(lck, istate);
1267
1268				if (res == THREAD_WAITING) {
1269					res = thread_block(THREAD_CONTINUE_NULL);
1270					slept++;
1271				}
1272				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1273					     (int)lck, res, slept, 0, 0);
1274			} else {
1275				lck->lck_rw_shared_count++;
1276				lck_interlock_unlock(lck, istate);
1277				break;
1278			}
1279		}
1280	}
1281
1282#if	CONFIG_DTRACE
1283	if (dtrace_ls_enabled == TRUE) {
1284		if (slept == 0) {
1285			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1286		} else {
1287			LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1288			    mach_absolute_time() - wait_interval, 0,
1289			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1290		}
1291	}
1292	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1293#endif
1294}
1295
1296
1297/*
1298 *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1299 *	Function:
1300 *		assembly fast path code has already dropped our read
1301 *		count and determined that someone else owns 'lck_rw_want_upgrade'
1302 *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1303 *		all we need to do here is determine if a wakeup is needed
1304 */
1305boolean_t
1306lck_rw_lock_shared_to_exclusive_failure(
1307	lck_rw_t	*lck,
1308	int		prior_lock_state)
1309{
1310	lck_rw_t	*fake_lck;
1311	thread_t	thread = current_thread();
1312	uint32_t	rwlock_count;
1313
1314	/* Check if dropping the lock means that we need to unpromote */
1315	rwlock_count = thread->rwlock_count--;
1316#if MACH_LDEBUG
1317	if (rwlock_count == 0) {
1318		panic("rw lock count underflow for thread %p", thread);
1319	}
1320#endif
1321	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1322		/* sched_flags checked without lock, but will be rechecked while clearing */
1323		lck_rw_clear_promotion(thread);
1324	}
1325
1326	/*
1327	 * prior_lock state is a snapshot of the 1st word of the
1328	 * lock in question... we'll fake up a pointer to it
1329	 * and carefully not access anything beyond whats defined
1330	 * in the first word of a lck_rw_t
1331	 */
1332	fake_lck = (lck_rw_t *)&prior_lock_state;
1333
1334	if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1335		/*
1336		 *	Someone else has requested upgrade.
1337		 *	Since we've released the read lock, wake
1338		 *	him up if he's blocked waiting
1339		 */
1340		thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1341	}
1342	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1343		     (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1344
1345	return (FALSE);
1346}
1347
1348
1349/*
1350 *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1351 *	Function:
1352 *		assembly fast path code has already dropped our read
1353 *		count and successfully acquired 'lck_rw_want_upgrade'
1354 *		we just need to wait for the rest of the readers to drain
1355 *		and then we can return as the exclusive holder of this lock
1356 */
1357boolean_t
1358lck_rw_lock_shared_to_exclusive_success(
1359	lck_rw_t	*lck)
1360{
1361	uint64_t	deadline = 0;
1362	int		slept = 0;
1363	int		still_shared = 0;
1364	wait_result_t	res;
1365	boolean_t	istate = -1;
1366
1367#if	CONFIG_DTRACE
1368	uint64_t wait_interval = 0;
1369	int readers_at_sleep = 0;
1370	boolean_t dtrace_ls_initialized = FALSE;
1371	boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1372#endif
1373
1374	while (lck->lck_rw_shared_count != 0) {
1375
1376#if	CONFIG_DTRACE
1377		if (dtrace_ls_initialized == FALSE) {
1378			dtrace_ls_initialized = TRUE;
1379			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1380			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1381			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1382			if (dtrace_ls_enabled) {
1383				/*
1384				 * Either sleeping or spinning is happening,
1385				 *  start a timing of our delay interval now.
1386				 */
1387				readers_at_sleep = lck->lck_rw_shared_count;
1388				wait_interval = mach_absolute_time();
1389			}
1390		}
1391#endif
1392		if (istate == -1)
1393			istate = ml_get_interrupts_enabled();
1394
1395		deadline = lck_rw_deadline_for_spin(lck);
1396
1397		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1398			     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1399
1400		while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1401			lck_rw_lock_pause(istate);
1402
1403		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1404			     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1405
1406		if ( !still_shared)
1407			break;
1408		/*
1409		 * if we get here, the deadline has expired w/o
1410		 * the rw_shared_count having drained to 0
1411		 * check to see if we're allowed to do a thread_block
1412		 */
1413		if (lck->lck_rw_can_sleep) {
1414
1415			istate = lck_interlock_lock(lck);
1416
1417			if (lck->lck_rw_shared_count != 0) {
1418				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1419					     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1420
1421				lck->lck_w_waiting = TRUE;
1422
1423				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1424				lck_interlock_unlock(lck, istate);
1425
1426				if (res == THREAD_WAITING) {
1427					res = thread_block(THREAD_CONTINUE_NULL);
1428					slept++;
1429				}
1430				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1431					     (int)lck, res, slept, 0, 0);
1432			} else {
1433				lck_interlock_unlock(lck, istate);
1434				break;
1435			}
1436		}
1437	}
1438#if	CONFIG_DTRACE
1439	/*
1440	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1441	 */
1442	if (dtrace_ls_enabled == TRUE) {
1443		if (slept == 0) {
1444			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1445		} else {
1446			LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1447			    mach_absolute_time() - wait_interval, 1,
1448			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1449		}
1450	}
1451	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1452#endif
1453	return (TRUE);
1454}
1455
1456
1457/*
1458 *      Routine:        lck_rw_lock_exclusive_to_shared
1459 * 	Function:
1460 *		assembly fast path has already dropped
1461 *		our exclusive state and bumped lck_rw_shared_count
1462 *		all we need to do here is determine if anyone
1463 *		needs to be awakened.
1464 */
1465void
1466lck_rw_lock_exclusive_to_shared_gen(
1467	lck_rw_t	*lck,
1468	int		prior_lock_state)
1469{
1470	lck_rw_t	*fake_lck;
1471
1472	/*
1473	 * prior_lock state is a snapshot of the 1st word of the
1474	 * lock in question... we'll fake up a pointer to it
1475	 * and carefully not access anything beyond whats defined
1476	 * in the first word of a lck_rw_t
1477	 */
1478	fake_lck = (lck_rw_t *)&prior_lock_state;
1479
1480	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1481			     (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1482
1483	/*
1484	 * don't wake up anyone waiting to take the lock exclusively
1485	 * since we hold a read count... when the read count drops to 0,
1486	 * the writers will be woken.
1487	 *
1488	 * wake up any waiting readers if we don't have any writers waiting,
1489	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1490	 */
1491	if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1492		thread_wakeup(RW_LOCK_READER_EVENT(lck));
1493
1494	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1495			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1496
1497#if CONFIG_DTRACE
1498	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1499#endif
1500}
1501
1502
1503/*
1504 *      Routine:        lck_rw_try_lock
1505 */
1506boolean_t
1507lck_rw_try_lock(
1508	lck_rw_t	*lck,
1509	lck_rw_type_t	lck_rw_type)
1510{
1511	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1512		return(lck_rw_try_lock_shared(lck));
1513	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1514		return(lck_rw_try_lock_exclusive(lck));
1515	else
1516		panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1517	return(FALSE);
1518}
1519
1520
1521void
1522lck_rw_assert(
1523	lck_rw_t	*lck,
1524	unsigned int	type)
1525{
1526	switch (type) {
1527	case LCK_RW_ASSERT_SHARED:
1528		if (lck->lck_rw_shared_count != 0) {
1529			return;
1530		}
1531		break;
1532	case LCK_RW_ASSERT_EXCLUSIVE:
1533		if ((lck->lck_rw_want_write ||
1534		     lck->lck_rw_want_upgrade) &&
1535		    lck->lck_rw_shared_count == 0) {
1536			return;
1537		}
1538		break;
1539	case LCK_RW_ASSERT_HELD:
1540		if (lck->lck_rw_want_write ||
1541		    lck->lck_rw_want_upgrade ||
1542		    lck->lck_rw_shared_count != 0) {
1543			return;
1544		}
1545		break;
1546	case LCK_RW_ASSERT_NOTHELD:
1547		if (!(lck->lck_rw_want_write ||
1548			  lck->lck_rw_want_upgrade ||
1549			  lck->lck_rw_shared_count != 0)) {
1550			return;
1551		}
1552		break;
1553	default:
1554		break;
1555	}
1556
1557	panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1558}
1559
1560/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1561void
1562lck_rw_clear_promotions_x86(thread_t thread)
1563{
1564#if MACH_LDEBUG
1565	/* It's fatal to leave a RW lock locked and return to userspace */
1566	panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1567#else
1568	/* Paper over the issue */
1569	thread->rwlock_count = 0;
1570	lck_rw_clear_promotion(thread);
1571#endif
1572}
1573
1574
1575#ifdef	MUTEX_ZONE
1576extern zone_t lck_mtx_zone;
1577#endif
1578/*
1579 *      Routine:        lck_mtx_alloc_init
1580 */
1581lck_mtx_t *
1582lck_mtx_alloc_init(
1583	lck_grp_t	*grp,
1584	lck_attr_t	*attr)
1585{
1586	lck_mtx_t	*lck;
1587#ifdef	MUTEX_ZONE
1588	if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
1589		lck_mtx_init(lck, grp, attr);
1590#else
1591	if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
1592		lck_mtx_init(lck, grp, attr);
1593#endif
1594	return(lck);
1595}
1596
1597/*
1598 *      Routine:        lck_mtx_free
1599 */
1600void
1601lck_mtx_free(
1602	lck_mtx_t	*lck,
1603	lck_grp_t	*grp)
1604{
1605	lck_mtx_destroy(lck, grp);
1606#ifdef	MUTEX_ZONE
1607	zfree(lck_mtx_zone, lck);
1608#else
1609	kfree(lck, sizeof(lck_mtx_t));
1610#endif
1611}
1612
1613/*
1614 *      Routine:        lck_mtx_ext_init
1615 */
1616static void
1617lck_mtx_ext_init(
1618	lck_mtx_ext_t	*lck,
1619	lck_grp_t	*grp,
1620	lck_attr_t	*attr)
1621{
1622	bzero((void *)lck, sizeof(lck_mtx_ext_t));
1623
1624	if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1625		lck->lck_mtx_deb.type = MUTEX_TAG;
1626		lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
1627	}
1628
1629	lck->lck_mtx_grp = grp;
1630
1631	if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
1632		lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
1633
1634	lck->lck_mtx.lck_mtx_is_ext = 1;
1635	lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1636}
1637
1638/*
1639 *      Routine:        lck_mtx_init
1640 */
1641void
1642lck_mtx_init(
1643	lck_mtx_t	*lck,
1644	lck_grp_t	*grp,
1645	lck_attr_t	*attr)
1646{
1647	lck_mtx_ext_t	*lck_ext;
1648	lck_attr_t	*lck_attr;
1649
1650	if (attr != LCK_ATTR_NULL)
1651		lck_attr = attr;
1652	else
1653		lck_attr = &LockDefaultLckAttr;
1654
1655	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1656		if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
1657			lck_mtx_ext_init(lck_ext, grp, lck_attr);
1658			lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1659			lck->lck_mtx_ptr = lck_ext;
1660		}
1661	} else {
1662		lck->lck_mtx_owner = 0;
1663		lck->lck_mtx_state = 0;
1664	}
1665	lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1666	lck_grp_reference(grp);
1667	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1668}
1669
1670/*
1671 *      Routine:        lck_mtx_init_ext
1672 */
1673void
1674lck_mtx_init_ext(
1675	lck_mtx_t	*lck,
1676	lck_mtx_ext_t	*lck_ext,
1677	lck_grp_t	*grp,
1678	lck_attr_t	*attr)
1679{
1680	lck_attr_t	*lck_attr;
1681
1682	if (attr != LCK_ATTR_NULL)
1683		lck_attr = attr;
1684	else
1685		lck_attr = &LockDefaultLckAttr;
1686
1687	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1688		lck_mtx_ext_init(lck_ext, grp, lck_attr);
1689		lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1690		lck->lck_mtx_ptr = lck_ext;
1691	} else {
1692		lck->lck_mtx_owner = 0;
1693		lck->lck_mtx_state = 0;
1694	}
1695	lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1696
1697	lck_grp_reference(grp);
1698	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1699}
1700
1701/*
1702 *      Routine:        lck_mtx_destroy
1703 */
1704void
1705lck_mtx_destroy(
1706	lck_mtx_t	*lck,
1707	lck_grp_t	*grp)
1708{
1709	boolean_t lck_is_indirect;
1710
1711	if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
1712		return;
1713#if MACH_LDEBUG
1714	lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1715#endif
1716	lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1717
1718	lck_mtx_lock_mark_destroyed(lck);
1719
1720	if (lck_is_indirect)
1721		kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
1722	lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1723	lck_grp_deallocate(grp);
1724	return;
1725}
1726
1727
1728#define	LCK_MTX_LCK_WAIT_CODE		0x20
1729#define	LCK_MTX_LCK_WAKEUP_CODE		0x21
1730#define	LCK_MTX_LCK_SPIN_CODE		0x22
1731#define	LCK_MTX_LCK_ACQUIRE_CODE	0x23
1732#define LCK_MTX_LCK_DEMOTE_CODE		0x24
1733
1734
1735/*
1736 * Routine: 	lck_mtx_unlock_wakeup_x86
1737 *
1738 * Invoked on unlock when there is
1739 * contention (i.e. the assembly routine sees that
1740 * that mutex->lck_mtx_waiters != 0 or
1741 * that mutex->lck_mtx_promoted != 0...
1742 *
1743 * neither the mutex or interlock is held
1744 */
1745void
1746lck_mtx_unlock_wakeup_x86 (
1747	lck_mtx_t	*mutex,
1748	int		prior_lock_state)
1749{
1750	lck_mtx_t	fake_lck;
1751
1752	/*
1753	 * prior_lock state is a snapshot of the 2nd word of the
1754	 * lock in question... we'll fake up a lock with the bits
1755	 * copied into place and carefully not access anything
1756	 * beyond whats defined in the second word of a lck_mtx_t
1757	 */
1758	fake_lck.lck_mtx_state = prior_lock_state;
1759
1760	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
1761		     mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
1762
1763	if (__probable(fake_lck.lck_mtx_waiters)) {
1764		if (fake_lck.lck_mtx_waiters > 1)
1765			thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri);
1766		else
1767			thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
1768	}
1769
1770	if (__improbable(fake_lck.lck_mtx_promoted)) {
1771		thread_t	thread = current_thread();
1772
1773
1774		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
1775			     thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
1776
1777		if (thread->promotions > 0) {
1778			spl_t	s = splsched();
1779
1780			thread_lock(thread);
1781
1782			if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
1783
1784				thread->sched_flags &= ~TH_SFLAG_PROMOTED;
1785
1786				if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
1787					/* Thread still has a RW lock promotion */
1788				} else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1789					KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1790							      thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
1791
1792					set_sched_pri(thread, DEPRESSPRI);
1793				}
1794				else {
1795					if (thread->priority < thread->sched_pri) {
1796						KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1797								      thread->sched_pri, thread->priority, 0, mutex, 0);
1798
1799						SCHED(compute_priority)(thread, FALSE);
1800					}
1801				}
1802			}
1803			thread_unlock(thread);
1804			splx(s);
1805		}
1806	}
1807	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
1808		     mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1809}
1810
1811
1812/*
1813 * Routine: 	lck_mtx_lock_acquire_x86
1814 *
1815 * Invoked on acquiring the mutex when there is
1816 * contention (i.e. the assembly routine sees that
1817 * that mutex->lck_mtx_waiters != 0 or
1818 * thread->was_promoted_on_wakeup != 0)...
1819 *
1820 * mutex is owned...  interlock is held... preemption is disabled
1821 */
1822void
1823lck_mtx_lock_acquire_x86(
1824	lck_mtx_t	*mutex)
1825{
1826	thread_t	thread;
1827	integer_t	priority;
1828	spl_t		s;
1829
1830	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
1831		     mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
1832
1833	if (mutex->lck_mtx_waiters)
1834		priority = mutex->lck_mtx_pri;
1835	else
1836		priority = 0;
1837
1838	thread = (thread_t)mutex->lck_mtx_owner;	/* faster then current_thread() */
1839
1840	if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
1841
1842		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
1843				      thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0);
1844
1845		s = splsched();
1846		thread_lock(thread);
1847
1848		if (thread->sched_pri < priority) {
1849			/* Do not promote past promotion ceiling */
1850			assert(priority <= MAXPRI_PROMOTE);
1851			set_sched_pri(thread, priority);
1852		}
1853		if (mutex->lck_mtx_promoted == 0) {
1854			mutex->lck_mtx_promoted = 1;
1855
1856			thread->promotions++;
1857			thread->sched_flags |= TH_SFLAG_PROMOTED;
1858		}
1859		thread->was_promoted_on_wakeup = 0;
1860
1861		thread_unlock(thread);
1862		splx(s);
1863	}
1864	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
1865		     mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1866}
1867
1868
1869
1870/*
1871 * Routine: 	lck_mtx_lock_spinwait_x86
1872 *
1873 * Invoked trying to acquire a mutex when there is contention but
1874 * the holder is running on another processor. We spin for up to a maximum
1875 * time waiting for the lock to be released.
1876 *
1877 * Called with the interlock unlocked.
1878 * returns 0 if mutex acquired
1879 * returns 1 if we spun
1880 * returns 2 if we didn't spin due to the holder not running
1881 */
1882int
1883lck_mtx_lock_spinwait_x86(
1884	lck_mtx_t	*mutex)
1885{
1886	thread_t	holder;
1887	uint64_t	deadline;
1888	int		retval = 1;
1889	int		loopcount = 0;
1890
1891
1892	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
1893		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
1894
1895	deadline = mach_absolute_time() + MutexSpin;
1896
1897	/*
1898	 * Spin while:
1899	 *   - mutex is locked, and
1900	 *   - its locked as a spin lock, and
1901	 *   - owner is running on another processor, and
1902	 *   - owner (processor) is not idling, and
1903	 *   - we haven't spun for long enough.
1904	 */
1905	do {
1906		if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
1907			retval = 0;
1908			break;
1909		}
1910		if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
1911
1912			if ( !(holder->machine.specFlags & OnProc) ||
1913			     (holder->state & TH_IDLE)) {
1914				if (loopcount == 0)
1915					retval = 2;
1916				break;
1917			}
1918		}
1919		cpu_pause();
1920
1921		loopcount++;
1922
1923	} while (mach_absolute_time() < deadline);
1924
1925
1926#if	CONFIG_DTRACE
1927	/*
1928	 * We've already kept a count via deadline of how long we spun.
1929	 * If dtrace is active, then we compute backwards to decide how
1930	 * long we spun.
1931	 *
1932	 * Note that we record a different probe id depending on whether
1933	 * this is a direct or indirect mutex.  This allows us to
1934	 * penalize only lock groups that have debug/stats enabled
1935	 * with dtrace processing if desired.
1936	 */
1937	if (__probable(mutex->lck_mtx_is_ext == 0)) {
1938		LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
1939		    mach_absolute_time() - (deadline - MutexSpin));
1940	} else {
1941		LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
1942		    mach_absolute_time() - (deadline - MutexSpin));
1943	}
1944	/* The lockstat acquire event is recorded by the assembly code beneath us. */
1945#endif
1946
1947	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
1948		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
1949
1950	return retval;
1951}
1952
1953
1954
1955/*
1956 * Routine: 	lck_mtx_lock_wait_x86
1957 *
1958 * Invoked in order to wait on contention.
1959 *
1960 * Called with the interlock locked and
1961 * preemption disabled...
1962 * returns it unlocked and with preemption enabled
1963 */
1964void
1965lck_mtx_lock_wait_x86 (
1966	lck_mtx_t	*mutex)
1967{
1968	thread_t	self = current_thread();
1969	thread_t	holder;
1970	integer_t	priority;
1971	spl_t		s;
1972#if	CONFIG_DTRACE
1973	uint64_t	sleep_start = 0;
1974
1975	if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
1976		sleep_start = mach_absolute_time();
1977	}
1978#endif
1979	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
1980		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
1981
1982	priority = self->sched_pri;
1983
1984	if (priority < self->priority)
1985		priority = self->priority;
1986	if (priority < BASEPRI_DEFAULT)
1987		priority = BASEPRI_DEFAULT;
1988
1989	/* Do not promote past promotion ceiling */
1990	priority = MIN(priority, MAXPRI_PROMOTE);
1991
1992	if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
1993		mutex->lck_mtx_pri = priority;
1994	mutex->lck_mtx_waiters++;
1995
1996	if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
1997	     holder->sched_pri < mutex->lck_mtx_pri ) {
1998		s = splsched();
1999		thread_lock(holder);
2000
2001		/* holder priority may have been bumped by another thread
2002		 * before thread_lock was taken
2003		 */
2004		if (holder->sched_pri < mutex->lck_mtx_pri) {
2005			KERNEL_DEBUG_CONSTANT(
2006				MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2007				holder->sched_pri, priority, thread_tid(holder), mutex, 0);
2008			/* Assert that we're not altering the priority of a
2009			 * thread above the MAXPRI_PROMOTE band
2010			 */
2011			assert(holder->sched_pri < MAXPRI_PROMOTE);
2012			set_sched_pri(holder, priority);
2013
2014			if (mutex->lck_mtx_promoted == 0) {
2015				holder->promotions++;
2016				holder->sched_flags |= TH_SFLAG_PROMOTED;
2017
2018				mutex->lck_mtx_promoted = 1;
2019			}
2020		}
2021		thread_unlock(holder);
2022		splx(s);
2023	}
2024	assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
2025
2026	lck_mtx_ilk_unlock(mutex);
2027
2028	thread_block(THREAD_CONTINUE_NULL);
2029
2030	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2031		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2032
2033#if	CONFIG_DTRACE
2034	/*
2035	 * Record the Dtrace lockstat probe for blocking, block time
2036	 * measured from when we were entered.
2037	 */
2038	if (sleep_start) {
2039		if (mutex->lck_mtx_is_ext == 0) {
2040			LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2041			    mach_absolute_time() - sleep_start);
2042		} else {
2043			LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2044			    mach_absolute_time() - sleep_start);
2045		}
2046	}
2047#endif
2048}
2049