1/*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 *	File:	kern/lock.c
58 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
59 *	Date:	1985
60 *
61 *	Locking primitives implementation
62 */
63
64#include <mach_ldebug.h>
65
66#include <kern/lock.h>
67#include <kern/locks.h>
68#include <kern/kalloc.h>
69#include <kern/misc_protos.h>
70#include <kern/thread.h>
71#include <kern/processor.h>
72#include <kern/cpu_data.h>
73#include <kern/cpu_number.h>
74#include <kern/sched_prim.h>
75#include <kern/xpr.h>
76#include <kern/debug.h>
77#include <string.h>
78
79#include <i386/machine_routines.h> /* machine_timeout_suspended() */
80#include <machine/machine_cpu.h>
81#include <i386/mp.h>
82
83#include <sys/kdebug.h>
84#include <mach/branch_predicates.h>
85
86/*
87 * We need only enough declarations from the BSD-side to be able to
88 * test if our probe is active, and to call __dtrace_probe().  Setting
89 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
90 */
91#if	CONFIG_DTRACE
92#define NEED_DTRACE_DEFS
93#include <../bsd/sys/lockstat.h>
94#endif
95
96#define	LCK_RW_LCK_EXCLUSIVE_CODE	0x100
97#define	LCK_RW_LCK_EXCLUSIVE1_CODE	0x101
98#define	LCK_RW_LCK_SHARED_CODE		0x102
99#define	LCK_RW_LCK_SH_TO_EX_CODE	0x103
100#define	LCK_RW_LCK_SH_TO_EX1_CODE	0x104
101#define	LCK_RW_LCK_EX_TO_SH_CODE	0x105
102
103#define LCK_RW_LCK_EX_WRITER_SPIN_CODE	0x106
104#define LCK_RW_LCK_EX_WRITER_WAIT_CODE	0x107
105#define LCK_RW_LCK_EX_READER_SPIN_CODE	0x108
106#define LCK_RW_LCK_EX_READER_WAIT_CODE	0x109
107#define LCK_RW_LCK_SHARED_SPIN_CODE	0x110
108#define LCK_RW_LCK_SHARED_WAIT_CODE	0x111
109#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE	0x112
110#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE	0x113
111
112
113#define	ANY_LOCK_DEBUG	(USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
114
115unsigned int LcksOpts=0;
116
117/* Forwards */
118
119#if	USLOCK_DEBUG
120/*
121 *	Perform simple lock checks.
122 */
123int	uslock_check = 1;
124int	max_lock_loops	= 100000000;
125decl_simple_lock_data(extern , printf_lock)
126decl_simple_lock_data(extern , panic_lock)
127#endif	/* USLOCK_DEBUG */
128
129
130/*
131 *	We often want to know the addresses of the callers
132 *	of the various lock routines.  However, this information
133 *	is only used for debugging and statistics.
134 */
135typedef void	*pc_t;
136#define	INVALID_PC	((void *) VM_MAX_KERNEL_ADDRESS)
137#define	INVALID_THREAD	((void *) VM_MAX_KERNEL_ADDRESS)
138#if	ANY_LOCK_DEBUG
139#define	OBTAIN_PC(pc)	((pc) = GET_RETURN_PC())
140#define DECL_PC(pc)	pc_t pc;
141#else	/* ANY_LOCK_DEBUG */
142#define DECL_PC(pc)
143#ifdef	lint
144/*
145 *	Eliminate lint complaints about unused local pc variables.
146 */
147#define	OBTAIN_PC(pc)	++pc
148#else	/* lint */
149#define	OBTAIN_PC(pc)
150#endif	/* lint */
151#endif	/* USLOCK_DEBUG */
152
153
154/*
155 *	Portable lock package implementation of usimple_locks.
156 */
157
158#if	USLOCK_DEBUG
159#define	USLDBG(stmt)	stmt
160void		usld_lock_init(usimple_lock_t, unsigned short);
161void		usld_lock_pre(usimple_lock_t, pc_t);
162void		usld_lock_post(usimple_lock_t, pc_t);
163void		usld_unlock(usimple_lock_t, pc_t);
164void		usld_lock_try_pre(usimple_lock_t, pc_t);
165void		usld_lock_try_post(usimple_lock_t, pc_t);
166int		usld_lock_common_checks(usimple_lock_t, char *);
167#else	/* USLOCK_DEBUG */
168#define	USLDBG(stmt)
169#endif	/* USLOCK_DEBUG */
170
171
172extern int lck_rw_grab_want(lck_rw_t *lck);
173extern int lck_rw_grab_shared(lck_rw_t *lck);
174extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
175
176
177/*
178 * Forward definitions
179 */
180
181void lck_rw_lock_shared_gen(
182	lck_rw_t	*lck);
183
184void lck_rw_lock_exclusive_gen(
185	lck_rw_t	*lck);
186
187boolean_t lck_rw_lock_shared_to_exclusive_success(
188	lck_rw_t	*lck);
189
190boolean_t lck_rw_lock_shared_to_exclusive_failure(
191	lck_rw_t	*lck,
192	int		prior_lock_state);
193
194void lck_rw_lock_exclusive_to_shared_gen(
195	lck_rw_t	*lck,
196	int		prior_lock_state);
197
198lck_rw_type_t lck_rw_done_gen(
199	lck_rw_t	*lck,
200	int		prior_lock_state);
201
202/*
203 *      Routine:        lck_spin_alloc_init
204 */
205lck_spin_t *
206lck_spin_alloc_init(
207	lck_grp_t	*grp,
208	lck_attr_t	*attr)
209{
210	lck_spin_t	*lck;
211
212	if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
213		lck_spin_init(lck, grp, attr);
214
215	return(lck);
216}
217
218/*
219 *      Routine:        lck_spin_free
220 */
221void
222lck_spin_free(
223	lck_spin_t	*lck,
224	lck_grp_t	*grp)
225{
226	lck_spin_destroy(lck, grp);
227	kfree(lck, sizeof(lck_spin_t));
228}
229
230/*
231 *      Routine:        lck_spin_init
232 */
233void
234lck_spin_init(
235	lck_spin_t	*lck,
236	lck_grp_t	*grp,
237	__unused lck_attr_t	*attr)
238{
239	usimple_lock_init((usimple_lock_t) lck, 0);
240	lck_grp_reference(grp);
241	lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
242}
243
244/*
245 *      Routine:        lck_spin_destroy
246 */
247void
248lck_spin_destroy(
249	lck_spin_t	*lck,
250	lck_grp_t	*grp)
251{
252	if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
253		return;
254	lck->interlock = LCK_SPIN_TAG_DESTROYED;
255	lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
256	lck_grp_deallocate(grp);
257	return;
258}
259
260/*
261 *      Routine:        lck_spin_lock
262 */
263void
264lck_spin_lock(
265	lck_spin_t	*lck)
266{
267	usimple_lock((usimple_lock_t) lck);
268}
269
270/*
271 *      Routine:        lck_spin_unlock
272 */
273void
274lck_spin_unlock(
275	lck_spin_t	*lck)
276{
277	usimple_unlock((usimple_lock_t) lck);
278}
279
280
281/*
282 *      Routine:        lck_spin_try_lock
283 */
284boolean_t
285lck_spin_try_lock(
286	lck_spin_t	*lck)
287{
288	return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
289}
290
291/*
292 *	Initialize a usimple_lock.
293 *
294 *	No change in preemption state.
295 */
296void
297usimple_lock_init(
298	usimple_lock_t	l,
299	__unused unsigned short	tag)
300{
301#ifndef	MACHINE_SIMPLE_LOCK
302	USLDBG(usld_lock_init(l, tag));
303	hw_lock_init(&l->interlock);
304#else
305	simple_lock_init((simple_lock_t)l,tag);
306#endif
307}
308
309volatile uint32_t spinlock_owner_cpu = ~0;
310volatile usimple_lock_t spinlock_timed_out;
311
312static uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
313	uint64_t deadline;
314	uint32_t i;
315
316	for (i = 0; i < real_ncpus; i++) {
317		if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
318			spinlock_owner_cpu = i;
319			if ((uint32_t) cpu_number() == i)
320				break;
321			cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
322			cpu_NMI_interrupt(i);
323			deadline = mach_absolute_time() + (LockTimeOut * 2);
324			while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
325				cpu_pause();
326			break;
327		}
328	}
329
330	return spinlock_owner_cpu;
331}
332
333/*
334 *	Acquire a usimple_lock.
335 *
336 *	Returns with preemption disabled.  Note
337 *	that the hw_lock routines are responsible for
338 *	maintaining preemption state.
339 */
340void
341usimple_lock(
342	usimple_lock_t	l)
343{
344#ifndef	MACHINE_SIMPLE_LOCK
345	DECL_PC(pc);
346
347	OBTAIN_PC(pc);
348	USLDBG(usld_lock_pre(l, pc));
349
350	if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0))	{
351		boolean_t uslock_acquired = FALSE;
352		while (machine_timeout_suspended()) {
353			enable_preemption();
354			if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
355				break;
356		}
357
358		if (uslock_acquired == FALSE) {
359			uint32_t lock_cpu;
360			uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
361			spinlock_timed_out = l;
362			lock_cpu = spinlock_timeout_NMI(lowner);
363			panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner,  current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
364		}
365	}
366	USLDBG(usld_lock_post(l, pc));
367#else
368	simple_lock((simple_lock_t)l);
369#endif
370}
371
372
373/*
374 *	Release a usimple_lock.
375 *
376 *	Returns with preemption enabled.  Note
377 *	that the hw_lock routines are responsible for
378 *	maintaining preemption state.
379 */
380void
381usimple_unlock(
382	usimple_lock_t	l)
383{
384#ifndef	MACHINE_SIMPLE_LOCK
385	DECL_PC(pc);
386
387	OBTAIN_PC(pc);
388	USLDBG(usld_unlock(l, pc));
389	hw_lock_unlock(&l->interlock);
390#else
391	simple_unlock_rwmb((simple_lock_t)l);
392#endif
393}
394
395
396/*
397 *	Conditionally acquire a usimple_lock.
398 *
399 *	On success, returns with preemption disabled.
400 *	On failure, returns with preemption in the same state
401 *	as when first invoked.  Note that the hw_lock routines
402 *	are responsible for maintaining preemption state.
403 *
404 *	XXX No stats are gathered on a miss; I preserved this
405 *	behavior from the original assembly-language code, but
406 *	doesn't it make sense to log misses?  XXX
407 */
408unsigned int
409usimple_lock_try(
410	usimple_lock_t	l)
411{
412#ifndef	MACHINE_SIMPLE_LOCK
413	unsigned int	success;
414	DECL_PC(pc);
415
416	OBTAIN_PC(pc);
417	USLDBG(usld_lock_try_pre(l, pc));
418	if ((success = hw_lock_try(&l->interlock))) {
419		USLDBG(usld_lock_try_post(l, pc));
420	}
421	return success;
422#else
423	return(simple_lock_try((simple_lock_t)l));
424#endif
425}
426
427#if	USLOCK_DEBUG
428/*
429 *	States of a usimple_lock.  The default when initializing
430 *	a usimple_lock is setting it up for debug checking.
431 */
432#define	USLOCK_CHECKED		0x0001		/* lock is being checked */
433#define	USLOCK_TAKEN		0x0002		/* lock has been taken */
434#define	USLOCK_INIT		0xBAA0		/* lock has been initialized */
435#define	USLOCK_INITIALIZED	(USLOCK_INIT|USLOCK_CHECKED)
436#define	USLOCK_CHECKING(l)	(uslock_check &&			\
437				 ((l)->debug.state & USLOCK_CHECKED))
438
439/*
440 *	Trace activities of a particularly interesting lock.
441 */
442void	usl_trace(usimple_lock_t, int, pc_t, const char *);
443
444
445/*
446 *	Initialize the debugging information contained
447 *	in a usimple_lock.
448 */
449void
450usld_lock_init(
451	usimple_lock_t	l,
452	__unused unsigned short	tag)
453{
454	if (l == USIMPLE_LOCK_NULL)
455		panic("lock initialization:  null lock pointer");
456	l->lock_type = USLOCK_TAG;
457	l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
458	l->debug.lock_cpu = l->debug.unlock_cpu = 0;
459	l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
460	l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
461	l->debug.duration[0] = l->debug.duration[1] = 0;
462	l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
463	l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
464	l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
465}
466
467
468/*
469 *	These checks apply to all usimple_locks, not just
470 *	those with USLOCK_CHECKED turned on.
471 */
472int
473usld_lock_common_checks(
474	usimple_lock_t	l,
475	char		*caller)
476{
477	if (l == USIMPLE_LOCK_NULL)
478		panic("%s:  null lock pointer", caller);
479	if (l->lock_type != USLOCK_TAG)
480		panic("%s:  %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
481	if (!(l->debug.state & USLOCK_INIT))
482		panic("%s:  %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
483	return USLOCK_CHECKING(l);
484}
485
486
487/*
488 *	Debug checks on a usimple_lock just before attempting
489 *	to acquire it.
490 */
491/* ARGSUSED */
492void
493usld_lock_pre(
494	usimple_lock_t	l,
495	pc_t		pc)
496{
497	char	caller[] = "usimple_lock";
498
499
500	if (!usld_lock_common_checks(l, caller))
501		return;
502
503/*
504 *	Note that we have a weird case where we are getting a lock when we are]
505 *	in the process of putting the system to sleep. We are running with no
506 *	current threads, therefore we can't tell if we are trying to retake a lock
507 *	we have or someone on the other processor has it.  Therefore we just
508 *	ignore this test if the locking thread is 0.
509 */
510
511	if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
512	    l->debug.lock_thread == (void *) current_thread()) {
513		printf("%s:  lock %p already locked (at %p) by",
514		      caller, l, l->debug.lock_pc);
515		printf(" current thread %p (new attempt at pc %p)\n",
516		       l->debug.lock_thread, pc);
517		panic("%s", caller);
518	}
519	mp_disable_preemption();
520	usl_trace(l, cpu_number(), pc, caller);
521	mp_enable_preemption();
522}
523
524
525/*
526 *	Debug checks on a usimple_lock just after acquiring it.
527 *
528 *	Pre-emption has been disabled at this point,
529 *	so we are safe in using cpu_number.
530 */
531void
532usld_lock_post(
533	usimple_lock_t	l,
534	pc_t		pc)
535{
536	register int	mycpu;
537	char	caller[] = "successful usimple_lock";
538
539
540	if (!usld_lock_common_checks(l, caller))
541		return;
542
543	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
544		panic("%s:  lock %p became uninitialized",
545		      caller, l);
546	if ((l->debug.state & USLOCK_TAKEN))
547		panic("%s:  lock 0x%p became TAKEN by someone else",
548		      caller, l);
549
550	mycpu = cpu_number();
551	l->debug.lock_thread = (void *)current_thread();
552	l->debug.state |= USLOCK_TAKEN;
553	l->debug.lock_pc = pc;
554	l->debug.lock_cpu = mycpu;
555
556	usl_trace(l, mycpu, pc, caller);
557}
558
559
560/*
561 *	Debug checks on a usimple_lock just before
562 *	releasing it.  Note that the caller has not
563 *	yet released the hardware lock.
564 *
565 *	Preemption is still disabled, so there's
566 *	no problem using cpu_number.
567 */
568void
569usld_unlock(
570	usimple_lock_t	l,
571	pc_t		pc)
572{
573	register int	mycpu;
574	char	caller[] = "usimple_unlock";
575
576
577	if (!usld_lock_common_checks(l, caller))
578		return;
579
580	mycpu = cpu_number();
581
582	if (!(l->debug.state & USLOCK_TAKEN))
583		panic("%s:  lock 0x%p hasn't been taken",
584		      caller, l);
585	if (l->debug.lock_thread != (void *) current_thread())
586		panic("%s:  unlocking lock 0x%p, owned by thread %p",
587		      caller, l, l->debug.lock_thread);
588	if (l->debug.lock_cpu != mycpu) {
589		printf("%s:  unlocking lock 0x%p on cpu 0x%x",
590		       caller, l, mycpu);
591		printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
592		panic("%s", caller);
593	}
594	usl_trace(l, mycpu, pc, caller);
595
596	l->debug.unlock_thread = l->debug.lock_thread;
597	l->debug.lock_thread = INVALID_PC;
598	l->debug.state &= ~USLOCK_TAKEN;
599	l->debug.unlock_pc = pc;
600	l->debug.unlock_cpu = mycpu;
601}
602
603
604/*
605 *	Debug checks on a usimple_lock just before
606 *	attempting to acquire it.
607 *
608 *	Preemption isn't guaranteed to be disabled.
609 */
610void
611usld_lock_try_pre(
612	usimple_lock_t	l,
613	pc_t		pc)
614{
615	char	caller[] = "usimple_lock_try";
616
617	if (!usld_lock_common_checks(l, caller))
618		return;
619	mp_disable_preemption();
620	usl_trace(l, cpu_number(), pc, caller);
621	mp_enable_preemption();
622}
623
624
625/*
626 *	Debug checks on a usimple_lock just after
627 *	successfully attempting to acquire it.
628 *
629 *	Preemption has been disabled by the
630 *	lock acquisition attempt, so it's safe
631 *	to use cpu_number.
632 */
633void
634usld_lock_try_post(
635	usimple_lock_t	l,
636	pc_t		pc)
637{
638	register int	mycpu;
639	char	caller[] = "successful usimple_lock_try";
640
641	if (!usld_lock_common_checks(l, caller))
642		return;
643
644	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
645		panic("%s:  lock 0x%p became uninitialized",
646		      caller, l);
647	if ((l->debug.state & USLOCK_TAKEN))
648		panic("%s:  lock 0x%p became TAKEN by someone else",
649		      caller, l);
650
651	mycpu = cpu_number();
652	l->debug.lock_thread = (void *) current_thread();
653	l->debug.state |= USLOCK_TAKEN;
654	l->debug.lock_pc = pc;
655	l->debug.lock_cpu = mycpu;
656
657	usl_trace(l, mycpu, pc, caller);
658}
659
660
661/*
662 *	For very special cases, set traced_lock to point to a
663 *	specific lock of interest.  The result is a series of
664 *	XPRs showing lock operations on that lock.  The lock_seq
665 *	value is used to show the order of those operations.
666 */
667usimple_lock_t		traced_lock;
668unsigned int		lock_seq;
669
670void
671usl_trace(
672	usimple_lock_t	l,
673	int		mycpu,
674	pc_t		pc,
675	const char *	op_name)
676{
677	if (traced_lock == l) {
678		XPR(XPR_SLOCK,
679		    "seq %d, cpu %d, %s @ %x\n",
680		    (uintptr_t) lock_seq, (uintptr_t) mycpu,
681		    (uintptr_t) op_name, (uintptr_t) pc, 0);
682		lock_seq++;
683	}
684}
685
686
687#endif	/* USLOCK_DEBUG */
688
689/*
690 *	Routine:	lock_alloc
691 *	Function:
692 *		Allocate a lock for external users who cannot
693 *		hard-code the structure definition into their
694 *		objects.
695 *		For now just use kalloc, but a zone is probably
696 *		warranted.
697 */
698lock_t *
699lock_alloc(
700	boolean_t	can_sleep,
701	unsigned short	tag,
702	unsigned short	tag1)
703{
704	lock_t		*l;
705
706	if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0)
707	  lock_init(l, can_sleep, tag, tag1);
708	return(l);
709}
710
711/*
712 *	Routine:	lock_free
713 *	Function:
714 *		Free a lock allocated for external users.
715 *		For now just use kfree, but a zone is probably
716 *		warranted.
717 */
718void
719lock_free(
720	lock_t		*l)
721{
722	kfree(l, sizeof(lock_t));
723}
724
725
726/*
727 *	Routine:	lock_init
728 *	Function:
729 *		Initialize a lock; required before use.
730 *		Note that clients declare the "struct lock"
731 *		variables and then initialize them, rather
732 *		than getting a new one from this module.
733 */
734void
735lock_init(
736	lock_t		*l,
737	boolean_t	can_sleep,
738	__unused unsigned short	tag,
739	__unused unsigned short	tag1)
740{
741	hw_lock_byte_init(&l->lck_rw_interlock);
742	l->lck_rw_want_write = FALSE;
743	l->lck_rw_want_upgrade = FALSE;
744	l->lck_rw_shared_count = 0;
745	l->lck_rw_can_sleep = can_sleep;
746	l->lck_rw_tag = tag;
747	l->lck_rw_priv_excl = 1;
748	l->lck_r_waiting = l->lck_w_waiting = 0;
749}
750
751
752/*
753 *	Sleep locks.  These use the same data structure and algorithm
754 *	as the spin locks, but the process sleeps while it is waiting
755 *	for the lock.  These work on uniprocessor systems.
756 */
757
758#define DECREMENTER_TIMEOUT 1000000
759
760void
761lock_write(
762	register lock_t	* l)
763{
764	lck_rw_lock_exclusive(l);
765}
766
767void
768lock_done(
769	register lock_t	* l)
770{
771	(void) lck_rw_done(l);
772}
773
774void
775lock_read(
776	register lock_t	* l)
777{
778	lck_rw_lock_shared(l);
779}
780
781
782/*
783 *	Routine:	lock_read_to_write
784 *	Function:
785 *		Improves a read-only lock to one with
786 *		write permission.  If another reader has
787 *		already requested an upgrade to a write lock,
788 *		no lock is held upon return.
789 *
790 *		Returns FALSE if the upgrade *failed*.
791 */
792
793boolean_t
794lock_read_to_write(
795	register lock_t	* l)
796{
797	return lck_rw_lock_shared_to_exclusive(l);
798}
799
800void
801lock_write_to_read(
802	register lock_t	* l)
803{
804	lck_rw_lock_exclusive_to_shared(l);
805}
806
807
808
809/*
810 *      Routine:        lck_rw_alloc_init
811 */
812lck_rw_t *
813lck_rw_alloc_init(
814	lck_grp_t	*grp,
815	lck_attr_t	*attr) {
816	lck_rw_t	*lck;
817
818	if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
819		bzero(lck, sizeof(lck_rw_t));
820		lck_rw_init(lck, grp, attr);
821	}
822
823	return(lck);
824}
825
826/*
827 *      Routine:        lck_rw_free
828 */
829void
830lck_rw_free(
831	lck_rw_t	*lck,
832	lck_grp_t	*grp) {
833	lck_rw_destroy(lck, grp);
834	kfree(lck, sizeof(lck_rw_t));
835}
836
837/*
838 *      Routine:        lck_rw_init
839 */
840void
841lck_rw_init(
842	lck_rw_t	*lck,
843	lck_grp_t	*grp,
844	lck_attr_t	*attr)
845{
846	lck_attr_t	*lck_attr = (attr != LCK_ATTR_NULL) ?
847					attr : &LockDefaultLckAttr;
848
849	hw_lock_byte_init(&lck->lck_rw_interlock);
850	lck->lck_rw_want_write = FALSE;
851	lck->lck_rw_want_upgrade = FALSE;
852	lck->lck_rw_shared_count = 0;
853	lck->lck_rw_can_sleep = TRUE;
854	lck->lck_r_waiting = lck->lck_w_waiting = 0;
855	lck->lck_rw_tag = 0;
856	lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
857				LCK_ATTR_RW_SHARED_PRIORITY) == 0);
858
859	lck_grp_reference(grp);
860	lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
861}
862
863/*
864 *      Routine:        lck_rw_destroy
865 */
866void
867lck_rw_destroy(
868	lck_rw_t	*lck,
869	lck_grp_t	*grp)
870{
871	if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
872		return;
873	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
874	lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
875	lck_grp_deallocate(grp);
876	return;
877}
878
879/*
880 *	Sleep locks.  These use the same data structure and algorithm
881 *	as the spin locks, but the process sleeps while it is waiting
882 *	for the lock.  These work on uniprocessor systems.
883 */
884
885#define DECREMENTER_TIMEOUT 1000000
886
887#define RW_LOCK_READER_EVENT(x)		\
888		((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
889
890#define RW_LOCK_WRITER_EVENT(x)		\
891		((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
892
893/*
894 * We disable interrupts while holding the RW interlock to prevent an
895 * interrupt from exacerbating hold time.
896 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
897 */
898static boolean_t
899lck_interlock_lock(lck_rw_t *lck)
900{
901	boolean_t	istate;
902
903	istate = ml_set_interrupts_enabled(FALSE);
904	hw_lock_byte_lock(&lck->lck_rw_interlock);
905
906	return istate;
907}
908
909static void
910lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
911{
912	hw_lock_byte_unlock(&lck->lck_rw_interlock);
913	ml_set_interrupts_enabled(istate);
914}
915
916/*
917 * This inline is used when busy-waiting for an rw lock.
918 * If interrupts were disabled when the lock primitive was called,
919 * we poll the IPI handler for pending tlb flushes.
920 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
921 */
922static inline void
923lck_rw_lock_pause(boolean_t interrupts_enabled)
924{
925	if (!interrupts_enabled)
926		handle_pending_TLB_flushes();
927	cpu_pause();
928}
929
930
931/*
932 * compute the deadline to spin against when
933 * waiting for a change of state on a lck_rw_t
934 */
935static inline uint64_t
936lck_rw_deadline_for_spin(lck_rw_t *lck)
937{
938	if (lck->lck_rw_can_sleep) {
939		if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
940			/*
941			 * there are already threads waiting on this lock... this
942			 * implies that they have spun beyond their deadlines waiting for
943			 * the desired state to show up so we will not bother spinning at this time...
944			 *   or
945			 * the current number of threads sharing this lock exceeds our capacity to run them
946			 * concurrently and since all states we're going to spin for require the rw_shared_count
947			 * to be at 0, we'll not bother spinning since the latency for this to happen is
948			 * unpredictable...
949			 */
950			return (mach_absolute_time());
951		}
952		return (mach_absolute_time() + MutexSpin);
953	} else
954		return (mach_absolute_time() + (100000LL * 1000000000LL));
955}
956
957
958/*
959 *      Routine:        lck_rw_lock_exclusive
960 */
961void
962lck_rw_lock_exclusive_gen(
963	lck_rw_t	*lck)
964{
965	uint64_t	deadline = 0;
966	int		slept = 0;
967	int		gotlock = 0;
968	int		lockheld = 0;
969	wait_result_t	res = 0;
970	boolean_t	istate = -1;
971
972#if	CONFIG_DTRACE
973	boolean_t dtrace_ls_initialized = FALSE;
974	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
975	uint64_t wait_interval = 0;
976	int readers_at_sleep = 0;
977#endif
978
979	/*
980	 *	Try to acquire the lck_rw_want_write bit.
981	 */
982	while ( !lck_rw_grab_want(lck)) {
983
984#if	CONFIG_DTRACE
985		if (dtrace_ls_initialized == FALSE) {
986			dtrace_ls_initialized = TRUE;
987			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
988			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
989			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
990			if (dtrace_ls_enabled) {
991				/*
992				 * Either sleeping or spinning is happening,
993				 *  start a timing of our delay interval now.
994				 */
995				readers_at_sleep = lck->lck_rw_shared_count;
996				wait_interval = mach_absolute_time();
997			}
998		}
999#endif
1000		if (istate == -1)
1001			istate = ml_get_interrupts_enabled();
1002
1003		deadline = lck_rw_deadline_for_spin(lck);
1004
1005		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1006
1007		while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
1008			lck_rw_lock_pause(istate);
1009
1010		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0);
1011
1012		if (gotlock)
1013			break;
1014		/*
1015		 * if we get here, the deadline has expired w/o us
1016		 * being able to grab the lock exclusively
1017		 * check to see if we're allowed to do a thread_block
1018		 */
1019		if (lck->lck_rw_can_sleep) {
1020
1021			istate = lck_interlock_lock(lck);
1022
1023			if (lck->lck_rw_want_write) {
1024
1025				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1026
1027				lck->lck_w_waiting = TRUE;
1028
1029				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1030				lck_interlock_unlock(lck, istate);
1031
1032				if (res == THREAD_WAITING) {
1033					res = thread_block(THREAD_CONTINUE_NULL);
1034					slept++;
1035				}
1036				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
1037			} else {
1038				lck->lck_rw_want_write = TRUE;
1039				lck_interlock_unlock(lck, istate);
1040				break;
1041			}
1042		}
1043	}
1044	/*
1045	 * Wait for readers (and upgrades) to finish...
1046	 * the test for these conditions must be done simultaneously with
1047	 * a check of the interlock not being held since
1048	 * the rw_shared_count will drop to 0 first and then want_upgrade
1049	 * will be set to 1 in the shared_to_exclusive scenario... those
1050	 * adjustments are done behind the interlock and represent an
1051	 * atomic change in state and must be considered as such
1052	 * however, once we see the read count at 0, the want_upgrade not set
1053	 * and the interlock not held, we are safe to proceed
1054	 */
1055	while (lck_rw_held_read_or_upgrade(lck)) {
1056
1057#if	CONFIG_DTRACE
1058		/*
1059		 * Either sleeping or spinning is happening, start
1060		 * a timing of our delay interval now.  If we set it
1061		 * to -1 we don't have accurate data so we cannot later
1062		 * decide to record a dtrace spin or sleep event.
1063		 */
1064		if (dtrace_ls_initialized == FALSE) {
1065			dtrace_ls_initialized = TRUE;
1066			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1067			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1068			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1069			if (dtrace_ls_enabled) {
1070				/*
1071				 * Either sleeping or spinning is happening,
1072				 *  start a timing of our delay interval now.
1073				 */
1074				readers_at_sleep = lck->lck_rw_shared_count;
1075				wait_interval = mach_absolute_time();
1076			}
1077		}
1078#endif
1079		if (istate == -1)
1080			istate = ml_get_interrupts_enabled();
1081
1082		deadline = lck_rw_deadline_for_spin(lck);
1083
1084		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1085
1086		while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1087			lck_rw_lock_pause(istate);
1088
1089		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0);
1090
1091		if ( !lockheld)
1092			break;
1093		/*
1094		 * if we get here, the deadline has expired w/o us
1095		 * being able to grab the lock exclusively
1096		 * check to see if we're allowed to do a thread_block
1097		 */
1098		if (lck->lck_rw_can_sleep) {
1099
1100			istate = lck_interlock_lock(lck);
1101
1102			if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1103				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0);
1104
1105				lck->lck_w_waiting = TRUE;
1106
1107				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1108				lck_interlock_unlock(lck, istate);
1109
1110				if (res == THREAD_WAITING) {
1111					res = thread_block(THREAD_CONTINUE_NULL);
1112					slept++;
1113				}
1114				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0);
1115			} else {
1116				lck_interlock_unlock(lck, istate);
1117				/*
1118				 * must own the lock now, since we checked for
1119				 * readers or upgrade owner behind the interlock
1120				 * no need for a call to 'lck_rw_held_read_or_upgrade'
1121				 */
1122				break;
1123			}
1124		}
1125	}
1126
1127#if	CONFIG_DTRACE
1128	/*
1129	 * Decide what latencies we suffered that are Dtrace events.
1130	 * If we have set wait_interval, then we either spun or slept.
1131	 * At least we get out from under the interlock before we record
1132	 * which is the best we can do here to minimize the impact
1133	 * of the tracing.
1134	 * If we have set wait_interval to -1, then dtrace was not enabled when we
1135	 * started sleeping/spinning so we don't record this event.
1136	 */
1137	if (dtrace_ls_enabled == TRUE) {
1138		if (slept == 0) {
1139			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1140			    mach_absolute_time() - wait_interval, 1);
1141		} else {
1142			/*
1143			 * For the blocking case, we also record if when we blocked
1144			 * it was held for read or write, and how many readers.
1145			 * Notice that above we recorded this before we dropped
1146			 * the interlock so the count is accurate.
1147			 */
1148			LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1149			    mach_absolute_time() - wait_interval, 1,
1150			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1151		}
1152	}
1153	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1154#endif
1155}
1156
1157
1158/*
1159 *      Routine:        lck_rw_done_gen
1160 *
1161 *	called from the assembly language wrapper...
1162 *	prior_lock_state is the value in the 1st
1163 * 	word of the lock at the time of a successful
1164 *	atomic compare and exchange with the new value...
1165 * 	it represents the state of the lock before we
1166 *	decremented the rw_shared_count or cleared either
1167 * 	rw_want_upgrade or rw_want_write and
1168 *	the lck_x_waiting bits...  since the wrapper
1169 * 	routine has already changed the state atomically,
1170 *	we just need to decide if we should
1171 *	wake up anyone and what value to return... we do
1172 *	this by examining the state of the lock before
1173 *	we changed it
1174 */
1175lck_rw_type_t
1176lck_rw_done_gen(
1177	lck_rw_t	*lck,
1178	int		prior_lock_state)
1179{
1180	lck_rw_t	*fake_lck;
1181	lck_rw_type_t	lock_type;
1182
1183	/*
1184	 * prior_lock state is a snapshot of the 1st word of the
1185	 * lock in question... we'll fake up a pointer to it
1186	 * and carefully not access anything beyond whats defined
1187	 * in the first word of a lck_rw_t
1188	 */
1189	fake_lck = (lck_rw_t *)&prior_lock_state;
1190
1191	if (fake_lck->lck_rw_shared_count <= 1) {
1192		if (fake_lck->lck_w_waiting)
1193			thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1194
1195		if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1196			thread_wakeup(RW_LOCK_READER_EVENT(lck));
1197	}
1198	if (fake_lck->lck_rw_shared_count)
1199		lock_type = LCK_RW_TYPE_SHARED;
1200	else
1201		lock_type = LCK_RW_TYPE_EXCLUSIVE;
1202
1203#if CONFIG_DTRACE
1204	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1205#endif
1206
1207	return(lock_type);
1208}
1209
1210
1211/*
1212 *	Routine:	lck_rw_unlock
1213 */
1214void
1215lck_rw_unlock(
1216	lck_rw_t	*lck,
1217	lck_rw_type_t	lck_rw_type)
1218{
1219	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1220		lck_rw_unlock_shared(lck);
1221	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1222		lck_rw_unlock_exclusive(lck);
1223	else
1224		panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1225}
1226
1227
1228/*
1229 *	Routine:	lck_rw_unlock_shared
1230 */
1231void
1232lck_rw_unlock_shared(
1233	lck_rw_t	*lck)
1234{
1235	lck_rw_type_t	ret;
1236
1237	ret = lck_rw_done(lck);
1238
1239	if (ret != LCK_RW_TYPE_SHARED)
1240		panic("lck_rw_unlock(): lock held in mode: %d\n", ret);
1241}
1242
1243
1244/*
1245 *	Routine:	lck_rw_unlock_exclusive
1246 */
1247void
1248lck_rw_unlock_exclusive(
1249	lck_rw_t	*lck)
1250{
1251	lck_rw_type_t	ret;
1252
1253	ret = lck_rw_done(lck);
1254
1255	if (ret != LCK_RW_TYPE_EXCLUSIVE)
1256		panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1257}
1258
1259
1260/*
1261 *	Routine:	lck_rw_lock
1262 */
1263void
1264lck_rw_lock(
1265	lck_rw_t	*lck,
1266	lck_rw_type_t	lck_rw_type)
1267{
1268	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1269		lck_rw_lock_shared(lck);
1270	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1271		lck_rw_lock_exclusive(lck);
1272	else
1273		panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1274}
1275
1276
1277/*
1278 *	Routine:	lck_rw_lock_shared_gen
1279 *	Function:
1280 *		assembly fast path code has determined that this lock
1281 *		is held exclusively... this is where we spin/block
1282 *		until we can acquire the lock in the shared mode
1283 */
1284void
1285lck_rw_lock_shared_gen(
1286	lck_rw_t	*lck)
1287{
1288	uint64_t	deadline = 0;
1289	int		gotlock = 0;
1290	int		slept = 0;
1291	wait_result_t	res = 0;
1292	boolean_t	istate = -1;
1293
1294#if	CONFIG_DTRACE
1295	uint64_t wait_interval = 0;
1296	int readers_at_sleep = 0;
1297	boolean_t dtrace_ls_initialized = FALSE;
1298	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1299#endif
1300
1301	while ( !lck_rw_grab_shared(lck)) {
1302
1303#if	CONFIG_DTRACE
1304		if (dtrace_ls_initialized == FALSE) {
1305			dtrace_ls_initialized = TRUE;
1306			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1307			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1308			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1309			if (dtrace_ls_enabled) {
1310				/*
1311				 * Either sleeping or spinning is happening,
1312				 *  start a timing of our delay interval now.
1313				 */
1314				readers_at_sleep = lck->lck_rw_shared_count;
1315				wait_interval = mach_absolute_time();
1316			}
1317		}
1318#endif
1319		if (istate == -1)
1320			istate = ml_get_interrupts_enabled();
1321
1322		deadline = lck_rw_deadline_for_spin(lck);
1323
1324		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1325			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1326
1327		while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1328			lck_rw_lock_pause(istate);
1329
1330		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1331			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1332
1333		if (gotlock)
1334			break;
1335		/*
1336		 * if we get here, the deadline has expired w/o us
1337		 * being able to grab the lock for read
1338		 * check to see if we're allowed to do a thread_block
1339		 */
1340		if (lck->lck_rw_can_sleep) {
1341
1342			istate = lck_interlock_lock(lck);
1343
1344			if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1345			    ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1346
1347				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1348					     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1349
1350				lck->lck_r_waiting = TRUE;
1351
1352				res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
1353				lck_interlock_unlock(lck, istate);
1354
1355				if (res == THREAD_WAITING) {
1356					res = thread_block(THREAD_CONTINUE_NULL);
1357					slept++;
1358				}
1359				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1360					     (int)lck, res, slept, 0, 0);
1361			} else {
1362				lck->lck_rw_shared_count++;
1363				lck_interlock_unlock(lck, istate);
1364				break;
1365			}
1366		}
1367	}
1368
1369#if	CONFIG_DTRACE
1370	if (dtrace_ls_enabled == TRUE) {
1371		if (slept == 0) {
1372			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1373		} else {
1374			LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1375			    mach_absolute_time() - wait_interval, 0,
1376			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1377		}
1378	}
1379	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1380#endif
1381}
1382
1383
1384/*
1385 *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1386 *	Function:
1387 *		assembly fast path code has already dropped our read
1388 *		count and determined that someone else owns 'lck_rw_want_upgrade'
1389 *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1390 *		all we need to do here is determine if a wakeup is needed
1391 */
1392boolean_t
1393lck_rw_lock_shared_to_exclusive_failure(
1394	lck_rw_t	*lck,
1395	int		prior_lock_state)
1396{
1397	lck_rw_t	*fake_lck;
1398
1399	/*
1400	 * prior_lock state is a snapshot of the 1st word of the
1401	 * lock in question... we'll fake up a pointer to it
1402	 * and carefully not access anything beyond whats defined
1403	 * in the first word of a lck_rw_t
1404	 */
1405	fake_lck = (lck_rw_t *)&prior_lock_state;
1406
1407	if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1408		/*
1409		 *	Someone else has requested upgrade.
1410		 *	Since we've released the read lock, wake
1411		 *	him up if he's blocked waiting
1412		 */
1413		thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1414	}
1415	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1416		     (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1417
1418	return (FALSE);
1419}
1420
1421
1422/*
1423 *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1424 *	Function:
1425 *		assembly fast path code has already dropped our read
1426 *		count and successfully acquired 'lck_rw_want_upgrade'
1427 *		we just need to wait for the rest of the readers to drain
1428 *		and then we can return as the exclusive holder of this lock
1429 */
1430boolean_t
1431lck_rw_lock_shared_to_exclusive_success(
1432	lck_rw_t	*lck)
1433{
1434	uint64_t	deadline = 0;
1435	int		slept = 0;
1436	int		still_shared = 0;
1437	wait_result_t	res;
1438	boolean_t	istate = -1;
1439
1440#if	CONFIG_DTRACE
1441	uint64_t wait_interval = 0;
1442	int readers_at_sleep = 0;
1443	boolean_t dtrace_ls_initialized = FALSE;
1444	boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1445#endif
1446
1447	while (lck->lck_rw_shared_count != 0) {
1448
1449#if	CONFIG_DTRACE
1450		if (dtrace_ls_initialized == FALSE) {
1451			dtrace_ls_initialized = TRUE;
1452			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1453			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1454			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1455			if (dtrace_ls_enabled) {
1456				/*
1457				 * Either sleeping or spinning is happening,
1458				 *  start a timing of our delay interval now.
1459				 */
1460				readers_at_sleep = lck->lck_rw_shared_count;
1461				wait_interval = mach_absolute_time();
1462			}
1463		}
1464#endif
1465		if (istate == -1)
1466			istate = ml_get_interrupts_enabled();
1467
1468		deadline = lck_rw_deadline_for_spin(lck);
1469
1470		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1471			     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1472
1473		while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1474			lck_rw_lock_pause(istate);
1475
1476		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1477			     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1478
1479		if ( !still_shared)
1480			break;
1481		/*
1482		 * if we get here, the deadline has expired w/o
1483		 * the rw_shared_count having drained to 0
1484		 * check to see if we're allowed to do a thread_block
1485		 */
1486		if (lck->lck_rw_can_sleep) {
1487
1488			istate = lck_interlock_lock(lck);
1489
1490			if (lck->lck_rw_shared_count != 0) {
1491				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1492					     (int)lck, lck->lck_rw_shared_count, 0, 0, 0);
1493
1494				lck->lck_w_waiting = TRUE;
1495
1496				res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1497				lck_interlock_unlock(lck, istate);
1498
1499				if (res == THREAD_WAITING) {
1500					res = thread_block(THREAD_CONTINUE_NULL);
1501					slept++;
1502				}
1503				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1504					     (int)lck, res, slept, 0, 0);
1505			} else {
1506				lck_interlock_unlock(lck, istate);
1507				break;
1508			}
1509		}
1510	}
1511#if	CONFIG_DTRACE
1512	/*
1513	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1514	 */
1515	if (dtrace_ls_enabled == TRUE) {
1516		if (slept == 0) {
1517			LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1518		} else {
1519			LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1520			    mach_absolute_time() - wait_interval, 1,
1521			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1522		}
1523	}
1524	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1525#endif
1526	return (TRUE);
1527}
1528
1529
1530/*
1531 *      Routine:        lck_rw_lock_exclusive_to_shared
1532 * 	Function:
1533 *		assembly fast path has already dropped
1534 *		our exclusive state and bumped lck_rw_shared_count
1535 *		all we need to do here is determine if anyone
1536 *		needs to be awakened.
1537 */
1538void
1539lck_rw_lock_exclusive_to_shared_gen(
1540	lck_rw_t	*lck,
1541	int		prior_lock_state)
1542{
1543	lck_rw_t	*fake_lck;
1544
1545	/*
1546	 * prior_lock state is a snapshot of the 1st word of the
1547	 * lock in question... we'll fake up a pointer to it
1548	 * and carefully not access anything beyond whats defined
1549	 * in the first word of a lck_rw_t
1550	 */
1551	fake_lck = (lck_rw_t *)&prior_lock_state;
1552
1553	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1554			     (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1555
1556	/*
1557	 * don't wake up anyone waiting to take the lock exclusively
1558	 * since we hold a read count... when the read count drops to 0,
1559	 * the writers will be woken.
1560	 *
1561	 * wake up any waiting readers if we don't have any writers waiting,
1562	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1563	 */
1564	if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1565		thread_wakeup(RW_LOCK_READER_EVENT(lck));
1566
1567	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1568			     (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1569
1570#if CONFIG_DTRACE
1571	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1572#endif
1573}
1574
1575
1576/*
1577 *      Routine:        lck_rw_try_lock
1578 */
1579boolean_t
1580lck_rw_try_lock(
1581	lck_rw_t	*lck,
1582	lck_rw_type_t	lck_rw_type)
1583{
1584	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1585		return(lck_rw_try_lock_shared(lck));
1586	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1587		return(lck_rw_try_lock_exclusive(lck));
1588	else
1589		panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1590	return(FALSE);
1591}
1592
1593
1594void
1595lck_rw_assert(
1596	lck_rw_t	*lck,
1597	unsigned int	type)
1598{
1599	switch (type) {
1600	case LCK_RW_ASSERT_SHARED:
1601		if (lck->lck_rw_shared_count != 0) {
1602			return;
1603		}
1604		break;
1605	case LCK_RW_ASSERT_EXCLUSIVE:
1606		if ((lck->lck_rw_want_write ||
1607		     lck->lck_rw_want_upgrade) &&
1608		    lck->lck_rw_shared_count == 0) {
1609			return;
1610		}
1611		break;
1612	case LCK_RW_ASSERT_HELD:
1613		if (lck->lck_rw_want_write ||
1614		    lck->lck_rw_want_upgrade ||
1615		    lck->lck_rw_shared_count != 0) {
1616			return;
1617		}
1618		break;
1619	default:
1620		break;
1621	}
1622
1623	panic("rw lock (%p) not held (mode=%u), first word %08x\n", lck, type, *(uint32_t *)lck);
1624}
1625
1626#ifdef	MUTEX_ZONE
1627extern zone_t lck_mtx_zone;
1628#endif
1629/*
1630 *      Routine:        lck_mtx_alloc_init
1631 */
1632lck_mtx_t *
1633lck_mtx_alloc_init(
1634	lck_grp_t	*grp,
1635	lck_attr_t	*attr)
1636{
1637	lck_mtx_t	*lck;
1638#ifdef	MUTEX_ZONE
1639	if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
1640		lck_mtx_init(lck, grp, attr);
1641#else
1642	if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
1643		lck_mtx_init(lck, grp, attr);
1644#endif
1645	return(lck);
1646}
1647
1648/*
1649 *      Routine:        lck_mtx_free
1650 */
1651void
1652lck_mtx_free(
1653	lck_mtx_t	*lck,
1654	lck_grp_t	*grp)
1655{
1656	lck_mtx_destroy(lck, grp);
1657#ifdef	MUTEX_ZONE
1658	zfree(lck_mtx_zone, lck);
1659#else
1660	kfree(lck, sizeof(lck_mtx_t));
1661#endif
1662}
1663
1664/*
1665 *      Routine:        lck_mtx_ext_init
1666 */
1667static void
1668lck_mtx_ext_init(
1669	lck_mtx_ext_t	*lck,
1670	lck_grp_t	*grp,
1671	lck_attr_t	*attr)
1672{
1673	bzero((void *)lck, sizeof(lck_mtx_ext_t));
1674
1675	if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1676		lck->lck_mtx_deb.type = MUTEX_TAG;
1677		lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
1678	}
1679
1680	lck->lck_mtx_grp = grp;
1681
1682	if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
1683		lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
1684
1685	lck->lck_mtx.lck_mtx_is_ext = 1;
1686#if	defined(__x86_64__)
1687	lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1688#endif
1689}
1690
1691/*
1692 *      Routine:        lck_mtx_init
1693 */
1694void
1695lck_mtx_init(
1696	lck_mtx_t	*lck,
1697	lck_grp_t	*grp,
1698	lck_attr_t	*attr)
1699{
1700	lck_mtx_ext_t	*lck_ext;
1701	lck_attr_t	*lck_attr;
1702
1703	if (attr != LCK_ATTR_NULL)
1704		lck_attr = attr;
1705	else
1706		lck_attr = &LockDefaultLckAttr;
1707
1708	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1709		if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
1710			lck_mtx_ext_init(lck_ext, grp, lck_attr);
1711			lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1712			lck->lck_mtx_ptr = lck_ext;
1713		}
1714	} else {
1715		lck->lck_mtx_owner = 0;
1716		lck->lck_mtx_state = 0;
1717	}
1718#if	defined(__x86_64__)
1719	lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1720#endif
1721	lck_grp_reference(grp);
1722	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1723}
1724
1725/*
1726 *      Routine:        lck_mtx_init_ext
1727 */
1728void
1729lck_mtx_init_ext(
1730	lck_mtx_t	*lck,
1731	lck_mtx_ext_t	*lck_ext,
1732	lck_grp_t	*grp,
1733	lck_attr_t	*attr)
1734{
1735	lck_attr_t	*lck_attr;
1736
1737	if (attr != LCK_ATTR_NULL)
1738		lck_attr = attr;
1739	else
1740		lck_attr = &LockDefaultLckAttr;
1741
1742	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
1743		lck_mtx_ext_init(lck_ext, grp, lck_attr);
1744		lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
1745		lck->lck_mtx_ptr = lck_ext;
1746	} else {
1747		lck->lck_mtx_owner = 0;
1748		lck->lck_mtx_state = 0;
1749	}
1750#if	defined(__x86_64__)
1751	lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
1752#endif
1753
1754	lck_grp_reference(grp);
1755	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
1756}
1757
1758/*
1759 *      Routine:        lck_mtx_destroy
1760 */
1761void
1762lck_mtx_destroy(
1763	lck_mtx_t	*lck,
1764	lck_grp_t	*grp)
1765{
1766	boolean_t lck_is_indirect;
1767
1768	if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
1769		return;
1770	lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1771
1772	lck_mtx_lock_mark_destroyed(lck);
1773
1774	if (lck_is_indirect)
1775		kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
1776	lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1777	lck_grp_deallocate(grp);
1778	return;
1779}
1780
1781
1782#define	LCK_MTX_LCK_WAIT_CODE		0x20
1783#define	LCK_MTX_LCK_WAKEUP_CODE		0x21
1784#define	LCK_MTX_LCK_SPIN_CODE		0x22
1785#define	LCK_MTX_LCK_ACQUIRE_CODE	0x23
1786#define LCK_MTX_LCK_DEMOTE_CODE		0x24
1787
1788
1789/*
1790 * Routine: 	lck_mtx_unlock_wakeup_x86
1791 *
1792 * Invoked on unlock when there is
1793 * contention (i.e. the assembly routine sees that
1794 * that mutex->lck_mtx_waiters != 0 or
1795 * that mutex->lck_mtx_promoted != 0...
1796 *
1797 * neither the mutex or interlock is held
1798 */
1799void
1800lck_mtx_unlock_wakeup_x86 (
1801	lck_mtx_t	*mutex,
1802	int		prior_lock_state)
1803{
1804	lck_mtx_t	fake_lck;
1805
1806	/*
1807	 * prior_lock state is a snapshot of the 2nd word of the
1808	 * lock in question... we'll fake up a lock with the bits
1809	 * copied into place and carefully not access anything
1810	 * beyond whats defined in the second word of a lck_mtx_t
1811	 */
1812	fake_lck.lck_mtx_state = prior_lock_state;
1813
1814	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
1815		     mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
1816
1817	if (__probable(fake_lck.lck_mtx_waiters)) {
1818
1819		if (fake_lck.lck_mtx_waiters > 1)
1820			thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri);
1821		else
1822			thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
1823	}
1824
1825	if (__improbable(fake_lck.lck_mtx_promoted)) {
1826		thread_t	thread = current_thread();
1827
1828
1829		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
1830			     thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
1831
1832		if (thread->promotions > 0) {
1833			spl_t	s = splsched();
1834
1835			thread_lock(thread);
1836
1837			if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
1838
1839				thread->sched_flags &= ~TH_SFLAG_PROMOTED;
1840
1841				if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1842					KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1843							      thread->sched_pri, DEPRESSPRI, 0, mutex, 0);
1844
1845					set_sched_pri(thread, DEPRESSPRI);
1846				}
1847				else {
1848					if (thread->priority < thread->sched_pri) {
1849						KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
1850								      thread->sched_pri, thread->priority, 0, mutex, 0);
1851
1852						SCHED(compute_priority)(thread, FALSE);
1853					}
1854				}
1855			}
1856			thread_unlock(thread);
1857			splx(s);
1858		}
1859	}
1860	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
1861		     mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1862}
1863
1864
1865/*
1866 * Routine: 	lck_mtx_lock_acquire_x86
1867 *
1868 * Invoked on acquiring the mutex when there is
1869 * contention (i.e. the assembly routine sees that
1870 * that mutex->lck_mtx_waiters != 0 or
1871 * thread->was_promoted_on_wakeup != 0)...
1872 *
1873 * mutex is owned...  interlock is held... preemption is disabled
1874 */
1875void
1876lck_mtx_lock_acquire_x86(
1877	lck_mtx_t	*mutex)
1878{
1879	thread_t	thread;
1880	integer_t	priority;
1881	spl_t		s;
1882
1883	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
1884		     mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
1885
1886	if (mutex->lck_mtx_waiters)
1887		priority = mutex->lck_mtx_pri;
1888	else
1889		priority = 0;
1890
1891	thread = (thread_t)mutex->lck_mtx_owner;	/* faster then current_thread() */
1892
1893	if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
1894
1895		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
1896				      thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0);
1897
1898		s = splsched();
1899		thread_lock(thread);
1900
1901		if (thread->sched_pri < priority)
1902			set_sched_pri(thread, priority);
1903
1904		if (mutex->lck_mtx_promoted == 0) {
1905			mutex->lck_mtx_promoted = 1;
1906
1907			thread->promotions++;
1908			thread->sched_flags |= TH_SFLAG_PROMOTED;
1909		}
1910		thread->was_promoted_on_wakeup = 0;
1911
1912		thread_unlock(thread);
1913		splx(s);
1914	}
1915	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
1916		     mutex, 0, mutex->lck_mtx_waiters, 0, 0);
1917}
1918
1919
1920
1921/*
1922 * Routine: 	lck_mtx_lock_spinwait_x86
1923 *
1924 * Invoked trying to acquire a mutex when there is contention but
1925 * the holder is running on another processor. We spin for up to a maximum
1926 * time waiting for the lock to be released.
1927 *
1928 * Called with the interlock unlocked.
1929 * returns 0 if mutex acquired
1930 * returns 1 if we spun
1931 * returns 2 if we didn't spin due to the holder not running
1932 */
1933int
1934lck_mtx_lock_spinwait_x86(
1935	lck_mtx_t	*mutex)
1936{
1937	thread_t	holder;
1938	uint64_t	deadline;
1939	int		retval = 1;
1940	int		loopcount = 0;
1941
1942
1943	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
1944		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0);
1945
1946	deadline = mach_absolute_time() + MutexSpin;
1947
1948	/*
1949	 * Spin while:
1950	 *   - mutex is locked, and
1951	 *   - its locked as a spin lock, and
1952	 *   - owner is running on another processor, and
1953	 *   - owner (processor) is not idling, and
1954	 *   - we haven't spun for long enough.
1955	 */
1956	do {
1957		if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
1958			retval = 0;
1959			break;
1960		}
1961		if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
1962
1963			if ( !(holder->machine.specFlags & OnProc) ||
1964			     (holder->state & TH_IDLE)) {
1965				if (loopcount == 0)
1966					retval = 2;
1967				break;
1968			}
1969		}
1970		cpu_pause();
1971
1972		loopcount++;
1973
1974	} while (mach_absolute_time() < deadline);
1975
1976
1977#if	CONFIG_DTRACE
1978	/*
1979	 * We've already kept a count via deadline of how long we spun.
1980	 * If dtrace is active, then we compute backwards to decide how
1981	 * long we spun.
1982	 *
1983	 * Note that we record a different probe id depending on whether
1984	 * this is a direct or indirect mutex.  This allows us to
1985	 * penalize only lock groups that have debug/stats enabled
1986	 * with dtrace processing if desired.
1987	 */
1988	if (__probable(mutex->lck_mtx_is_ext == 0)) {
1989		LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
1990		    mach_absolute_time() - (deadline - MutexSpin));
1991	} else {
1992		LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
1993		    mach_absolute_time() - (deadline - MutexSpin));
1994	}
1995	/* The lockstat acquire event is recorded by the assembly code beneath us. */
1996#endif
1997
1998	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
1999		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0);
2000
2001	return retval;
2002}
2003
2004
2005
2006/*
2007 * Routine: 	lck_mtx_lock_wait_x86
2008 *
2009 * Invoked in order to wait on contention.
2010 *
2011 * Called with the interlock locked and
2012 * preemption disabled...
2013 * returns it unlocked and with preemption enabled
2014 */
2015void
2016lck_mtx_lock_wait_x86 (
2017	lck_mtx_t	*mutex)
2018{
2019	thread_t	self = current_thread();
2020	thread_t	holder;
2021	integer_t	priority;
2022	spl_t		s;
2023#if	CONFIG_DTRACE
2024	uint64_t	sleep_start = 0;
2025
2026	if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
2027		sleep_start = mach_absolute_time();
2028	}
2029#endif
2030	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
2031		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2032
2033	priority = self->sched_pri;
2034
2035	if (priority < self->priority)
2036		priority = self->priority;
2037	if (priority < BASEPRI_DEFAULT)
2038		priority = BASEPRI_DEFAULT;
2039
2040	if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
2041		mutex->lck_mtx_pri = priority;
2042	mutex->lck_mtx_waiters++;
2043
2044	if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
2045	     holder->sched_pri < mutex->lck_mtx_pri ) {
2046
2047		s = splsched();
2048		thread_lock(holder);
2049
2050		if (holder->sched_pri < mutex->lck_mtx_pri) {
2051			KERNEL_DEBUG_CONSTANT(
2052				MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
2053				holder->sched_pri, priority, thread_tid(holder), mutex, 0);
2054
2055			set_sched_pri(holder, priority);
2056
2057			if (mutex->lck_mtx_promoted == 0) {
2058				holder->promotions++;
2059				holder->sched_flags |= TH_SFLAG_PROMOTED;
2060
2061				mutex->lck_mtx_promoted = 1;
2062			}
2063		}
2064		thread_unlock(holder);
2065		splx(s);
2066	}
2067	assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
2068
2069	lck_mtx_ilk_unlock(mutex);
2070
2071	thread_block(THREAD_CONTINUE_NULL);
2072
2073	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2074		     mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2075
2076#if	CONFIG_DTRACE
2077	/*
2078	 * Record the Dtrace lockstat probe for blocking, block time
2079	 * measured from when we were entered.
2080	 */
2081	if (sleep_start) {
2082		if (mutex->lck_mtx_is_ext == 0) {
2083			LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2084			    mach_absolute_time() - sleep_start);
2085		} else {
2086			LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2087			    mach_absolute_time() - sleep_start);
2088		}
2089	}
2090#endif
2091}
2092