1/*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56#include <mach_ldebug.h>
57#include <debug.h>
58
59#include <mach/kern_return.h>
60#include <mach/mach_host_server.h>
61#include <mach_debug/lockgroup_info.h>
62
63#include <kern/locks.h>
64#include <kern/misc_protos.h>
65#include <kern/kalloc.h>
66#include <kern/thread.h>
67#include <kern/processor.h>
68#include <kern/sched_prim.h>
69#include <kern/debug.h>
70#include <string.h>
71
72
73#include <sys/kdebug.h>
74
75#if	CONFIG_DTRACE
76/*
77 * We need only enough declarations from the BSD-side to be able to
78 * test if our probe is active, and to call __dtrace_probe().  Setting
79 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
80 */
81#define NEED_DTRACE_DEFS
82#include <../bsd/sys/lockstat.h>
83#endif
84
85#define	LCK_MTX_SLEEP_CODE		0
86#define	LCK_MTX_SLEEP_DEADLINE_CODE	1
87#define	LCK_MTX_LCK_WAIT_CODE		2
88#define	LCK_MTX_UNLCK_WAKEUP_CODE	3
89
90
91static queue_head_t	lck_grp_queue;
92static unsigned int	lck_grp_cnt;
93
94decl_lck_mtx_data(static,lck_grp_lock)
95static lck_mtx_ext_t lck_grp_lock_ext;
96
97lck_grp_attr_t	LockDefaultGroupAttr;
98lck_grp_t		LockCompatGroup;
99lck_attr_t		LockDefaultLckAttr;
100
101/*
102 * Routine:	lck_mod_init
103 */
104
105void
106lck_mod_init(
107	void)
108{
109	/*
110	 * Obtain "lcks" options:this currently controls lock statistics
111	 */
112	if (!PE_parse_boot_argn("lcks", &LcksOpts, sizeof (LcksOpts)))
113		LcksOpts = 0;
114
115	queue_init(&lck_grp_queue);
116
117	/*
118	 * Need to bootstrap the LockCompatGroup instead of calling lck_grp_init() here. This avoids
119	 * grabbing the lck_grp_lock before it is initialized.
120	 */
121
122	bzero(&LockCompatGroup, sizeof(lck_grp_t));
123	(void) strncpy(LockCompatGroup.lck_grp_name, "Compatibility APIs", LCK_GRP_MAX_NAME);
124
125	if (LcksOpts & enaLkStat)
126		LockCompatGroup.lck_grp_attr = LCK_GRP_ATTR_STAT;
127    else
128		LockCompatGroup.lck_grp_attr = LCK_ATTR_NONE;
129
130	LockCompatGroup.lck_grp_refcnt = 1;
131
132	enqueue_tail(&lck_grp_queue, (queue_entry_t)&LockCompatGroup);
133	lck_grp_cnt = 1;
134
135	lck_grp_attr_setdefault(&LockDefaultGroupAttr);
136	lck_attr_setdefault(&LockDefaultLckAttr);
137
138	lck_mtx_init_ext(&lck_grp_lock, &lck_grp_lock_ext, &LockCompatGroup, &LockDefaultLckAttr);
139
140}
141
142/*
143 * Routine:	lck_grp_attr_alloc_init
144 */
145
146lck_grp_attr_t	*
147lck_grp_attr_alloc_init(
148	void)
149{
150	lck_grp_attr_t	*attr;
151
152	if ((attr = (lck_grp_attr_t *)kalloc(sizeof(lck_grp_attr_t))) != 0)
153		lck_grp_attr_setdefault(attr);
154
155	return(attr);
156}
157
158
159/*
160 * Routine:	lck_grp_attr_setdefault
161 */
162
163void
164lck_grp_attr_setdefault(
165	lck_grp_attr_t	*attr)
166{
167	if (LcksOpts & enaLkStat)
168		attr->grp_attr_val = LCK_GRP_ATTR_STAT;
169	else
170		attr->grp_attr_val = 0;
171}
172
173
174/*
175 * Routine: 	lck_grp_attr_setstat
176 */
177
178void
179lck_grp_attr_setstat(
180	lck_grp_attr_t	*attr)
181{
182	(void)hw_atomic_or(&attr->grp_attr_val, LCK_GRP_ATTR_STAT);
183}
184
185
186/*
187 * Routine: 	lck_grp_attr_free
188 */
189
190void
191lck_grp_attr_free(
192	lck_grp_attr_t	*attr)
193{
194	kfree(attr, sizeof(lck_grp_attr_t));
195}
196
197
198/*
199 * Routine: 	lck_grp_alloc_init
200 */
201
202lck_grp_t *
203lck_grp_alloc_init(
204	const char*	grp_name,
205	lck_grp_attr_t	*attr)
206{
207	lck_grp_t	*grp;
208
209	if ((grp = (lck_grp_t *)kalloc(sizeof(lck_grp_t))) != 0)
210		lck_grp_init(grp, grp_name, attr);
211
212	return(grp);
213}
214
215
216/*
217 * Routine: 	lck_grp_init
218 */
219
220void
221lck_grp_init(
222	lck_grp_t		*grp,
223	const char*		grp_name,
224	lck_grp_attr_t	*attr)
225{
226	bzero((void *)grp, sizeof(lck_grp_t));
227
228	(void) strncpy(grp->lck_grp_name, grp_name, LCK_GRP_MAX_NAME);
229
230	if (attr != LCK_GRP_ATTR_NULL)
231		grp->lck_grp_attr = attr->grp_attr_val;
232	else if (LcksOpts & enaLkStat)
233                grp->lck_grp_attr = LCK_GRP_ATTR_STAT;
234        else
235                grp->lck_grp_attr = LCK_ATTR_NONE;
236
237	grp->lck_grp_refcnt = 1;
238
239	lck_mtx_lock(&lck_grp_lock);
240	enqueue_tail(&lck_grp_queue, (queue_entry_t)grp);
241	lck_grp_cnt++;
242	lck_mtx_unlock(&lck_grp_lock);
243
244}
245
246
247/*
248 * Routine: 	lck_grp_free
249 */
250
251void
252lck_grp_free(
253	lck_grp_t	*grp)
254{
255	lck_mtx_lock(&lck_grp_lock);
256	lck_grp_cnt--;
257	(void)remque((queue_entry_t)grp);
258	lck_mtx_unlock(&lck_grp_lock);
259	lck_grp_deallocate(grp);
260}
261
262
263/*
264 * Routine: 	lck_grp_reference
265 */
266
267void
268lck_grp_reference(
269	lck_grp_t	*grp)
270{
271	(void)hw_atomic_add(&grp->lck_grp_refcnt, 1);
272}
273
274
275/*
276 * Routine: 	lck_grp_deallocate
277 */
278
279void
280lck_grp_deallocate(
281	lck_grp_t	*grp)
282{
283	if (hw_atomic_sub(&grp->lck_grp_refcnt, 1) == 0)
284	 	kfree(grp, sizeof(lck_grp_t));
285}
286
287/*
288 * Routine:	lck_grp_lckcnt_incr
289 */
290
291void
292lck_grp_lckcnt_incr(
293	lck_grp_t	*grp,
294	lck_type_t	lck_type)
295{
296	unsigned int	*lckcnt;
297
298	switch (lck_type) {
299	case LCK_TYPE_SPIN:
300		lckcnt = &grp->lck_grp_spincnt;
301		break;
302	case LCK_TYPE_MTX:
303		lckcnt = &grp->lck_grp_mtxcnt;
304		break;
305	case LCK_TYPE_RW:
306		lckcnt = &grp->lck_grp_rwcnt;
307		break;
308	default:
309		return panic("lck_grp_lckcnt_incr(): invalid lock type: %d\n", lck_type);
310	}
311
312	(void)hw_atomic_add(lckcnt, 1);
313}
314
315/*
316 * Routine:	lck_grp_lckcnt_decr
317 */
318
319void
320lck_grp_lckcnt_decr(
321	lck_grp_t	*grp,
322	lck_type_t	lck_type)
323{
324	unsigned int	*lckcnt;
325
326	switch (lck_type) {
327	case LCK_TYPE_SPIN:
328		lckcnt = &grp->lck_grp_spincnt;
329		break;
330	case LCK_TYPE_MTX:
331		lckcnt = &grp->lck_grp_mtxcnt;
332		break;
333	case LCK_TYPE_RW:
334		lckcnt = &grp->lck_grp_rwcnt;
335		break;
336	default:
337		return panic("lck_grp_lckcnt_decr(): invalid lock type: %d\n", lck_type);
338	}
339
340	(void)hw_atomic_sub(lckcnt, 1);
341}
342
343/*
344 * Routine:	lck_attr_alloc_init
345 */
346
347lck_attr_t *
348lck_attr_alloc_init(
349	void)
350{
351	lck_attr_t	*attr;
352
353	if ((attr = (lck_attr_t *)kalloc(sizeof(lck_attr_t))) != 0)
354		lck_attr_setdefault(attr);
355
356	return(attr);
357}
358
359
360/*
361 * Routine:	lck_attr_setdefault
362 */
363
364void
365lck_attr_setdefault(
366	lck_attr_t	*attr)
367{
368#if   __i386__ || __x86_64__
369#if     !DEBUG
370 	if (LcksOpts & enaLkDeb)
371 		attr->lck_attr_val =  LCK_ATTR_DEBUG;
372 	else
373 		attr->lck_attr_val =  LCK_ATTR_NONE;
374#else
375 	attr->lck_attr_val =  LCK_ATTR_DEBUG;
376#endif	/* !DEBUG */
377#else
378#error Unknown architecture.
379#endif	/* __arm__ */
380}
381
382
383/*
384 * Routine:	lck_attr_setdebug
385 */
386void
387lck_attr_setdebug(
388	lck_attr_t	*attr)
389{
390	(void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_DEBUG);
391}
392
393/*
394 * Routine:	lck_attr_setdebug
395 */
396void
397lck_attr_cleardebug(
398	lck_attr_t	*attr)
399{
400	(void)hw_atomic_and(&attr->lck_attr_val, ~LCK_ATTR_DEBUG);
401}
402
403
404/*
405 * Routine:	lck_attr_rw_shared_priority
406 */
407void
408lck_attr_rw_shared_priority(
409	lck_attr_t	*attr)
410{
411	(void)hw_atomic_or(&attr->lck_attr_val, LCK_ATTR_RW_SHARED_PRIORITY);
412}
413
414
415/*
416 * Routine:	lck_attr_free
417 */
418void
419lck_attr_free(
420	lck_attr_t	*attr)
421{
422	kfree(attr, sizeof(lck_attr_t));
423}
424
425
426/*
427 * Routine:	lck_spin_sleep
428 */
429wait_result_t
430lck_spin_sleep(
431        lck_spin_t		*lck,
432	lck_sleep_action_t	lck_sleep_action,
433	event_t			event,
434	wait_interrupt_t	interruptible)
435{
436	wait_result_t	res;
437
438	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
439		panic("Invalid lock sleep action %x\n", lck_sleep_action);
440
441	res = assert_wait(event, interruptible);
442	if (res == THREAD_WAITING) {
443		lck_spin_unlock(lck);
444		res = thread_block(THREAD_CONTINUE_NULL);
445		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK))
446			lck_spin_lock(lck);
447	}
448	else
449	if (lck_sleep_action & LCK_SLEEP_UNLOCK)
450		lck_spin_unlock(lck);
451
452	return res;
453}
454
455
456/*
457 * Routine:	lck_spin_sleep_deadline
458 */
459wait_result_t
460lck_spin_sleep_deadline(
461        lck_spin_t		*lck,
462	lck_sleep_action_t	lck_sleep_action,
463	event_t			event,
464	wait_interrupt_t	interruptible,
465	uint64_t		deadline)
466{
467	wait_result_t   res;
468
469	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
470		panic("Invalid lock sleep action %x\n", lck_sleep_action);
471
472	res = assert_wait_deadline(event, interruptible, deadline);
473	if (res == THREAD_WAITING) {
474		lck_spin_unlock(lck);
475		res = thread_block(THREAD_CONTINUE_NULL);
476		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK))
477			lck_spin_lock(lck);
478	}
479	else
480	if (lck_sleep_action & LCK_SLEEP_UNLOCK)
481		lck_spin_unlock(lck);
482
483	return res;
484}
485
486
487/*
488 * Routine:	lck_mtx_sleep
489 */
490wait_result_t
491lck_mtx_sleep(
492        lck_mtx_t		*lck,
493	lck_sleep_action_t	lck_sleep_action,
494	event_t			event,
495	wait_interrupt_t	interruptible)
496{
497	wait_result_t	res;
498	thread_t		thread = current_thread();
499
500	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_CODE) | DBG_FUNC_START,
501		     (int)lck, (int)lck_sleep_action, (int)event, (int)interruptible, 0);
502
503	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
504		panic("Invalid lock sleep action %x\n", lck_sleep_action);
505
506	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
507		/*
508		 * We overload the RW lock promotion to give us a priority ceiling
509		 * during the time that this thread is asleep, so that when it
510		 * is re-awakened (and not yet contending on the mutex), it is
511		 * runnable at a reasonably high priority.
512		 */
513		thread->rwlock_count++;
514	}
515
516	res = assert_wait(event, interruptible);
517	if (res == THREAD_WAITING) {
518		lck_mtx_unlock(lck);
519		res = thread_block(THREAD_CONTINUE_NULL);
520		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
521			if ((lck_sleep_action & LCK_SLEEP_SPIN))
522				lck_mtx_lock_spin(lck);
523			else
524				lck_mtx_lock(lck);
525		}
526	}
527	else
528	if (lck_sleep_action & LCK_SLEEP_UNLOCK)
529		lck_mtx_unlock(lck);
530
531	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
532		if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
533			/* sched_flags checked without lock, but will be rechecked while clearing */
534			lck_rw_clear_promotion(thread);
535		}
536	}
537
538	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0);
539
540	return res;
541}
542
543
544/*
545 * Routine:	lck_mtx_sleep_deadline
546 */
547wait_result_t
548lck_mtx_sleep_deadline(
549        lck_mtx_t		*lck,
550	lck_sleep_action_t	lck_sleep_action,
551	event_t			event,
552	wait_interrupt_t	interruptible,
553	uint64_t		deadline)
554{
555	wait_result_t   res;
556	thread_t		thread = current_thread();
557
558	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_START,
559		     (int)lck, (int)lck_sleep_action, (int)event, (int)interruptible, 0);
560
561	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
562		panic("Invalid lock sleep action %x\n", lck_sleep_action);
563
564	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
565		/*
566		 * See lck_mtx_sleep().
567		 */
568		thread->rwlock_count++;
569	}
570
571	res = assert_wait_deadline(event, interruptible, deadline);
572	if (res == THREAD_WAITING) {
573		lck_mtx_unlock(lck);
574		res = thread_block(THREAD_CONTINUE_NULL);
575		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
576			if ((lck_sleep_action & LCK_SLEEP_SPIN))
577				lck_mtx_lock_spin(lck);
578			else
579				lck_mtx_lock(lck);
580		}
581	}
582	else
583	if (lck_sleep_action & LCK_SLEEP_UNLOCK)
584		lck_mtx_unlock(lck);
585
586	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
587		if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
588			/* sched_flags checked without lock, but will be rechecked while clearing */
589			lck_rw_clear_promotion(thread);
590		}
591	}
592
593	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0);
594
595	return res;
596}
597
598/*
599 * Routine: 	lck_mtx_lock_wait
600 *
601 * Invoked in order to wait on contention.
602 *
603 * Called with the interlock locked and
604 * returns it unlocked.
605 */
606void
607lck_mtx_lock_wait (
608	lck_mtx_t			*lck,
609	thread_t			holder)
610{
611	thread_t		self = current_thread();
612	lck_mtx_t		*mutex;
613	integer_t		priority;
614	spl_t			s = splsched();
615#if	CONFIG_DTRACE
616	uint64_t		sleep_start = 0;
617
618	if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
619		sleep_start = mach_absolute_time();
620	}
621#endif
622
623	if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
624		mutex = lck;
625	else
626		mutex = &lck->lck_mtx_ptr->lck_mtx;
627
628	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, (int)lck, (int)holder, 0, 0, 0);
629
630	priority = self->sched_pri;
631	if (priority < self->priority)
632		priority = self->priority;
633	if (priority < BASEPRI_DEFAULT)
634		priority = BASEPRI_DEFAULT;
635
636	/* Do not promote past promotion ceiling */
637	priority = MIN(priority, MAXPRI_PROMOTE);
638
639	thread_lock(holder);
640	if (mutex->lck_mtx_pri == 0)
641		holder->promotions++;
642	holder->sched_flags |= TH_SFLAG_PROMOTED;
643	if (		mutex->lck_mtx_pri < priority	&&
644				holder->sched_pri < priority		) {
645		KERNEL_DEBUG_CONSTANT(
646			MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
647					holder->sched_pri, priority, holder, lck, 0);
648		set_sched_pri(holder, priority);
649	}
650	thread_unlock(holder);
651	splx(s);
652
653	if (mutex->lck_mtx_pri < priority)
654		mutex->lck_mtx_pri = priority;
655	if (self->pending_promoter[self->pending_promoter_index] == NULL) {
656		self->pending_promoter[self->pending_promoter_index] = mutex;
657		mutex->lck_mtx_waiters++;
658	}
659	else
660	if (self->pending_promoter[self->pending_promoter_index] != mutex) {
661		self->pending_promoter[++self->pending_promoter_index] = mutex;
662		mutex->lck_mtx_waiters++;
663	}
664
665	assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT);
666	lck_mtx_ilk_unlock(mutex);
667
668	thread_block(THREAD_CONTINUE_NULL);
669
670	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
671#if	CONFIG_DTRACE
672	/*
673	 * Record the Dtrace lockstat probe for blocking, block time
674	 * measured from when we were entered.
675	 */
676	if (sleep_start) {
677		if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) {
678			LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, lck,
679			    mach_absolute_time() - sleep_start);
680		} else {
681			LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, lck,
682			    mach_absolute_time() - sleep_start);
683		}
684	}
685#endif
686}
687
688/*
689 * Routine: 	lck_mtx_lock_acquire
690 *
691 * Invoked on acquiring the mutex when there is
692 * contention.
693 *
694 * Returns the current number of waiters.
695 *
696 * Called with the interlock locked.
697 */
698int
699lck_mtx_lock_acquire(
700	lck_mtx_t		*lck)
701{
702	thread_t		thread = current_thread();
703	lck_mtx_t		*mutex;
704
705	if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
706		mutex = lck;
707	else
708		mutex = &lck->lck_mtx_ptr->lck_mtx;
709
710	if (thread->pending_promoter[thread->pending_promoter_index] == mutex) {
711		thread->pending_promoter[thread->pending_promoter_index] = NULL;
712		if (thread->pending_promoter_index > 0)
713			thread->pending_promoter_index--;
714		mutex->lck_mtx_waiters--;
715	}
716
717	if (mutex->lck_mtx_waiters > 0) {
718		integer_t		priority = mutex->lck_mtx_pri;
719		spl_t			s = splsched();
720
721		thread_lock(thread);
722		thread->promotions++;
723		thread->sched_flags |= TH_SFLAG_PROMOTED;
724		if (thread->sched_pri < priority) {
725			KERNEL_DEBUG_CONSTANT(
726				MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
727						thread->sched_pri, priority, 0, lck, 0);
728			/* Do not promote past promotion ceiling */
729			assert(priority <= MAXPRI_PROMOTE);
730			set_sched_pri(thread, priority);
731		}
732		thread_unlock(thread);
733		splx(s);
734	}
735	else
736		mutex->lck_mtx_pri = 0;
737
738#if CONFIG_DTRACE
739	if (lockstat_probemap[LS_LCK_MTX_LOCK_ACQUIRE] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_ACQUIRE]) {
740		if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) {
741			LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lck, 0);
742		} else {
743			LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, lck, 0);
744		}
745	}
746#endif
747	return (mutex->lck_mtx_waiters);
748}
749
750/*
751 * Routine: 	lck_mtx_unlock_wakeup
752 *
753 * Invoked on unlock when there is contention.
754 *
755 * Called with the interlock locked.
756 */
757void
758lck_mtx_unlock_wakeup (
759	lck_mtx_t			*lck,
760	thread_t			holder)
761{
762	thread_t		thread = current_thread();
763	lck_mtx_t		*mutex;
764
765	if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
766		mutex = lck;
767	else
768		mutex = &lck->lck_mtx_ptr->lck_mtx;
769
770	if (thread != holder)
771		panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder);
772
773	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, (int)lck, (int)holder, 0, 0, 0);
774
775	assert(mutex->lck_mtx_waiters > 0);
776	thread_wakeup_one((event_t)(((unsigned int*)lck)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
777
778	if (thread->promotions > 0) {
779		spl_t		s = splsched();
780
781		thread_lock(thread);
782		if (	--thread->promotions == 0				&&
783				(thread->sched_flags & TH_SFLAG_PROMOTED)		) {
784			thread->sched_flags &= ~TH_SFLAG_PROMOTED;
785
786			if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
787				/* Thread still has a RW lock promotion */
788			} else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
789				KERNEL_DEBUG_CONSTANT(
790					MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
791						  thread->sched_pri, DEPRESSPRI, 0, lck, 0);
792
793				set_sched_pri(thread, DEPRESSPRI);
794			}
795			else {
796				if (thread->priority < thread->sched_pri) {
797					KERNEL_DEBUG_CONSTANT(
798						MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) |
799															DBG_FUNC_NONE,
800							thread->sched_pri, thread->priority,
801									0, lck, 0);
802				}
803
804				SCHED(compute_priority)(thread, FALSE);
805			}
806		}
807		thread_unlock(thread);
808		splx(s);
809	}
810
811	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
812}
813
814void
815lck_mtx_unlockspin_wakeup (
816	lck_mtx_t			*lck)
817{
818	assert(lck->lck_mtx_waiters > 0);
819	thread_wakeup_one((event_t)(((unsigned int*)lck)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)));
820
821	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_NONE, (int)lck, 0, 0, 1, 0);
822#if CONFIG_DTRACE
823	/*
824	 * When there are waiters, we skip the hot-patch spot in the
825	 * fastpath, so we record it here.
826	 */
827	LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lck, 0);
828#endif
829}
830
831
832/*
833 * Routine: 	mutex_pause
834 *
835 * Called by former callers of simple_lock_pause().
836 */
837#define MAX_COLLISION_COUNTS	32
838#define MAX_COLLISION 	8
839
840unsigned int max_collision_count[MAX_COLLISION_COUNTS];
841
842uint32_t collision_backoffs[MAX_COLLISION] = {
843        10, 50, 100, 200, 400, 600, 800, 1000
844};
845
846
847void
848mutex_pause(uint32_t collisions)
849{
850	wait_result_t wait_result;
851	uint32_t	back_off;
852
853	if (collisions >= MAX_COLLISION_COUNTS)
854	        collisions = MAX_COLLISION_COUNTS - 1;
855	max_collision_count[collisions]++;
856
857	if (collisions >= MAX_COLLISION)
858	        collisions = MAX_COLLISION - 1;
859	back_off = collision_backoffs[collisions];
860
861	wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC);
862	assert(wait_result == THREAD_WAITING);
863
864	wait_result = thread_block(THREAD_CONTINUE_NULL);
865	assert(wait_result == THREAD_TIMED_OUT);
866}
867
868
869unsigned int mutex_yield_wait = 0;
870unsigned int mutex_yield_no_wait = 0;
871
872void
873lck_mtx_yield(
874	    lck_mtx_t	*lck)
875{
876	int	waiters;
877
878#if DEBUG
879	lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED);
880#endif /* DEBUG */
881
882	if (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT)
883	        waiters = lck->lck_mtx_ptr->lck_mtx.lck_mtx_waiters;
884	else
885	        waiters = lck->lck_mtx_waiters;
886
887	if ( !waiters) {
888	        mutex_yield_no_wait++;
889	} else {
890	        mutex_yield_wait++;
891		lck_mtx_unlock(lck);
892		mutex_pause(0);
893		lck_mtx_lock(lck);
894	}
895}
896
897
898/*
899 * Routine:	lck_rw_sleep
900 */
901wait_result_t
902lck_rw_sleep(
903        lck_rw_t		*lck,
904	lck_sleep_action_t	lck_sleep_action,
905	event_t			event,
906	wait_interrupt_t	interruptible)
907{
908	wait_result_t	res;
909	lck_rw_type_t	lck_rw_type;
910	thread_t		thread = current_thread();
911
912	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
913		panic("Invalid lock sleep action %x\n", lck_sleep_action);
914
915	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
916		/*
917		 * Although we are dropping the RW lock, the intent in most cases
918		 * is that this thread remains as an observer, since it may hold
919		 * some secondary resource, but must yield to avoid deadlock. In
920		 * this situation, make sure that the thread is boosted to the
921		 * RW lock ceiling while blocked, so that it can re-acquire the
922		 * RW lock at that priority.
923		 */
924		thread->rwlock_count++;
925	}
926
927	res = assert_wait(event, interruptible);
928	if (res == THREAD_WAITING) {
929		lck_rw_type = lck_rw_done(lck);
930		res = thread_block(THREAD_CONTINUE_NULL);
931		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
932			if (!(lck_sleep_action & (LCK_SLEEP_SHARED|LCK_SLEEP_EXCLUSIVE)))
933				lck_rw_lock(lck, lck_rw_type);
934			else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE)
935				lck_rw_lock_exclusive(lck);
936			else
937				lck_rw_lock_shared(lck);
938		}
939	}
940	else
941	if (lck_sleep_action & LCK_SLEEP_UNLOCK)
942		(void)lck_rw_done(lck);
943
944	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
945		if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
946			/* sched_flags checked without lock, but will be rechecked while clearing */
947
948			/* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */
949			assert(lck_sleep_action & LCK_SLEEP_UNLOCK);
950
951			lck_rw_clear_promotion(thread);
952		}
953	}
954
955	return res;
956}
957
958
959/*
960 * Routine:	lck_rw_sleep_deadline
961 */
962wait_result_t
963lck_rw_sleep_deadline(
964	lck_rw_t		*lck,
965	lck_sleep_action_t	lck_sleep_action,
966	event_t			event,
967	wait_interrupt_t	interruptible,
968	uint64_t		deadline)
969{
970	wait_result_t   res;
971	lck_rw_type_t	lck_rw_type;
972	thread_t		thread = current_thread();
973
974	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0)
975		panic("Invalid lock sleep action %x\n", lck_sleep_action);
976
977	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
978		thread->rwlock_count++;
979	}
980
981	res = assert_wait_deadline(event, interruptible, deadline);
982	if (res == THREAD_WAITING) {
983		lck_rw_type = lck_rw_done(lck);
984		res = thread_block(THREAD_CONTINUE_NULL);
985		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
986			if (!(lck_sleep_action & (LCK_SLEEP_SHARED|LCK_SLEEP_EXCLUSIVE)))
987				lck_rw_lock(lck, lck_rw_type);
988			else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE)
989				lck_rw_lock_exclusive(lck);
990			else
991				lck_rw_lock_shared(lck);
992		}
993	}
994	else
995	if (lck_sleep_action & LCK_SLEEP_UNLOCK)
996		(void)lck_rw_done(lck);
997
998	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
999		if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1000			/* sched_flags checked without lock, but will be rechecked while clearing */
1001
1002			/* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */
1003			assert(lck_sleep_action & LCK_SLEEP_UNLOCK);
1004
1005			lck_rw_clear_promotion(thread);
1006		}
1007	}
1008
1009	return res;
1010}
1011
1012/*
1013 * Reader-writer lock promotion
1014 *
1015 * We support a limited form of reader-writer
1016 * lock promotion whose effects are:
1017 *
1018 *   * Qualifying threads have decay disabled
1019 *   * Scheduler priority is reset to a floor of
1020 *     of their statically assigned priority
1021 *     or BASEPRI_BACKGROUND
1022 *
1023 * The rationale is that lck_rw_ts do not have
1024 * a single owner, so we cannot apply a directed
1025 * priority boost from all waiting threads
1026 * to all holding threads without maintaining
1027 * lists of all shared owners and all waiting
1028 * threads for every lock.
1029 *
1030 * Instead (and to preserve the uncontended fast-
1031 * path), acquiring (or attempting to acquire)
1032 * a RW lock in shared or exclusive lock increments
1033 * a per-thread counter. Only if that thread stops
1034 * making forward progress (for instance blocking
1035 * on a mutex, or being preempted) do we consult
1036 * the counter and apply the priority floor.
1037 * When the thread becomes runnable again (or in
1038 * the case of preemption it never stopped being
1039 * runnable), it has the priority boost and should
1040 * be in a good position to run on the CPU and
1041 * release all RW locks (at which point the priority
1042 * boost is cleared).
1043 *
1044 * Care must be taken to ensure that priority
1045 * boosts are not retained indefinitely, since unlike
1046 * mutex priority boosts (where the boost is tied
1047 * to the mutex lifecycle), the boost is tied
1048 * to the thread and independent of any particular
1049 * lck_rw_t. Assertions are in place on return
1050 * to userspace so that the boost is not held
1051 * indefinitely.
1052 *
1053 * The routines that increment/decrement the
1054 * per-thread counter should err on the side of
1055 * incrementing any time a preemption is possible
1056 * and the lock would be visible to the rest of the
1057 * system as held (so it should be incremented before
1058 * interlocks are dropped/preemption is enabled, or
1059 * before a CAS is executed to acquire the lock).
1060 *
1061 */
1062
1063/*
1064 * lck_rw_clear_promotion: Undo priority promotions when the last RW
1065 * lock is released by a thread (if a promotion was active)
1066 */
1067void lck_rw_clear_promotion(thread_t thread)
1068{
1069	assert(thread->rwlock_count == 0);
1070
1071	/* Cancel any promotions if the thread had actually blocked while holding a RW lock */
1072	spl_t s = splsched();
1073
1074	thread_lock(thread);
1075
1076	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
1077		thread->sched_flags &= ~TH_SFLAG_RW_PROMOTED;
1078
1079		if (thread->sched_flags & TH_SFLAG_PROMOTED) {
1080			/* Thread still has a mutex promotion */
1081		} else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
1082			KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
1083							      thread->sched_pri, DEPRESSPRI, 0, 0, 0);
1084
1085			set_sched_pri(thread, DEPRESSPRI);
1086		} else {
1087			KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
1088								  thread->sched_pri, thread->priority, 0, 0, 0);
1089
1090			SCHED(compute_priority)(thread, FALSE);
1091		}
1092	}
1093
1094	thread_unlock(thread);
1095	splx(s);
1096}
1097
1098kern_return_t
1099host_lockgroup_info(
1100	host_t					host,
1101	lockgroup_info_array_t	*lockgroup_infop,
1102	mach_msg_type_number_t	*lockgroup_infoCntp)
1103{
1104	lockgroup_info_t	*lockgroup_info_base;
1105	lockgroup_info_t	*lockgroup_info;
1106	vm_offset_t			lockgroup_info_addr;
1107	vm_size_t			lockgroup_info_size;
1108	lck_grp_t			*lck_grp;
1109	unsigned int		i;
1110	vm_size_t			used;
1111	vm_map_copy_t		copy;
1112	kern_return_t		kr;
1113
1114	if (host == HOST_NULL)
1115		return KERN_INVALID_HOST;
1116
1117	lck_mtx_lock(&lck_grp_lock);
1118
1119	lockgroup_info_size = round_page(lck_grp_cnt * sizeof *lockgroup_info);
1120	kr = kmem_alloc_pageable(ipc_kernel_map,
1121						 &lockgroup_info_addr, lockgroup_info_size);
1122	if (kr != KERN_SUCCESS) {
1123		lck_mtx_unlock(&lck_grp_lock);
1124		return(kr);
1125	}
1126
1127	lockgroup_info_base = (lockgroup_info_t *) lockgroup_info_addr;
1128	lck_grp = (lck_grp_t *)queue_first(&lck_grp_queue);
1129	lockgroup_info = lockgroup_info_base;
1130
1131	for (i = 0; i < lck_grp_cnt; i++) {
1132
1133		lockgroup_info->lock_spin_cnt = lck_grp->lck_grp_spincnt;
1134		lockgroup_info->lock_spin_util_cnt = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_util_cnt;
1135		lockgroup_info->lock_spin_held_cnt = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_held_cnt;
1136		lockgroup_info->lock_spin_miss_cnt = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_miss_cnt;
1137		lockgroup_info->lock_spin_held_max = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_held_max;
1138		lockgroup_info->lock_spin_held_cum = lck_grp->lck_grp_stat.lck_grp_spin_stat.lck_grp_spin_held_cum;
1139
1140		lockgroup_info->lock_mtx_cnt = lck_grp->lck_grp_mtxcnt;
1141		lockgroup_info->lock_mtx_util_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_util_cnt;
1142		lockgroup_info->lock_mtx_held_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_cnt;
1143		lockgroup_info->lock_mtx_miss_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_miss_cnt;
1144		lockgroup_info->lock_mtx_wait_cnt = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_cnt;
1145		lockgroup_info->lock_mtx_held_max = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_max;
1146		lockgroup_info->lock_mtx_held_cum = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_cum;
1147		lockgroup_info->lock_mtx_wait_max = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_max;
1148		lockgroup_info->lock_mtx_wait_cum = lck_grp->lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_cum;
1149
1150		lockgroup_info->lock_rw_cnt = lck_grp->lck_grp_rwcnt;
1151		lockgroup_info->lock_rw_util_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_util_cnt;
1152		lockgroup_info->lock_rw_held_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_held_cnt;
1153		lockgroup_info->lock_rw_miss_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_miss_cnt;
1154		lockgroup_info->lock_rw_wait_cnt = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_cnt;
1155		lockgroup_info->lock_rw_held_max = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_held_max;
1156		lockgroup_info->lock_rw_held_cum = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_held_cum;
1157		lockgroup_info->lock_rw_wait_max = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_max;
1158		lockgroup_info->lock_rw_wait_cum = lck_grp->lck_grp_stat.lck_grp_rw_stat.lck_grp_rw_wait_cum;
1159
1160		(void) strncpy(lockgroup_info->lockgroup_name,lck_grp->lck_grp_name, LOCKGROUP_MAX_NAME);
1161
1162		lck_grp = (lck_grp_t *)(queue_next((queue_entry_t)(lck_grp)));
1163		lockgroup_info++;
1164	}
1165
1166	*lockgroup_infoCntp = lck_grp_cnt;
1167	lck_mtx_unlock(&lck_grp_lock);
1168
1169	used = (*lockgroup_infoCntp) * sizeof *lockgroup_info;
1170
1171	if (used != lockgroup_info_size)
1172		bzero((char *) lockgroup_info, lockgroup_info_size - used);
1173
1174	kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)lockgroup_info_addr,
1175			   (vm_map_size_t)lockgroup_info_size, TRUE, &copy);
1176	assert(kr == KERN_SUCCESS);
1177
1178	*lockgroup_infop = (lockgroup_info_t *) copy;
1179
1180	return(KERN_SUCCESS);
1181}
1182
1183