1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include "lint.h"
28#include "thr_uberdata.h"
29#include <sys/rtpriocntl.h>
30#include <sys/sdt.h>
31#include <atomic.h>
32
33#if defined(THREAD_DEBUG)
34#define	INCR32(x)	(((x) != UINT32_MAX)? (x)++ : 0)
35#define	INCR(x)		((x)++)
36#define	DECR(x)		((x)--)
37#define	MAXINCR(m, x)	((m < ++x)? (m = x) : 0)
38#else
39#define	INCR32(x)
40#define	INCR(x)
41#define	DECR(x)
42#define	MAXINCR(m, x)
43#endif
44
45/*
46 * This mutex is initialized to be held by lwp#1.
47 * It is used to block a thread that has returned from a mutex_lock()
48 * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
49 */
50mutex_t	stall_mutex = DEFAULTMUTEX;
51
52static int shared_mutex_held(mutex_t *);
53static int mutex_queuelock_adaptive(mutex_t *);
54static void mutex_wakeup_all(mutex_t *);
55
56/*
57 * Lock statistics support functions.
58 */
59void
60record_begin_hold(tdb_mutex_stats_t *msp)
61{
62	tdb_incr(msp->mutex_lock);
63	msp->mutex_begin_hold = gethrtime();
64}
65
66hrtime_t
67record_hold_time(tdb_mutex_stats_t *msp)
68{
69	hrtime_t now = gethrtime();
70
71	if (msp->mutex_begin_hold)
72		msp->mutex_hold_time += now - msp->mutex_begin_hold;
73	msp->mutex_begin_hold = 0;
74	return (now);
75}
76
77/*
78 * Called once at library initialization.
79 */
80void
81mutex_setup(void)
82{
83	if (set_lock_byte(&stall_mutex.mutex_lockw))
84		thr_panic("mutex_setup() cannot acquire stall_mutex");
85	stall_mutex.mutex_owner = (uintptr_t)curthread;
86}
87
88/*
89 * The default spin count of 1000 is experimentally determined.
90 * On sun4u machines with any number of processors it could be raised
91 * to 10,000 but that (experimentally) makes almost no difference.
92 * The environment variable:
93 *	_THREAD_ADAPTIVE_SPIN=count
94 * can be used to override and set the count in the range [0 .. 1,000,000].
95 */
96int	thread_adaptive_spin = 1000;
97uint_t	thread_max_spinners = 100;
98int	thread_queue_verify = 0;
99static	int	ncpus;
100
101/*
102 * Distinguish spinning for queue locks from spinning for regular locks.
103 * We try harder to acquire queue locks by spinning.
104 * The environment variable:
105 *	_THREAD_QUEUE_SPIN=count
106 * can be used to override and set the count in the range [0 .. 1,000,000].
107 */
108int	thread_queue_spin = 10000;
109
110#define	ALL_ATTRIBUTES				\
111	(LOCK_RECURSIVE | LOCK_ERRORCHECK |	\
112	LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT |	\
113	LOCK_ROBUST)
114
115/*
116 * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
117 * augmented by zero or more the flags:
118 *	LOCK_RECURSIVE
119 *	LOCK_ERRORCHECK
120 *	LOCK_PRIO_INHERIT
121 *	LOCK_PRIO_PROTECT
122 *	LOCK_ROBUST
123 */
124#pragma weak _mutex_init = mutex_init
125/* ARGSUSED2 */
126int
127mutex_init(mutex_t *mp, int type, void *arg)
128{
129	int basetype = (type & ~ALL_ATTRIBUTES);
130	const pcclass_t *pccp;
131	int error = 0;
132	int ceil;
133
134	if (basetype == USYNC_PROCESS_ROBUST) {
135		/*
136		 * USYNC_PROCESS_ROBUST is a deprecated historical type.
137		 * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
138		 * retain the USYNC_PROCESS_ROBUST flag so we can return
139		 * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
140		 * mutexes will ever draw ELOCKUNMAPPED).
141		 */
142		type |= (USYNC_PROCESS | LOCK_ROBUST);
143		basetype = USYNC_PROCESS;
144	}
145
146	if (type & LOCK_PRIO_PROTECT)
147		pccp = get_info_by_policy(SCHED_FIFO);
148	if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
149	    (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
150	    == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
151	    ((type & LOCK_PRIO_PROTECT) &&
152	    ((ceil = *(int *)arg) < pccp->pcc_primin ||
153	    ceil > pccp->pcc_primax))) {
154		error = EINVAL;
155	} else if (type & LOCK_ROBUST) {
156		/*
157		 * Callers of mutex_init() with the LOCK_ROBUST attribute
158		 * are required to pass an initially all-zero mutex.
159		 * Multiple calls to mutex_init() are allowed; all but
160		 * the first return EBUSY.  A call to mutex_init() is
161		 * allowed to make an inconsistent robust lock consistent
162		 * (for historical usage, even though the proper interface
163		 * for this is mutex_consistent()).  Note that we use
164		 * atomic_or_16() to set the LOCK_INITED flag so as
165		 * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
166		 */
167		if (!(mp->mutex_flag & LOCK_INITED)) {
168			mp->mutex_type = (uint8_t)type;
169			atomic_or_16(&mp->mutex_flag, LOCK_INITED);
170			mp->mutex_magic = MUTEX_MAGIC;
171		} else if (type != mp->mutex_type ||
172		    ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
173			error = EINVAL;
174		} else if (mutex_consistent(mp) != 0) {
175			error = EBUSY;
176		}
177		/* register a process robust mutex with the kernel */
178		if (basetype == USYNC_PROCESS)
179			register_lock(mp);
180	} else {
181		(void) memset(mp, 0, sizeof (*mp));
182		mp->mutex_type = (uint8_t)type;
183		mp->mutex_flag = LOCK_INITED;
184		mp->mutex_magic = MUTEX_MAGIC;
185	}
186
187	if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
188		mp->mutex_ceiling = ceil;
189	}
190
191	/*
192	 * This should be at the beginning of the function,
193	 * but for the sake of old broken applications that
194	 * do not have proper alignment for their mutexes
195	 * (and don't check the return code from mutex_init),
196	 * we put it here, after initializing the mutex regardless.
197	 */
198	if (error == 0 &&
199	    ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
200	    curthread->ul_misaligned == 0)
201		error = EINVAL;
202
203	return (error);
204}
205
206/*
207 * Delete mp from list of ceiling mutexes owned by curthread.
208 * Return 1 if the head of the chain was updated.
209 */
210int
211_ceil_mylist_del(mutex_t *mp)
212{
213	ulwp_t *self = curthread;
214	mxchain_t **mcpp;
215	mxchain_t *mcp;
216
217	for (mcpp = &self->ul_mxchain;
218	    (mcp = *mcpp) != NULL;
219	    mcpp = &mcp->mxchain_next) {
220		if (mcp->mxchain_mx == mp) {
221			*mcpp = mcp->mxchain_next;
222			lfree(mcp, sizeof (*mcp));
223			return (mcpp == &self->ul_mxchain);
224		}
225	}
226	return (0);
227}
228
229/*
230 * Add mp to the list of ceiling mutexes owned by curthread.
231 * Return ENOMEM if no memory could be allocated.
232 */
233int
234_ceil_mylist_add(mutex_t *mp)
235{
236	ulwp_t *self = curthread;
237	mxchain_t *mcp;
238
239	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
240		return (ENOMEM);
241	mcp->mxchain_mx = mp;
242	mcp->mxchain_next = self->ul_mxchain;
243	self->ul_mxchain = mcp;
244	return (0);
245}
246
247/*
248 * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
249 */
250static void
251set_rt_priority(ulwp_t *self, int prio)
252{
253	pcparms_t pcparm;
254
255	pcparm.pc_cid = self->ul_rtclassid;
256	((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
257	((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
258	(void) priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
259}
260
261/*
262 * Inherit priority from ceiling.
263 * This changes the effective priority, not the assigned priority.
264 */
265void
266_ceil_prio_inherit(int prio)
267{
268	ulwp_t *self = curthread;
269
270	self->ul_epri = prio;
271	set_rt_priority(self, prio);
272}
273
274/*
275 * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
276 * if holding at least one ceiling lock.  If no ceiling locks are held at this
277 * point, disinherit completely, reverting back to assigned priority.
278 */
279void
280_ceil_prio_waive(void)
281{
282	ulwp_t *self = curthread;
283	mxchain_t *mcp = self->ul_mxchain;
284	int prio;
285
286	if (mcp == NULL) {
287		prio = self->ul_pri;
288		self->ul_epri = 0;
289	} else {
290		prio = mcp->mxchain_mx->mutex_ceiling;
291		self->ul_epri = prio;
292	}
293	set_rt_priority(self, prio);
294}
295
296/*
297 * Clear the lock byte.  Retain the waiters byte and the spinners byte.
298 * Return the old value of the lock word.
299 */
300static uint32_t
301clear_lockbyte(volatile uint32_t *lockword)
302{
303	uint32_t old;
304	uint32_t new;
305
306	do {
307		old = *lockword;
308		new = old & ~LOCKMASK;
309	} while (atomic_cas_32(lockword, old, new) != old);
310
311	return (old);
312}
313
314/*
315 * Same as clear_lockbyte(), but operates on mutex_lockword64.
316 * The mutex_ownerpid field is cleared along with the lock byte.
317 */
318static uint64_t
319clear_lockbyte64(volatile uint64_t *lockword64)
320{
321	uint64_t old;
322	uint64_t new;
323
324	do {
325		old = *lockword64;
326		new = old & ~LOCKMASK64;
327	} while (atomic_cas_64(lockword64, old, new) != old);
328
329	return (old);
330}
331
332/*
333 * Similar to set_lock_byte(), which only tries to set the lock byte.
334 * Here, we attempt to set the lock byte AND the mutex_ownerpid, keeping
335 * the remaining bytes constant.  This atomic operation is required for the
336 * correctness of process-shared robust locks, otherwise there would be
337 * a window or vulnerability in which the lock byte had been set but the
338 * mutex_ownerpid had not yet been set.  If the process were to die in
339 * this window of vulnerability (due to some other thread calling exit()
340 * or the process receiving a fatal signal), the mutex would be left locked
341 * but without a process-ID to determine which process was holding the lock.
342 * The kernel would then be unable to mark the robust mutex as LOCK_OWNERDEAD
343 * when the process died.  For all other cases of process-shared locks, this
344 * operation is just a convenience, for the sake of common code.
345 *
346 * This operation requires process-shared robust locks to be properly
347 * aligned on an 8-byte boundary, at least on sparc machines, lest the
348 * operation incur an alignment fault.  This is automatic when locks
349 * are declared properly using the mutex_t or pthread_mutex_t data types
350 * and the application does not allocate dynamic memory on less than an
351 * 8-byte boundary.  See the 'horrible hack' comments below for cases
352 * dealing with such broken applications.
353 */
354static int
355set_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
356{
357	uint64_t old;
358	uint64_t new;
359
360	old = *lockword64 & ~LOCKMASK64;
361	new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
362	if (atomic_cas_64(lockword64, old, new) == old)
363		return (LOCKCLEAR);
364
365	return (LOCKSET);
366}
367
368/*
369 * Increment the spinners count in the mutex lock word.
370 * Return 0 on success.  Return -1 if the count would overflow.
371 */
372static int
373spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
374{
375	uint32_t old;
376	uint32_t new;
377
378	do {
379		old = *lockword;
380		if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
381			return (-1);
382		new = old + (1 << SPINNERSHIFT);
383	} while (atomic_cas_32(lockword, old, new) != old);
384
385	return (0);
386}
387
388/*
389 * Decrement the spinners count in the mutex lock word.
390 * Return the new value of the lock word.
391 */
392static uint32_t
393spinners_decr(volatile uint32_t *lockword)
394{
395	uint32_t old;
396	uint32_t new;
397
398	do {
399		new = old = *lockword;
400		if (new & SPINNERMASK)
401			new -= (1 << SPINNERSHIFT);
402	} while (atomic_cas_32(lockword, old, new) != old);
403
404	return (new);
405}
406
407/*
408 * Non-preemptive spin locks.  Used by queue_lock().
409 * No lock statistics are gathered for these locks.
410 * No DTrace probes are provided for these locks.
411 */
412void
413spin_lock_set(mutex_t *mp)
414{
415	ulwp_t *self = curthread;
416
417	no_preempt(self);
418	if (set_lock_byte(&mp->mutex_lockw) == 0) {
419		mp->mutex_owner = (uintptr_t)self;
420		return;
421	}
422	/*
423	 * Spin for a while, attempting to acquire the lock.
424	 */
425	INCR32(self->ul_spin_lock_spin);
426	if (mutex_queuelock_adaptive(mp) == 0 ||
427	    set_lock_byte(&mp->mutex_lockw) == 0) {
428		mp->mutex_owner = (uintptr_t)self;
429		return;
430	}
431	/*
432	 * Try harder if we were previously at a no premption level.
433	 */
434	if (self->ul_preempt > 1) {
435		INCR32(self->ul_spin_lock_spin2);
436		if (mutex_queuelock_adaptive(mp) == 0 ||
437		    set_lock_byte(&mp->mutex_lockw) == 0) {
438			mp->mutex_owner = (uintptr_t)self;
439			return;
440		}
441	}
442	/*
443	 * Give up and block in the kernel for the mutex.
444	 */
445	INCR32(self->ul_spin_lock_sleep);
446	(void) ___lwp_mutex_timedlock(mp, NULL, self);
447}
448
449void
450spin_lock_clear(mutex_t *mp)
451{
452	ulwp_t *self = curthread;
453
454	mp->mutex_owner = 0;
455	if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
456		(void) ___lwp_mutex_wakeup(mp, 0);
457		INCR32(self->ul_spin_lock_wakeup);
458	}
459	preempt(self);
460}
461
462/*
463 * Allocate the sleep queue hash table.
464 */
465void
466queue_alloc(void)
467{
468	ulwp_t *self = curthread;
469	uberdata_t *udp = self->ul_uberdata;
470	queue_head_t *qp;
471	void *data;
472	int i;
473
474	/*
475	 * No locks are needed; we call here only when single-threaded.
476	 */
477	ASSERT(self == udp->ulwp_one);
478	ASSERT(!udp->uberflags.uf_mt);
479	if ((data = mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
480	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
481	    == MAP_FAILED)
482		thr_panic("cannot allocate thread queue_head table");
483	udp->queue_head = qp = (queue_head_t *)data;
484	for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
485		qp->qh_type = (i < QHASHSIZE)? MX : CV;
486		qp->qh_lock.mutex_flag = LOCK_INITED;
487		qp->qh_lock.mutex_magic = MUTEX_MAGIC;
488		qp->qh_hlist = &qp->qh_def_root;
489#if defined(THREAD_DEBUG)
490		qp->qh_hlen = 1;
491		qp->qh_hmax = 1;
492#endif
493	}
494}
495
496#if defined(THREAD_DEBUG)
497
498/*
499 * Debugging: verify correctness of a sleep queue.
500 */
501void
502QVERIFY(queue_head_t *qp)
503{
504	ulwp_t *self = curthread;
505	uberdata_t *udp = self->ul_uberdata;
506	queue_root_t *qrp;
507	ulwp_t *ulwp;
508	ulwp_t *prev;
509	uint_t index;
510	uint32_t cnt;
511	char qtype;
512	void *wchan;
513
514	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
515	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
516	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
517		cnt++;
518		ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
519		    (qrp->qr_head == NULL && qrp->qr_tail == NULL));
520	}
521	ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
522	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
523	ASSERT(qp->qh_type == qtype);
524	if (!thread_queue_verify)
525		return;
526	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
527	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
528		for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
529		    prev = ulwp, ulwp = ulwp->ul_link) {
530			cnt++;
531			if (ulwp->ul_writer)
532				ASSERT(prev == NULL || prev->ul_writer);
533			ASSERT(ulwp->ul_qtype == qtype);
534			ASSERT(ulwp->ul_wchan != NULL);
535			ASSERT(ulwp->ul_sleepq == qp);
536			wchan = ulwp->ul_wchan;
537			ASSERT(qrp->qr_wchan == wchan);
538			index = QUEUE_HASH(wchan, qtype);
539			ASSERT(&udp->queue_head[index] == qp);
540		}
541		ASSERT(qrp->qr_tail == prev);
542	}
543	ASSERT(qp->qh_qlen == cnt);
544}
545
546#else	/* THREAD_DEBUG */
547
548#define	QVERIFY(qp)
549
550#endif	/* THREAD_DEBUG */
551
552/*
553 * Acquire a queue head.
554 */
555queue_head_t *
556queue_lock(void *wchan, int qtype)
557{
558	uberdata_t *udp = curthread->ul_uberdata;
559	queue_head_t *qp;
560	queue_root_t *qrp;
561
562	ASSERT(qtype == MX || qtype == CV);
563
564	/*
565	 * It is possible that we could be called while still single-threaded.
566	 * If so, we call queue_alloc() to allocate the queue_head[] array.
567	 */
568	if ((qp = udp->queue_head) == NULL) {
569		queue_alloc();
570		qp = udp->queue_head;
571	}
572	qp += QUEUE_HASH(wchan, qtype);
573	spin_lock_set(&qp->qh_lock);
574	for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
575		if (qrp->qr_wchan == wchan)
576			break;
577	if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
578		/* the default queue root is available; use it */
579		qrp = &qp->qh_def_root;
580		qrp->qr_wchan = wchan;
581		ASSERT(qrp->qr_next == NULL);
582		ASSERT(qrp->qr_tail == NULL &&
583		    qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
584	}
585	qp->qh_wchan = wchan;	/* valid until queue_unlock() is called */
586	qp->qh_root = qrp;	/* valid until queue_unlock() is called */
587	INCR32(qp->qh_lockcount);
588	QVERIFY(qp);
589	return (qp);
590}
591
592/*
593 * Release a queue head.
594 */
595void
596queue_unlock(queue_head_t *qp)
597{
598	QVERIFY(qp);
599	spin_lock_clear(&qp->qh_lock);
600}
601
602/*
603 * For rwlock queueing, we must queue writers ahead of readers of the
604 * same priority.  We do this by making writers appear to have a half
605 * point higher priority for purposes of priority comparisons below.
606 */
607#define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
608
609void
610enqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
611{
612	queue_root_t *qrp;
613	ulwp_t **ulwpp;
614	ulwp_t *next;
615	int pri = CMP_PRIO(ulwp);
616
617	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
618	ASSERT(ulwp->ul_sleepq != qp);
619
620	if ((qrp = qp->qh_root) == NULL) {
621		/* use the thread's queue root for the linkage */
622		qrp = &ulwp->ul_queue_root;
623		qrp->qr_next = qp->qh_hlist;
624		qrp->qr_prev = NULL;
625		qrp->qr_head = NULL;
626		qrp->qr_tail = NULL;
627		qrp->qr_wchan = qp->qh_wchan;
628		qrp->qr_rtcount = 0;
629		qrp->qr_qlen = 0;
630		qrp->qr_qmax = 0;
631		qp->qh_hlist->qr_prev = qrp;
632		qp->qh_hlist = qrp;
633		qp->qh_root = qrp;
634		MAXINCR(qp->qh_hmax, qp->qh_hlen);
635	}
636
637	/*
638	 * LIFO queue ordering is unfair and can lead to starvation,
639	 * but it gives better performance for heavily contended locks.
640	 * We use thread_queue_fifo (range is 0..8) to determine
641	 * the frequency of FIFO vs LIFO queuing:
642	 *	0 : every 256th time	(almost always LIFO)
643	 *	1 : every 128th time
644	 *	2 : every 64th  time
645	 *	3 : every 32nd  time
646	 *	4 : every 16th  time	(the default value, mostly LIFO)
647	 *	5 : every 8th   time
648	 *	6 : every 4th   time
649	 *	7 : every 2nd   time
650	 *	8 : every time		(never LIFO, always FIFO)
651	 * Note that there is always some degree of FIFO ordering.
652	 * This breaks live lock conditions that occur in applications
653	 * that are written assuming (incorrectly) that threads acquire
654	 * locks fairly, that is, in roughly round-robin order.
655	 * In any event, the queue is maintained in kernel priority order.
656	 *
657	 * If force_fifo is non-zero, fifo queueing is forced.
658	 * SUSV3 requires this for semaphores.
659	 */
660	if (qrp->qr_head == NULL) {
661		/*
662		 * The queue is empty.  LIFO/FIFO doesn't matter.
663		 */
664		ASSERT(qrp->qr_tail == NULL);
665		ulwpp = &qrp->qr_head;
666	} else if (force_fifo |
667	    (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
668		/*
669		 * Enqueue after the last thread whose priority is greater
670		 * than or equal to the priority of the thread being queued.
671		 * Attempt first to go directly onto the tail of the queue.
672		 */
673		if (pri <= CMP_PRIO(qrp->qr_tail))
674			ulwpp = &qrp->qr_tail->ul_link;
675		else {
676			for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
677			    ulwpp = &next->ul_link)
678				if (pri > CMP_PRIO(next))
679					break;
680		}
681	} else {
682		/*
683		 * Enqueue before the first thread whose priority is less
684		 * than or equal to the priority of the thread being queued.
685		 * Hopefully we can go directly onto the head of the queue.
686		 */
687		for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
688		    ulwpp = &next->ul_link)
689			if (pri >= CMP_PRIO(next))
690				break;
691	}
692	if ((ulwp->ul_link = *ulwpp) == NULL)
693		qrp->qr_tail = ulwp;
694	*ulwpp = ulwp;
695
696	ulwp->ul_sleepq = qp;
697	ulwp->ul_wchan = qp->qh_wchan;
698	ulwp->ul_qtype = qp->qh_type;
699	if ((ulwp->ul_schedctl != NULL &&
700	    ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
701	    ulwp->ul_pilocks) {
702		ulwp->ul_rtqueued = 1;
703		qrp->qr_rtcount++;
704	}
705	MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
706	MAXINCR(qp->qh_qmax, qp->qh_qlen);
707}
708
709/*
710 * Helper function for queue_slot() and queue_slot_rt().
711 * Try to find a non-suspended thread on the queue.
712 */
713static ulwp_t **
714queue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
715{
716	ulwp_t *ulwp;
717	ulwp_t **foundpp = NULL;
718	int priority = -1;
719	ulwp_t *prev;
720	int tpri;
721
722	for (prev = NULL;
723	    (ulwp = *ulwpp) != NULL;
724	    prev = ulwp, ulwpp = &ulwp->ul_link) {
725		if (ulwp->ul_stop)	/* skip suspended threads */
726			continue;
727		tpri = rt? CMP_PRIO(ulwp) : 0;
728		if (tpri > priority) {
729			foundpp = ulwpp;
730			*prevp = prev;
731			priority = tpri;
732			if (!rt)
733				break;
734		}
735	}
736	return (foundpp);
737}
738
739/*
740 * For real-time, we search the entire queue because the dispatch
741 * (kernel) priorities may have changed since enqueueing.
742 */
743static ulwp_t **
744queue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
745{
746	ulwp_t **ulwpp = ulwpp_org;
747	ulwp_t *ulwp = *ulwpp;
748	ulwp_t **foundpp = ulwpp;
749	int priority = CMP_PRIO(ulwp);
750	ulwp_t *prev;
751	int tpri;
752
753	for (prev = ulwp, ulwpp = &ulwp->ul_link;
754	    (ulwp = *ulwpp) != NULL;
755	    prev = ulwp, ulwpp = &ulwp->ul_link) {
756		tpri = CMP_PRIO(ulwp);
757		if (tpri > priority) {
758			foundpp = ulwpp;
759			*prevp = prev;
760			priority = tpri;
761		}
762	}
763	ulwp = *foundpp;
764
765	/*
766	 * Try not to return a suspended thread.
767	 * This mimics the old libthread's behavior.
768	 */
769	if (ulwp->ul_stop &&
770	    (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
771		foundpp = ulwpp;
772		ulwp = *foundpp;
773	}
774	ulwp->ul_rt = 1;
775	return (foundpp);
776}
777
778ulwp_t **
779queue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
780{
781	queue_root_t *qrp;
782	ulwp_t **ulwpp;
783	ulwp_t *ulwp;
784	int rt;
785
786	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
787
788	if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
789		*more = 0;
790		return (NULL);		/* no lwps on the queue */
791	}
792	rt = (qrp->qr_rtcount != 0);
793	*prevp = NULL;
794	if (ulwp->ul_link == NULL) {	/* only one lwp on the queue */
795		*more = 0;
796		ulwp->ul_rt = rt;
797		return (&qrp->qr_head);
798	}
799	*more = 1;
800
801	if (rt)		/* real-time queue */
802		return (queue_slot_rt(&qrp->qr_head, prevp));
803	/*
804	 * Try not to return a suspended thread.
805	 * This mimics the old libthread's behavior.
806	 */
807	if (ulwp->ul_stop &&
808	    (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
809		ulwp = *ulwpp;
810		ulwp->ul_rt = 0;
811		return (ulwpp);
812	}
813	/*
814	 * The common case; just pick the first thread on the queue.
815	 */
816	ulwp->ul_rt = 0;
817	return (&qrp->qr_head);
818}
819
820/*
821 * Common code for unlinking an lwp from a user-level sleep queue.
822 */
823void
824queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
825{
826	queue_root_t *qrp = qp->qh_root;
827	queue_root_t *nqrp;
828	ulwp_t *ulwp = *ulwpp;
829	ulwp_t *next;
830
831	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
832	ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
833
834	DECR(qp->qh_qlen);
835	DECR(qrp->qr_qlen);
836	if (ulwp->ul_rtqueued) {
837		ulwp->ul_rtqueued = 0;
838		qrp->qr_rtcount--;
839	}
840	next = ulwp->ul_link;
841	*ulwpp = next;
842	ulwp->ul_link = NULL;
843	if (qrp->qr_tail == ulwp)
844		qrp->qr_tail = prev;
845	if (qrp == &ulwp->ul_queue_root) {
846		/*
847		 * We can't continue to use the unlinked thread's
848		 * queue root for the linkage.
849		 */
850		queue_root_t *qr_next = qrp->qr_next;
851		queue_root_t *qr_prev = qrp->qr_prev;
852
853		if (qrp->qr_tail) {
854			/* switch to using the last thread's queue root */
855			ASSERT(qrp->qr_qlen != 0);
856			nqrp = &qrp->qr_tail->ul_queue_root;
857			*nqrp = *qrp;
858			if (qr_next)
859				qr_next->qr_prev = nqrp;
860			if (qr_prev)
861				qr_prev->qr_next = nqrp;
862			else
863				qp->qh_hlist = nqrp;
864			qp->qh_root = nqrp;
865		} else {
866			/* empty queue root; just delete from the hash list */
867			ASSERT(qrp->qr_qlen == 0);
868			if (qr_next)
869				qr_next->qr_prev = qr_prev;
870			if (qr_prev)
871				qr_prev->qr_next = qr_next;
872			else
873				qp->qh_hlist = qr_next;
874			qp->qh_root = NULL;
875			DECR(qp->qh_hlen);
876		}
877	}
878}
879
880ulwp_t *
881dequeue(queue_head_t *qp, int *more)
882{
883	ulwp_t **ulwpp;
884	ulwp_t *ulwp;
885	ulwp_t *prev;
886
887	if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
888		return (NULL);
889	ulwp = *ulwpp;
890	queue_unlink(qp, ulwpp, prev);
891	ulwp->ul_sleepq = NULL;
892	ulwp->ul_wchan = NULL;
893	return (ulwp);
894}
895
896/*
897 * Return a pointer to the highest priority thread sleeping on wchan.
898 */
899ulwp_t *
900queue_waiter(queue_head_t *qp)
901{
902	ulwp_t **ulwpp;
903	ulwp_t *prev;
904	int more;
905
906	if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
907		return (NULL);
908	return (*ulwpp);
909}
910
911int
912dequeue_self(queue_head_t *qp)
913{
914	ulwp_t *self = curthread;
915	queue_root_t *qrp;
916	ulwp_t **ulwpp;
917	ulwp_t *ulwp;
918	ulwp_t *prev;
919	int found = 0;
920
921	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
922
923	/* find self on the sleep queue */
924	if ((qrp = qp->qh_root) != NULL) {
925		for (prev = NULL, ulwpp = &qrp->qr_head;
926		    (ulwp = *ulwpp) != NULL;
927		    prev = ulwp, ulwpp = &ulwp->ul_link) {
928			if (ulwp == self) {
929				queue_unlink(qp, ulwpp, prev);
930				self->ul_cvmutex = NULL;
931				self->ul_sleepq = NULL;
932				self->ul_wchan = NULL;
933				found = 1;
934				break;
935			}
936		}
937	}
938
939	if (!found)
940		thr_panic("dequeue_self(): curthread not found on queue");
941
942	return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
943}
944
945/*
946 * Called from call_user_handler() and _thrp_suspend() to take
947 * ourself off of our sleep queue so we can grab locks.
948 */
949void
950unsleep_self(void)
951{
952	ulwp_t *self = curthread;
953	queue_head_t *qp;
954
955	/*
956	 * Calling enter_critical()/exit_critical() here would lead
957	 * to recursion.  Just manipulate self->ul_critical directly.
958	 */
959	self->ul_critical++;
960	while (self->ul_sleepq != NULL) {
961		qp = queue_lock(self->ul_wchan, self->ul_qtype);
962		/*
963		 * We may have been moved from a CV queue to a
964		 * mutex queue while we were attempting queue_lock().
965		 * If so, just loop around and try again.
966		 * dequeue_self() clears self->ul_sleepq.
967		 */
968		if (qp == self->ul_sleepq)
969			(void) dequeue_self(qp);
970		queue_unlock(qp);
971	}
972	self->ul_writer = 0;
973	self->ul_critical--;
974}
975
976/*
977 * Common code for calling the the ___lwp_mutex_timedlock() system call.
978 * Returns with mutex_owner and mutex_ownerpid set correctly.
979 */
980static int
981mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
982{
983	ulwp_t *self = curthread;
984	uberdata_t *udp = self->ul_uberdata;
985	int mtype = mp->mutex_type;
986	hrtime_t begin_sleep;
987	int acquired;
988	int error;
989
990	self->ul_sp = stkptr();
991	self->ul_wchan = mp;
992	if (__td_event_report(self, TD_SLEEP, udp)) {
993		self->ul_td_evbuf.eventnum = TD_SLEEP;
994		self->ul_td_evbuf.eventdata = mp;
995		tdb_event(TD_SLEEP, udp);
996	}
997	if (msp) {
998		tdb_incr(msp->mutex_sleep);
999		begin_sleep = gethrtime();
1000	}
1001
1002	DTRACE_PROBE1(plockstat, mutex__block, mp);
1003
1004	for (;;) {
1005		/*
1006		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1007		 * means we successfully acquired the lock.
1008		 */
1009		if ((error = ___lwp_mutex_timedlock(mp, tsp, self)) != 0 &&
1010		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1011			acquired = 0;
1012			break;
1013		}
1014
1015		if (mtype & USYNC_PROCESS) {
1016			/*
1017			 * Defend against forkall().  We may be the child,
1018			 * in which case we don't actually own the mutex.
1019			 */
1020			enter_critical(self);
1021			if (mp->mutex_ownerpid == udp->pid) {
1022				exit_critical(self);
1023				acquired = 1;
1024				break;
1025			}
1026			exit_critical(self);
1027		} else {
1028			acquired = 1;
1029			break;
1030		}
1031	}
1032
1033	if (msp)
1034		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1035	self->ul_wchan = NULL;
1036	self->ul_sp = 0;
1037
1038	if (acquired) {
1039		ASSERT(mp->mutex_owner == (uintptr_t)self);
1040		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1041		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1042	} else {
1043		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1044		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1045	}
1046
1047	return (error);
1048}
1049
1050/*
1051 * Common code for calling the ___lwp_mutex_trylock() system call.
1052 * Returns with mutex_owner and mutex_ownerpid set correctly.
1053 */
1054int
1055mutex_trylock_kernel(mutex_t *mp)
1056{
1057	ulwp_t *self = curthread;
1058	uberdata_t *udp = self->ul_uberdata;
1059	int mtype = mp->mutex_type;
1060	int error;
1061	int acquired;
1062
1063	for (;;) {
1064		/*
1065		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1066		 * means we successfully acquired the lock.
1067		 */
1068		if ((error = ___lwp_mutex_trylock(mp, self)) != 0 &&
1069		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1070			acquired = 0;
1071			break;
1072		}
1073
1074		if (mtype & USYNC_PROCESS) {
1075			/*
1076			 * Defend against forkall().  We may be the child,
1077			 * in which case we don't actually own the mutex.
1078			 */
1079			enter_critical(self);
1080			if (mp->mutex_ownerpid == udp->pid) {
1081				exit_critical(self);
1082				acquired = 1;
1083				break;
1084			}
1085			exit_critical(self);
1086		} else {
1087			acquired = 1;
1088			break;
1089		}
1090	}
1091
1092	if (acquired) {
1093		ASSERT(mp->mutex_owner == (uintptr_t)self);
1094		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1095	} else if (error != EBUSY) {
1096		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1097	}
1098
1099	return (error);
1100}
1101
1102volatile sc_shared_t *
1103setup_schedctl(void)
1104{
1105	ulwp_t *self = curthread;
1106	volatile sc_shared_t *scp;
1107	sc_shared_t *tmp;
1108
1109	if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
1110	    !self->ul_vfork &&			/* not a child of vfork() */
1111	    !self->ul_schedctl_called) {	/* haven't been called before */
1112		enter_critical(self);
1113		self->ul_schedctl_called = &self->ul_uberdata->uberflags;
1114		if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
1115			self->ul_schedctl = scp = tmp;
1116		exit_critical(self);
1117	}
1118	/*
1119	 * Unless the call to setup_schedctl() is surrounded
1120	 * by enter_critical()/exit_critical(), the address
1121	 * we are returning could be invalid due to a forkall()
1122	 * having occurred in another thread.
1123	 */
1124	return (scp);
1125}
1126
1127/*
1128 * Interfaces from libsched, incorporated into libc.
1129 * libsched.so.1 is now a filter library onto libc.
1130 */
1131#pragma weak schedctl_lookup = schedctl_init
1132schedctl_t *
1133schedctl_init(void)
1134{
1135	volatile sc_shared_t *scp = setup_schedctl();
1136	return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
1137}
1138
1139void
1140schedctl_exit(void)
1141{
1142}
1143
1144/*
1145 * Contract private interface for java.
1146 * Set up the schedctl data if it doesn't exist yet.
1147 * Return a pointer to the pointer to the schedctl data.
1148 */
1149volatile sc_shared_t *volatile *
1150_thr_schedctl(void)
1151{
1152	ulwp_t *self = curthread;
1153	volatile sc_shared_t *volatile *ptr;
1154
1155	if (self->ul_vfork)
1156		return (NULL);
1157	if (*(ptr = &self->ul_schedctl) == NULL)
1158		(void) setup_schedctl();
1159	return (ptr);
1160}
1161
1162/*
1163 * Block signals and attempt to block preemption.
1164 * no_preempt()/preempt() must be used in pairs but can be nested.
1165 */
1166void
1167no_preempt(ulwp_t *self)
1168{
1169	volatile sc_shared_t *scp;
1170
1171	if (self->ul_preempt++ == 0) {
1172		enter_critical(self);
1173		if ((scp = self->ul_schedctl) != NULL ||
1174		    (scp = setup_schedctl()) != NULL) {
1175			/*
1176			 * Save the pre-existing preempt value.
1177			 */
1178			self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
1179			scp->sc_preemptctl.sc_nopreempt = 1;
1180		}
1181	}
1182}
1183
1184/*
1185 * Undo the effects of no_preempt().
1186 */
1187void
1188preempt(ulwp_t *self)
1189{
1190	volatile sc_shared_t *scp;
1191
1192	ASSERT(self->ul_preempt > 0);
1193	if (--self->ul_preempt == 0) {
1194		if ((scp = self->ul_schedctl) != NULL) {
1195			/*
1196			 * Restore the pre-existing preempt value.
1197			 */
1198			scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
1199			if (scp->sc_preemptctl.sc_yield &&
1200			    scp->sc_preemptctl.sc_nopreempt == 0) {
1201				yield();
1202				if (scp->sc_preemptctl.sc_yield) {
1203					/*
1204					 * Shouldn't happen.  This is either
1205					 * a race condition or the thread
1206					 * just entered the real-time class.
1207					 */
1208					yield();
1209					scp->sc_preemptctl.sc_yield = 0;
1210				}
1211			}
1212		}
1213		exit_critical(self);
1214	}
1215}
1216
1217/*
1218 * If a call to preempt() would cause the current thread to yield or to
1219 * take deferred actions in exit_critical(), then unpark the specified
1220 * lwp so it can run while we delay.  Return the original lwpid if the
1221 * unpark was not performed, else return zero.  The tests are a repeat
1222 * of some of the tests in preempt(), above.  This is a statistical
1223 * optimization solely for cond_sleep_queue(), below.
1224 */
1225static lwpid_t
1226preempt_unpark(ulwp_t *self, lwpid_t lwpid)
1227{
1228	volatile sc_shared_t *scp = self->ul_schedctl;
1229
1230	ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
1231	if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
1232	    (self->ul_curplease && self->ul_critical == 1)) {
1233		(void) __lwp_unpark(lwpid);
1234		lwpid = 0;
1235	}
1236	return (lwpid);
1237}
1238
1239/*
1240 * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1241 * If this fails, return EBUSY and let the caller deal with it.
1242 * If this succeeds, return 0 with mutex_owner set to curthread.
1243 */
1244static int
1245mutex_trylock_adaptive(mutex_t *mp, int tryhard)
1246{
1247	ulwp_t *self = curthread;
1248	int error = EBUSY;
1249	ulwp_t *ulwp;
1250	volatile sc_shared_t *scp;
1251	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
1252	volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
1253	uint32_t new_lockword;
1254	int count = 0;
1255	int max_count;
1256	uint8_t max_spinners;
1257
1258	ASSERT(!(mp->mutex_type & USYNC_PROCESS));
1259
1260	if (MUTEX_OWNED(mp, self))
1261		return (EBUSY);
1262
1263	enter_critical(self);
1264
1265	/* short-cut, not definitive (see below) */
1266	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1267		ASSERT(mp->mutex_type & LOCK_ROBUST);
1268		error = ENOTRECOVERABLE;
1269		goto done;
1270	}
1271
1272	/*
1273	 * Make one attempt to acquire the lock before
1274	 * incurring the overhead of the spin loop.
1275	 */
1276	if (set_lock_byte(lockp) == 0) {
1277		*ownerp = (uintptr_t)self;
1278		error = 0;
1279		goto done;
1280	}
1281	if (!tryhard)
1282		goto done;
1283	if (ncpus == 0)
1284		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1285	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1286		max_spinners = ncpus - 1;
1287	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1288	if (max_count == 0)
1289		goto done;
1290
1291	/*
1292	 * This spin loop is unfair to lwps that have already dropped into
1293	 * the kernel to sleep.  They will starve on a highly-contended mutex.
1294	 * This is just too bad.  The adaptive spin algorithm is intended
1295	 * to allow programs with highly-contended locks (that is, broken
1296	 * programs) to execute with reasonable speed despite their contention.
1297	 * Being fair would reduce the speed of such programs and well-written
1298	 * programs will not suffer in any case.
1299	 */
1300	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
1301		goto done;
1302	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1303	for (count = 1; ; count++) {
1304		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
1305			*ownerp = (uintptr_t)self;
1306			error = 0;
1307			break;
1308		}
1309		if (count == max_count)
1310			break;
1311		SMT_PAUSE();
1312		/*
1313		 * Stop spinning if the mutex owner is not running on
1314		 * a processor; it will not drop the lock any time soon
1315		 * and we would just be wasting time to keep spinning.
1316		 *
1317		 * Note that we are looking at another thread (ulwp_t)
1318		 * without ensuring that the other thread does not exit.
1319		 * The scheme relies on ulwp_t structures never being
1320		 * deallocated by the library (the library employs a free
1321		 * list of ulwp_t structs that are reused when new threads
1322		 * are created) and on schedctl shared memory never being
1323		 * deallocated once created via __schedctl().
1324		 *
1325		 * Thus, the worst that can happen when the spinning thread
1326		 * looks at the owner's schedctl data is that it is looking
1327		 * at some other thread's schedctl data.  This almost never
1328		 * happens and is benign when it does.
1329		 */
1330		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1331		    ((scp = ulwp->ul_schedctl) == NULL ||
1332		    scp->sc_state != SC_ONPROC))
1333			break;
1334	}
1335	new_lockword = spinners_decr(&mp->mutex_lockword);
1336	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1337		/*
1338		 * We haven't yet acquired the lock, the lock
1339		 * is free, and there are no other spinners.
1340		 * Make one final attempt to acquire the lock.
1341		 *
1342		 * This isn't strictly necessary since mutex_lock_queue()
1343		 * (the next action this thread will take if it doesn't
1344		 * acquire the lock here) makes one attempt to acquire
1345		 * the lock before putting the thread to sleep.
1346		 *
1347		 * If the next action for this thread (on failure here)
1348		 * were not to call mutex_lock_queue(), this would be
1349		 * necessary for correctness, to avoid ending up with an
1350		 * unheld mutex with waiters but no one to wake them up.
1351		 */
1352		if (set_lock_byte(lockp) == 0) {
1353			*ownerp = (uintptr_t)self;
1354			error = 0;
1355		}
1356		count++;
1357	}
1358
1359done:
1360	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1361		ASSERT(mp->mutex_type & LOCK_ROBUST);
1362		/*
1363		 * We shouldn't own the mutex.
1364		 * Just clear the lock; everyone has already been waked up.
1365		 */
1366		*ownerp = 0;
1367		(void) clear_lockbyte(&mp->mutex_lockword);
1368		error = ENOTRECOVERABLE;
1369	}
1370
1371	exit_critical(self);
1372
1373	if (error) {
1374		if (count) {
1375			DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
1376		}
1377		if (error != EBUSY) {
1378			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1379		}
1380	} else {
1381		if (count) {
1382			DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
1383		}
1384		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1385		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1386			ASSERT(mp->mutex_type & LOCK_ROBUST);
1387			error = EOWNERDEAD;
1388		}
1389	}
1390
1391	return (error);
1392}
1393
1394/*
1395 * Same as mutex_trylock_adaptive(), except specifically for queue locks.
1396 * The owner field is not set here; the caller (spin_lock_set()) sets it.
1397 */
1398static int
1399mutex_queuelock_adaptive(mutex_t *mp)
1400{
1401	ulwp_t *ulwp;
1402	volatile sc_shared_t *scp;
1403	volatile uint8_t *lockp;
1404	volatile uint64_t *ownerp;
1405	int count = curthread->ul_queue_spin;
1406
1407	ASSERT(mp->mutex_type == USYNC_THREAD);
1408
1409	if (count == 0)
1410		return (EBUSY);
1411
1412	lockp = (volatile uint8_t *)&mp->mutex_lockw;
1413	ownerp = (volatile uint64_t *)&mp->mutex_owner;
1414	while (--count >= 0) {
1415		if (*lockp == 0 && set_lock_byte(lockp) == 0)
1416			return (0);
1417		SMT_PAUSE();
1418		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1419		    ((scp = ulwp->ul_schedctl) == NULL ||
1420		    scp->sc_state != SC_ONPROC))
1421			break;
1422	}
1423
1424	return (EBUSY);
1425}
1426
1427/*
1428 * Like mutex_trylock_adaptive(), but for process-shared mutexes.
1429 * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1430 * If this fails, return EBUSY and let the caller deal with it.
1431 * If this succeeds, return 0 with mutex_owner set to curthread
1432 * and mutex_ownerpid set to the current pid.
1433 */
1434static int
1435mutex_trylock_process(mutex_t *mp, int tryhard)
1436{
1437	ulwp_t *self = curthread;
1438	uberdata_t *udp = self->ul_uberdata;
1439	int error = EBUSY;
1440	volatile uint64_t *lockp = (volatile uint64_t *)&mp->mutex_lockword64;
1441	uint32_t new_lockword;
1442	int count = 0;
1443	int max_count;
1444	uint8_t max_spinners;
1445
1446#if defined(__sparc) && !defined(_LP64)
1447	/* horrible hack, necessary only on 32-bit sparc */
1448	int fix_alignment_problem =
1449	    (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1450	    self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST));
1451#endif
1452
1453	ASSERT(mp->mutex_type & USYNC_PROCESS);
1454
1455	if (shared_mutex_held(mp))
1456		return (EBUSY);
1457
1458	enter_critical(self);
1459
1460	/* short-cut, not definitive (see below) */
1461	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1462		ASSERT(mp->mutex_type & LOCK_ROBUST);
1463		error = ENOTRECOVERABLE;
1464		goto done;
1465	}
1466
1467	/*
1468	 * Make one attempt to acquire the lock before
1469	 * incurring the overhead of the spin loop.
1470	 */
1471#if defined(__sparc) && !defined(_LP64)
1472	/* horrible hack, necessary only on 32-bit sparc */
1473	if (fix_alignment_problem) {
1474		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1475			mp->mutex_ownerpid = udp->pid;
1476			mp->mutex_owner = (uintptr_t)self;
1477			error = 0;
1478			goto done;
1479		}
1480	} else
1481#endif
1482	if (set_lock_byte64(lockp, udp->pid) == 0) {
1483		mp->mutex_owner = (uintptr_t)self;
1484		/* mp->mutex_ownerpid was set by set_lock_byte64() */
1485		error = 0;
1486		goto done;
1487	}
1488	if (!tryhard)
1489		goto done;
1490	if (ncpus == 0)
1491		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1492	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1493		max_spinners = ncpus - 1;
1494	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1495	if (max_count == 0)
1496		goto done;
1497
1498	/*
1499	 * This is a process-shared mutex.
1500	 * We cannot know if the owner is running on a processor.
1501	 * We just spin and hope that it is on a processor.
1502	 */
1503	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
1504		goto done;
1505	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1506	for (count = 1; ; count++) {
1507#if defined(__sparc) && !defined(_LP64)
1508		/* horrible hack, necessary only on 32-bit sparc */
1509		if (fix_alignment_problem) {
1510			if ((*lockp & LOCKMASK64) == 0 &&
1511			    set_lock_byte(&mp->mutex_lockw) == 0) {
1512				mp->mutex_ownerpid = udp->pid;
1513				mp->mutex_owner = (uintptr_t)self;
1514				error = 0;
1515				break;
1516			}
1517		} else
1518#endif
1519		if ((*lockp & LOCKMASK64) == 0 &&
1520		    set_lock_byte64(lockp, udp->pid) == 0) {
1521			mp->mutex_owner = (uintptr_t)self;
1522			/* mp->mutex_ownerpid was set by set_lock_byte64() */
1523			error = 0;
1524			break;
1525		}
1526		if (count == max_count)
1527			break;
1528		SMT_PAUSE();
1529	}
1530	new_lockword = spinners_decr(&mp->mutex_lockword);
1531	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1532		/*
1533		 * We haven't yet acquired the lock, the lock
1534		 * is free, and there are no other spinners.
1535		 * Make one final attempt to acquire the lock.
1536		 *
1537		 * This isn't strictly necessary since mutex_lock_kernel()
1538		 * (the next action this thread will take if it doesn't
1539		 * acquire the lock here) makes one attempt to acquire
1540		 * the lock before putting the thread to sleep.
1541		 *
1542		 * If the next action for this thread (on failure here)
1543		 * were not to call mutex_lock_kernel(), this would be
1544		 * necessary for correctness, to avoid ending up with an
1545		 * unheld mutex with waiters but no one to wake them up.
1546		 */
1547#if defined(__sparc) && !defined(_LP64)
1548		/* horrible hack, necessary only on 32-bit sparc */
1549		if (fix_alignment_problem) {
1550			if (set_lock_byte(&mp->mutex_lockw) == 0) {
1551				mp->mutex_ownerpid = udp->pid;
1552				mp->mutex_owner = (uintptr_t)self;
1553				error = 0;
1554			}
1555		} else
1556#endif
1557		if (set_lock_byte64(lockp, udp->pid) == 0) {
1558			mp->mutex_owner = (uintptr_t)self;
1559			/* mp->mutex_ownerpid was set by set_lock_byte64() */
1560			error = 0;
1561		}
1562		count++;
1563	}
1564
1565done:
1566	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1567		ASSERT(mp->mutex_type & LOCK_ROBUST);
1568		/*
1569		 * We shouldn't own the mutex.
1570		 * Just clear the lock; everyone has already been waked up.
1571		 */
1572		mp->mutex_owner = 0;
1573		/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1574		(void) clear_lockbyte64(&mp->mutex_lockword64);
1575		error = ENOTRECOVERABLE;
1576	}
1577
1578	exit_critical(self);
1579
1580	if (error) {
1581		if (count) {
1582			DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
1583		}
1584		if (error != EBUSY) {
1585			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1586		}
1587	} else {
1588		if (count) {
1589			DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
1590		}
1591		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1592		if (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1593			ASSERT(mp->mutex_type & LOCK_ROBUST);
1594			if (mp->mutex_flag & LOCK_OWNERDEAD)
1595				error = EOWNERDEAD;
1596			else if (mp->mutex_type & USYNC_PROCESS_ROBUST)
1597				error = ELOCKUNMAPPED;
1598			else
1599				error = EOWNERDEAD;
1600		}
1601	}
1602
1603	return (error);
1604}
1605
1606/*
1607 * Mutex wakeup code for releasing a USYNC_THREAD mutex.
1608 * Returns the lwpid of the thread that was dequeued, if any.
1609 * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
1610 * to wake up the specified lwp.
1611 */
1612static lwpid_t
1613mutex_wakeup(mutex_t *mp)
1614{
1615	lwpid_t lwpid = 0;
1616	int more;
1617	queue_head_t *qp;
1618	ulwp_t *ulwp;
1619
1620	/*
1621	 * Dequeue a waiter from the sleep queue.  Don't touch the mutex
1622	 * waiters bit if no one was found on the queue because the mutex
1623	 * might have been deallocated or reallocated for another purpose.
1624	 */
1625	qp = queue_lock(mp, MX);
1626	if ((ulwp = dequeue(qp, &more)) != NULL) {
1627		lwpid = ulwp->ul_lwpid;
1628		mp->mutex_waiters = more;
1629	}
1630	queue_unlock(qp);
1631	return (lwpid);
1632}
1633
1634/*
1635 * Mutex wakeup code for releasing all waiters on a USYNC_THREAD mutex.
1636 */
1637static void
1638mutex_wakeup_all(mutex_t *mp)
1639{
1640	queue_head_t *qp;
1641	queue_root_t *qrp;
1642	int nlwpid = 0;
1643	int maxlwps = MAXLWPS;
1644	ulwp_t *ulwp;
1645	lwpid_t buffer[MAXLWPS];
1646	lwpid_t *lwpid = buffer;
1647
1648	/*
1649	 * Walk the list of waiters and prepare to wake up all of them.
1650	 * The waiters flag has already been cleared from the mutex.
1651	 *
1652	 * We keep track of lwpids that are to be unparked in lwpid[].
1653	 * __lwp_unpark_all() is called to unpark all of them after
1654	 * they have been removed from the sleep queue and the sleep
1655	 * queue lock has been dropped.  If we run out of space in our
1656	 * on-stack buffer, we need to allocate more but we can't call
1657	 * lmalloc() because we are holding a queue lock when the overflow
1658	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
1659	 * either because the application may have allocated a small
1660	 * stack and we don't want to overrun the stack.  So we call
1661	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
1662	 * system call directly since that path acquires no locks.
1663	 */
1664	qp = queue_lock(mp, MX);
1665	for (;;) {
1666		if ((qrp = qp->qh_root) == NULL ||
1667		    (ulwp = qrp->qr_head) == NULL)
1668			break;
1669		ASSERT(ulwp->ul_wchan == mp);
1670		queue_unlink(qp, &qrp->qr_head, NULL);
1671		ulwp->ul_sleepq = NULL;
1672		ulwp->ul_wchan = NULL;
1673		if (nlwpid == maxlwps)
1674			lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
1675		lwpid[nlwpid++] = ulwp->ul_lwpid;
1676	}
1677
1678	if (nlwpid == 0) {
1679		queue_unlock(qp);
1680	} else {
1681		mp->mutex_waiters = 0;
1682		no_preempt(curthread);
1683		queue_unlock(qp);
1684		if (nlwpid == 1)
1685			(void) __lwp_unpark(lwpid[0]);
1686		else
1687			(void) __lwp_unpark_all(lwpid, nlwpid);
1688		preempt(curthread);
1689	}
1690
1691	if (lwpid != buffer)
1692		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
1693}
1694
1695/*
1696 * Release a process-private mutex.
1697 * As an optimization, if there are waiters but there are also spinners
1698 * attempting to acquire the mutex, then don't bother waking up a waiter;
1699 * one of the spinners will acquire the mutex soon and it would be a waste
1700 * of resources to wake up some thread just to have it spin for a while
1701 * and then possibly go back to sleep.  See mutex_trylock_adaptive().
1702 */
1703static lwpid_t
1704mutex_unlock_queue(mutex_t *mp, int release_all)
1705{
1706	ulwp_t *self = curthread;
1707	lwpid_t lwpid = 0;
1708	uint32_t old_lockword;
1709
1710	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1711	sigoff(self);
1712	mp->mutex_owner = 0;
1713	old_lockword = clear_lockbyte(&mp->mutex_lockword);
1714	if ((old_lockword & WAITERMASK) &&
1715	    (release_all || (old_lockword & SPINNERMASK) == 0)) {
1716		no_preempt(self);	/* ensure a prompt wakeup */
1717		if (release_all)
1718			mutex_wakeup_all(mp);
1719		else
1720			lwpid = mutex_wakeup(mp);
1721		if (lwpid == 0)
1722			preempt(self);
1723	}
1724	sigon(self);
1725	return (lwpid);
1726}
1727
1728/*
1729 * Like mutex_unlock_queue(), but for process-shared mutexes.
1730 */
1731static void
1732mutex_unlock_process(mutex_t *mp, int release_all)
1733{
1734	ulwp_t *self = curthread;
1735	uint64_t old_lockword64;
1736
1737	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1738	sigoff(self);
1739	mp->mutex_owner = 0;
1740#if defined(__sparc) && !defined(_LP64)
1741	/* horrible hack, necessary only on 32-bit sparc */
1742	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1743	    self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST)) {
1744		uint32_t old_lockword;
1745		mp->mutex_ownerpid = 0;
1746		old_lockword = clear_lockbyte(&mp->mutex_lockword);
1747		if ((old_lockword & WAITERMASK) &&
1748		    (release_all || (old_lockword & SPINNERMASK) == 0)) {
1749			no_preempt(self);	/* ensure a prompt wakeup */
1750			(void) ___lwp_mutex_wakeup(mp, release_all);
1751			preempt(self);
1752		}
1753		sigon(self);
1754		return;
1755	}
1756#endif
1757	/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1758	old_lockword64 = clear_lockbyte64(&mp->mutex_lockword64);
1759	if ((old_lockword64 & WAITERMASK64) &&
1760	    (release_all || (old_lockword64 & SPINNERMASK64) == 0)) {
1761		no_preempt(self);	/* ensure a prompt wakeup */
1762		(void) ___lwp_mutex_wakeup(mp, release_all);
1763		preempt(self);
1764	}
1765	sigon(self);
1766}
1767
1768void
1769stall(void)
1770{
1771	for (;;)
1772		(void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
1773}
1774
1775/*
1776 * Acquire a USYNC_THREAD mutex via user-level sleep queues.
1777 * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
1778 * If successful, returns with mutex_owner set correctly.
1779 */
1780int
1781mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
1782	timespec_t *tsp)
1783{
1784	uberdata_t *udp = curthread->ul_uberdata;
1785	queue_head_t *qp;
1786	hrtime_t begin_sleep;
1787	int error = 0;
1788
1789	self->ul_sp = stkptr();
1790	if (__td_event_report(self, TD_SLEEP, udp)) {
1791		self->ul_wchan = mp;
1792		self->ul_td_evbuf.eventnum = TD_SLEEP;
1793		self->ul_td_evbuf.eventdata = mp;
1794		tdb_event(TD_SLEEP, udp);
1795	}
1796	if (msp) {
1797		tdb_incr(msp->mutex_sleep);
1798		begin_sleep = gethrtime();
1799	}
1800
1801	DTRACE_PROBE1(plockstat, mutex__block, mp);
1802
1803	/*
1804	 * Put ourself on the sleep queue, and while we are
1805	 * unable to grab the lock, go park in the kernel.
1806	 * Take ourself off the sleep queue after we acquire the lock.
1807	 * The waiter bit can be set/cleared only while holding the queue lock.
1808	 */
1809	qp = queue_lock(mp, MX);
1810	enqueue(qp, self, 0);
1811	mp->mutex_waiters = 1;
1812	for (;;) {
1813		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1814			mp->mutex_owner = (uintptr_t)self;
1815			mp->mutex_waiters = dequeue_self(qp);
1816			break;
1817		}
1818		set_parking_flag(self, 1);
1819		queue_unlock(qp);
1820		/*
1821		 * __lwp_park() will return the residual time in tsp
1822		 * if we are unparked before the timeout expires.
1823		 */
1824		error = __lwp_park(tsp, 0);
1825		set_parking_flag(self, 0);
1826		/*
1827		 * We could have taken a signal or suspended ourself.
1828		 * If we did, then we removed ourself from the queue.
1829		 * Someone else may have removed us from the queue
1830		 * as a consequence of mutex_unlock().  We may have
1831		 * gotten a timeout from __lwp_park().  Or we may still
1832		 * be on the queue and this is just a spurious wakeup.
1833		 */
1834		qp = queue_lock(mp, MX);
1835		if (self->ul_sleepq == NULL) {
1836			if (error) {
1837				mp->mutex_waiters = queue_waiter(qp)? 1 : 0;
1838				if (error != EINTR)
1839					break;
1840				error = 0;
1841			}
1842			if (set_lock_byte(&mp->mutex_lockw) == 0) {
1843				mp->mutex_owner = (uintptr_t)self;
1844				break;
1845			}
1846			enqueue(qp, self, 0);
1847			mp->mutex_waiters = 1;
1848		}
1849		ASSERT(self->ul_sleepq == qp &&
1850		    self->ul_qtype == MX &&
1851		    self->ul_wchan == mp);
1852		if (error) {
1853			if (error != EINTR) {
1854				mp->mutex_waiters = dequeue_self(qp);
1855				break;
1856			}
1857			error = 0;
1858		}
1859	}
1860	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
1861	    self->ul_wchan == NULL);
1862	self->ul_sp = 0;
1863
1864	ASSERT(error == 0 || error == EINVAL || error == ETIME);
1865
1866	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1867		ASSERT(mp->mutex_type & LOCK_ROBUST);
1868		/*
1869		 * We shouldn't own the mutex.
1870		 * Just clear the lock; everyone has already been waked up.
1871		 */
1872		mp->mutex_owner = 0;
1873		(void) clear_lockbyte(&mp->mutex_lockword);
1874		error = ENOTRECOVERABLE;
1875	}
1876
1877	queue_unlock(qp);
1878
1879	if (msp)
1880		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1881
1882	if (error) {
1883		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1884		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1885	} else {
1886		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1887		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1888		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1889			ASSERT(mp->mutex_type & LOCK_ROBUST);
1890			error = EOWNERDEAD;
1891		}
1892	}
1893
1894	return (error);
1895}
1896
1897static int
1898mutex_recursion(mutex_t *mp, int mtype, int try)
1899{
1900	ASSERT(mutex_held(mp));
1901	ASSERT(mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK));
1902	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1903
1904	if (mtype & LOCK_RECURSIVE) {
1905		if (mp->mutex_rcount == RECURSION_MAX) {
1906			DTRACE_PROBE2(plockstat, mutex__error, mp, EAGAIN);
1907			return (EAGAIN);
1908		}
1909		mp->mutex_rcount++;
1910		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
1911		return (0);
1912	}
1913	if (try == MUTEX_LOCK) {
1914		DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1915		return (EDEADLK);
1916	}
1917	return (EBUSY);
1918}
1919
1920/*
1921 * Register this USYNC_PROCESS|LOCK_ROBUST mutex with the kernel so
1922 * it can apply LOCK_OWNERDEAD|LOCK_UNMAPPED if it becomes necessary.
1923 * We use tdb_hash_lock here and in the synch object tracking code in
1924 * the tdb_agent.c file.  There is no conflict between these two usages.
1925 */
1926void
1927register_lock(mutex_t *mp)
1928{
1929	uberdata_t *udp = curthread->ul_uberdata;
1930	uint_t hash = LOCK_HASH(mp);
1931	robust_t *rlp;
1932	robust_t *invalid;
1933	robust_t **rlpp;
1934	robust_t **table;
1935
1936	if ((table = udp->robustlocks) == NULL) {
1937		lmutex_lock(&udp->tdb_hash_lock);
1938		if ((table = udp->robustlocks) == NULL) {
1939			table = lmalloc(LOCKHASHSZ * sizeof (robust_t *));
1940			membar_producer();
1941			udp->robustlocks = table;
1942		}
1943		lmutex_unlock(&udp->tdb_hash_lock);
1944	}
1945	membar_consumer();
1946
1947	/*
1948	 * First search the registered table with no locks held.
1949	 * This is safe because the table never shrinks
1950	 * and we can only get a false negative.
1951	 */
1952	for (rlp = table[hash]; rlp != NULL; rlp = rlp->robust_next) {
1953		if (rlp->robust_lock == mp)	/* already registered */
1954			return;
1955	}
1956
1957	/*
1958	 * The lock was not found.
1959	 * Repeat the operation with tdb_hash_lock held.
1960	 */
1961	lmutex_lock(&udp->tdb_hash_lock);
1962
1963	invalid = NULL;
1964	for (rlpp = &table[hash];
1965	    (rlp = *rlpp) != NULL;
1966	    rlpp = &rlp->robust_next) {
1967		if (rlp->robust_lock == mp) {	/* already registered */
1968			lmutex_unlock(&udp->tdb_hash_lock);
1969			return;
1970		}
1971		/* remember the first invalid entry, if any */
1972		if (rlp->robust_lock == INVALID_ADDR && invalid == NULL)
1973			invalid = rlp;
1974	}
1975
1976	/*
1977	 * The lock has never been registered.
1978	 * Add it to the table and register it now.
1979	 */
1980	if ((rlp = invalid) != NULL) {
1981		/*
1982		 * Reuse the invalid entry we found above.
1983		 * The linkages are still correct.
1984		 */
1985		rlp->robust_lock = mp;
1986		membar_producer();
1987	} else {
1988		/*
1989		 * Allocate a new entry and add it to
1990		 * the hash table and to the global list.
1991		 */
1992		rlp = lmalloc(sizeof (*rlp));
1993		rlp->robust_lock = mp;
1994		rlp->robust_next = NULL;
1995		rlp->robust_list = udp->robustlist;
1996		udp->robustlist = rlp;
1997		membar_producer();
1998		*rlpp = rlp;
1999	}
2000
2001	lmutex_unlock(&udp->tdb_hash_lock);
2002
2003	(void) ___lwp_mutex_register(mp, &rlp->robust_lock);
2004}
2005
2006/*
2007 * This is called in the child of fork()/forkall() to start over
2008 * with a clean slate.  (Each process must register its own locks.)
2009 * No locks are needed because all other threads are suspended or gone.
2010 */
2011void
2012unregister_locks(void)
2013{
2014	uberdata_t *udp = curthread->ul_uberdata;
2015	robust_t **table;
2016	robust_t *rlp;
2017	robust_t *next;
2018
2019	/*
2020	 * Do this first, before calling lfree().
2021	 */
2022	table = udp->robustlocks;
2023	udp->robustlocks = NULL;
2024	rlp = udp->robustlist;
2025	udp->robustlist = NULL;
2026
2027	/*
2028	 * Do this by traversing the global list, not the hash table.
2029	 */
2030	while (rlp != NULL) {
2031		next = rlp->robust_list;
2032		lfree(rlp, sizeof (*rlp));
2033		rlp = next;
2034	}
2035	if (table != NULL)
2036		lfree(table, LOCKHASHSZ * sizeof (robust_t *));
2037}
2038
2039/*
2040 * Returns with mutex_owner set correctly.
2041 */
2042int
2043mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
2044{
2045	ulwp_t *self = curthread;
2046	uberdata_t *udp = self->ul_uberdata;
2047	int mtype = mp->mutex_type;
2048	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2049	int error = 0;
2050	int noceil = try & MUTEX_NOCEIL;
2051	uint8_t ceil;
2052	int myprio;
2053
2054	try &= ~MUTEX_NOCEIL;
2055	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
2056
2057	if (!self->ul_schedctl_called)
2058		(void) setup_schedctl();
2059
2060	if (msp && try == MUTEX_TRY)
2061		tdb_incr(msp->mutex_try);
2062
2063	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_held(mp))
2064		return (mutex_recursion(mp, mtype, try));
2065
2066	if (self->ul_error_detection && try == MUTEX_LOCK &&
2067	    tsp == NULL && mutex_held(mp))
2068		lock_error(mp, "mutex_lock", NULL, NULL);
2069
2070	if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2071		update_sched(self);
2072		if (self->ul_cid != self->ul_rtclassid) {
2073			DTRACE_PROBE2(plockstat, mutex__error, mp, EPERM);
2074			return (EPERM);
2075		}
2076		ceil = mp->mutex_ceiling;
2077		myprio = self->ul_epri? self->ul_epri : self->ul_pri;
2078		if (myprio > ceil) {
2079			DTRACE_PROBE2(plockstat, mutex__error, mp, EINVAL);
2080			return (EINVAL);
2081		}
2082		if ((error = _ceil_mylist_add(mp)) != 0) {
2083			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
2084			return (error);
2085		}
2086		if (myprio < ceil)
2087			_ceil_prio_inherit(ceil);
2088	}
2089
2090	if ((mtype & (USYNC_PROCESS | LOCK_ROBUST))
2091	    == (USYNC_PROCESS | LOCK_ROBUST))
2092		register_lock(mp);
2093
2094	if (mtype & LOCK_PRIO_INHERIT) {
2095		/* go straight to the kernel */
2096		if (try == MUTEX_TRY)
2097			error = mutex_trylock_kernel(mp);
2098		else	/* MUTEX_LOCK */
2099			error = mutex_lock_kernel(mp, tsp, msp);
2100		/*
2101		 * The kernel never sets or clears the lock byte
2102		 * for LOCK_PRIO_INHERIT mutexes.
2103		 * Set it here for consistency.
2104		 */
2105		switch (error) {
2106		case 0:
2107			self->ul_pilocks++;
2108			mp->mutex_lockw = LOCKSET;
2109			break;
2110		case EOWNERDEAD:
2111		case ELOCKUNMAPPED:
2112			self->ul_pilocks++;
2113			mp->mutex_lockw = LOCKSET;
2114			/* FALLTHROUGH */
2115		case ENOTRECOVERABLE:
2116			ASSERT(mtype & LOCK_ROBUST);
2117			break;
2118		case EDEADLK:
2119			if (try == MUTEX_TRY) {
2120				error = EBUSY;
2121			} else if (tsp != NULL) {	/* simulate a timeout */
2122				/*
2123				 * Note: mutex_timedlock() never returns EINTR.
2124				 */
2125				timespec_t ts = *tsp;
2126				timespec_t rts;
2127
2128				while (__nanosleep(&ts, &rts) == EINTR)
2129					ts = rts;
2130				error = ETIME;
2131			} else {		/* simulate a deadlock */
2132				stall();
2133			}
2134			break;
2135		}
2136	} else if (mtype & USYNC_PROCESS) {
2137		error = mutex_trylock_process(mp, try == MUTEX_LOCK);
2138		if (error == EBUSY && try == MUTEX_LOCK)
2139			error = mutex_lock_kernel(mp, tsp, msp);
2140	} else {	/* USYNC_THREAD */
2141		error = mutex_trylock_adaptive(mp, try == MUTEX_LOCK);
2142		if (error == EBUSY && try == MUTEX_LOCK)
2143			error = mutex_lock_queue(self, msp, mp, tsp);
2144	}
2145
2146	switch (error) {
2147	case 0:
2148	case EOWNERDEAD:
2149	case ELOCKUNMAPPED:
2150		if (mtype & LOCK_ROBUST)
2151			remember_lock(mp);
2152		if (msp)
2153			record_begin_hold(msp);
2154		break;
2155	default:
2156		if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2157			(void) _ceil_mylist_del(mp);
2158			if (myprio < ceil)
2159				_ceil_prio_waive();
2160		}
2161		if (try == MUTEX_TRY) {
2162			if (msp)
2163				tdb_incr(msp->mutex_try_fail);
2164			if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2165				self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2166				tdb_event(TD_LOCK_TRY, udp);
2167			}
2168		}
2169		break;
2170	}
2171
2172	return (error);
2173}
2174
2175int
2176fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
2177{
2178	ulwp_t *self = curthread;
2179	uberdata_t *udp = self->ul_uberdata;
2180
2181	/*
2182	 * We know that USYNC_PROCESS is set in mtype and that
2183	 * zero, one, or both of the flags LOCK_RECURSIVE and
2184	 * LOCK_ERRORCHECK are set, and that no other flags are set.
2185	 */
2186	ASSERT((mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0);
2187	enter_critical(self);
2188#if defined(__sparc) && !defined(_LP64)
2189	/* horrible hack, necessary only on 32-bit sparc */
2190	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2191	    self->ul_misaligned) {
2192		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2193			mp->mutex_ownerpid = udp->pid;
2194			mp->mutex_owner = (uintptr_t)self;
2195			exit_critical(self);
2196			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2197			return (0);
2198		}
2199	} else
2200#endif
2201	if (set_lock_byte64(&mp->mutex_lockword64, udp->pid) == 0) {
2202		mp->mutex_owner = (uintptr_t)self;
2203		/* mp->mutex_ownerpid was set by set_lock_byte64() */
2204		exit_critical(self);
2205		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2206		return (0);
2207	}
2208	exit_critical(self);
2209
2210	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && shared_mutex_held(mp))
2211		return (mutex_recursion(mp, mtype, try));
2212
2213	if (try == MUTEX_LOCK) {
2214		if (mutex_trylock_process(mp, 1) == 0)
2215			return (0);
2216		return (mutex_lock_kernel(mp, tsp, NULL));
2217	}
2218
2219	if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2220		self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2221		tdb_event(TD_LOCK_TRY, udp);
2222	}
2223	return (EBUSY);
2224}
2225
2226static int
2227mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
2228{
2229	ulwp_t *self = curthread;
2230	int mtype = mp->mutex_type;
2231	uberflags_t *gflags;
2232
2233	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2234	    self->ul_error_detection && self->ul_misaligned == 0)
2235		lock_error(mp, "mutex_lock", NULL, "mutex is misaligned");
2236
2237	/*
2238	 * Optimize the case of USYNC_THREAD, including
2239	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2240	 * no error detection, no lock statistics,
2241	 * and the process has only a single thread.
2242	 * (Most likely a traditional single-threaded application.)
2243	 */
2244	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2245	    self->ul_uberdata->uberflags.uf_all) == 0) {
2246		/*
2247		 * Only one thread exists so we don't need an atomic operation.
2248		 * We do, however, need to protect against signals.
2249		 */
2250		if (mp->mutex_lockw == 0) {
2251			sigoff(self);
2252			mp->mutex_lockw = LOCKSET;
2253			mp->mutex_owner = (uintptr_t)self;
2254			sigon(self);
2255			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2256			return (0);
2257		}
2258		if (mtype && MUTEX_OWNER(mp) == self)
2259			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2260		/*
2261		 * We have reached a deadlock, probably because the
2262		 * process is executing non-async-signal-safe code in
2263		 * a signal handler and is attempting to acquire a lock
2264		 * that it already owns.  This is not surprising, given
2265		 * bad programming practices over the years that has
2266		 * resulted in applications calling printf() and such
2267		 * in their signal handlers.  Unless the user has told
2268		 * us that the signal handlers are safe by setting:
2269		 *	export _THREAD_ASYNC_SAFE=1
2270		 * we return EDEADLK rather than actually deadlocking.
2271		 */
2272		if (tsp == NULL &&
2273		    MUTEX_OWNER(mp) == self && !self->ul_async_safe) {
2274			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
2275			return (EDEADLK);
2276		}
2277	}
2278
2279	/*
2280	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2281	 * no error detection, and no lock statistics.
2282	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2283	 */
2284	if ((gflags = self->ul_schedctl_called) != NULL &&
2285	    (gflags->uf_trs_ted |
2286	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2287		if (mtype & USYNC_PROCESS)
2288			return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
2289		sigoff(self);
2290		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2291			mp->mutex_owner = (uintptr_t)self;
2292			sigon(self);
2293			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2294			return (0);
2295		}
2296		sigon(self);
2297		if (mtype && MUTEX_OWNER(mp) == self)
2298			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2299		if (mutex_trylock_adaptive(mp, 1) != 0)
2300			return (mutex_lock_queue(self, NULL, mp, tsp));
2301		return (0);
2302	}
2303
2304	/* else do it the long way */
2305	return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
2306}
2307
2308#pragma weak pthread_mutex_lock = mutex_lock
2309#pragma weak _mutex_lock = mutex_lock
2310int
2311mutex_lock(mutex_t *mp)
2312{
2313	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2314	return (mutex_lock_impl(mp, NULL));
2315}
2316
2317int
2318pthread_mutex_timedlock(pthread_mutex_t *_RESTRICT_KYWD mp,
2319	const struct timespec *_RESTRICT_KYWD abstime)
2320{
2321	timespec_t tslocal;
2322	int error;
2323
2324	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2325	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
2326	error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2327	if (error == ETIME)
2328		error = ETIMEDOUT;
2329	return (error);
2330}
2331
2332int
2333pthread_mutex_reltimedlock_np(pthread_mutex_t *_RESTRICT_KYWD mp,
2334	const struct timespec *_RESTRICT_KYWD reltime)
2335{
2336	timespec_t tslocal;
2337	int error;
2338
2339	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2340	tslocal = *reltime;
2341	error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2342	if (error == ETIME)
2343		error = ETIMEDOUT;
2344	return (error);
2345}
2346
2347#pragma weak pthread_mutex_trylock = mutex_trylock
2348int
2349mutex_trylock(mutex_t *mp)
2350{
2351	ulwp_t *self = curthread;
2352	uberdata_t *udp = self->ul_uberdata;
2353	int mtype = mp->mutex_type;
2354	uberflags_t *gflags;
2355
2356	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2357
2358	/*
2359	 * Optimize the case of USYNC_THREAD, including
2360	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2361	 * no error detection, no lock statistics,
2362	 * and the process has only a single thread.
2363	 * (Most likely a traditional single-threaded application.)
2364	 */
2365	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2366	    udp->uberflags.uf_all) == 0) {
2367		/*
2368		 * Only one thread exists so we don't need an atomic operation.
2369		 * We do, however, need to protect against signals.
2370		 */
2371		if (mp->mutex_lockw == 0) {
2372			sigoff(self);
2373			mp->mutex_lockw = LOCKSET;
2374			mp->mutex_owner = (uintptr_t)self;
2375			sigon(self);
2376			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2377			return (0);
2378		}
2379		if (mtype && MUTEX_OWNER(mp) == self)
2380			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2381		return (EBUSY);
2382	}
2383
2384	/*
2385	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2386	 * no error detection, and no lock statistics.
2387	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2388	 */
2389	if ((gflags = self->ul_schedctl_called) != NULL &&
2390	    (gflags->uf_trs_ted |
2391	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2392		if (mtype & USYNC_PROCESS)
2393			return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
2394		sigoff(self);
2395		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2396			mp->mutex_owner = (uintptr_t)self;
2397			sigon(self);
2398			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2399			return (0);
2400		}
2401		sigon(self);
2402		if (mtype && MUTEX_OWNER(mp) == self)
2403			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2404		if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2405			self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2406			tdb_event(TD_LOCK_TRY, udp);
2407		}
2408		return (EBUSY);
2409	}
2410
2411	/* else do it the long way */
2412	return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
2413}
2414
2415int
2416mutex_unlock_internal(mutex_t *mp, int retain_robust_flags)
2417{
2418	ulwp_t *self = curthread;
2419	uberdata_t *udp = self->ul_uberdata;
2420	int mtype = mp->mutex_type;
2421	tdb_mutex_stats_t *msp;
2422	int error = 0;
2423	int release_all;
2424	lwpid_t lwpid;
2425
2426	if ((mtype & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
2427	    !mutex_held(mp))
2428		return (EPERM);
2429
2430	if (self->ul_error_detection && !mutex_held(mp))
2431		lock_error(mp, "mutex_unlock", NULL, NULL);
2432
2433	if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2434		mp->mutex_rcount--;
2435		DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2436		return (0);
2437	}
2438
2439	if ((msp = MUTEX_STATS(mp, udp)) != NULL)
2440		(void) record_hold_time(msp);
2441
2442	if (!retain_robust_flags && !(mtype & LOCK_PRIO_INHERIT) &&
2443	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2444		ASSERT(mtype & LOCK_ROBUST);
2445		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2446		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
2447	}
2448	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
2449
2450	if (mtype & LOCK_PRIO_INHERIT) {
2451		no_preempt(self);
2452		mp->mutex_owner = 0;
2453		/* mp->mutex_ownerpid is cleared by ___lwp_mutex_unlock() */
2454		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2455		mp->mutex_lockw = LOCKCLEAR;
2456		self->ul_pilocks--;
2457		error = ___lwp_mutex_unlock(mp);
2458		preempt(self);
2459	} else if (mtype & USYNC_PROCESS) {
2460		mutex_unlock_process(mp, release_all);
2461	} else {	/* USYNC_THREAD */
2462		if ((lwpid = mutex_unlock_queue(mp, release_all)) != 0) {
2463			(void) __lwp_unpark(lwpid);
2464			preempt(self);
2465		}
2466	}
2467
2468	if (mtype & LOCK_ROBUST)
2469		forget_lock(mp);
2470
2471	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
2472		_ceil_prio_waive();
2473
2474	return (error);
2475}
2476
2477#pragma weak pthread_mutex_unlock = mutex_unlock
2478#pragma weak _mutex_unlock = mutex_unlock
2479int
2480mutex_unlock(mutex_t *mp)
2481{
2482	ulwp_t *self = curthread;
2483	int mtype = mp->mutex_type;
2484	uberflags_t *gflags;
2485	lwpid_t lwpid;
2486	short el;
2487
2488	/*
2489	 * Optimize the case of USYNC_THREAD, including
2490	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2491	 * no error detection, no lock statistics,
2492	 * and the process has only a single thread.
2493	 * (Most likely a traditional single-threaded application.)
2494	 */
2495	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2496	    self->ul_uberdata->uberflags.uf_all) == 0) {
2497		if (mtype) {
2498			/*
2499			 * At this point we know that one or both of the
2500			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2501			 */
2502			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2503				return (EPERM);
2504			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2505				mp->mutex_rcount--;
2506				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2507				return (0);
2508			}
2509		}
2510		/*
2511		 * Only one thread exists so we don't need an atomic operation.
2512		 * Also, there can be no waiters.
2513		 */
2514		sigoff(self);
2515		mp->mutex_owner = 0;
2516		mp->mutex_lockword = 0;
2517		sigon(self);
2518		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2519		return (0);
2520	}
2521
2522	/*
2523	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2524	 * no error detection, and no lock statistics.
2525	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2526	 */
2527	if ((gflags = self->ul_schedctl_called) != NULL) {
2528		if (((el = gflags->uf_trs_ted) | mtype) == 0) {
2529fast_unlock:
2530			if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2531				(void) __lwp_unpark(lwpid);
2532				preempt(self);
2533			}
2534			return (0);
2535		}
2536		if (el)		/* error detection or lock statistics */
2537			goto slow_unlock;
2538		if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2539			/*
2540			 * At this point we know that one or both of the
2541			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2542			 */
2543			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2544				return (EPERM);
2545			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2546				mp->mutex_rcount--;
2547				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2548				return (0);
2549			}
2550			goto fast_unlock;
2551		}
2552		if ((mtype &
2553		    ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2554			/*
2555			 * At this point we know that zero, one, or both of the
2556			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
2557			 * that the USYNC_PROCESS flag is set.
2558			 */
2559			if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
2560				return (EPERM);
2561			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2562				mp->mutex_rcount--;
2563				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2564				return (0);
2565			}
2566			mutex_unlock_process(mp, 0);
2567			return (0);
2568		}
2569	}
2570
2571	/* else do it the long way */
2572slow_unlock:
2573	return (mutex_unlock_internal(mp, 0));
2574}
2575
2576/*
2577 * Internally to the library, almost all mutex lock/unlock actions
2578 * go through these lmutex_ functions, to protect critical regions.
2579 * We replicate a bit of code from mutex_lock() and mutex_unlock()
2580 * to make these functions faster since we know that the mutex type
2581 * of all internal locks is USYNC_THREAD.  We also know that internal
2582 * locking can never fail, so we panic if it does.
2583 */
2584void
2585lmutex_lock(mutex_t *mp)
2586{
2587	ulwp_t *self = curthread;
2588	uberdata_t *udp = self->ul_uberdata;
2589
2590	ASSERT(mp->mutex_type == USYNC_THREAD);
2591
2592	enter_critical(self);
2593	/*
2594	 * Optimize the case of no lock statistics and only a single thread.
2595	 * (Most likely a traditional single-threaded application.)
2596	 */
2597	if (udp->uberflags.uf_all == 0) {
2598		/*
2599		 * Only one thread exists; the mutex must be free.
2600		 */
2601		ASSERT(mp->mutex_lockw == 0);
2602		mp->mutex_lockw = LOCKSET;
2603		mp->mutex_owner = (uintptr_t)self;
2604		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2605	} else {
2606		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2607
2608		if (!self->ul_schedctl_called)
2609			(void) setup_schedctl();
2610
2611		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2612			mp->mutex_owner = (uintptr_t)self;
2613			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2614		} else if (mutex_trylock_adaptive(mp, 1) != 0) {
2615			(void) mutex_lock_queue(self, msp, mp, NULL);
2616		}
2617
2618		if (msp)
2619			record_begin_hold(msp);
2620	}
2621}
2622
2623void
2624lmutex_unlock(mutex_t *mp)
2625{
2626	ulwp_t *self = curthread;
2627	uberdata_t *udp = self->ul_uberdata;
2628
2629	ASSERT(mp->mutex_type == USYNC_THREAD);
2630
2631	/*
2632	 * Optimize the case of no lock statistics and only a single thread.
2633	 * (Most likely a traditional single-threaded application.)
2634	 */
2635	if (udp->uberflags.uf_all == 0) {
2636		/*
2637		 * Only one thread exists so there can be no waiters.
2638		 */
2639		mp->mutex_owner = 0;
2640		mp->mutex_lockword = 0;
2641		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2642	} else {
2643		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2644		lwpid_t lwpid;
2645
2646		if (msp)
2647			(void) record_hold_time(msp);
2648		if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2649			(void) __lwp_unpark(lwpid);
2650			preempt(self);
2651		}
2652	}
2653	exit_critical(self);
2654}
2655
2656/*
2657 * For specialized code in libc, like the asynchronous i/o code,
2658 * the following sig_*() locking primitives are used in order
2659 * to make the code asynchronous signal safe.  Signals are
2660 * deferred while locks acquired by these functions are held.
2661 */
2662void
2663sig_mutex_lock(mutex_t *mp)
2664{
2665	ulwp_t *self = curthread;
2666
2667	sigoff(self);
2668	(void) mutex_lock(mp);
2669}
2670
2671void
2672sig_mutex_unlock(mutex_t *mp)
2673{
2674	ulwp_t *self = curthread;
2675
2676	(void) mutex_unlock(mp);
2677	sigon(self);
2678}
2679
2680int
2681sig_mutex_trylock(mutex_t *mp)
2682{
2683	ulwp_t *self = curthread;
2684	int error;
2685
2686	sigoff(self);
2687	if ((error = mutex_trylock(mp)) != 0)
2688		sigon(self);
2689	return (error);
2690}
2691
2692/*
2693 * sig_cond_wait() is a cancellation point.
2694 */
2695int
2696sig_cond_wait(cond_t *cv, mutex_t *mp)
2697{
2698	int error;
2699
2700	ASSERT(curthread->ul_sigdefer != 0);
2701	pthread_testcancel();
2702	error = __cond_wait(cv, mp);
2703	if (error == EINTR && curthread->ul_cursig) {
2704		sig_mutex_unlock(mp);
2705		/* take the deferred signal here */
2706		sig_mutex_lock(mp);
2707	}
2708	pthread_testcancel();
2709	return (error);
2710}
2711
2712/*
2713 * sig_cond_reltimedwait() is a cancellation point.
2714 */
2715int
2716sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
2717{
2718	int error;
2719
2720	ASSERT(curthread->ul_sigdefer != 0);
2721	pthread_testcancel();
2722	error = __cond_reltimedwait(cv, mp, ts);
2723	if (error == EINTR && curthread->ul_cursig) {
2724		sig_mutex_unlock(mp);
2725		/* take the deferred signal here */
2726		sig_mutex_lock(mp);
2727	}
2728	pthread_testcancel();
2729	return (error);
2730}
2731
2732/*
2733 * For specialized code in libc, like the stdio code.
2734 * the following cancel_safe_*() locking primitives are used in
2735 * order to make the code cancellation-safe.  Cancellation is
2736 * deferred while locks acquired by these functions are held.
2737 */
2738void
2739cancel_safe_mutex_lock(mutex_t *mp)
2740{
2741	(void) mutex_lock(mp);
2742	curthread->ul_libc_locks++;
2743}
2744
2745int
2746cancel_safe_mutex_trylock(mutex_t *mp)
2747{
2748	int error;
2749
2750	if ((error = mutex_trylock(mp)) == 0)
2751		curthread->ul_libc_locks++;
2752	return (error);
2753}
2754
2755void
2756cancel_safe_mutex_unlock(mutex_t *mp)
2757{
2758	ulwp_t *self = curthread;
2759
2760	ASSERT(self->ul_libc_locks != 0);
2761
2762	(void) mutex_unlock(mp);
2763
2764	/*
2765	 * Decrement the count of locks held by cancel_safe_mutex_lock().
2766	 * If we are then in a position to terminate cleanly and
2767	 * if there is a pending cancellation and cancellation
2768	 * is not disabled and we received EINTR from a recent
2769	 * system call then perform the cancellation action now.
2770	 */
2771	if (--self->ul_libc_locks == 0 &&
2772	    !(self->ul_vfork | self->ul_nocancel |
2773	    self->ul_critical | self->ul_sigdefer) &&
2774	    cancel_active())
2775		pthread_exit(PTHREAD_CANCELED);
2776}
2777
2778static int
2779shared_mutex_held(mutex_t *mparg)
2780{
2781	/*
2782	 * The 'volatile' is necessary to make sure the compiler doesn't
2783	 * reorder the tests of the various components of the mutex.
2784	 * They must be tested in this order:
2785	 *	mutex_lockw
2786	 *	mutex_owner
2787	 *	mutex_ownerpid
2788	 * This relies on the fact that everywhere mutex_lockw is cleared,
2789	 * mutex_owner and mutex_ownerpid are cleared before mutex_lockw
2790	 * is cleared, and that everywhere mutex_lockw is set, mutex_owner
2791	 * and mutex_ownerpid are set after mutex_lockw is set, and that
2792	 * mutex_lockw is set or cleared with a memory barrier.
2793	 */
2794	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2795	ulwp_t *self = curthread;
2796	uberdata_t *udp = self->ul_uberdata;
2797
2798	return (MUTEX_OWNED(mp, self) && mp->mutex_ownerpid == udp->pid);
2799}
2800
2801#pragma weak _mutex_held = mutex_held
2802int
2803mutex_held(mutex_t *mparg)
2804{
2805	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2806
2807	if (mparg->mutex_type & USYNC_PROCESS)
2808		return (shared_mutex_held(mparg));
2809	return (MUTEX_OWNED(mp, curthread));
2810}
2811
2812#pragma weak pthread_mutex_destroy = mutex_destroy
2813#pragma weak _mutex_destroy = mutex_destroy
2814int
2815mutex_destroy(mutex_t *mp)
2816{
2817	if (mp->mutex_type & USYNC_PROCESS)
2818		forget_lock(mp);
2819	(void) memset(mp, 0, sizeof (*mp));
2820	tdb_sync_obj_deregister(mp);
2821	return (0);
2822}
2823
2824#pragma weak pthread_mutex_consistent_np = mutex_consistent
2825#pragma weak pthread_mutex_consistent = mutex_consistent
2826int
2827mutex_consistent(mutex_t *mp)
2828{
2829	/*
2830	 * Do this only for an inconsistent, initialized robust lock
2831	 * that we hold.  For all other cases, return EINVAL.
2832	 */
2833	if (mutex_held(mp) &&
2834	    (mp->mutex_type & LOCK_ROBUST) &&
2835	    (mp->mutex_flag & LOCK_INITED) &&
2836	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2837		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2838		mp->mutex_rcount = 0;
2839		return (0);
2840	}
2841	return (EINVAL);
2842}
2843
2844/*
2845 * Spin locks are separate from ordinary mutexes,
2846 * but we use the same data structure for them.
2847 */
2848
2849int
2850pthread_spin_init(pthread_spinlock_t *lock, int pshared)
2851{
2852	mutex_t *mp = (mutex_t *)lock;
2853
2854	(void) memset(mp, 0, sizeof (*mp));
2855	if (pshared == PTHREAD_PROCESS_SHARED)
2856		mp->mutex_type = USYNC_PROCESS;
2857	else
2858		mp->mutex_type = USYNC_THREAD;
2859	mp->mutex_flag = LOCK_INITED;
2860	mp->mutex_magic = MUTEX_MAGIC;
2861
2862	/*
2863	 * This should be at the beginning of the function,
2864	 * but for the sake of old broken applications that
2865	 * do not have proper alignment for their mutexes
2866	 * (and don't check the return code from pthread_spin_init),
2867	 * we put it here, after initializing the mutex regardless.
2868	 */
2869	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2870	    curthread->ul_misaligned == 0)
2871		return (EINVAL);
2872
2873	return (0);
2874}
2875
2876int
2877pthread_spin_destroy(pthread_spinlock_t *lock)
2878{
2879	(void) memset(lock, 0, sizeof (*lock));
2880	return (0);
2881}
2882
2883int
2884pthread_spin_trylock(pthread_spinlock_t *lock)
2885{
2886	mutex_t *mp = (mutex_t *)lock;
2887	ulwp_t *self = curthread;
2888	int error = 0;
2889
2890	no_preempt(self);
2891	if (set_lock_byte(&mp->mutex_lockw) != 0)
2892		error = EBUSY;
2893	else {
2894		mp->mutex_owner = (uintptr_t)self;
2895		if (mp->mutex_type == USYNC_PROCESS)
2896			mp->mutex_ownerpid = self->ul_uberdata->pid;
2897		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2898	}
2899	preempt(self);
2900	return (error);
2901}
2902
2903int
2904pthread_spin_lock(pthread_spinlock_t *lock)
2905{
2906	mutex_t *mp = (mutex_t *)lock;
2907	ulwp_t *self = curthread;
2908	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
2909	int count = 0;
2910
2911	ASSERT(!self->ul_critical || self->ul_bindflags);
2912
2913	DTRACE_PROBE1(plockstat, mutex__spin, mp);
2914
2915	/*
2916	 * We don't care whether the owner is running on a processor.
2917	 * We just spin because that's what this interface requires.
2918	 */
2919	for (;;) {
2920		if (*lockp == 0) {	/* lock byte appears to be clear */
2921			no_preempt(self);
2922			if (set_lock_byte(lockp) == 0)
2923				break;
2924			preempt(self);
2925		}
2926		if (count < INT_MAX)
2927			count++;
2928		SMT_PAUSE();
2929	}
2930	mp->mutex_owner = (uintptr_t)self;
2931	if (mp->mutex_type == USYNC_PROCESS)
2932		mp->mutex_ownerpid = self->ul_uberdata->pid;
2933	preempt(self);
2934	if (count) {
2935		DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
2936	}
2937	DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
2938	return (0);
2939}
2940
2941int
2942pthread_spin_unlock(pthread_spinlock_t *lock)
2943{
2944	mutex_t *mp = (mutex_t *)lock;
2945	ulwp_t *self = curthread;
2946
2947	no_preempt(self);
2948	mp->mutex_owner = 0;
2949	mp->mutex_ownerpid = 0;
2950	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2951	(void) atomic_swap_32(&mp->mutex_lockword, 0);
2952	preempt(self);
2953	return (0);
2954}
2955
2956#define	INITIAL_LOCKS	8	/* initial size of ul_heldlocks.array */
2957
2958/*
2959 * Find/allocate an entry for 'lock' in our array of held locks.
2960 */
2961static mutex_t **
2962find_lock_entry(mutex_t *lock)
2963{
2964	ulwp_t *self = curthread;
2965	mutex_t **remembered = NULL;
2966	mutex_t **lockptr;
2967	uint_t nlocks;
2968
2969	if ((nlocks = self->ul_heldlockcnt) != 0)
2970		lockptr = self->ul_heldlocks.array;
2971	else {
2972		nlocks = 1;
2973		lockptr = &self->ul_heldlocks.single;
2974	}
2975
2976	for (; nlocks; nlocks--, lockptr++) {
2977		if (*lockptr == lock)
2978			return (lockptr);
2979		if (*lockptr == NULL && remembered == NULL)
2980			remembered = lockptr;
2981	}
2982	if (remembered != NULL) {
2983		*remembered = lock;
2984		return (remembered);
2985	}
2986
2987	/*
2988	 * No entry available.  Allocate more space, converting
2989	 * the single entry into an array of entries if necessary.
2990	 */
2991	if ((nlocks = self->ul_heldlockcnt) == 0) {
2992		/*
2993		 * Initial allocation of the array.
2994		 * Convert the single entry into an array.
2995		 */
2996		self->ul_heldlockcnt = nlocks = INITIAL_LOCKS;
2997		lockptr = lmalloc(nlocks * sizeof (mutex_t *));
2998		/*
2999		 * The single entry becomes the first entry in the array.
3000		 */
3001		*lockptr = self->ul_heldlocks.single;
3002		self->ul_heldlocks.array = lockptr;
3003		/*
3004		 * Return the next available entry in the array.
3005		 */
3006		*++lockptr = lock;
3007		return (lockptr);
3008	}
3009	/*
3010	 * Reallocate the array, double the size each time.
3011	 */
3012	lockptr = lmalloc(nlocks * 2 * sizeof (mutex_t *));
3013	(void) memcpy(lockptr, self->ul_heldlocks.array,
3014	    nlocks * sizeof (mutex_t *));
3015	lfree(self->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
3016	self->ul_heldlocks.array = lockptr;
3017	self->ul_heldlockcnt *= 2;
3018	/*
3019	 * Return the next available entry in the newly allocated array.
3020	 */
3021	*(lockptr += nlocks) = lock;
3022	return (lockptr);
3023}
3024
3025/*
3026 * Insert 'lock' into our list of held locks.
3027 * Currently only used for LOCK_ROBUST mutexes.
3028 */
3029void
3030remember_lock(mutex_t *lock)
3031{
3032	(void) find_lock_entry(lock);
3033}
3034
3035/*
3036 * Remove 'lock' from our list of held locks.
3037 * Currently only used for LOCK_ROBUST mutexes.
3038 */
3039void
3040forget_lock(mutex_t *lock)
3041{
3042	*find_lock_entry(lock) = NULL;
3043}
3044
3045/*
3046 * Free the array of held locks.
3047 */
3048void
3049heldlock_free(ulwp_t *ulwp)
3050{
3051	uint_t nlocks;
3052
3053	if ((nlocks = ulwp->ul_heldlockcnt) != 0)
3054		lfree(ulwp->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
3055	ulwp->ul_heldlockcnt = 0;
3056	ulwp->ul_heldlocks.array = NULL;
3057}
3058
3059/*
3060 * Mark all held LOCK_ROBUST mutexes LOCK_OWNERDEAD.
3061 * Called from _thrp_exit() to deal with abandoned locks.
3062 */
3063void
3064heldlock_exit(void)
3065{
3066	ulwp_t *self = curthread;
3067	mutex_t **lockptr;
3068	uint_t nlocks;
3069	mutex_t *mp;
3070
3071	if ((nlocks = self->ul_heldlockcnt) != 0)
3072		lockptr = self->ul_heldlocks.array;
3073	else {
3074		nlocks = 1;
3075		lockptr = &self->ul_heldlocks.single;
3076	}
3077
3078	for (; nlocks; nlocks--, lockptr++) {
3079		/*
3080		 * The kernel takes care of transitioning held
3081		 * LOCK_PRIO_INHERIT mutexes to LOCK_OWNERDEAD.
3082		 * We avoid that case here.
3083		 */
3084		if ((mp = *lockptr) != NULL &&
3085		    mutex_held(mp) &&
3086		    (mp->mutex_type & (LOCK_ROBUST | LOCK_PRIO_INHERIT)) ==
3087		    LOCK_ROBUST) {
3088			mp->mutex_rcount = 0;
3089			if (!(mp->mutex_flag & LOCK_UNMAPPED))
3090				mp->mutex_flag |= LOCK_OWNERDEAD;
3091			(void) mutex_unlock_internal(mp, 1);
3092		}
3093	}
3094
3095	heldlock_free(self);
3096}
3097
3098#pragma weak _cond_init = cond_init
3099/* ARGSUSED2 */
3100int
3101cond_init(cond_t *cvp, int type, void *arg)
3102{
3103	if (type != USYNC_THREAD && type != USYNC_PROCESS)
3104		return (EINVAL);
3105	(void) memset(cvp, 0, sizeof (*cvp));
3106	cvp->cond_type = (uint16_t)type;
3107	cvp->cond_magic = COND_MAGIC;
3108
3109	/*
3110	 * This should be at the beginning of the function,
3111	 * but for the sake of old broken applications that
3112	 * do not have proper alignment for their condvars
3113	 * (and don't check the return code from cond_init),
3114	 * we put it here, after initializing the condvar regardless.
3115	 */
3116	if (((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1)) &&
3117	    curthread->ul_misaligned == 0)
3118		return (EINVAL);
3119
3120	return (0);
3121}
3122
3123/*
3124 * cond_sleep_queue(): utility function for cond_wait_queue().
3125 *
3126 * Go to sleep on a condvar sleep queue, expect to be waked up
3127 * by someone calling cond_signal() or cond_broadcast() or due
3128 * to receiving a UNIX signal or being cancelled, or just simply
3129 * due to a spurious wakeup (like someome calling forkall()).
3130 *
3131 * The associated mutex is *not* reacquired before returning.
3132 * That must be done by the caller of cond_sleep_queue().
3133 */
3134static int
3135cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3136{
3137	ulwp_t *self = curthread;
3138	queue_head_t *qp;
3139	queue_head_t *mqp;
3140	lwpid_t lwpid;
3141	int signalled;
3142	int error;
3143	int cv_wake;
3144	int release_all;
3145
3146	/*
3147	 * Put ourself on the CV sleep queue, unlock the mutex, then
3148	 * park ourself and unpark a candidate lwp to grab the mutex.
3149	 * We must go onto the CV sleep queue before dropping the
3150	 * mutex in order to guarantee atomicity of the operation.
3151	 */
3152	self->ul_sp = stkptr();
3153	qp = queue_lock(cvp, CV);
3154	enqueue(qp, self, 0);
3155	cvp->cond_waiters_user = 1;
3156	self->ul_cvmutex = mp;
3157	self->ul_cv_wake = cv_wake = (tsp != NULL);
3158	self->ul_signalled = 0;
3159	if (mp->mutex_flag & LOCK_OWNERDEAD) {
3160		mp->mutex_flag &= ~LOCK_OWNERDEAD;
3161		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
3162	}
3163	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
3164	lwpid = mutex_unlock_queue(mp, release_all);
3165	for (;;) {
3166		set_parking_flag(self, 1);
3167		queue_unlock(qp);
3168		if (lwpid != 0) {
3169			lwpid = preempt_unpark(self, lwpid);
3170			preempt(self);
3171		}
3172		/*
3173		 * We may have a deferred signal present,
3174		 * in which case we should return EINTR.
3175		 * Also, we may have received a SIGCANCEL; if so
3176		 * and we are cancelable we should return EINTR.
3177		 * We force an immediate EINTR return from
3178		 * __lwp_park() by turning our parking flag off.
3179		 */
3180		if (self->ul_cursig != 0 ||
3181		    (self->ul_cancelable && self->ul_cancel_pending))
3182			set_parking_flag(self, 0);
3183		/*
3184		 * __lwp_park() will return the residual time in tsp
3185		 * if we are unparked before the timeout expires.
3186		 */
3187		error = __lwp_park(tsp, lwpid);
3188		set_parking_flag(self, 0);
3189		lwpid = 0;	/* unpark the other lwp only once */
3190		/*
3191		 * We were waked up by cond_signal(), cond_broadcast(),
3192		 * by an interrupt or timeout (EINTR or ETIME),
3193		 * or we may just have gotten a spurious wakeup.
3194		 */
3195		qp = queue_lock(cvp, CV);
3196		if (!cv_wake)
3197			mqp = queue_lock(mp, MX);
3198		if (self->ul_sleepq == NULL)
3199			break;
3200		/*
3201		 * We are on either the condvar sleep queue or the
3202		 * mutex sleep queue.  Break out of the sleep if we
3203		 * were interrupted or we timed out (EINTR or ETIME).
3204		 * Else this is a spurious wakeup; continue the loop.
3205		 */
3206		if (!cv_wake && self->ul_sleepq == mqp) { /* mutex queue */
3207			if (error) {
3208				mp->mutex_waiters = dequeue_self(mqp);
3209				break;
3210			}
3211			tsp = NULL;	/* no more timeout */
3212		} else if (self->ul_sleepq == qp) {	/* condvar queue */
3213			if (error) {
3214				cvp->cond_waiters_user = dequeue_self(qp);
3215				break;
3216			}
3217			/*
3218			 * Else a spurious wakeup on the condvar queue.
3219			 * __lwp_park() has already adjusted the timeout.
3220			 */
3221		} else {
3222			thr_panic("cond_sleep_queue(): thread not on queue");
3223		}
3224		if (!cv_wake)
3225			queue_unlock(mqp);
3226	}
3227
3228	self->ul_sp = 0;
3229	self->ul_cv_wake = 0;
3230	ASSERT(self->ul_cvmutex == NULL);
3231	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
3232	    self->ul_wchan == NULL);
3233
3234	signalled = self->ul_signalled;
3235	self->ul_signalled = 0;
3236	queue_unlock(qp);
3237	if (!cv_wake)
3238		queue_unlock(mqp);
3239
3240	/*
3241	 * If we were concurrently cond_signal()d and any of:
3242	 * received a UNIX signal, were cancelled, or got a timeout,
3243	 * then perform another cond_signal() to avoid consuming it.
3244	 */
3245	if (error && signalled)
3246		(void) cond_signal(cvp);
3247
3248	return (error);
3249}
3250
3251static void
3252cond_wait_check_alignment(cond_t *cvp, mutex_t *mp)
3253{
3254	if ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1))
3255		lock_error(mp, "cond_wait", cvp, "mutex is misaligned");
3256	if ((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1))
3257		lock_error(mp, "cond_wait", cvp, "condvar is misaligned");
3258}
3259
3260int
3261cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3262{
3263	ulwp_t *self = curthread;
3264	int error;
3265	int merror;
3266
3267	if (self->ul_error_detection && self->ul_misaligned == 0)
3268		cond_wait_check_alignment(cvp, mp);
3269
3270	/*
3271	 * The old thread library was programmed to defer signals
3272	 * while in cond_wait() so that the associated mutex would
3273	 * be guaranteed to be held when the application signal
3274	 * handler was invoked.
3275	 *
3276	 * We do not behave this way by default; the state of the
3277	 * associated mutex in the signal handler is undefined.
3278	 *
3279	 * To accommodate applications that depend on the old
3280	 * behavior, the _THREAD_COND_WAIT_DEFER environment
3281	 * variable can be set to 1 and we will behave in the
3282	 * old way with respect to cond_wait().
3283	 */
3284	if (self->ul_cond_wait_defer)
3285		sigoff(self);
3286
3287	error = cond_sleep_queue(cvp, mp, tsp);
3288
3289	/*
3290	 * Reacquire the mutex.
3291	 */
3292	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3293		error = merror;
3294
3295	/*
3296	 * Take any deferred signal now, after we have reacquired the mutex.
3297	 */
3298	if (self->ul_cond_wait_defer)
3299		sigon(self);
3300
3301	return (error);
3302}
3303
3304/*
3305 * cond_sleep_kernel(): utility function for cond_wait_kernel().
3306 * See the comment ahead of cond_sleep_queue(), above.
3307 */
3308static int
3309cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3310{
3311	int mtype = mp->mutex_type;
3312	ulwp_t *self = curthread;
3313	int error;
3314
3315	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
3316		_ceil_prio_waive();
3317
3318	self->ul_sp = stkptr();
3319	self->ul_wchan = cvp;
3320	sigoff(self);
3321	mp->mutex_owner = 0;
3322	/* mp->mutex_ownerpid is cleared by ___lwp_cond_wait() */
3323	if (mtype & LOCK_PRIO_INHERIT) {
3324		mp->mutex_lockw = LOCKCLEAR;
3325		self->ul_pilocks--;
3326	}
3327	/*
3328	 * ___lwp_cond_wait() returns immediately with EINTR if
3329	 * set_parking_flag(self,0) is called on this lwp before it
3330	 * goes to sleep in the kernel.  sigacthandler() calls this
3331	 * when a deferred signal is noted.  This assures that we don't
3332	 * get stuck in ___lwp_cond_wait() with all signals blocked
3333	 * due to taking a deferred signal before going to sleep.
3334	 */
3335	set_parking_flag(self, 1);
3336	if (self->ul_cursig != 0 ||
3337	    (self->ul_cancelable && self->ul_cancel_pending))
3338		set_parking_flag(self, 0);
3339	error = ___lwp_cond_wait(cvp, mp, tsp, 1);
3340	set_parking_flag(self, 0);
3341	sigon(self);
3342	self->ul_sp = 0;
3343	self->ul_wchan = NULL;
3344	return (error);
3345}
3346
3347int
3348cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3349{
3350	ulwp_t *self = curthread;
3351	int error;
3352	int merror;
3353
3354	if (self->ul_error_detection && self->ul_misaligned == 0)
3355		cond_wait_check_alignment(cvp, mp);
3356
3357	/*
3358	 * See the large comment in cond_wait_queue(), above.
3359	 */
3360	if (self->ul_cond_wait_defer)
3361		sigoff(self);
3362
3363	error = cond_sleep_kernel(cvp, mp, tsp);
3364
3365	/*
3366	 * Override the return code from ___lwp_cond_wait()
3367	 * with any non-zero return code from mutex_lock().
3368	 * This addresses robust lock failures in particular;
3369	 * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
3370	 * errors in order to take corrective action.
3371	 */
3372	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3373		error = merror;
3374
3375	/*
3376	 * Take any deferred signal now, after we have reacquired the mutex.
3377	 */
3378	if (self->ul_cond_wait_defer)
3379		sigon(self);
3380
3381	return (error);
3382}
3383
3384/*
3385 * Common code for cond_wait() and cond_timedwait()
3386 */
3387int
3388cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3389{
3390	int mtype = mp->mutex_type;
3391	hrtime_t begin_sleep = 0;
3392	ulwp_t *self = curthread;
3393	uberdata_t *udp = self->ul_uberdata;
3394	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3395	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
3396	uint8_t rcount;
3397	int error = 0;
3398
3399	/*
3400	 * The SUSV3 Posix spec for pthread_cond_timedwait() states:
3401	 *	Except in the case of [ETIMEDOUT], all these error checks
3402	 *	shall act as if they were performed immediately at the
3403	 *	beginning of processing for the function and shall cause
3404	 *	an error return, in effect, prior to modifying the state
3405	 *	of the mutex specified by mutex or the condition variable
3406	 *	specified by cond.
3407	 * Therefore, we must return EINVAL now if the timout is invalid.
3408	 */
3409	if (tsp != NULL &&
3410	    (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
3411		return (EINVAL);
3412
3413	if (__td_event_report(self, TD_SLEEP, udp)) {
3414		self->ul_sp = stkptr();
3415		self->ul_wchan = cvp;
3416		self->ul_td_evbuf.eventnum = TD_SLEEP;
3417		self->ul_td_evbuf.eventdata = cvp;
3418		tdb_event(TD_SLEEP, udp);
3419		self->ul_sp = 0;
3420	}
3421	if (csp) {
3422		if (tsp)
3423			tdb_incr(csp->cond_timedwait);
3424		else
3425			tdb_incr(csp->cond_wait);
3426	}
3427	if (msp)
3428		begin_sleep = record_hold_time(msp);
3429	else if (csp)
3430		begin_sleep = gethrtime();
3431
3432	if (self->ul_error_detection) {
3433		if (!mutex_held(mp))
3434			lock_error(mp, "cond_wait", cvp, NULL);
3435		if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
3436			lock_error(mp, "recursive mutex in cond_wait",
3437			    cvp, NULL);
3438		if (cvp->cond_type & USYNC_PROCESS) {
3439			if (!(mtype & USYNC_PROCESS))
3440				lock_error(mp, "cond_wait", cvp,
3441				    "condvar process-shared, "
3442				    "mutex process-private");
3443		} else {
3444			if (mtype & USYNC_PROCESS)
3445				lock_error(mp, "cond_wait", cvp,
3446				    "condvar process-private, "
3447				    "mutex process-shared");
3448		}
3449	}
3450
3451	/*
3452	 * We deal with recursive mutexes by completely
3453	 * dropping the lock and restoring the recursion
3454	 * count after waking up.  This is arguably wrong,
3455	 * but it obeys the principle of least astonishment.
3456	 */
3457	rcount = mp->mutex_rcount;
3458	mp->mutex_rcount = 0;
3459	if ((mtype &
3460	    (USYNC_PROCESS | LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) |
3461	    (cvp->cond_type & USYNC_PROCESS))
3462		error = cond_wait_kernel(cvp, mp, tsp);
3463	else
3464		error = cond_wait_queue(cvp, mp, tsp);
3465	mp->mutex_rcount = rcount;
3466
3467	if (csp) {
3468		hrtime_t lapse = gethrtime() - begin_sleep;
3469		if (tsp == NULL)
3470			csp->cond_wait_sleep_time += lapse;
3471		else {
3472			csp->cond_timedwait_sleep_time += lapse;
3473			if (error == ETIME)
3474				tdb_incr(csp->cond_timedwait_timeout);
3475		}
3476	}
3477	return (error);
3478}
3479
3480/*
3481 * cond_wait() is a cancellation point but __cond_wait() is not.
3482 * Internally, libc calls the non-cancellation version.
3483 * Other libraries need to use pthread_setcancelstate(), as appropriate,
3484 * since __cond_wait() is not exported from libc.
3485 */
3486int
3487__cond_wait(cond_t *cvp, mutex_t *mp)
3488{
3489	ulwp_t *self = curthread;
3490	uberdata_t *udp = self->ul_uberdata;
3491	uberflags_t *gflags;
3492
3493	if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3494	    !mutex_held(mp))
3495		return (EPERM);
3496
3497	/*
3498	 * Optimize the common case of USYNC_THREAD plus
3499	 * no error detection, no lock statistics, and no event tracing.
3500	 */
3501	if ((gflags = self->ul_schedctl_called) != NULL &&
3502	    (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
3503	    self->ul_td_events_enable |
3504	    udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
3505		return (cond_wait_queue(cvp, mp, NULL));
3506
3507	/*
3508	 * Else do it the long way.
3509	 */
3510	return (cond_wait_common(cvp, mp, NULL));
3511}
3512
3513#pragma weak _cond_wait = cond_wait
3514int
3515cond_wait(cond_t *cvp, mutex_t *mp)
3516{
3517	int error;
3518
3519	_cancelon();
3520	error = __cond_wait(cvp, mp);
3521	if (error == EINTR)
3522		_canceloff();
3523	else
3524		_canceloff_nocancel();
3525	return (error);
3526}
3527
3528/*
3529 * pthread_cond_wait() is a cancellation point.
3530 */
3531int
3532pthread_cond_wait(pthread_cond_t *_RESTRICT_KYWD cvp,
3533	pthread_mutex_t *_RESTRICT_KYWD mp)
3534{
3535	int error;
3536
3537	error = cond_wait((cond_t *)cvp, (mutex_t *)mp);
3538	return ((error == EINTR)? 0 : error);
3539}
3540
3541/*
3542 * cond_timedwait() is a cancellation point but __cond_timedwait() is not.
3543 */
3544int
3545__cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3546{
3547	clockid_t clock_id = cvp->cond_clockid;
3548	timespec_t reltime;
3549	int error;
3550
3551	if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3552	    !mutex_held(mp))
3553		return (EPERM);
3554
3555	if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
3556		clock_id = CLOCK_REALTIME;
3557	abstime_to_reltime(clock_id, abstime, &reltime);
3558	error = cond_wait_common(cvp, mp, &reltime);
3559	if (error == ETIME && clock_id == CLOCK_HIGHRES) {
3560		/*
3561		 * Don't return ETIME if we didn't really get a timeout.
3562		 * This can happen if we return because someone resets
3563		 * the system clock.  Just return zero in this case,
3564		 * giving a spurious wakeup but not a timeout.
3565		 */
3566		if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
3567		    abstime->tv_nsec > gethrtime())
3568			error = 0;
3569	}
3570	return (error);
3571}
3572
3573int
3574cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3575{
3576	int error;
3577
3578	_cancelon();
3579	error = __cond_timedwait(cvp, mp, abstime);
3580	if (error == EINTR)
3581		_canceloff();
3582	else
3583		_canceloff_nocancel();
3584	return (error);
3585}
3586
3587/*
3588 * pthread_cond_timedwait() is a cancellation point.
3589 */
3590int
3591pthread_cond_timedwait(pthread_cond_t *_RESTRICT_KYWD cvp,
3592	pthread_mutex_t *_RESTRICT_KYWD mp,
3593	const struct timespec *_RESTRICT_KYWD abstime)
3594{
3595	int error;
3596
3597	error = cond_timedwait((cond_t *)cvp, (mutex_t *)mp, abstime);
3598	if (error == ETIME)
3599		error = ETIMEDOUT;
3600	else if (error == EINTR)
3601		error = 0;
3602	return (error);
3603}
3604
3605/*
3606 * cond_reltimedwait() is a cancellation point but __cond_reltimedwait() is not.
3607 */
3608int
3609__cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3610{
3611	timespec_t tslocal = *reltime;
3612
3613	if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3614	    !mutex_held(mp))
3615		return (EPERM);
3616
3617	return (cond_wait_common(cvp, mp, &tslocal));
3618}
3619
3620int
3621cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3622{
3623	int error;
3624
3625	_cancelon();
3626	error = __cond_reltimedwait(cvp, mp, reltime);
3627	if (error == EINTR)
3628		_canceloff();
3629	else
3630		_canceloff_nocancel();
3631	return (error);
3632}
3633
3634int
3635pthread_cond_reltimedwait_np(pthread_cond_t *_RESTRICT_KYWD cvp,
3636	pthread_mutex_t *_RESTRICT_KYWD mp,
3637	const struct timespec *_RESTRICT_KYWD reltime)
3638{
3639	int error;
3640
3641	error = cond_reltimedwait((cond_t *)cvp, (mutex_t *)mp, reltime);
3642	if (error == ETIME)
3643		error = ETIMEDOUT;
3644	else if (error == EINTR)
3645		error = 0;
3646	return (error);
3647}
3648
3649#pragma weak pthread_cond_signal = cond_signal
3650#pragma weak _cond_signal = cond_signal
3651int
3652cond_signal(cond_t *cvp)
3653{
3654	ulwp_t *self = curthread;
3655	uberdata_t *udp = self->ul_uberdata;
3656	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3657	int error = 0;
3658	int more;
3659	lwpid_t lwpid;
3660	queue_head_t *qp;
3661	mutex_t *mp;
3662	queue_head_t *mqp;
3663	ulwp_t **ulwpp;
3664	ulwp_t *ulwp;
3665	ulwp_t *prev;
3666
3667	if (csp)
3668		tdb_incr(csp->cond_signal);
3669
3670	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3671		error = _lwp_cond_signal(cvp);
3672
3673	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3674		return (error);
3675
3676	/*
3677	 * Move someone from the condvar sleep queue to the mutex sleep
3678	 * queue for the mutex that he will acquire on being waked up.
3679	 * We can do this only if we own the mutex he will acquire.
3680	 * If we do not own the mutex, or if his ul_cv_wake flag
3681	 * is set, just dequeue and unpark him.
3682	 */
3683	qp = queue_lock(cvp, CV);
3684	ulwpp = queue_slot(qp, &prev, &more);
3685	cvp->cond_waiters_user = more;
3686	if (ulwpp == NULL) {	/* no one on the sleep queue */
3687		queue_unlock(qp);
3688		return (error);
3689	}
3690	ulwp = *ulwpp;
3691
3692	/*
3693	 * Inform the thread that he was the recipient of a cond_signal().
3694	 * This lets him deal with cond_signal() and, concurrently,
3695	 * one or more of a cancellation, a UNIX signal, or a timeout.
3696	 * These latter conditions must not consume a cond_signal().
3697	 */
3698	ulwp->ul_signalled = 1;
3699
3700	/*
3701	 * Dequeue the waiter but leave his ul_sleepq non-NULL
3702	 * while we move him to the mutex queue so that he can
3703	 * deal properly with spurious wakeups.
3704	 */
3705	queue_unlink(qp, ulwpp, prev);
3706
3707	mp = ulwp->ul_cvmutex;		/* the mutex he will acquire */
3708	ulwp->ul_cvmutex = NULL;
3709	ASSERT(mp != NULL);
3710
3711	if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3712		/* just wake him up */
3713		lwpid = ulwp->ul_lwpid;
3714		no_preempt(self);
3715		ulwp->ul_sleepq = NULL;
3716		ulwp->ul_wchan = NULL;
3717		queue_unlock(qp);
3718		(void) __lwp_unpark(lwpid);
3719		preempt(self);
3720	} else {
3721		/* move him to the mutex queue */
3722		mqp = queue_lock(mp, MX);
3723		enqueue(mqp, ulwp, 0);
3724		mp->mutex_waiters = 1;
3725		queue_unlock(mqp);
3726		queue_unlock(qp);
3727	}
3728
3729	return (error);
3730}
3731
3732/*
3733 * Utility function called by mutex_wakeup_all(), cond_broadcast(),
3734 * and rw_queue_release() to (re)allocate a big buffer to hold the
3735 * lwpids of all the threads to be set running after they are removed
3736 * from their sleep queues.  Since we are holding a queue lock, we
3737 * cannot call any function that might acquire a lock.  mmap(), munmap(),
3738 * lwp_unpark_all() are simple system calls and are safe in this regard.
3739 */
3740lwpid_t *
3741alloc_lwpids(lwpid_t *lwpid, int *nlwpid_ptr, int *maxlwps_ptr)
3742{
3743	/*
3744	 * Allocate NEWLWPS ids on the first overflow.
3745	 * Double the allocation each time after that.
3746	 */
3747	int nlwpid = *nlwpid_ptr;
3748	int maxlwps = *maxlwps_ptr;
3749	int first_allocation;
3750	int newlwps;
3751	void *vaddr;
3752
3753	ASSERT(nlwpid == maxlwps);
3754
3755	first_allocation = (maxlwps == MAXLWPS);
3756	newlwps = first_allocation? NEWLWPS : 2 * maxlwps;
3757	vaddr = mmap(NULL, newlwps * sizeof (lwpid_t),
3758	    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
3759
3760	if (vaddr == MAP_FAILED) {
3761		/*
3762		 * Let's hope this never happens.
3763		 * If it does, then we have a terrible
3764		 * thundering herd on our hands.
3765		 */
3766		(void) __lwp_unpark_all(lwpid, nlwpid);
3767		*nlwpid_ptr = 0;
3768	} else {
3769		(void) memcpy(vaddr, lwpid, maxlwps * sizeof (lwpid_t));
3770		if (!first_allocation)
3771			(void) munmap((caddr_t)lwpid,
3772			    maxlwps * sizeof (lwpid_t));
3773		lwpid = vaddr;
3774		*maxlwps_ptr = newlwps;
3775	}
3776
3777	return (lwpid);
3778}
3779
3780#pragma weak pthread_cond_broadcast = cond_broadcast
3781#pragma weak _cond_broadcast = cond_broadcast
3782int
3783cond_broadcast(cond_t *cvp)
3784{
3785	ulwp_t *self = curthread;
3786	uberdata_t *udp = self->ul_uberdata;
3787	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3788	int error = 0;
3789	queue_head_t *qp;
3790	queue_root_t *qrp;
3791	mutex_t *mp;
3792	mutex_t *mp_cache = NULL;
3793	queue_head_t *mqp = NULL;
3794	ulwp_t *ulwp;
3795	int nlwpid = 0;
3796	int maxlwps = MAXLWPS;
3797	lwpid_t buffer[MAXLWPS];
3798	lwpid_t *lwpid = buffer;
3799
3800	if (csp)
3801		tdb_incr(csp->cond_broadcast);
3802
3803	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3804		error = _lwp_cond_broadcast(cvp);
3805
3806	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3807		return (error);
3808
3809	/*
3810	 * Move everyone from the condvar sleep queue to the mutex sleep
3811	 * queue for the mutex that they will acquire on being waked up.
3812	 * We can do this only if we own the mutex they will acquire.
3813	 * If we do not own the mutex, or if their ul_cv_wake flag
3814	 * is set, just dequeue and unpark them.
3815	 *
3816	 * We keep track of lwpids that are to be unparked in lwpid[].
3817	 * __lwp_unpark_all() is called to unpark all of them after
3818	 * they have been removed from the sleep queue and the sleep
3819	 * queue lock has been dropped.  If we run out of space in our
3820	 * on-stack buffer, we need to allocate more but we can't call
3821	 * lmalloc() because we are holding a queue lock when the overflow
3822	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
3823	 * either because the application may have allocated a small
3824	 * stack and we don't want to overrun the stack.  So we call
3825	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
3826	 * system call directly since that path acquires no locks.
3827	 */
3828	qp = queue_lock(cvp, CV);
3829	cvp->cond_waiters_user = 0;
3830	for (;;) {
3831		if ((qrp = qp->qh_root) == NULL ||
3832		    (ulwp = qrp->qr_head) == NULL)
3833			break;
3834		ASSERT(ulwp->ul_wchan == cvp);
3835		queue_unlink(qp, &qrp->qr_head, NULL);
3836		mp = ulwp->ul_cvmutex;		/* his mutex */
3837		ulwp->ul_cvmutex = NULL;
3838		ASSERT(mp != NULL);
3839		if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3840			/* just wake him up */
3841			ulwp->ul_sleepq = NULL;
3842			ulwp->ul_wchan = NULL;
3843			if (nlwpid == maxlwps)
3844				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
3845			lwpid[nlwpid++] = ulwp->ul_lwpid;
3846		} else {
3847			/* move him to the mutex queue */
3848			if (mp != mp_cache) {
3849				mp_cache = mp;
3850				if (mqp != NULL)
3851					queue_unlock(mqp);
3852				mqp = queue_lock(mp, MX);
3853			}
3854			enqueue(mqp, ulwp, 0);
3855			mp->mutex_waiters = 1;
3856		}
3857	}
3858	if (mqp != NULL)
3859		queue_unlock(mqp);
3860	if (nlwpid == 0) {
3861		queue_unlock(qp);
3862	} else {
3863		no_preempt(self);
3864		queue_unlock(qp);
3865		if (nlwpid == 1)
3866			(void) __lwp_unpark(lwpid[0]);
3867		else
3868			(void) __lwp_unpark_all(lwpid, nlwpid);
3869		preempt(self);
3870	}
3871	if (lwpid != buffer)
3872		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
3873	return (error);
3874}
3875
3876#pragma weak pthread_cond_destroy = cond_destroy
3877int
3878cond_destroy(cond_t *cvp)
3879{
3880	cvp->cond_magic = 0;
3881	tdb_sync_obj_deregister(cvp);
3882	return (0);
3883}
3884
3885#if defined(THREAD_DEBUG)
3886void
3887assert_no_libc_locks_held(void)
3888{
3889	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
3890}
3891
3892/* protected by link_lock */
3893uint64_t spin_lock_spin;
3894uint64_t spin_lock_spin2;
3895uint64_t spin_lock_sleep;
3896uint64_t spin_lock_wakeup;
3897
3898/*
3899 * Record spin lock statistics.
3900 * Called by a thread exiting itself in thrp_exit().
3901 * Also called via atexit() from the thread calling
3902 * exit() to do all the other threads as well.
3903 */
3904void
3905record_spin_locks(ulwp_t *ulwp)
3906{
3907	spin_lock_spin += ulwp->ul_spin_lock_spin;
3908	spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
3909	spin_lock_sleep += ulwp->ul_spin_lock_sleep;
3910	spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
3911	ulwp->ul_spin_lock_spin = 0;
3912	ulwp->ul_spin_lock_spin2 = 0;
3913	ulwp->ul_spin_lock_sleep = 0;
3914	ulwp->ul_spin_lock_wakeup = 0;
3915}
3916
3917/*
3918 * atexit function:  dump the queue statistics to stderr.
3919 */
3920#include <stdio.h>
3921void
3922dump_queue_statistics(void)
3923{
3924	uberdata_t *udp = curthread->ul_uberdata;
3925	queue_head_t *qp;
3926	int qn;
3927	uint64_t spin_lock_total = 0;
3928
3929	if (udp->queue_head == NULL || thread_queue_dump == 0)
3930		return;
3931
3932	if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
3933	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3934		return;
3935	for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
3936		if (qp->qh_lockcount == 0)
3937			continue;
3938		spin_lock_total += qp->qh_lockcount;
3939		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3940		    (u_longlong_t)qp->qh_lockcount,
3941		    qp->qh_qmax, qp->qh_hmax) < 0)
3942			return;
3943	}
3944
3945	if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
3946	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3947		return;
3948	for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
3949		if (qp->qh_lockcount == 0)
3950			continue;
3951		spin_lock_total += qp->qh_lockcount;
3952		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3953		    (u_longlong_t)qp->qh_lockcount,
3954		    qp->qh_qmax, qp->qh_hmax) < 0)
3955			return;
3956	}
3957
3958	(void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
3959	    (u_longlong_t)spin_lock_total);
3960	(void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
3961	    (u_longlong_t)spin_lock_spin);
3962	(void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
3963	    (u_longlong_t)spin_lock_spin2);
3964	(void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
3965	    (u_longlong_t)spin_lock_sleep);
3966	(void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
3967	    (u_longlong_t)spin_lock_wakeup);
3968}
3969#endif
3970