synch.c revision 1219:f89f56c2d9ac
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23/*
24 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25 * Use is subject to license terms.
26 */
27
28#pragma ident	"%Z%%M%	%I%	%E% SMI"
29
30#include <sys/sdt.h>
31
32#include "lint.h"
33#include "thr_uberdata.h"
34
35/*
36 * This mutex is initialized to be held by lwp#1.
37 * It is used to block a thread that has returned from a mutex_lock()
38 * of a PTHREAD_PRIO_INHERIT mutex with an unrecoverable error.
39 */
40mutex_t	stall_mutex = DEFAULTMUTEX;
41
42static int shared_mutex_held(mutex_t *);
43
44/*
45 * Lock statistics support functions.
46 */
47void
48record_begin_hold(tdb_mutex_stats_t *msp)
49{
50	tdb_incr(msp->mutex_lock);
51	msp->mutex_begin_hold = gethrtime();
52}
53
54hrtime_t
55record_hold_time(tdb_mutex_stats_t *msp)
56{
57	hrtime_t now = gethrtime();
58
59	if (msp->mutex_begin_hold)
60		msp->mutex_hold_time += now - msp->mutex_begin_hold;
61	msp->mutex_begin_hold = 0;
62	return (now);
63}
64
65/*
66 * Called once at library initialization.
67 */
68void
69mutex_setup(void)
70{
71	if (set_lock_byte(&stall_mutex.mutex_lockw))
72		thr_panic("mutex_setup() cannot acquire stall_mutex");
73	stall_mutex.mutex_owner = (uintptr_t)curthread;
74}
75
76/*
77 * The default spin counts of 1000 and 500 are experimentally determined.
78 * On sun4u machines with any number of processors they could be raised
79 * to 10,000 but that (experimentally) makes almost no difference.
80 * The environment variables:
81 *	_THREAD_ADAPTIVE_SPIN=count
82 *	_THREAD_RELEASE_SPIN=count
83 * can be used to override and set the counts in the range [0 .. 1,000,000].
84 */
85int	thread_adaptive_spin = 1000;
86uint_t	thread_max_spinners = 100;
87int	thread_release_spin = 500;
88int	thread_queue_verify = 0;
89static	int	ncpus;
90
91/*
92 * Distinguish spinning for queue locks from spinning for regular locks.
93 * The environment variable:
94 *	_THREAD_QUEUE_SPIN=count
95 * can be used to override and set the count in the range [0 .. 1,000,000].
96 * There is no release spin concept for queue locks.
97 */
98int	thread_queue_spin = 1000;
99
100/*
101 * Use the otherwise-unused 'mutex_ownerpid' field of a USYNC_THREAD
102 * mutex to be a count of adaptive spins in progress.
103 */
104#define	mutex_spinners	mutex_ownerpid
105
106void
107_mutex_set_typeattr(mutex_t *mp, int attr)
108{
109	mp->mutex_type |= (uint8_t)attr;
110}
111
112/*
113 * 'type' can be one of USYNC_THREAD or USYNC_PROCESS, possibly
114 * augmented by the flags LOCK_RECURSIVE and/or LOCK_ERRORCHECK,
115 * or it can be USYNC_PROCESS_ROBUST with no extra flags.
116 */
117#pragma weak _private_mutex_init = __mutex_init
118#pragma weak mutex_init = __mutex_init
119#pragma weak _mutex_init = __mutex_init
120/* ARGSUSED2 */
121int
122__mutex_init(mutex_t *mp, int type, void *arg)
123{
124	int error;
125
126	switch (type & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) {
127	case USYNC_THREAD:
128	case USYNC_PROCESS:
129		(void) _memset(mp, 0, sizeof (*mp));
130		mp->mutex_type = (uint8_t)type;
131		mp->mutex_flag = LOCK_INITED;
132		error = 0;
133		break;
134	case USYNC_PROCESS_ROBUST:
135		if (type & (LOCK_RECURSIVE|LOCK_ERRORCHECK))
136			error = EINVAL;
137		else
138			error = ___lwp_mutex_init(mp, type);
139		break;
140	default:
141		error = EINVAL;
142		break;
143	}
144	if (error == 0)
145		mp->mutex_magic = MUTEX_MAGIC;
146	return (error);
147}
148
149/*
150 * Delete mp from list of ceil mutexes owned by curthread.
151 * Return 1 if the head of the chain was updated.
152 */
153int
154_ceil_mylist_del(mutex_t *mp)
155{
156	ulwp_t *self = curthread;
157	mxchain_t **mcpp;
158	mxchain_t *mcp;
159
160	mcpp = &self->ul_mxchain;
161	while ((*mcpp)->mxchain_mx != mp)
162		mcpp = &(*mcpp)->mxchain_next;
163	mcp = *mcpp;
164	*mcpp = mcp->mxchain_next;
165	lfree(mcp, sizeof (*mcp));
166	return (mcpp == &self->ul_mxchain);
167}
168
169/*
170 * Add mp to head of list of ceil mutexes owned by curthread.
171 * Return ENOMEM if no memory could be allocated.
172 */
173int
174_ceil_mylist_add(mutex_t *mp)
175{
176	ulwp_t *self = curthread;
177	mxchain_t *mcp;
178
179	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
180		return (ENOMEM);
181	mcp->mxchain_mx = mp;
182	mcp->mxchain_next = self->ul_mxchain;
183	self->ul_mxchain = mcp;
184	return (0);
185}
186
187/*
188 * Inherit priority from ceiling.  The inheritance impacts the effective
189 * priority, not the assigned priority.  See _thread_setschedparam_main().
190 */
191void
192_ceil_prio_inherit(int ceil)
193{
194	ulwp_t *self = curthread;
195	struct sched_param param;
196
197	(void) _memset(&param, 0, sizeof (param));
198	param.sched_priority = ceil;
199	if (_thread_setschedparam_main(self->ul_lwpid,
200	    self->ul_policy, &param, PRIO_INHERIT)) {
201		/*
202		 * Panic since unclear what error code to return.
203		 * If we do return the error codes returned by above
204		 * called routine, update the man page...
205		 */
206		thr_panic("_thread_setschedparam_main() fails");
207	}
208}
209
210/*
211 * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
212 * if holding at least one ceiling lock.  If no ceiling locks are held at this
213 * point, disinherit completely, reverting back to assigned priority.
214 */
215void
216_ceil_prio_waive(void)
217{
218	ulwp_t *self = curthread;
219	struct sched_param param;
220
221	(void) _memset(&param, 0, sizeof (param));
222	if (self->ul_mxchain == NULL) {
223		/*
224		 * No ceil locks held.  Zero the epri, revert back to ul_pri.
225		 * Since thread's hash lock is not held, one cannot just
226		 * read ul_pri here...do it in the called routine...
227		 */
228		param.sched_priority = self->ul_pri;	/* ignored */
229		if (_thread_setschedparam_main(self->ul_lwpid,
230		    self->ul_policy, &param, PRIO_DISINHERIT))
231			thr_panic("_thread_setschedparam_main() fails");
232	} else {
233		/*
234		 * Set priority to that of the mutex at the head
235		 * of the ceilmutex chain.
236		 */
237		param.sched_priority =
238		    self->ul_mxchain->mxchain_mx->mutex_ceiling;
239		if (_thread_setschedparam_main(self->ul_lwpid,
240		    self->ul_policy, &param, PRIO_INHERIT))
241			thr_panic("_thread_setschedparam_main() fails");
242	}
243}
244
245/*
246 * Non-preemptive spin locks.  Used by queue_lock().
247 * No lock statistics are gathered for these locks.
248 */
249void
250spin_lock_set(mutex_t *mp)
251{
252	ulwp_t *self = curthread;
253
254	no_preempt(self);
255	if (set_lock_byte(&mp->mutex_lockw) == 0) {
256		mp->mutex_owner = (uintptr_t)self;
257		return;
258	}
259	/*
260	 * Spin for a while, attempting to acquire the lock.
261	 */
262	if (self->ul_spin_lock_spin != UINT_MAX)
263		self->ul_spin_lock_spin++;
264	if (mutex_queuelock_adaptive(mp) == 0 ||
265	    set_lock_byte(&mp->mutex_lockw) == 0) {
266		mp->mutex_owner = (uintptr_t)self;
267		return;
268	}
269	/*
270	 * Try harder if we were previously at a no premption level.
271	 */
272	if (self->ul_preempt > 1) {
273		if (self->ul_spin_lock_spin2 != UINT_MAX)
274			self->ul_spin_lock_spin2++;
275		if (mutex_queuelock_adaptive(mp) == 0 ||
276		    set_lock_byte(&mp->mutex_lockw) == 0) {
277			mp->mutex_owner = (uintptr_t)self;
278			return;
279		}
280	}
281	/*
282	 * Give up and block in the kernel for the mutex.
283	 */
284	if (self->ul_spin_lock_sleep != UINT_MAX)
285		self->ul_spin_lock_sleep++;
286	(void) ___lwp_mutex_timedlock(mp, NULL);
287	mp->mutex_owner = (uintptr_t)self;
288}
289
290void
291spin_lock_clear(mutex_t *mp)
292{
293	ulwp_t *self = curthread;
294
295	mp->mutex_owner = 0;
296	if (swap32(&mp->mutex_lockword, 0) & WAITERMASK) {
297		(void) ___lwp_mutex_wakeup(mp);
298		if (self->ul_spin_lock_wakeup != UINT_MAX)
299			self->ul_spin_lock_wakeup++;
300	}
301	preempt(self);
302}
303
304/*
305 * Allocate the sleep queue hash table.
306 */
307void
308queue_alloc(void)
309{
310	ulwp_t *self = curthread;
311	uberdata_t *udp = self->ul_uberdata;
312	void *data;
313	int i;
314
315	/*
316	 * No locks are needed; we call here only when single-threaded.
317	 */
318	ASSERT(self == udp->ulwp_one);
319	ASSERT(!udp->uberflags.uf_mt);
320	if ((data = _private_mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
321	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
322	    == MAP_FAILED)
323		thr_panic("cannot allocate thread queue_head table");
324	udp->queue_head = (queue_head_t *)data;
325	for (i = 0; i < 2 * QHASHSIZE; i++)
326		udp->queue_head[i].qh_lock.mutex_magic = MUTEX_MAGIC;
327}
328
329#if defined(THREAD_DEBUG)
330
331/*
332 * Debugging: verify correctness of a sleep queue.
333 */
334void
335QVERIFY(queue_head_t *qp)
336{
337	ulwp_t *self = curthread;
338	uberdata_t *udp = self->ul_uberdata;
339	ulwp_t *ulwp;
340	ulwp_t *prev;
341	uint_t index;
342	uint32_t cnt = 0;
343	char qtype;
344	void *wchan;
345
346	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
347	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
348	ASSERT((qp->qh_head != NULL && qp->qh_tail != NULL) ||
349		(qp->qh_head == NULL && qp->qh_tail == NULL));
350	if (!thread_queue_verify)
351		return;
352	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
353	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
354	for (prev = NULL, ulwp = qp->qh_head; ulwp != NULL;
355	    prev = ulwp, ulwp = ulwp->ul_link, cnt++) {
356		ASSERT(ulwp->ul_qtype == qtype);
357		ASSERT(ulwp->ul_wchan != NULL);
358		ASSERT(ulwp->ul_sleepq == qp);
359		wchan = ulwp->ul_wchan;
360		index = QUEUE_HASH(wchan, qtype);
361		ASSERT(&udp->queue_head[index] == qp);
362	}
363	ASSERT(qp->qh_tail == prev);
364	ASSERT(qp->qh_qlen == cnt);
365}
366
367#else	/* THREAD_DEBUG */
368
369#define	QVERIFY(qp)
370
371#endif	/* THREAD_DEBUG */
372
373/*
374 * Acquire a queue head.
375 */
376queue_head_t *
377queue_lock(void *wchan, int qtype)
378{
379	uberdata_t *udp = curthread->ul_uberdata;
380	queue_head_t *qp;
381
382	ASSERT(qtype == MX || qtype == CV);
383
384	/*
385	 * It is possible that we could be called while still single-threaded.
386	 * If so, we call queue_alloc() to allocate the queue_head[] array.
387	 */
388	if ((qp = udp->queue_head) == NULL) {
389		queue_alloc();
390		qp = udp->queue_head;
391	}
392	qp += QUEUE_HASH(wchan, qtype);
393	spin_lock_set(&qp->qh_lock);
394	/*
395	 * At once per nanosecond, qh_lockcount will wrap after 512 years.
396	 * Were we to care about this, we could peg the value at UINT64_MAX.
397	 */
398	qp->qh_lockcount++;
399	QVERIFY(qp);
400	return (qp);
401}
402
403/*
404 * Release a queue head.
405 */
406void
407queue_unlock(queue_head_t *qp)
408{
409	QVERIFY(qp);
410	spin_lock_clear(&qp->qh_lock);
411}
412
413/*
414 * For rwlock queueing, we must queue writers ahead of readers of the
415 * same priority.  We do this by making writers appear to have a half
416 * point higher priority for purposes of priority comparisons below.
417 */
418#define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
419
420void
421enqueue(queue_head_t *qp, ulwp_t *ulwp, void *wchan, int qtype)
422{
423	ulwp_t **ulwpp;
424	ulwp_t *next;
425	int pri = CMP_PRIO(ulwp);
426	int force_fifo = (qtype & FIFOQ);
427	int do_fifo;
428
429	qtype &= ~FIFOQ;
430	ASSERT(qtype == MX || qtype == CV);
431	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
432	ASSERT(ulwp->ul_sleepq != qp);
433
434	/*
435	 * LIFO queue ordering is unfair and can lead to starvation,
436	 * but it gives better performance for heavily contended locks.
437	 * We use thread_queue_fifo (range is 0..8) to determine
438	 * the frequency of FIFO vs LIFO queuing:
439	 *	0 : every 256th time	(almost always LIFO)
440	 *	1 : every 128th time
441	 *	2 : every 64th  time
442	 *	3 : every 32nd  time
443	 *	4 : every 16th  time	(the default value, mostly LIFO)
444	 *	5 : every 8th   time
445	 *	6 : every 4th   time
446	 *	7 : every 2nd   time
447	 *	8 : every time		(never LIFO, always FIFO)
448	 * Note that there is always some degree of FIFO ordering.
449	 * This breaks live lock conditions that occur in applications
450	 * that are written assuming (incorrectly) that threads acquire
451	 * locks fairly, that is, in roughly round-robin order.
452	 * In any event, the queue is maintained in priority order.
453	 *
454	 * If we are given the FIFOQ flag in qtype, fifo queueing is forced.
455	 * SUSV3 requires this for semaphores.
456	 */
457	do_fifo = (force_fifo ||
458		((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0);
459
460	if (qp->qh_head == NULL) {
461		/*
462		 * The queue is empty.  LIFO/FIFO doesn't matter.
463		 */
464		ASSERT(qp->qh_tail == NULL);
465		ulwpp = &qp->qh_head;
466	} else if (do_fifo) {
467		/*
468		 * Enqueue after the last thread whose priority is greater
469		 * than or equal to the priority of the thread being queued.
470		 * Attempt first to go directly onto the tail of the queue.
471		 */
472		if (pri <= CMP_PRIO(qp->qh_tail))
473			ulwpp = &qp->qh_tail->ul_link;
474		else {
475			for (ulwpp = &qp->qh_head; (next = *ulwpp) != NULL;
476			    ulwpp = &next->ul_link)
477				if (pri > CMP_PRIO(next))
478					break;
479		}
480	} else {
481		/*
482		 * Enqueue before the first thread whose priority is less
483		 * than or equal to the priority of the thread being queued.
484		 * Hopefully we can go directly onto the head of the queue.
485		 */
486		for (ulwpp = &qp->qh_head; (next = *ulwpp) != NULL;
487		    ulwpp = &next->ul_link)
488			if (pri >= CMP_PRIO(next))
489				break;
490	}
491	if ((ulwp->ul_link = *ulwpp) == NULL)
492		qp->qh_tail = ulwp;
493	*ulwpp = ulwp;
494
495	ulwp->ul_sleepq = qp;
496	ulwp->ul_wchan = wchan;
497	ulwp->ul_qtype = qtype;
498	if (qp->qh_qmax < ++qp->qh_qlen)
499		qp->qh_qmax = qp->qh_qlen;
500}
501
502/*
503 * Return a pointer to the queue slot of the
504 * highest priority thread on the queue.
505 * On return, prevp, if not NULL, will contain a pointer
506 * to the thread's predecessor on the queue
507 */
508static ulwp_t **
509queue_slot(queue_head_t *qp, void *wchan, int *more, ulwp_t **prevp)
510{
511	ulwp_t **ulwpp;
512	ulwp_t *ulwp;
513	ulwp_t *prev = NULL;
514	ulwp_t **suspp = NULL;
515	ulwp_t *susprev;
516
517	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
518
519	/*
520	 * Find a waiter on the sleep queue.
521	 */
522	for (ulwpp = &qp->qh_head; (ulwp = *ulwpp) != NULL;
523	    prev = ulwp, ulwpp = &ulwp->ul_link) {
524		if (ulwp->ul_wchan == wchan) {
525			if (!ulwp->ul_stop)
526				break;
527			/*
528			 * Try not to return a suspended thread.
529			 * This mimics the old libthread's behavior.
530			 */
531			if (suspp == NULL) {
532				suspp = ulwpp;
533				susprev = prev;
534			}
535		}
536	}
537
538	if (ulwp == NULL && suspp != NULL) {
539		ulwp = *(ulwpp = suspp);
540		prev = susprev;
541		suspp = NULL;
542	}
543	if (ulwp == NULL) {
544		if (more != NULL)
545			*more = 0;
546		return (NULL);
547	}
548
549	if (prevp != NULL)
550		*prevp = prev;
551	if (more == NULL)
552		return (ulwpp);
553
554	/*
555	 * Scan the remainder of the queue for another waiter.
556	 */
557	if (suspp != NULL) {
558		*more = 1;
559		return (ulwpp);
560	}
561	for (ulwp = ulwp->ul_link; ulwp != NULL; ulwp = ulwp->ul_link) {
562		if (ulwp->ul_wchan == wchan) {
563			*more = 1;
564			return (ulwpp);
565		}
566	}
567
568	*more = 0;
569	return (ulwpp);
570}
571
572ulwp_t *
573dequeue(queue_head_t *qp, void *wchan, int *more)
574{
575	ulwp_t **ulwpp;
576	ulwp_t *ulwp;
577	ulwp_t *prev;
578
579	if ((ulwpp = queue_slot(qp, wchan, more, &prev)) == NULL)
580		return (NULL);
581
582	/*
583	 * Dequeue the waiter.
584	 */
585	ulwp = *ulwpp;
586	*ulwpp = ulwp->ul_link;
587	ulwp->ul_link = NULL;
588	if (qp->qh_tail == ulwp)
589		qp->qh_tail = prev;
590	qp->qh_qlen--;
591	ulwp->ul_sleepq = NULL;
592	ulwp->ul_wchan = NULL;
593
594	return (ulwp);
595}
596
597/*
598 * Return a pointer to the highest priority thread sleeping on wchan.
599 */
600ulwp_t *
601queue_waiter(queue_head_t *qp, void *wchan)
602{
603	ulwp_t **ulwpp;
604
605	if ((ulwpp = queue_slot(qp, wchan, NULL, NULL)) == NULL)
606		return (NULL);
607	return (*ulwpp);
608}
609
610uint8_t
611dequeue_self(queue_head_t *qp, void *wchan)
612{
613	ulwp_t *self = curthread;
614	ulwp_t **ulwpp;
615	ulwp_t *ulwp;
616	ulwp_t *prev = NULL;
617	int found = 0;
618	int more = 0;
619
620	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
621
622	/* find self on the sleep queue */
623	for (ulwpp = &qp->qh_head; (ulwp = *ulwpp) != NULL;
624	    prev = ulwp, ulwpp = &ulwp->ul_link) {
625		if (ulwp == self) {
626			/* dequeue ourself */
627			*ulwpp = self->ul_link;
628			if (qp->qh_tail == self)
629				qp->qh_tail = prev;
630			qp->qh_qlen--;
631			ASSERT(self->ul_wchan == wchan);
632			self->ul_cvmutex = NULL;
633			self->ul_sleepq = NULL;
634			self->ul_wchan = NULL;
635			self->ul_cv_wake = 0;
636			self->ul_link = NULL;
637			found = 1;
638			break;
639		}
640		if (ulwp->ul_wchan == wchan)
641			more = 1;
642	}
643
644	if (!found)
645		thr_panic("dequeue_self(): curthread not found on queue");
646
647	if (more)
648		return (1);
649
650	/* scan the remainder of the queue for another waiter */
651	for (ulwp = *ulwpp; ulwp != NULL; ulwp = ulwp->ul_link) {
652		if (ulwp->ul_wchan == wchan)
653			return (1);
654	}
655
656	return (0);
657}
658
659/*
660 * Called from call_user_handler() and _thrp_suspend() to take
661 * ourself off of our sleep queue so we can grab locks.
662 */
663void
664unsleep_self(void)
665{
666	ulwp_t *self = curthread;
667	queue_head_t *qp;
668
669	/*
670	 * Calling enter_critical()/exit_critical() here would lead
671	 * to recursion.  Just manipulate self->ul_critical directly.
672	 */
673	self->ul_critical++;
674	self->ul_writer = 0;
675	while (self->ul_sleepq != NULL) {
676		qp = queue_lock(self->ul_wchan, self->ul_qtype);
677		/*
678		 * We may have been moved from a CV queue to a
679		 * mutex queue while we were attempting queue_lock().
680		 * If so, just loop around and try again.
681		 * dequeue_self() clears self->ul_sleepq.
682		 */
683		if (qp == self->ul_sleepq)
684			(void) dequeue_self(qp, self->ul_wchan);
685		queue_unlock(qp);
686	}
687	self->ul_critical--;
688}
689
690/*
691 * Common code for calling the the ___lwp_mutex_timedlock() system call.
692 * Returns with mutex_owner and mutex_ownerpid set correctly.
693 */
694int
695mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
696{
697	ulwp_t *self = curthread;
698	uberdata_t *udp = self->ul_uberdata;
699	hrtime_t begin_sleep;
700	int error;
701
702	self->ul_sp = stkptr();
703	self->ul_wchan = mp;
704	if (__td_event_report(self, TD_SLEEP, udp)) {
705		self->ul_td_evbuf.eventnum = TD_SLEEP;
706		self->ul_td_evbuf.eventdata = mp;
707		tdb_event(TD_SLEEP, udp);
708	}
709	if (msp) {
710		tdb_incr(msp->mutex_sleep);
711		begin_sleep = gethrtime();
712	}
713
714	DTRACE_PROBE1(plockstat, mutex__block, mp);
715
716	for (;;) {
717		if ((error = ___lwp_mutex_timedlock(mp, tsp)) != 0) {
718			DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
719			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
720			break;
721		}
722
723		if (mp->mutex_type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
724			/*
725			 * Defend against forkall().  We may be the child,
726			 * in which case we don't actually own the mutex.
727			 */
728			enter_critical(self);
729			if (mp->mutex_ownerpid == udp->pid) {
730				mp->mutex_owner = (uintptr_t)self;
731				exit_critical(self);
732				DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
733				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
734				    0, 0);
735				break;
736			}
737			exit_critical(self);
738		} else {
739			mp->mutex_owner = (uintptr_t)self;
740			DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
741			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
742			break;
743		}
744	}
745	if (msp)
746		msp->mutex_sleep_time += gethrtime() - begin_sleep;
747	self->ul_wchan = NULL;
748	self->ul_sp = 0;
749
750	return (error);
751}
752
753/*
754 * Common code for calling the ___lwp_mutex_trylock() system call.
755 * Returns with mutex_owner and mutex_ownerpid set correctly.
756 */
757int
758mutex_trylock_kernel(mutex_t *mp)
759{
760	ulwp_t *self = curthread;
761	uberdata_t *udp = self->ul_uberdata;
762	int error;
763
764	for (;;) {
765		if ((error = ___lwp_mutex_trylock(mp)) != 0) {
766			if (error != EBUSY) {
767				DTRACE_PROBE2(plockstat, mutex__error, mp,
768				    error);
769			}
770			break;
771		}
772
773		if (mp->mutex_type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
774			/*
775			 * Defend against forkall().  We may be the child,
776			 * in which case we don't actually own the mutex.
777			 */
778			enter_critical(self);
779			if (mp->mutex_ownerpid == udp->pid) {
780				mp->mutex_owner = (uintptr_t)self;
781				exit_critical(self);
782				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
783				    0, 0);
784				break;
785			}
786			exit_critical(self);
787		} else {
788			mp->mutex_owner = (uintptr_t)self;
789			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
790			break;
791		}
792	}
793
794	return (error);
795}
796
797volatile sc_shared_t *
798setup_schedctl(void)
799{
800	ulwp_t *self = curthread;
801	volatile sc_shared_t *scp;
802	sc_shared_t *tmp;
803
804	if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
805	    !self->ul_vfork &&			/* not a child of vfork() */
806	    !self->ul_schedctl_called) {	/* haven't been called before */
807		enter_critical(self);
808		self->ul_schedctl_called = &self->ul_uberdata->uberflags;
809		if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
810			self->ul_schedctl = scp = tmp;
811		exit_critical(self);
812	}
813	/*
814	 * Unless the call to setup_schedctl() is surrounded
815	 * by enter_critical()/exit_critical(), the address
816	 * we are returning could be invalid due to a forkall()
817	 * having occurred in another thread.
818	 */
819	return (scp);
820}
821
822/*
823 * Interfaces from libsched, incorporated into libc.
824 * libsched.so.1 is now a filter library onto libc.
825 */
826#pragma weak schedctl_lookup = _schedctl_init
827#pragma weak _schedctl_lookup = _schedctl_init
828#pragma weak schedctl_init = _schedctl_init
829schedctl_t *
830_schedctl_init(void)
831{
832	volatile sc_shared_t *scp = setup_schedctl();
833	return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
834}
835
836#pragma weak schedctl_exit = _schedctl_exit
837void
838_schedctl_exit(void)
839{
840}
841
842/*
843 * Contract private interface for java.
844 * Set up the schedctl data if it doesn't exist yet.
845 * Return a pointer to the pointer to the schedctl data.
846 */
847volatile sc_shared_t *volatile *
848_thr_schedctl(void)
849{
850	ulwp_t *self = curthread;
851	volatile sc_shared_t *volatile *ptr;
852
853	if (self->ul_vfork)
854		return (NULL);
855	if (*(ptr = &self->ul_schedctl) == NULL)
856		(void) setup_schedctl();
857	return (ptr);
858}
859
860/*
861 * Block signals and attempt to block preemption.
862 * no_preempt()/preempt() must be used in pairs but can be nested.
863 */
864void
865no_preempt(ulwp_t *self)
866{
867	volatile sc_shared_t *scp;
868
869	if (self->ul_preempt++ == 0) {
870		enter_critical(self);
871		if ((scp = self->ul_schedctl) != NULL ||
872		    (scp = setup_schedctl()) != NULL) {
873			/*
874			 * Save the pre-existing preempt value.
875			 */
876			self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
877			scp->sc_preemptctl.sc_nopreempt = 1;
878		}
879	}
880}
881
882/*
883 * Undo the effects of no_preempt().
884 */
885void
886preempt(ulwp_t *self)
887{
888	volatile sc_shared_t *scp;
889
890	ASSERT(self->ul_preempt > 0);
891	if (--self->ul_preempt == 0) {
892		if ((scp = self->ul_schedctl) != NULL) {
893			/*
894			 * Restore the pre-existing preempt value.
895			 */
896			scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
897			if (scp->sc_preemptctl.sc_yield &&
898			    scp->sc_preemptctl.sc_nopreempt == 0) {
899				lwp_yield();
900				if (scp->sc_preemptctl.sc_yield) {
901					/*
902					 * Shouldn't happen.  This is either
903					 * a race condition or the thread
904					 * just entered the real-time class.
905					 */
906					lwp_yield();
907					scp->sc_preemptctl.sc_yield = 0;
908				}
909			}
910		}
911		exit_critical(self);
912	}
913}
914
915/*
916 * If a call to preempt() would cause the current thread to yield or to
917 * take deferred actions in exit_critical(), then unpark the specified
918 * lwp so it can run while we delay.  Return the original lwpid if the
919 * unpark was not performed, else return zero.  The tests are a repeat
920 * of some of the tests in preempt(), above.  This is a statistical
921 * optimization solely for cond_sleep_queue(), below.
922 */
923static lwpid_t
924preempt_unpark(ulwp_t *self, lwpid_t lwpid)
925{
926	volatile sc_shared_t *scp = self->ul_schedctl;
927
928	ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
929	if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
930	    (self->ul_curplease && self->ul_critical == 1)) {
931		(void) __lwp_unpark(lwpid);
932		lwpid = 0;
933	}
934	return (lwpid);
935}
936
937/*
938 * Spin for a while, trying to grab the lock.  We know that we
939 * failed set_lock_byte(&mp->mutex_lockw) once before coming here.
940 * If this fails, return EBUSY and let the caller deal with it.
941 * If this succeeds, return 0 with mutex_owner set to curthread.
942 */
943int
944mutex_trylock_adaptive(mutex_t *mp)
945{
946	ulwp_t *self = curthread;
947	ulwp_t *ulwp;
948	volatile sc_shared_t *scp;
949	volatile uint8_t *lockp;
950	volatile uint64_t *ownerp;
951	int count, max = self->ul_adaptive_spin;
952
953	ASSERT(!(mp->mutex_type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)));
954
955	if (max == 0 || (mp->mutex_spinners >= self->ul_max_spinners))
956		return (EBUSY);
957
958	lockp = (volatile uint8_t *)&mp->mutex_lockw;
959	ownerp = (volatile uint64_t *)&mp->mutex_owner;
960
961	DTRACE_PROBE1(plockstat, mutex__spin, mp);
962
963	/*
964	 * This spin loop is unfair to lwps that have already dropped into
965	 * the kernel to sleep.  They will starve on a highly-contended mutex.
966	 * This is just too bad.  The adaptive spin algorithm is intended
967	 * to allow programs with highly-contended locks (that is, broken
968	 * programs) to execute with reasonable speed despite their contention.
969	 * Being fair would reduce the speed of such programs and well-written
970	 * programs will not suffer in any case.
971	 */
972	enter_critical(self);		/* protects ul_schedctl */
973	incr32(&mp->mutex_spinners);
974	for (count = 0; count < max; count++) {
975		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
976			*ownerp = (uintptr_t)self;
977			decr32(&mp->mutex_spinners);
978			exit_critical(self);
979			DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
980			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
981			return (0);
982		}
983		SMT_PAUSE();
984		/*
985		 * Stop spinning if the mutex owner is not running on
986		 * a processor; it will not drop the lock any time soon
987		 * and we would just be wasting time to keep spinning.
988		 *
989		 * Note that we are looking at another thread (ulwp_t)
990		 * without ensuring that the other thread does not exit.
991		 * The scheme relies on ulwp_t structures never being
992		 * deallocated by the library (the library employs a free
993		 * list of ulwp_t structs that are reused when new threads
994		 * are created) and on schedctl shared memory never being
995		 * deallocated once created via __schedctl().
996		 *
997		 * Thus, the worst that can happen when the spinning thread
998		 * looks at the owner's schedctl data is that it is looking
999		 * at some other thread's schedctl data.  This almost never
1000		 * happens and is benign when it does.
1001		 */
1002		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1003		    ((scp = ulwp->ul_schedctl) == NULL ||
1004		    scp->sc_state != SC_ONPROC))
1005			break;
1006	}
1007	decr32(&mp->mutex_spinners);
1008	exit_critical(self);
1009
1010	DTRACE_PROBE2(plockstat, mutex__spun, 0, count);
1011
1012	return (EBUSY);
1013}
1014
1015/*
1016 * Same as mutex_trylock_adaptive(), except specifically for queue locks.
1017 * The owner field is not set here; the caller (spin_lock_set()) sets it.
1018 */
1019int
1020mutex_queuelock_adaptive(mutex_t *mp)
1021{
1022	ulwp_t *ulwp;
1023	volatile sc_shared_t *scp;
1024	volatile uint8_t *lockp;
1025	volatile uint64_t *ownerp;
1026	int count = curthread->ul_queue_spin;
1027
1028	ASSERT(mp->mutex_type == USYNC_THREAD);
1029
1030	if (count == 0)
1031		return (EBUSY);
1032
1033	lockp = (volatile uint8_t *)&mp->mutex_lockw;
1034	ownerp = (volatile uint64_t *)&mp->mutex_owner;
1035	while (--count >= 0) {
1036		if (*lockp == 0 && set_lock_byte(lockp) == 0)
1037			return (0);
1038		SMT_PAUSE();
1039		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1040		    ((scp = ulwp->ul_schedctl) == NULL ||
1041		    scp->sc_state != SC_ONPROC))
1042			break;
1043	}
1044
1045	return (EBUSY);
1046}
1047
1048/*
1049 * Like mutex_trylock_adaptive(), but for process-shared mutexes.
1050 * Spin for a while, trying to grab the lock.  We know that we
1051 * failed set_lock_byte(&mp->mutex_lockw) once before coming here.
1052 * If this fails, return EBUSY and let the caller deal with it.
1053 * If this succeeds, return 0 with mutex_owner set to curthread
1054 * and mutex_ownerpid set to the current pid.
1055 */
1056int
1057mutex_trylock_process(mutex_t *mp)
1058{
1059	ulwp_t *self = curthread;
1060	uberdata_t *udp = self->ul_uberdata;
1061	int count;
1062	volatile uint8_t *lockp;
1063	volatile uint64_t *ownerp;
1064	volatile int32_t *pidp;
1065	pid_t pid, newpid;
1066	uint64_t owner, newowner;
1067
1068	if ((count = ncpus) == 0)
1069		count = ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1070	count = (count > 1)? self->ul_adaptive_spin : 0;
1071
1072	ASSERT((mp->mutex_type & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) ==
1073		USYNC_PROCESS);
1074
1075	if (count == 0)
1076		return (EBUSY);
1077
1078	lockp = (volatile uint8_t *)&mp->mutex_lockw;
1079	ownerp = (volatile uint64_t *)&mp->mutex_owner;
1080	pidp = (volatile int32_t *)&mp->mutex_ownerpid;
1081	owner = *ownerp;
1082	pid = *pidp;
1083	/*
1084	 * This is a process-shared mutex.
1085	 * We cannot know if the owner is running on a processor.
1086	 * We just spin and hope that it is on a processor.
1087	 */
1088	while (--count >= 0) {
1089		if (*lockp == 0) {
1090			enter_critical(self);
1091			if (set_lock_byte(lockp) == 0) {
1092				*ownerp = (uintptr_t)self;
1093				*pidp = udp->pid;
1094				exit_critical(self);
1095				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
1096				    0, 0);
1097				return (0);
1098			}
1099			exit_critical(self);
1100		} else if ((newowner = *ownerp) == owner &&
1101		    (newpid = *pidp) == pid) {
1102			SMT_PAUSE();
1103			continue;
1104		}
1105		/*
1106		 * The owner of the lock changed; start the count over again.
1107		 * This may be too aggressive; it needs testing.
1108		 */
1109		owner = newowner;
1110		pid = newpid;
1111		count = self->ul_adaptive_spin;
1112	}
1113
1114	return (EBUSY);
1115}
1116
1117/*
1118 * Mutex wakeup code for releasing a USYNC_THREAD mutex.
1119 * Returns the lwpid of the thread that was dequeued, if any.
1120 * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
1121 * to wake up the specified lwp.
1122 */
1123lwpid_t
1124mutex_wakeup(mutex_t *mp)
1125{
1126	lwpid_t lwpid = 0;
1127	queue_head_t *qp;
1128	ulwp_t *ulwp;
1129	int more;
1130
1131	/*
1132	 * Dequeue a waiter from the sleep queue.  Don't touch the mutex
1133	 * waiters bit if no one was found on the queue because the mutex
1134	 * might have been deallocated or reallocated for another purpose.
1135	 */
1136	qp = queue_lock(mp, MX);
1137	if ((ulwp = dequeue(qp, mp, &more)) != NULL) {
1138		lwpid = ulwp->ul_lwpid;
1139		mp->mutex_waiters = (more? 1 : 0);
1140	}
1141	queue_unlock(qp);
1142	return (lwpid);
1143}
1144
1145/*
1146 * Spin for a while, testing to see if the lock has been grabbed.
1147 * If this fails, call mutex_wakeup() to release a waiter.
1148 */
1149lwpid_t
1150mutex_unlock_queue(mutex_t *mp)
1151{
1152	ulwp_t *self = curthread;
1153	uint32_t *lockw = &mp->mutex_lockword;
1154	lwpid_t lwpid;
1155	volatile uint8_t *lockp;
1156	volatile uint32_t *spinp;
1157	int count;
1158
1159	/*
1160	 * We use the swap primitive to clear the lock, but we must
1161	 * atomically retain the waiters bit for the remainder of this
1162	 * code to work.  We first check to see if the waiters bit is
1163	 * set and if so clear the lock by swapping in a word containing
1164	 * only the waiters bit.  This could produce a false positive test
1165	 * for whether there are waiters that need to be waked up, but
1166	 * this just causes an extra call to mutex_wakeup() to do nothing.
1167	 * The opposite case is more delicate:  If there are no waiters,
1168	 * we swap in a zero lock byte and a zero waiters bit.  The result
1169	 * of the swap could indicate that there really was a waiter so in
1170	 * this case we go directly to mutex_wakeup() without performing
1171	 * any of the adaptive code because the waiter bit has been cleared
1172	 * and the adaptive code is unreliable in this case.
1173	 */
1174	if (!(*lockw & WAITERMASK)) {	/* no waiter exists right now */
1175		mp->mutex_owner = 0;
1176		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1177		if (!(swap32(lockw, 0) & WAITERMASK))	/* still no waiters */
1178			return (0);
1179		no_preempt(self);	/* ensure a prompt wakeup */
1180		lwpid = mutex_wakeup(mp);
1181	} else {
1182		no_preempt(self);	/* ensure a prompt wakeup */
1183		lockp = (volatile uint8_t *)&mp->mutex_lockw;
1184		spinp = (volatile uint32_t *)&mp->mutex_spinners;
1185		mp->mutex_owner = 0;
1186		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1187		(void) swap32(lockw, WAITER);	/* clear lock, retain waiter */
1188
1189		/*
1190		 * We spin here fewer times than mutex_trylock_adaptive().
1191		 * We are trying to balance two conflicting goals:
1192		 * 1. Avoid waking up anyone if a spinning thread
1193		 *    grabs the lock.
1194		 * 2. Wake up a sleeping thread promptly to get on
1195		 *    with useful work.
1196		 * We don't spin at all if there is no acquiring spinner;
1197		 * (mp->mutex_spinners is non-zero if there are spinners).
1198		 */
1199		for (count = self->ul_release_spin;
1200		    *spinp && count > 0; count--) {
1201			/*
1202			 * There is a waiter that we will have to wake
1203			 * up unless someone else grabs the lock while
1204			 * we are busy spinning.  Like the spin loop in
1205			 * mutex_trylock_adaptive(), this spin loop is
1206			 * unfair to lwps that have already dropped into
1207			 * the kernel to sleep.  They will starve on a
1208			 * highly-contended mutex.  Too bad.
1209			 */
1210			if (*lockp != 0) {	/* somebody grabbed the lock */
1211				preempt(self);
1212				return (0);
1213			}
1214			SMT_PAUSE();
1215		}
1216
1217		/*
1218		 * No one grabbed the lock.
1219		 * Wake up some lwp that is waiting for it.
1220		 */
1221		mp->mutex_waiters = 0;
1222		lwpid = mutex_wakeup(mp);
1223	}
1224
1225	if (lwpid == 0)
1226		preempt(self);
1227	return (lwpid);
1228}
1229
1230/*
1231 * Like mutex_unlock_queue(), but for process-shared mutexes.
1232 * We tested the waiters field before calling here and it was non-zero.
1233 */
1234void
1235mutex_unlock_process(mutex_t *mp)
1236{
1237	ulwp_t *self = curthread;
1238	int count;
1239	volatile uint8_t *lockp;
1240
1241	/*
1242	 * See the comments in mutex_unlock_queue(), above.
1243	 */
1244	if ((count = ncpus) == 0)
1245		count = ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1246	count = (count > 1)? self->ul_release_spin : 0;
1247	no_preempt(self);
1248	mp->mutex_owner = 0;
1249	mp->mutex_ownerpid = 0;
1250	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1251	if (count == 0) {
1252		/* clear lock, test waiter */
1253		if (!(swap32(&mp->mutex_lockword, 0) & WAITERMASK)) {
1254			/* no waiters now */
1255			preempt(self);
1256			return;
1257		}
1258	} else {
1259		/* clear lock, retain waiter */
1260		(void) swap32(&mp->mutex_lockword, WAITER);
1261		lockp = (volatile uint8_t *)&mp->mutex_lockw;
1262		while (--count >= 0) {
1263			if (*lockp != 0) {
1264				/* somebody grabbed the lock */
1265				preempt(self);
1266				return;
1267			}
1268			SMT_PAUSE();
1269		}
1270		/*
1271		 * We must clear the waiters field before going
1272		 * to the kernel, else it could remain set forever.
1273		 */
1274		mp->mutex_waiters = 0;
1275	}
1276	(void) ___lwp_mutex_wakeup(mp);
1277	preempt(self);
1278}
1279
1280/*
1281 * Return the real priority of a thread.
1282 */
1283int
1284real_priority(ulwp_t *ulwp)
1285{
1286	if (ulwp->ul_epri == 0)
1287		return (ulwp->ul_mappedpri? ulwp->ul_mappedpri : ulwp->ul_pri);
1288	return (ulwp->ul_emappedpri? ulwp->ul_emappedpri : ulwp->ul_epri);
1289}
1290
1291void
1292stall(void)
1293{
1294	for (;;)
1295		(void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
1296}
1297
1298/*
1299 * Acquire a USYNC_THREAD mutex via user-level sleep queues.
1300 * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
1301 * Returns with mutex_owner set correctly.
1302 */
1303int
1304mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
1305	timespec_t *tsp)
1306{
1307	uberdata_t *udp = curthread->ul_uberdata;
1308	queue_head_t *qp;
1309	hrtime_t begin_sleep;
1310	int error = 0;
1311
1312	self->ul_sp = stkptr();
1313	if (__td_event_report(self, TD_SLEEP, udp)) {
1314		self->ul_wchan = mp;
1315		self->ul_td_evbuf.eventnum = TD_SLEEP;
1316		self->ul_td_evbuf.eventdata = mp;
1317		tdb_event(TD_SLEEP, udp);
1318	}
1319	if (msp) {
1320		tdb_incr(msp->mutex_sleep);
1321		begin_sleep = gethrtime();
1322	}
1323
1324	DTRACE_PROBE1(plockstat, mutex__block, mp);
1325
1326	/*
1327	 * Put ourself on the sleep queue, and while we are
1328	 * unable to grab the lock, go park in the kernel.
1329	 * Take ourself off the sleep queue after we acquire the lock.
1330	 * The waiter bit can be set/cleared only while holding the queue lock.
1331	 */
1332	qp = queue_lock(mp, MX);
1333	enqueue(qp, self, mp, MX);
1334	mp->mutex_waiters = 1;
1335	for (;;) {
1336		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1337			mp->mutex_owner = (uintptr_t)self;
1338			DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1339			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1340			mp->mutex_waiters = dequeue_self(qp, mp);
1341			break;
1342		}
1343		set_parking_flag(self, 1);
1344		queue_unlock(qp);
1345		/*
1346		 * __lwp_park() will return the residual time in tsp
1347		 * if we are unparked before the timeout expires.
1348		 */
1349		if ((error = __lwp_park(tsp, 0)) == EINTR)
1350			error = 0;
1351		set_parking_flag(self, 0);
1352		/*
1353		 * We could have taken a signal or suspended ourself.
1354		 * If we did, then we removed ourself from the queue.
1355		 * Someone else may have removed us from the queue
1356		 * as a consequence of mutex_unlock().  We may have
1357		 * gotten a timeout from __lwp_park().  Or we may still
1358		 * be on the queue and this is just a spurious wakeup.
1359		 */
1360		qp = queue_lock(mp, MX);
1361		if (self->ul_sleepq == NULL) {
1362			if (error) {
1363				DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1364				DTRACE_PROBE2(plockstat, mutex__error, mp,
1365				    error);
1366				break;
1367			}
1368			if (set_lock_byte(&mp->mutex_lockw) == 0) {
1369				mp->mutex_owner = (uintptr_t)self;
1370				DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1371				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
1372				    0, 0);
1373				break;
1374			}
1375			enqueue(qp, self, mp, MX);
1376			mp->mutex_waiters = 1;
1377		}
1378		ASSERT(self->ul_sleepq == qp &&
1379		    self->ul_qtype == MX &&
1380		    self->ul_wchan == mp);
1381		if (error) {
1382			mp->mutex_waiters = dequeue_self(qp, mp);
1383			DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1384			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1385			break;
1386		}
1387	}
1388
1389	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
1390	    self->ul_wchan == NULL);
1391	self->ul_sp = 0;
1392
1393	queue_unlock(qp);
1394	if (msp)
1395		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1396
1397	ASSERT(error == 0 || error == EINVAL || error == ETIME);
1398	return (error);
1399}
1400
1401/*
1402 * Returns with mutex_owner set correctly.
1403 */
1404int
1405mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
1406{
1407	ulwp_t *self = curthread;
1408	uberdata_t *udp = self->ul_uberdata;
1409	int mtype = mp->mutex_type;
1410	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
1411	int error = 0;
1412
1413	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1414
1415	if (!self->ul_schedctl_called)
1416		(void) setup_schedctl();
1417
1418	if (msp && try == MUTEX_TRY)
1419		tdb_incr(msp->mutex_try);
1420
1421	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_is_held(mp)) {
1422		if (mtype & LOCK_RECURSIVE) {
1423			if (mp->mutex_rcount == RECURSION_MAX) {
1424				error = EAGAIN;
1425			} else {
1426				mp->mutex_rcount++;
1427				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
1428				    1, 0);
1429				return (0);
1430			}
1431		} else if (try == MUTEX_TRY) {
1432			return (EBUSY);
1433		} else {
1434			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1435			return (EDEADLK);
1436		}
1437	}
1438
1439	if (self->ul_error_detection && try == MUTEX_LOCK &&
1440	    tsp == NULL && mutex_is_held(mp))
1441		lock_error(mp, "mutex_lock", NULL, NULL);
1442
1443	if (mtype &
1444	    (USYNC_PROCESS_ROBUST|PTHREAD_PRIO_INHERIT|PTHREAD_PRIO_PROTECT)) {
1445		uint8_t ceil;
1446		int myprio;
1447
1448		if (mtype & PTHREAD_PRIO_PROTECT) {
1449			ceil = mp->mutex_ceiling;
1450			ASSERT(_validate_rt_prio(SCHED_FIFO, ceil) == 0);
1451			myprio = real_priority(self);
1452			if (myprio > ceil) {
1453				DTRACE_PROBE2(plockstat, mutex__error, mp,
1454				    EINVAL);
1455				return (EINVAL);
1456			}
1457			if ((error = _ceil_mylist_add(mp)) != 0) {
1458				DTRACE_PROBE2(plockstat, mutex__error, mp,
1459				    error);
1460				return (error);
1461			}
1462			if (myprio < ceil)
1463				_ceil_prio_inherit(ceil);
1464		}
1465
1466		if (mtype & PTHREAD_PRIO_INHERIT) {
1467			/* go straight to the kernel */
1468			if (try == MUTEX_TRY)
1469				error = mutex_trylock_kernel(mp);
1470			else	/* MUTEX_LOCK */
1471				error = mutex_lock_kernel(mp, tsp, msp);
1472			/*
1473			 * The kernel never sets or clears the lock byte
1474			 * for PTHREAD_PRIO_INHERIT mutexes.
1475			 * Set it here for debugging consistency.
1476			 */
1477			switch (error) {
1478			case 0:
1479			case EOWNERDEAD:
1480				mp->mutex_lockw = LOCKSET;
1481				break;
1482			}
1483		} else if (mtype & USYNC_PROCESS_ROBUST) {
1484			/* go straight to the kernel */
1485			if (try == MUTEX_TRY)
1486				error = mutex_trylock_kernel(mp);
1487			else	/* MUTEX_LOCK */
1488				error = mutex_lock_kernel(mp, tsp, msp);
1489		} else {	/* PTHREAD_PRIO_PROTECT */
1490			/*
1491			 * Try once at user level before going to the kernel.
1492			 * If this is a process shared mutex then protect
1493			 * against forkall() while setting mp->mutex_ownerpid.
1494			 */
1495			if (mtype & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
1496				enter_critical(self);
1497				if (set_lock_byte(&mp->mutex_lockw) == 0) {
1498					mp->mutex_owner = (uintptr_t)self;
1499					mp->mutex_ownerpid = udp->pid;
1500					exit_critical(self);
1501					DTRACE_PROBE3(plockstat,
1502					    mutex__acquire, mp, 0, 0);
1503				} else {
1504					exit_critical(self);
1505					error = EBUSY;
1506				}
1507			} else {
1508				if (set_lock_byte(&mp->mutex_lockw) == 0) {
1509					mp->mutex_owner = (uintptr_t)self;
1510					DTRACE_PROBE3(plockstat,
1511					    mutex__acquire, mp, 0, 0);
1512				} else {
1513					error = EBUSY;
1514				}
1515			}
1516			if (error && try == MUTEX_LOCK)
1517				error = mutex_lock_kernel(mp, tsp, msp);
1518		}
1519
1520		if (error) {
1521			if (mtype & PTHREAD_PRIO_INHERIT) {
1522				switch (error) {
1523				case EOWNERDEAD:
1524				case ENOTRECOVERABLE:
1525					if (mtype & PTHREAD_MUTEX_ROBUST_NP)
1526						break;
1527					if (error == EOWNERDEAD) {
1528						/*
1529						 * We own the mutex; unlock it.
1530						 * It becomes ENOTRECOVERABLE.
1531						 * All waiters are waked up.
1532						 */
1533						mp->mutex_owner = 0;
1534						mp->mutex_ownerpid = 0;
1535						DTRACE_PROBE2(plockstat,
1536						    mutex__release, mp, 0);
1537						mp->mutex_lockw = LOCKCLEAR;
1538						(void) ___lwp_mutex_unlock(mp);
1539					}
1540					/* FALLTHROUGH */
1541				case EDEADLK:
1542					if (try == MUTEX_LOCK)
1543						stall();
1544					error = EBUSY;
1545					break;
1546				}
1547			}
1548			if ((mtype & PTHREAD_PRIO_PROTECT) &&
1549			    error != EOWNERDEAD) {
1550				(void) _ceil_mylist_del(mp);
1551				if (myprio < ceil)
1552					_ceil_prio_waive();
1553			}
1554		}
1555	} else if (mtype & USYNC_PROCESS) {
1556		/*
1557		 * This is a process shared mutex.  Protect against
1558		 * forkall() while setting mp->mutex_ownerpid.
1559		 */
1560		enter_critical(self);
1561		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1562			mp->mutex_owner = (uintptr_t)self;
1563			mp->mutex_ownerpid = udp->pid;
1564			exit_critical(self);
1565			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1566		} else {
1567			/* try a little harder */
1568			exit_critical(self);
1569			error = mutex_trylock_process(mp);
1570		}
1571		if (error && try == MUTEX_LOCK)
1572			error = mutex_lock_kernel(mp, tsp, msp);
1573	} else  {	/* USYNC_THREAD */
1574		/* try once */
1575		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1576			mp->mutex_owner = (uintptr_t)self;
1577			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1578		} else {
1579			/* try a little harder if we don't own the mutex */
1580			error = EBUSY;
1581			if (MUTEX_OWNER(mp) != self)
1582				error = mutex_trylock_adaptive(mp);
1583			if (error && try == MUTEX_LOCK)		/* go park */
1584				error = mutex_lock_queue(self, msp, mp, tsp);
1585		}
1586	}
1587
1588	switch (error) {
1589	case EOWNERDEAD:
1590	case ELOCKUNMAPPED:
1591		mp->mutex_owner = (uintptr_t)self;
1592		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1593		/* FALLTHROUGH */
1594	case 0:
1595		if (msp)
1596			record_begin_hold(msp);
1597		break;
1598	default:
1599		if (try == MUTEX_TRY) {
1600			if (msp)
1601				tdb_incr(msp->mutex_try_fail);
1602			if (__td_event_report(self, TD_LOCK_TRY, udp)) {
1603				self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
1604				tdb_event(TD_LOCK_TRY, udp);
1605			}
1606		}
1607		break;
1608	}
1609
1610	return (error);
1611}
1612
1613int
1614fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
1615{
1616	ulwp_t *self = curthread;
1617	uberdata_t *udp = self->ul_uberdata;
1618
1619	/*
1620	 * We know that USYNC_PROCESS is set in mtype and that
1621	 * zero, one, or both of the flags LOCK_RECURSIVE and
1622	 * LOCK_ERRORCHECK are set, and that no other flags are set.
1623	 */
1624	enter_critical(self);
1625	if (set_lock_byte(&mp->mutex_lockw) == 0) {
1626		mp->mutex_owner = (uintptr_t)self;
1627		mp->mutex_ownerpid = udp->pid;
1628		exit_critical(self);
1629		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1630		return (0);
1631	}
1632	exit_critical(self);
1633
1634	if ((mtype & ~USYNC_PROCESS) && shared_mutex_held(mp)) {
1635		if (mtype & LOCK_RECURSIVE) {
1636			if (mp->mutex_rcount == RECURSION_MAX)
1637				return (EAGAIN);
1638			mp->mutex_rcount++;
1639			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
1640			return (0);
1641		}
1642		if (try == MUTEX_LOCK) {
1643			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1644			return (EDEADLK);
1645		}
1646		return (EBUSY);
1647	}
1648
1649	/* try a little harder if we don't own the mutex */
1650	if (!shared_mutex_held(mp) && mutex_trylock_process(mp) == 0)
1651		return (0);
1652
1653	if (try == MUTEX_LOCK)
1654		return (mutex_lock_kernel(mp, tsp, NULL));
1655
1656	if (__td_event_report(self, TD_LOCK_TRY, udp)) {
1657		self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
1658		tdb_event(TD_LOCK_TRY, udp);
1659	}
1660	return (EBUSY);
1661}
1662
1663static int
1664slow_lock(ulwp_t *self, mutex_t *mp, timespec_t *tsp)
1665{
1666	int error = 0;
1667
1668	if (MUTEX_OWNER(mp) == self || mutex_trylock_adaptive(mp) != 0)
1669		error = mutex_lock_queue(self, NULL, mp, tsp);
1670	return (error);
1671}
1672
1673int
1674mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
1675{
1676	ulwp_t *self = curthread;
1677	uberdata_t *udp = self->ul_uberdata;
1678	uberflags_t *gflags;
1679	int mtype;
1680
1681	/*
1682	 * Optimize the case of USYNC_THREAD, including
1683	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
1684	 * no error detection, no lock statistics,
1685	 * and the process has only a single thread.
1686	 * (Most likely a traditional single-threaded application.)
1687	 */
1688	if ((((mtype = mp->mutex_type) & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
1689	    udp->uberflags.uf_all) == 0) {
1690		/*
1691		 * Only one thread exists so we don't need an atomic operation.
1692		 */
1693		if (mp->mutex_lockw == 0) {
1694			mp->mutex_lockw = LOCKSET;
1695			mp->mutex_owner = (uintptr_t)self;
1696			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1697			return (0);
1698		}
1699		if (mtype && MUTEX_OWNER(mp) == self) {
1700			/*
1701			 * LOCK_RECURSIVE, LOCK_ERRORCHECK, or both.
1702			 */
1703			if (mtype & LOCK_RECURSIVE) {
1704				if (mp->mutex_rcount == RECURSION_MAX)
1705					return (EAGAIN);
1706				mp->mutex_rcount++;
1707				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
1708				    1, 0);
1709				return (0);
1710			}
1711			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1712			return (EDEADLK);	/* LOCK_ERRORCHECK */
1713		}
1714		/*
1715		 * We have reached a deadlock, probably because the
1716		 * process is executing non-async-signal-safe code in
1717		 * a signal handler and is attempting to acquire a lock
1718		 * that it already owns.  This is not surprising, given
1719		 * bad programming practices over the years that has
1720		 * resulted in applications calling printf() and such
1721		 * in their signal handlers.  Unless the user has told
1722		 * us that the signal handlers are safe by setting:
1723		 *	export _THREAD_ASYNC_SAFE=1
1724		 * we return EDEADLK rather than actually deadlocking.
1725		 */
1726		if (tsp == NULL &&
1727		    MUTEX_OWNER(mp) == self && !self->ul_async_safe) {
1728			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1729			return (EDEADLK);
1730		}
1731	}
1732
1733	/*
1734	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
1735	 * no error detection, and no lock statistics.
1736	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
1737	 */
1738	if ((gflags = self->ul_schedctl_called) != NULL &&
1739	    (gflags->uf_trs_ted |
1740	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
1741
1742		if (mtype & USYNC_PROCESS)
1743			return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
1744
1745		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1746			mp->mutex_owner = (uintptr_t)self;
1747			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1748			return (0);
1749		}
1750
1751		if (mtype && MUTEX_OWNER(mp) == self) {
1752			if (mtype & LOCK_RECURSIVE) {
1753				if (mp->mutex_rcount == RECURSION_MAX)
1754					return (EAGAIN);
1755				mp->mutex_rcount++;
1756				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
1757				    1, 0);
1758				return (0);
1759			}
1760			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1761			return (EDEADLK);	/* LOCK_ERRORCHECK */
1762		}
1763
1764		return (slow_lock(self, mp, tsp));
1765	}
1766
1767	/* else do it the long way */
1768	return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
1769}
1770
1771#pragma weak _private_mutex_lock = __mutex_lock
1772#pragma weak mutex_lock = __mutex_lock
1773#pragma weak _mutex_lock = __mutex_lock
1774#pragma weak pthread_mutex_lock = __mutex_lock
1775#pragma weak _pthread_mutex_lock = __mutex_lock
1776int
1777__mutex_lock(mutex_t *mp)
1778{
1779	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
1780	return (mutex_lock_impl(mp, NULL));
1781}
1782
1783#pragma weak pthread_mutex_timedlock = _pthread_mutex_timedlock
1784int
1785_pthread_mutex_timedlock(mutex_t *mp, const timespec_t *abstime)
1786{
1787	timespec_t tslocal;
1788	int error;
1789
1790	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
1791	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
1792	error = mutex_lock_impl(mp, &tslocal);
1793	if (error == ETIME)
1794		error = ETIMEDOUT;
1795	return (error);
1796}
1797
1798#pragma weak pthread_mutex_reltimedlock_np = _pthread_mutex_reltimedlock_np
1799int
1800_pthread_mutex_reltimedlock_np(mutex_t *mp, const timespec_t *reltime)
1801{
1802	timespec_t tslocal;
1803	int error;
1804
1805	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
1806	tslocal = *reltime;
1807	error = mutex_lock_impl(mp, &tslocal);
1808	if (error == ETIME)
1809		error = ETIMEDOUT;
1810	return (error);
1811}
1812
1813static int
1814slow_trylock(mutex_t *mp, ulwp_t *self)
1815{
1816	if (MUTEX_OWNER(mp) == self ||
1817	    mutex_trylock_adaptive(mp) != 0) {
1818		uberdata_t *udp = self->ul_uberdata;
1819
1820		if (__td_event_report(self, TD_LOCK_TRY, udp)) {
1821			self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
1822			tdb_event(TD_LOCK_TRY, udp);
1823		}
1824		return (EBUSY);
1825	}
1826	return (0);
1827}
1828
1829#pragma weak _private_mutex_trylock = __mutex_trylock
1830#pragma weak mutex_trylock = __mutex_trylock
1831#pragma weak _mutex_trylock = __mutex_trylock
1832#pragma weak pthread_mutex_trylock = __mutex_trylock
1833#pragma weak _pthread_mutex_trylock = __mutex_trylock
1834int
1835__mutex_trylock(mutex_t *mp)
1836{
1837	ulwp_t *self = curthread;
1838	uberdata_t *udp = self->ul_uberdata;
1839	uberflags_t *gflags;
1840	int mtype;
1841
1842	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
1843	/*
1844	 * Optimize the case of USYNC_THREAD, including
1845	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
1846	 * no error detection, no lock statistics,
1847	 * and the process has only a single thread.
1848	 * (Most likely a traditional single-threaded application.)
1849	 */
1850	if ((((mtype = mp->mutex_type) & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
1851	    udp->uberflags.uf_all) == 0) {
1852		/*
1853		 * Only one thread exists so we don't need an atomic operation.
1854		 */
1855		if (mp->mutex_lockw == 0) {
1856			mp->mutex_lockw = LOCKSET;
1857			mp->mutex_owner = (uintptr_t)self;
1858			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1859			return (0);
1860		}
1861		if (mtype && MUTEX_OWNER(mp) == self) {
1862			if (mtype & LOCK_RECURSIVE) {
1863				if (mp->mutex_rcount == RECURSION_MAX)
1864					return (EAGAIN);
1865				mp->mutex_rcount++;
1866				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
1867				    1, 0);
1868				return (0);
1869			}
1870			return (EDEADLK);	/* LOCK_ERRORCHECK */
1871		}
1872		return (EBUSY);
1873	}
1874
1875	/*
1876	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
1877	 * no error detection, and no lock statistics.
1878	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
1879	 */
1880	if ((gflags = self->ul_schedctl_called) != NULL &&
1881	    (gflags->uf_trs_ted |
1882	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
1883
1884		if (mtype & USYNC_PROCESS)
1885			return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
1886
1887		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1888			mp->mutex_owner = (uintptr_t)self;
1889			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1890			return (0);
1891		}
1892
1893		if (mtype && MUTEX_OWNER(mp) == self) {
1894			if (mtype & LOCK_RECURSIVE) {
1895				if (mp->mutex_rcount == RECURSION_MAX)
1896					return (EAGAIN);
1897				mp->mutex_rcount++;
1898				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
1899				    1, 0);
1900				return (0);
1901			}
1902			return (EBUSY);		/* LOCK_ERRORCHECK */
1903		}
1904
1905		return (slow_trylock(mp, self));
1906	}
1907
1908	/* else do it the long way */
1909	return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
1910}
1911
1912int
1913mutex_unlock_internal(mutex_t *mp)
1914{
1915	ulwp_t *self = curthread;
1916	uberdata_t *udp = self->ul_uberdata;
1917	int mtype = mp->mutex_type;
1918	tdb_mutex_stats_t *msp;
1919	int error;
1920	lwpid_t lwpid;
1921
1922	if ((mtype & LOCK_ERRORCHECK) && !mutex_is_held(mp))
1923		return (EPERM);
1924
1925	if (self->ul_error_detection && !mutex_is_held(mp))
1926		lock_error(mp, "mutex_unlock", NULL, NULL);
1927
1928	if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
1929		mp->mutex_rcount--;
1930		DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
1931		return (0);
1932	}
1933
1934	if ((msp = MUTEX_STATS(mp, udp)) != NULL)
1935		(void) record_hold_time(msp);
1936
1937	if (mtype &
1938	    (USYNC_PROCESS_ROBUST|PTHREAD_PRIO_INHERIT|PTHREAD_PRIO_PROTECT)) {
1939		no_preempt(self);
1940		mp->mutex_owner = 0;
1941		mp->mutex_ownerpid = 0;
1942		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1943		if (mtype & PTHREAD_PRIO_INHERIT) {
1944			mp->mutex_lockw = LOCKCLEAR;
1945			error = ___lwp_mutex_unlock(mp);
1946		} else if (mtype & USYNC_PROCESS_ROBUST) {
1947			error = ___lwp_mutex_unlock(mp);
1948		} else {
1949			if (swap32(&mp->mutex_lockword, 0) & WAITERMASK)
1950				(void) ___lwp_mutex_wakeup(mp);
1951			error = 0;
1952		}
1953		if (mtype & PTHREAD_PRIO_PROTECT) {
1954			if (_ceil_mylist_del(mp))
1955				_ceil_prio_waive();
1956		}
1957		preempt(self);
1958	} else if (mtype & USYNC_PROCESS) {
1959		if (mp->mutex_lockword & WAITERMASK)
1960			mutex_unlock_process(mp);
1961		else {
1962			mp->mutex_owner = 0;
1963			mp->mutex_ownerpid = 0;
1964			DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1965			if (swap32(&mp->mutex_lockword, 0) & WAITERMASK) {
1966				no_preempt(self);
1967				(void) ___lwp_mutex_wakeup(mp);
1968				preempt(self);
1969			}
1970		}
1971		error = 0;
1972	} else {	/* USYNC_THREAD */
1973		if ((lwpid = mutex_unlock_queue(mp)) != 0) {
1974			(void) __lwp_unpark(lwpid);
1975			preempt(self);
1976		}
1977		error = 0;
1978	}
1979
1980	return (error);
1981}
1982
1983#pragma weak _private_mutex_unlock = __mutex_unlock
1984#pragma weak mutex_unlock = __mutex_unlock
1985#pragma weak _mutex_unlock = __mutex_unlock
1986#pragma weak pthread_mutex_unlock = __mutex_unlock
1987#pragma weak _pthread_mutex_unlock = __mutex_unlock
1988int
1989__mutex_unlock(mutex_t *mp)
1990{
1991	ulwp_t *self = curthread;
1992	uberdata_t *udp = self->ul_uberdata;
1993	uberflags_t *gflags;
1994	lwpid_t lwpid;
1995	int mtype;
1996	short el;
1997
1998	/*
1999	 * Optimize the case of USYNC_THREAD, including
2000	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2001	 * no error detection, no lock statistics,
2002	 * and the process has only a single thread.
2003	 * (Most likely a traditional single-threaded application.)
2004	 */
2005	if ((((mtype = mp->mutex_type) & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2006	    udp->uberflags.uf_all) == 0) {
2007		if (mtype) {
2008			/*
2009			 * At this point we know that one or both of the
2010			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2011			 */
2012			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2013				return (EPERM);
2014			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2015				mp->mutex_rcount--;
2016				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2017				return (0);
2018			}
2019		}
2020		/*
2021		 * Only one thread exists so we don't need an atomic operation.
2022		 * Also, there can be no waiters.
2023		 */
2024		mp->mutex_owner = 0;
2025		mp->mutex_lockword = 0;
2026		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2027		return (0);
2028	}
2029
2030	/*
2031	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2032	 * no error detection, and no lock statistics.
2033	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2034	 */
2035	if ((gflags = self->ul_schedctl_called) != NULL) {
2036		if (((el = gflags->uf_trs_ted) | mtype) == 0) {
2037fast_unlock:
2038			if (!(mp->mutex_lockword & WAITERMASK)) {
2039				/* no waiter exists right now */
2040				mp->mutex_owner = 0;
2041				DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2042				if (swap32(&mp->mutex_lockword, 0) &
2043				    WAITERMASK) {
2044					/* a waiter suddenly appeared */
2045					no_preempt(self);
2046					if ((lwpid = mutex_wakeup(mp)) != 0)
2047						(void) __lwp_unpark(lwpid);
2048					preempt(self);
2049				}
2050			} else if ((lwpid = mutex_unlock_queue(mp)) != 0) {
2051				(void) __lwp_unpark(lwpid);
2052				preempt(self);
2053			}
2054			return (0);
2055		}
2056		if (el)		/* error detection or lock statistics */
2057			goto slow_unlock;
2058		if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2059			/*
2060			 * At this point we know that one or both of the
2061			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2062			 */
2063			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2064				return (EPERM);
2065			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2066				mp->mutex_rcount--;
2067				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2068				return (0);
2069			}
2070			goto fast_unlock;
2071		}
2072		if ((mtype &
2073		    ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2074			/*
2075			 * At this point we know that zero, one, or both of the
2076			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
2077			 * that the USYNC_PROCESS flag is set.
2078			 */
2079			if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
2080				return (EPERM);
2081			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2082				mp->mutex_rcount--;
2083				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2084				return (0);
2085			}
2086			if (mp->mutex_lockword & WAITERMASK)
2087				mutex_unlock_process(mp);
2088			else {
2089				mp->mutex_owner = 0;
2090				mp->mutex_ownerpid = 0;
2091				DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2092				if (swap32(&mp->mutex_lockword, 0) &
2093				    WAITERMASK) {
2094					no_preempt(self);
2095					(void) ___lwp_mutex_wakeup(mp);
2096					preempt(self);
2097				}
2098			}
2099			return (0);
2100		}
2101	}
2102
2103	/* else do it the long way */
2104slow_unlock:
2105	return (mutex_unlock_internal(mp));
2106}
2107
2108/*
2109 * Internally to the library, almost all mutex lock/unlock actions
2110 * go through these lmutex_ functions, to protect critical regions.
2111 * We replicate a bit of code from __mutex_lock() and __mutex_unlock()
2112 * to make these functions faster since we know that the mutex type
2113 * of all internal locks is USYNC_THREAD.  We also know that internal
2114 * locking can never fail, so we panic if it does.
2115 */
2116void
2117lmutex_lock(mutex_t *mp)
2118{
2119	ulwp_t *self = curthread;
2120	uberdata_t *udp = self->ul_uberdata;
2121
2122	ASSERT(mp->mutex_type == USYNC_THREAD);
2123
2124	enter_critical(self);
2125	/*
2126	 * Optimize the case of no lock statistics and only a single thread.
2127	 * (Most likely a traditional single-threaded application.)
2128	 */
2129	if (udp->uberflags.uf_all == 0) {
2130		/*
2131		 * Only one thread exists; the mutex must be free.
2132		 */
2133		ASSERT(mp->mutex_lockw == 0);
2134		mp->mutex_lockw = LOCKSET;
2135		mp->mutex_owner = (uintptr_t)self;
2136		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2137	} else {
2138		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2139
2140		if (!self->ul_schedctl_called)
2141			(void) setup_schedctl();
2142
2143		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2144			mp->mutex_owner = (uintptr_t)self;
2145			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2146		} else if (mutex_trylock_adaptive(mp) != 0) {
2147			(void) mutex_lock_queue(self, msp, mp, NULL);
2148		}
2149
2150		if (msp)
2151			record_begin_hold(msp);
2152	}
2153}
2154
2155void
2156lmutex_unlock(mutex_t *mp)
2157{
2158	ulwp_t *self = curthread;
2159	uberdata_t *udp = self->ul_uberdata;
2160
2161	ASSERT(mp->mutex_type == USYNC_THREAD);
2162
2163	/*
2164	 * Optimize the case of no lock statistics and only a single thread.
2165	 * (Most likely a traditional single-threaded application.)
2166	 */
2167	if (udp->uberflags.uf_all == 0) {
2168		/*
2169		 * Only one thread exists so there can be no waiters.
2170		 */
2171		mp->mutex_owner = 0;
2172		mp->mutex_lockword = 0;
2173		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2174	} else {
2175		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2176		lwpid_t lwpid;
2177
2178		if (msp)
2179			(void) record_hold_time(msp);
2180		if ((lwpid = mutex_unlock_queue(mp)) != 0) {
2181			(void) __lwp_unpark(lwpid);
2182			preempt(self);
2183		}
2184	}
2185	exit_critical(self);
2186}
2187
2188static int
2189shared_mutex_held(mutex_t *mparg)
2190{
2191	/*
2192	 * There is an inherent data race in the current ownership design.
2193	 * The mutex_owner and mutex_ownerpid fields cannot be set or tested
2194	 * atomically as a pair. The original implementation tested each
2195	 * field just once. This was exposed to trivial false positives in
2196	 * the case of multiple multithreaded processes with thread addresses
2197	 * in common. To close the window to an acceptable level we now use a
2198	 * sequence of five tests: pid-thr-pid-thr-pid. This ensures that any
2199	 * single interruption will still leave one uninterrupted sequence of
2200	 * pid-thr-pid tests intact.
2201	 *
2202	 * It is assumed that all updates are always ordered thr-pid and that
2203	 * we have TSO hardware.
2204	 */
2205	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2206	ulwp_t *self = curthread;
2207	uberdata_t *udp = self->ul_uberdata;
2208
2209	if (mp->mutex_ownerpid != udp->pid)
2210		return (0);
2211
2212	if (!MUTEX_OWNED(mp, self))
2213		return (0);
2214
2215	if (mp->mutex_ownerpid != udp->pid)
2216		return (0);
2217
2218	if (!MUTEX_OWNED(mp, self))
2219		return (0);
2220
2221	if (mp->mutex_ownerpid != udp->pid)
2222		return (0);
2223
2224	return (1);
2225}
2226
2227/*
2228 * Some crufty old programs define their own version of _mutex_held()
2229 * to be simply return(1).  This breaks internal libc logic, so we
2230 * define a private version for exclusive use by libc, mutex_is_held(),
2231 * and also a new public function, __mutex_held(), to be used in new
2232 * code to circumvent these crufty old programs.
2233 */
2234#pragma weak mutex_held = mutex_is_held
2235#pragma weak _mutex_held = mutex_is_held
2236#pragma weak __mutex_held = mutex_is_held
2237int
2238mutex_is_held(mutex_t *mp)
2239{
2240	if (mp->mutex_type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST))
2241		return (shared_mutex_held(mp));
2242	return (MUTEX_OWNED(mp, curthread));
2243}
2244
2245#pragma weak _private_mutex_destroy = __mutex_destroy
2246#pragma weak mutex_destroy = __mutex_destroy
2247#pragma weak _mutex_destroy = __mutex_destroy
2248#pragma weak pthread_mutex_destroy = __mutex_destroy
2249#pragma weak _pthread_mutex_destroy = __mutex_destroy
2250int
2251__mutex_destroy(mutex_t *mp)
2252{
2253	mp->mutex_magic = 0;
2254	mp->mutex_flag &= ~LOCK_INITED;
2255	tdb_sync_obj_deregister(mp);
2256	return (0);
2257}
2258
2259/*
2260 * Spin locks are separate from ordinary mutexes,
2261 * but we use the same data structure for them.
2262 */
2263
2264#pragma weak pthread_spin_init = _pthread_spin_init
2265int
2266_pthread_spin_init(pthread_spinlock_t *lock, int pshared)
2267{
2268	mutex_t *mp = (mutex_t *)lock;
2269
2270	(void) _memset(mp, 0, sizeof (*mp));
2271	if (pshared == PTHREAD_PROCESS_SHARED)
2272		mp->mutex_type = USYNC_PROCESS;
2273	else
2274		mp->mutex_type = USYNC_THREAD;
2275	mp->mutex_flag = LOCK_INITED;
2276	mp->mutex_magic = MUTEX_MAGIC;
2277	return (0);
2278}
2279
2280#pragma weak pthread_spin_destroy = _pthread_spin_destroy
2281int
2282_pthread_spin_destroy(pthread_spinlock_t *lock)
2283{
2284	(void) _memset(lock, 0, sizeof (*lock));
2285	return (0);
2286}
2287
2288#pragma weak pthread_spin_trylock = _pthread_spin_trylock
2289int
2290_pthread_spin_trylock(pthread_spinlock_t *lock)
2291{
2292	mutex_t *mp = (mutex_t *)lock;
2293	ulwp_t *self = curthread;
2294	int error = 0;
2295
2296	no_preempt(self);
2297	if (set_lock_byte(&mp->mutex_lockw) != 0)
2298		error = EBUSY;
2299	else {
2300		mp->mutex_owner = (uintptr_t)self;
2301		if (mp->mutex_type == USYNC_PROCESS)
2302			mp->mutex_ownerpid = self->ul_uberdata->pid;
2303		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2304	}
2305	preempt(self);
2306	return (error);
2307}
2308
2309#pragma weak pthread_spin_lock = _pthread_spin_lock
2310int
2311_pthread_spin_lock(pthread_spinlock_t *lock)
2312{
2313	volatile uint8_t *lockp =
2314		(volatile uint8_t *)&((mutex_t *)lock)->mutex_lockw;
2315
2316	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2317	/*
2318	 * We don't care whether the owner is running on a processor.
2319	 * We just spin because that's what this interface requires.
2320	 */
2321	for (;;) {
2322		if (*lockp == 0) {	/* lock byte appears to be clear */
2323			if (_pthread_spin_trylock(lock) == 0)
2324				return (0);
2325		}
2326		SMT_PAUSE();
2327	}
2328}
2329
2330#pragma weak pthread_spin_unlock = _pthread_spin_unlock
2331int
2332_pthread_spin_unlock(pthread_spinlock_t *lock)
2333{
2334	mutex_t *mp = (mutex_t *)lock;
2335	ulwp_t *self = curthread;
2336
2337	no_preempt(self);
2338	mp->mutex_owner = 0;
2339	mp->mutex_ownerpid = 0;
2340	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2341	(void) swap32(&mp->mutex_lockword, 0);
2342	preempt(self);
2343	return (0);
2344}
2345
2346#pragma weak cond_init = _cond_init
2347/* ARGSUSED2 */
2348int
2349_cond_init(cond_t *cvp, int type, void *arg)
2350{
2351	if (type != USYNC_THREAD && type != USYNC_PROCESS)
2352		return (EINVAL);
2353	(void) _memset(cvp, 0, sizeof (*cvp));
2354	cvp->cond_type = (uint16_t)type;
2355	cvp->cond_magic = COND_MAGIC;
2356	return (0);
2357}
2358
2359/*
2360 * cond_sleep_queue(): utility function for cond_wait_queue().
2361 *
2362 * Go to sleep on a condvar sleep queue, expect to be waked up
2363 * by someone calling cond_signal() or cond_broadcast() or due
2364 * to receiving a UNIX signal or being cancelled, or just simply
2365 * due to a spurious wakeup (like someome calling forkall()).
2366 *
2367 * The associated mutex is *not* reacquired before returning.
2368 * That must be done by the caller of cond_sleep_queue().
2369 */
2370int
2371cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
2372{
2373	ulwp_t *self = curthread;
2374	queue_head_t *qp;
2375	queue_head_t *mqp;
2376	lwpid_t lwpid;
2377	int signalled;
2378	int error;
2379
2380	/*
2381	 * Put ourself on the CV sleep queue, unlock the mutex, then
2382	 * park ourself and unpark a candidate lwp to grab the mutex.
2383	 * We must go onto the CV sleep queue before dropping the
2384	 * mutex in order to guarantee atomicity of the operation.
2385	 */
2386	self->ul_sp = stkptr();
2387	qp = queue_lock(cvp, CV);
2388	enqueue(qp, self, cvp, CV);
2389	cvp->cond_waiters_user = 1;
2390	self->ul_cvmutex = mp;
2391	self->ul_cv_wake = (tsp != NULL);
2392	self->ul_signalled = 0;
2393	lwpid = mutex_unlock_queue(mp);
2394	for (;;) {
2395		set_parking_flag(self, 1);
2396		queue_unlock(qp);
2397		if (lwpid != 0) {
2398			lwpid = preempt_unpark(self, lwpid);
2399			preempt(self);
2400		}
2401		/*
2402		 * We may have a deferred signal present,
2403		 * in which case we should return EINTR.
2404		 * Also, we may have received a SIGCANCEL; if so
2405		 * and we are cancelable we should return EINTR.
2406		 * We force an immediate EINTR return from
2407		 * __lwp_park() by turning our parking flag off.
2408		 */
2409		if (self->ul_cursig != 0 ||
2410		    (self->ul_cancelable && self->ul_cancel_pending))
2411			set_parking_flag(self, 0);
2412		/*
2413		 * __lwp_park() will return the residual time in tsp
2414		 * if we are unparked before the timeout expires.
2415		 */
2416		error = __lwp_park(tsp, lwpid);
2417		set_parking_flag(self, 0);
2418		lwpid = 0;	/* unpark the other lwp only once */
2419		/*
2420		 * We were waked up by cond_signal(), cond_broadcast(),
2421		 * by an interrupt or timeout (EINTR or ETIME),
2422		 * or we may just have gotten a spurious wakeup.
2423		 */
2424		qp = queue_lock(cvp, CV);
2425		mqp = queue_lock(mp, MX);
2426		if (self->ul_sleepq == NULL)
2427			break;
2428		/*
2429		 * We are on either the condvar sleep queue or the
2430		 * mutex sleep queue.  If we are on the mutex sleep
2431		 * queue, continue sleeping.  If we are on the condvar
2432		 * sleep queue, break out of the sleep if we were
2433		 * interrupted or we timed out (EINTR or ETIME).
2434		 * Else this is a spurious wakeup; continue the loop.
2435		 */
2436		if (self->ul_sleepq == mqp)		/* mutex queue */
2437			tsp = NULL;
2438		else if (self->ul_sleepq == qp) {	/* condvar queue */
2439			if (error) {
2440				cvp->cond_waiters_user = dequeue_self(qp, cvp);
2441				break;
2442			}
2443			/*
2444			 * Else a spurious wakeup on the condvar queue.
2445			 * __lwp_park() has already adjusted the timeout.
2446			 */
2447		} else {
2448			thr_panic("cond_sleep_queue(): thread not on queue");
2449		}
2450		queue_unlock(mqp);
2451	}
2452
2453	self->ul_sp = 0;
2454	ASSERT(self->ul_cvmutex == NULL && self->ul_cv_wake == 0);
2455	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
2456	    self->ul_wchan == NULL);
2457
2458	signalled = self->ul_signalled;
2459	self->ul_signalled = 0;
2460	queue_unlock(qp);
2461	queue_unlock(mqp);
2462
2463	/*
2464	 * If we were concurrently cond_signal()d and any of:
2465	 * received a UNIX signal, were cancelled, or got a timeout,
2466	 * then perform another cond_signal() to avoid consuming it.
2467	 */
2468	if (error && signalled)
2469		(void) cond_signal_internal(cvp);
2470
2471	return (error);
2472}
2473
2474int
2475cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp,
2476	tdb_mutex_stats_t *msp)
2477{
2478	ulwp_t *self = curthread;
2479	int error;
2480
2481	/*
2482	 * The old thread library was programmed to defer signals
2483	 * while in cond_wait() so that the associated mutex would
2484	 * be guaranteed to be held when the application signal
2485	 * handler was invoked.
2486	 *
2487	 * We do not behave this way by default; the state of the
2488	 * associated mutex in the signal handler is undefined.
2489	 *
2490	 * To accommodate applications that depend on the old
2491	 * behavior, the _THREAD_COND_WAIT_DEFER environment
2492	 * variable can be set to 1 and we will behave in the
2493	 * old way with respect to cond_wait().
2494	 */
2495	if (self->ul_cond_wait_defer)
2496		sigoff(self);
2497
2498	error = cond_sleep_queue(cvp, mp, tsp);
2499
2500	/*
2501	 * Reacquire the mutex.
2502	 */
2503	if (set_lock_byte(&mp->mutex_lockw) == 0) {
2504		mp->mutex_owner = (uintptr_t)self;
2505		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2506	} else if (mutex_trylock_adaptive(mp) != 0) {
2507		(void) mutex_lock_queue(self, msp, mp, NULL);
2508	}
2509
2510	if (msp)
2511		record_begin_hold(msp);
2512
2513	/*
2514	 * Take any deferred signal now, after we have reacquired the mutex.
2515	 */
2516	if (self->ul_cond_wait_defer)
2517		sigon(self);
2518
2519	return (error);
2520}
2521
2522/*
2523 * cond_sleep_kernel(): utility function for cond_wait_kernel().
2524 * See the comment ahead of cond_sleep_queue(), above.
2525 */
2526int
2527cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
2528{
2529	int mtype = mp->mutex_type;
2530	ulwp_t *self = curthread;
2531	int error;
2532
2533	if (mtype & PTHREAD_PRIO_PROTECT) {
2534		if (_ceil_mylist_del(mp))
2535			_ceil_prio_waive();
2536	}
2537
2538	self->ul_sp = stkptr();
2539	self->ul_wchan = cvp;
2540	mp->mutex_owner = 0;
2541	mp->mutex_ownerpid = 0;
2542	if (mtype & PTHREAD_PRIO_INHERIT)
2543		mp->mutex_lockw = LOCKCLEAR;
2544	/*
2545	 * ___lwp_cond_wait() returns immediately with EINTR if
2546	 * set_parking_flag(self,0) is called on this lwp before it
2547	 * goes to sleep in the kernel.  sigacthandler() calls this
2548	 * when a deferred signal is noted.  This assures that we don't
2549	 * get stuck in ___lwp_cond_wait() with all signals blocked
2550	 * due to taking a deferred signal before going to sleep.
2551	 */
2552	set_parking_flag(self, 1);
2553	if (self->ul_cursig != 0 ||
2554	    (self->ul_cancelable && self->ul_cancel_pending))
2555		set_parking_flag(self, 0);
2556	error = ___lwp_cond_wait(cvp, mp, tsp, 1);
2557	set_parking_flag(self, 0);
2558	self->ul_sp = 0;
2559	self->ul_wchan = NULL;
2560	return (error);
2561}
2562
2563int
2564cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
2565{
2566	ulwp_t *self = curthread;
2567	int error;
2568	int merror;
2569
2570	/*
2571	 * See the large comment in cond_wait_queue(), above.
2572	 */
2573	if (self->ul_cond_wait_defer)
2574		sigoff(self);
2575
2576	error = cond_sleep_kernel(cvp, mp, tsp);
2577
2578	/*
2579	 * Override the return code from ___lwp_cond_wait()
2580	 * with any non-zero return code from mutex_lock().
2581	 * This addresses robust lock failures in particular;
2582	 * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
2583	 * errors in order to take corrective action.
2584	 */
2585	if ((merror = _private_mutex_lock(mp)) != 0)
2586		error = merror;
2587
2588	/*
2589	 * Take any deferred signal now, after we have reacquired the mutex.
2590	 */
2591	if (self->ul_cond_wait_defer)
2592		sigon(self);
2593
2594	return (error);
2595}
2596
2597/*
2598 * Common code for _cond_wait() and _cond_timedwait()
2599 */
2600int
2601cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
2602{
2603	int mtype = mp->mutex_type;
2604	hrtime_t begin_sleep = 0;
2605	ulwp_t *self = curthread;
2606	uberdata_t *udp = self->ul_uberdata;
2607	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
2608	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2609	uint8_t rcount;
2610	int error = 0;
2611
2612	/*
2613	 * The SUSV3 Posix spec for pthread_cond_timedwait() states:
2614	 *	Except in the case of [ETIMEDOUT], all these error checks
2615	 *	shall act as if they were performed immediately at the
2616	 *	beginning of processing for the function and shall cause
2617	 *	an error return, in effect, prior to modifying the state
2618	 *	of the mutex specified by mutex or the condition variable
2619	 *	specified by cond.
2620	 * Therefore, we must return EINVAL now if the timout is invalid.
2621	 */
2622	if (tsp != NULL &&
2623	    (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
2624		return (EINVAL);
2625
2626	if (__td_event_report(self, TD_SLEEP, udp)) {
2627		self->ul_sp = stkptr();
2628		self->ul_wchan = cvp;
2629		self->ul_td_evbuf.eventnum = TD_SLEEP;
2630		self->ul_td_evbuf.eventdata = cvp;
2631		tdb_event(TD_SLEEP, udp);
2632		self->ul_sp = 0;
2633	}
2634	if (csp) {
2635		if (tsp)
2636			tdb_incr(csp->cond_timedwait);
2637		else
2638			tdb_incr(csp->cond_wait);
2639	}
2640	if (msp)
2641		begin_sleep = record_hold_time(msp);
2642	else if (csp)
2643		begin_sleep = gethrtime();
2644
2645	if (self->ul_error_detection) {
2646		if (!mutex_is_held(mp))
2647			lock_error(mp, "cond_wait", cvp, NULL);
2648		if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
2649			lock_error(mp, "recursive mutex in cond_wait",
2650				cvp, NULL);
2651		if (cvp->cond_type & USYNC_PROCESS) {
2652			if (!(mtype & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)))
2653				lock_error(mp, "cond_wait", cvp,
2654					"condvar process-shared, "
2655					"mutex process-private");
2656		} else {
2657			if (mtype & (USYNC_PROCESS | USYNC_PROCESS_ROBUST))
2658				lock_error(mp, "cond_wait", cvp,
2659					"condvar process-private, "
2660					"mutex process-shared");
2661		}
2662	}
2663
2664	/*
2665	 * We deal with recursive mutexes by completely
2666	 * dropping the lock and restoring the recursion
2667	 * count after waking up.  This is arguably wrong,
2668	 * but it obeys the principle of least astonishment.
2669	 */
2670	rcount = mp->mutex_rcount;
2671	mp->mutex_rcount = 0;
2672	if ((mtype & (USYNC_PROCESS | USYNC_PROCESS_ROBUST |
2673	    PTHREAD_PRIO_INHERIT | PTHREAD_PRIO_PROTECT)) |
2674	    (cvp->cond_type & USYNC_PROCESS))
2675		error = cond_wait_kernel(cvp, mp, tsp);
2676	else
2677		error = cond_wait_queue(cvp, mp, tsp, msp);
2678	mp->mutex_rcount = rcount;
2679
2680	if (csp) {
2681		hrtime_t lapse = gethrtime() - begin_sleep;
2682		if (tsp == NULL)
2683			csp->cond_wait_sleep_time += lapse;
2684		else {
2685			csp->cond_timedwait_sleep_time += lapse;
2686			if (error == ETIME)
2687				tdb_incr(csp->cond_timedwait_timeout);
2688		}
2689	}
2690	return (error);
2691}
2692
2693/*
2694 * cond_wait() is a cancellation point but _cond_wait() is not.
2695 * System libraries call the non-cancellation version.
2696 * It is expected that only applications call the cancellation version.
2697 */
2698int
2699_cond_wait(cond_t *cvp, mutex_t *mp)
2700{
2701	ulwp_t *self = curthread;
2702	uberdata_t *udp = self->ul_uberdata;
2703	uberflags_t *gflags;
2704
2705	/*
2706	 * Optimize the common case of USYNC_THREAD plus
2707	 * no error detection, no lock statistics, and no event tracing.
2708	 */
2709	if ((gflags = self->ul_schedctl_called) != NULL &&
2710	    (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
2711	    self->ul_td_events_enable |
2712	    udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
2713		return (cond_wait_queue(cvp, mp, NULL, NULL));
2714
2715	/*
2716	 * Else do it the long way.
2717	 */
2718	return (cond_wait_common(cvp, mp, NULL));
2719}
2720
2721int
2722cond_wait(cond_t *cvp, mutex_t *mp)
2723{
2724	int error;
2725
2726	_cancelon();
2727	error = _cond_wait(cvp, mp);
2728	if (error == EINTR)
2729		_canceloff();
2730	else
2731		_canceloff_nocancel();
2732	return (error);
2733}
2734
2735#pragma weak pthread_cond_wait = _pthread_cond_wait
2736int
2737_pthread_cond_wait(cond_t *cvp, mutex_t *mp)
2738{
2739	int error;
2740
2741	error = cond_wait(cvp, mp);
2742	return ((error == EINTR)? 0 : error);
2743}
2744
2745/*
2746 * cond_timedwait() is a cancellation point but _cond_timedwait() is not.
2747 * System libraries call the non-cancellation version.
2748 * It is expected that only applications call the cancellation version.
2749 */
2750int
2751_cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
2752{
2753	clockid_t clock_id = cvp->cond_clockid;
2754	timespec_t reltime;
2755	int error;
2756
2757	if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
2758		clock_id = CLOCK_REALTIME;
2759	abstime_to_reltime(clock_id, abstime, &reltime);
2760	error = cond_wait_common(cvp, mp, &reltime);
2761	if (error == ETIME && clock_id == CLOCK_HIGHRES) {
2762		/*
2763		 * Don't return ETIME if we didn't really get a timeout.
2764		 * This can happen if we return because someone resets
2765		 * the system clock.  Just return zero in this case,
2766		 * giving a spurious wakeup but not a timeout.
2767		 */
2768		if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
2769		    abstime->tv_nsec > gethrtime())
2770			error = 0;
2771	}
2772	return (error);
2773}
2774
2775int
2776cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
2777{
2778	int error;
2779
2780	_cancelon();
2781	error = _cond_timedwait(cvp, mp, abstime);
2782	if (error == EINTR)
2783		_canceloff();
2784	else
2785		_canceloff_nocancel();
2786	return (error);
2787}
2788
2789#pragma weak pthread_cond_timedwait = _pthread_cond_timedwait
2790int
2791_pthread_cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
2792{
2793	int error;
2794
2795	error = cond_timedwait(cvp, mp, abstime);
2796	if (error == ETIME)
2797		error = ETIMEDOUT;
2798	else if (error == EINTR)
2799		error = 0;
2800	return (error);
2801}
2802
2803/*
2804 * cond_reltimedwait() is a cancellation point but _cond_reltimedwait()
2805 * is not.  System libraries call the non-cancellation version.
2806 * It is expected that only applications call the cancellation version.
2807 */
2808int
2809_cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
2810{
2811	timespec_t tslocal = *reltime;
2812
2813	return (cond_wait_common(cvp, mp, &tslocal));
2814}
2815
2816#pragma weak cond_reltimedwait = _cond_reltimedwait_cancel
2817int
2818_cond_reltimedwait_cancel(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
2819{
2820	int error;
2821
2822	_cancelon();
2823	error = _cond_reltimedwait(cvp, mp, reltime);
2824	if (error == EINTR)
2825		_canceloff();
2826	else
2827		_canceloff_nocancel();
2828	return (error);
2829}
2830
2831#pragma weak pthread_cond_reltimedwait_np = _pthread_cond_reltimedwait_np
2832int
2833_pthread_cond_reltimedwait_np(cond_t *cvp, mutex_t *mp,
2834	const timespec_t *reltime)
2835{
2836	int error;
2837
2838	error = _cond_reltimedwait_cancel(cvp, mp, reltime);
2839	if (error == ETIME)
2840		error = ETIMEDOUT;
2841	else if (error == EINTR)
2842		error = 0;
2843	return (error);
2844}
2845
2846#pragma weak pthread_cond_signal = cond_signal_internal
2847#pragma weak _pthread_cond_signal = cond_signal_internal
2848#pragma weak cond_signal = cond_signal_internal
2849#pragma weak _cond_signal = cond_signal_internal
2850int
2851cond_signal_internal(cond_t *cvp)
2852{
2853	ulwp_t *self = curthread;
2854	uberdata_t *udp = self->ul_uberdata;
2855	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
2856	int error = 0;
2857	queue_head_t *qp;
2858	mutex_t *mp;
2859	queue_head_t *mqp;
2860	ulwp_t **ulwpp;
2861	ulwp_t *ulwp;
2862	ulwp_t *prev = NULL;
2863	ulwp_t *next;
2864	ulwp_t **suspp = NULL;
2865	ulwp_t *susprev;
2866
2867	if (csp)
2868		tdb_incr(csp->cond_signal);
2869
2870	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
2871		error = __lwp_cond_signal(cvp);
2872
2873	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
2874		return (error);
2875
2876	/*
2877	 * Move someone from the condvar sleep queue to the mutex sleep
2878	 * queue for the mutex that he will acquire on being waked up.
2879	 * We can do this only if we own the mutex he will acquire.
2880	 * If we do not own the mutex, or if his ul_cv_wake flag
2881	 * is set, just dequeue and unpark him.
2882	 */
2883	qp = queue_lock(cvp, CV);
2884	for (ulwpp = &qp->qh_head; (ulwp = *ulwpp) != NULL;
2885	    prev = ulwp, ulwpp = &ulwp->ul_link) {
2886		if (ulwp->ul_wchan == cvp) {
2887			if (!ulwp->ul_stop)
2888				break;
2889			/*
2890			 * Try not to dequeue a suspended thread.
2891			 * This mimics the old libthread's behavior.
2892			 */
2893			if (suspp == NULL) {
2894				suspp = ulwpp;
2895				susprev = prev;
2896			}
2897		}
2898	}
2899	if (ulwp == NULL && suspp != NULL) {
2900		ulwp = *(ulwpp = suspp);
2901		prev = susprev;
2902		suspp = NULL;
2903	}
2904	if (ulwp == NULL) {	/* no one on the sleep queue */
2905		cvp->cond_waiters_user = 0;
2906		queue_unlock(qp);
2907		return (error);
2908	}
2909	/*
2910	 * Scan the remainder of the CV queue for another waiter.
2911	 */
2912	if (suspp != NULL) {
2913		next = *suspp;
2914	} else {
2915		for (next = ulwp->ul_link; next != NULL; next = next->ul_link)
2916			if (next->ul_wchan == cvp)
2917				break;
2918	}
2919	if (next == NULL)
2920		cvp->cond_waiters_user = 0;
2921
2922	/*
2923	 * Inform the thread that he was the recipient of a cond_signal().
2924	 * This lets him deal with cond_signal() and, concurrently,
2925	 * one or more of a cancellation, a UNIX signal, or a timeout.
2926	 * These latter conditions must not consume a cond_signal().
2927	 */
2928	ulwp->ul_signalled = 1;
2929
2930	/*
2931	 * Dequeue the waiter but leave his ul_sleepq non-NULL
2932	 * while we move him to the mutex queue so that he can
2933	 * deal properly with spurious wakeups.
2934	 */
2935	*ulwpp = ulwp->ul_link;
2936	if (qp->qh_tail == ulwp)
2937		qp->qh_tail = prev;
2938	qp->qh_qlen--;
2939	ulwp->ul_link = NULL;
2940
2941	mp = ulwp->ul_cvmutex;		/* the mutex he will acquire */
2942	ulwp->ul_cvmutex = NULL;
2943	ASSERT(mp != NULL);
2944
2945	if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
2946		lwpid_t lwpid = ulwp->ul_lwpid;
2947
2948		no_preempt(self);
2949		ulwp->ul_sleepq = NULL;
2950		ulwp->ul_wchan = NULL;
2951		ulwp->ul_cv_wake = 0;
2952		queue_unlock(qp);
2953		(void) __lwp_unpark(lwpid);
2954		preempt(self);
2955	} else {
2956		mqp = queue_lock(mp, MX);
2957		enqueue(mqp, ulwp, mp, MX);
2958		mp->mutex_waiters = 1;
2959		queue_unlock(mqp);
2960		queue_unlock(qp);
2961	}
2962
2963	return (error);
2964}
2965
2966#define	MAXLWPS	128	/* max remembered lwpids before overflow */
2967#define	NEWLWPS	2048	/* max remembered lwpids at first overflow */
2968
2969#pragma weak pthread_cond_broadcast = cond_broadcast_internal
2970#pragma weak _pthread_cond_broadcast = cond_broadcast_internal
2971#pragma weak cond_broadcast = cond_broadcast_internal
2972#pragma weak _cond_broadcast = cond_broadcast_internal
2973int
2974cond_broadcast_internal(cond_t *cvp)
2975{
2976	ulwp_t *self = curthread;
2977	uberdata_t *udp = self->ul_uberdata;
2978	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
2979	int error = 0;
2980	queue_head_t *qp;
2981	mutex_t *mp;
2982	queue_head_t *mqp;
2983	mutex_t *mp_cache = NULL;
2984	queue_head_t *mqp_cache = NULL;
2985	ulwp_t **ulwpp;
2986	ulwp_t *ulwp;
2987	ulwp_t *prev = NULL;
2988	lwpid_t buffer[MAXLWPS];
2989	lwpid_t *lwpid = buffer;
2990	int nlwpid = 0;
2991	int maxlwps = MAXLWPS;
2992
2993	if (csp)
2994		tdb_incr(csp->cond_broadcast);
2995
2996	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
2997		error = __lwp_cond_broadcast(cvp);
2998
2999	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3000		return (error);
3001
3002	/*
3003	 * Move everyone from the condvar sleep queue to the mutex sleep
3004	 * queue for the mutex that they will acquire on being waked up.
3005	 * We can do this only if we own the mutex they will acquire.
3006	 * If we do not own the mutex, or if their ul_cv_wake flag
3007	 * is set, just dequeue and unpark them.
3008	 *
3009	 * We keep track of lwpids that are to be unparked in lwpid[].
3010	 * __lwp_unpark_all() is called to unpark all of them after
3011	 * they have been removed from the sleep queue and the sleep
3012	 * queue lock has been dropped.  If we run out of space in our
3013	 * on-stack buffer, we need to allocate more but we can't call
3014	 * lmalloc() because we are holding a queue lock when the overflow
3015	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
3016	 * either because the application may have allocated a small stack
3017	 * and we don't want to overrun the stack.  So we use the mmap()
3018	 * system call directly since that path acquires no locks.
3019	 */
3020	qp = queue_lock(cvp, CV);
3021	cvp->cond_waiters_user = 0;
3022	ulwpp = &qp->qh_head;
3023	while ((ulwp = *ulwpp) != NULL) {
3024
3025		if (ulwp->ul_wchan != cvp) {
3026			prev = ulwp;
3027			ulwpp = &ulwp->ul_link;
3028			continue;
3029		}
3030
3031		*ulwpp = ulwp->ul_link;
3032		if (qp->qh_tail == ulwp)
3033			qp->qh_tail = prev;
3034		qp->qh_qlen--;
3035		ulwp->ul_link = NULL;
3036
3037		mp = ulwp->ul_cvmutex;		/* his mutex */
3038		ulwp->ul_cvmutex = NULL;
3039		ASSERT(mp != NULL);
3040
3041		if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3042			ulwp->ul_sleepq = NULL;
3043			ulwp->ul_wchan = NULL;
3044			ulwp->ul_cv_wake = 0;
3045			if (nlwpid == maxlwps) {
3046				/*
3047				 * Allocate NEWLWPS ids on the first overflow.
3048				 * Double the allocation each time after that.
3049				 */
3050				int newlwps = (lwpid == buffer)? NEWLWPS :
3051						2 * maxlwps;
3052				void *vaddr = _private_mmap(NULL,
3053					newlwps * sizeof (lwpid_t),
3054					PROT_READ|PROT_WRITE,
3055					MAP_PRIVATE|MAP_ANON, -1, (off_t)0);
3056				if (vaddr == MAP_FAILED) {
3057					/*
3058					 * Let's hope this never happens.
3059					 * If it does, then we have a terrible
3060					 * thundering herd on our hands.
3061					 */
3062					(void) __lwp_unpark_all(lwpid, nlwpid);
3063					nlwpid = 0;
3064				} else {
3065					(void) _memcpy(vaddr, lwpid,
3066						maxlwps * sizeof (lwpid_t));
3067					if (lwpid != buffer)
3068						(void) _private_munmap(lwpid,
3069						    maxlwps * sizeof (lwpid_t));
3070					lwpid = vaddr;
3071					maxlwps = newlwps;
3072				}
3073			}
3074			lwpid[nlwpid++] = ulwp->ul_lwpid;
3075		} else {
3076			if (mp != mp_cache) {
3077				if (mqp_cache != NULL)
3078					queue_unlock(mqp_cache);
3079				mqp_cache = queue_lock(mp, MX);
3080				mp_cache = mp;
3081			}
3082			mqp = mqp_cache;
3083			enqueue(mqp, ulwp, mp, MX);
3084			mp->mutex_waiters = 1;
3085		}
3086	}
3087	if (mqp_cache != NULL)
3088		queue_unlock(mqp_cache);
3089	queue_unlock(qp);
3090	if (nlwpid) {
3091		if (nlwpid == 1)
3092			(void) __lwp_unpark(lwpid[0]);
3093		else
3094			(void) __lwp_unpark_all(lwpid, nlwpid);
3095	}
3096	if (lwpid != buffer)
3097		(void) _private_munmap(lwpid, maxlwps * sizeof (lwpid_t));
3098
3099	return (error);
3100}
3101
3102#pragma weak pthread_cond_destroy = _cond_destroy
3103#pragma weak _pthread_cond_destroy = _cond_destroy
3104#pragma weak cond_destroy = _cond_destroy
3105int
3106_cond_destroy(cond_t *cvp)
3107{
3108	cvp->cond_magic = 0;
3109	tdb_sync_obj_deregister(cvp);
3110	return (0);
3111}
3112
3113#if defined(THREAD_DEBUG)
3114void
3115assert_no_libc_locks_held(void)
3116{
3117	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
3118}
3119#endif
3120
3121/* protected by link_lock */
3122uint64_t spin_lock_spin;
3123uint64_t spin_lock_spin2;
3124uint64_t spin_lock_sleep;
3125uint64_t spin_lock_wakeup;
3126
3127/*
3128 * Record spin lock statistics.
3129 * Called by a thread exiting itself in thrp_exit().
3130 * Also called via atexit() from the thread calling
3131 * exit() to do all the other threads as well.
3132 */
3133void
3134record_spin_locks(ulwp_t *ulwp)
3135{
3136	spin_lock_spin += ulwp->ul_spin_lock_spin;
3137	spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
3138	spin_lock_sleep += ulwp->ul_spin_lock_sleep;
3139	spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
3140	ulwp->ul_spin_lock_spin = 0;
3141	ulwp->ul_spin_lock_spin2 = 0;
3142	ulwp->ul_spin_lock_sleep = 0;
3143	ulwp->ul_spin_lock_wakeup = 0;
3144}
3145
3146/*
3147 * atexit function:  dump the queue statistics to stderr.
3148 */
3149#if !defined(__lint)
3150#define	fprintf	_fprintf
3151#endif
3152#include <stdio.h>
3153void
3154dump_queue_statistics(void)
3155{
3156	uberdata_t *udp = curthread->ul_uberdata;
3157	queue_head_t *qp;
3158	int qn;
3159	uint64_t spin_lock_total = 0;
3160
3161	if (udp->queue_head == NULL || thread_queue_dump == 0)
3162		return;
3163
3164	if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
3165	    fprintf(stderr, "queue#   lockcount    max qlen\n") < 0)
3166		return;
3167	for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
3168		if (qp->qh_lockcount == 0)
3169			continue;
3170		spin_lock_total += qp->qh_lockcount;
3171		if (fprintf(stderr, "%5d %12llu%12u\n", qn,
3172			(u_longlong_t)qp->qh_lockcount, qp->qh_qmax) < 0)
3173				return;
3174	}
3175
3176	if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
3177	    fprintf(stderr, "queue#   lockcount    max qlen\n") < 0)
3178		return;
3179	for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
3180		if (qp->qh_lockcount == 0)
3181			continue;
3182		spin_lock_total += qp->qh_lockcount;
3183		if (fprintf(stderr, "%5d %12llu%12u\n", qn,
3184			(u_longlong_t)qp->qh_lockcount, qp->qh_qmax) < 0)
3185				return;
3186	}
3187
3188	(void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
3189		(u_longlong_t)spin_lock_total);
3190	(void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
3191		(u_longlong_t)spin_lock_spin);
3192	(void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
3193		(u_longlong_t)spin_lock_spin2);
3194	(void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
3195		(u_longlong_t)spin_lock_sleep);
3196	(void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
3197		(u_longlong_t)spin_lock_wakeup);
3198}
3199