1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28/*	  All Rights Reserved	*/
29
30#include <sys/param.h>
31#include <sys/types.h>
32#include <sys/sysmacros.h>
33#include <sys/systm.h>
34#include <sys/cred.h>
35#include <sys/user.h>
36#include <sys/errno.h>
37#include <sys/file.h>
38#include <sys/proc.h>
39#include <sys/prsystm.h>
40#include <sys/kmem.h>
41#include <sys/sobject.h>
42#include <sys/fault.h>
43#include <sys/procfs.h>
44#include <sys/watchpoint.h>
45#include <sys/time.h>
46#include <sys/cmn_err.h>
47#include <sys/machlock.h>
48#include <sys/debug.h>
49#include <sys/synch.h>
50#include <sys/synch32.h>
51#include <sys/mman.h>
52#include <sys/class.h>
53#include <sys/schedctl.h>
54#include <sys/sleepq.h>
55#include <sys/policy.h>
56#include <sys/tnf_probe.h>
57#include <sys/lwpchan_impl.h>
58#include <sys/turnstile.h>
59#include <sys/atomic.h>
60#include <sys/lwp_timer_impl.h>
61#include <sys/lwp_upimutex_impl.h>
62#include <vm/as.h>
63#include <sys/sdt.h>
64
65static kthread_t *lwpsobj_owner(caddr_t);
66static void lwp_unsleep(kthread_t *t);
67static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
68static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
69static void lwp_mutex_unregister(void *uaddr);
70static void set_owner_pid(lwp_mutex_t *, uintptr_t, pid_t);
71static int iswanted(kthread_t *, lwpchan_t *);
72
73extern int lwp_cond_signal(lwp_cond_t *cv);
74
75/*
76 * Maximum number of user prio inheritance locks that can be held by a thread.
77 * Used to limit kmem for each thread. This is a per-thread limit that
78 * can be administered on a system wide basis (using /etc/system).
79 *
80 * Also, when a limit, say maxlwps is added for numbers of lwps within a
81 * process, the per-thread limit automatically becomes a process-wide limit
82 * of maximum number of held upi locks within a process:
83 *      maxheldupimx = maxnestupimx * maxlwps;
84 */
85static uint32_t maxnestupimx = 2000;
86
87/*
88 * The sobj_ops vector exports a set of functions needed when a thread
89 * is asleep on a synchronization object of this type.
90 */
91static sobj_ops_t lwp_sobj_ops = {
92	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
93};
94
95static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
96
97static sobj_ops_t lwp_sobj_pi_ops = {
98	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
99	turnstile_change_pri
100};
101
102static sleepq_head_t	lwpsleepq[NSLEEPQ];
103upib_t			upimutextab[UPIMUTEX_TABSIZE];
104
105#define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
106#define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
107
108/*
109 * We know that both lc_wchan and lc_wchan0 are addresses that most
110 * likely are 8-byte aligned, so we shift off the low-order 3 bits.
111 * 'pool' is either 0 or 1.
112 */
113#define	LWPCHAN_LOCK_HASH(X, pool) \
114	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
115	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
116
117static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
118
119/*
120 * Is this a POSIX threads user-level lock requiring priority inheritance?
121 */
122#define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
123
124static sleepq_head_t *
125lwpsqhash(lwpchan_t *lwpchan)
126{
127	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
128	return (&lwpsleepq[SQHASHINDEX(x)]);
129}
130
131/*
132 * Lock an lwpchan.
133 * Keep this in sync with lwpchan_unlock(), below.
134 */
135static void
136lwpchan_lock(lwpchan_t *lwpchan, int pool)
137{
138	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
139	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
140}
141
142/*
143 * Unlock an lwpchan.
144 * Keep this in sync with lwpchan_lock(), above.
145 */
146static void
147lwpchan_unlock(lwpchan_t *lwpchan, int pool)
148{
149	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
150	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
151}
152
153/*
154 * Delete mappings from the lwpchan cache for pages that are being
155 * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
156 * all mappings within the range are deleted from the lwpchan cache.
157 */
158void
159lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
160{
161	lwpchan_data_t *lcp;
162	lwpchan_hashbucket_t *hashbucket;
163	lwpchan_hashbucket_t *endbucket;
164	lwpchan_entry_t *ent;
165	lwpchan_entry_t **prev;
166	caddr_t addr;
167
168	mutex_enter(&p->p_lcp_lock);
169	lcp = p->p_lcp;
170	hashbucket = lcp->lwpchan_cache;
171	endbucket = hashbucket + lcp->lwpchan_size;
172	for (; hashbucket < endbucket; hashbucket++) {
173		if (hashbucket->lwpchan_chain == NULL)
174			continue;
175		mutex_enter(&hashbucket->lwpchan_lock);
176		prev = &hashbucket->lwpchan_chain;
177		/* check entire chain */
178		while ((ent = *prev) != NULL) {
179			addr = ent->lwpchan_addr;
180			if (start <= addr && addr < end) {
181				*prev = ent->lwpchan_next;
182				/*
183				 * We do this only for the obsolete type
184				 * USYNC_PROCESS_ROBUST.  Otherwise robust
185				 * locks do not draw ELOCKUNMAPPED or
186				 * EOWNERDEAD due to being unmapped.
187				 */
188				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
189				    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
190					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
191				/*
192				 * If there is a user-level robust lock
193				 * registration, mark it as invalid.
194				 */
195				if ((addr = ent->lwpchan_uaddr) != NULL)
196					lwp_mutex_unregister(addr);
197				kmem_free(ent, sizeof (*ent));
198				atomic_add_32(&lcp->lwpchan_entries, -1);
199			} else {
200				prev = &ent->lwpchan_next;
201			}
202		}
203		mutex_exit(&hashbucket->lwpchan_lock);
204	}
205	mutex_exit(&p->p_lcp_lock);
206}
207
208/*
209 * Given an lwpchan cache pointer and a process virtual address,
210 * return a pointer to the corresponding lwpchan hash bucket.
211 */
212static lwpchan_hashbucket_t *
213lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
214{
215	uint_t i;
216
217	/*
218	 * All user-level sync object addresses are 8-byte aligned.
219	 * Ignore the lowest 3 bits of the address and use the
220	 * higher-order 2*lwpchan_bits bits for the hash index.
221	 */
222	addr >>= 3;
223	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
224	return (lcp->lwpchan_cache + i);
225}
226
227/*
228 * (Re)allocate the per-process lwpchan cache.
229 */
230static void
231lwpchan_alloc_cache(proc_t *p, uint_t bits)
232{
233	lwpchan_data_t *lcp;
234	lwpchan_data_t *old_lcp;
235	lwpchan_hashbucket_t *hashbucket;
236	lwpchan_hashbucket_t *endbucket;
237	lwpchan_hashbucket_t *newbucket;
238	lwpchan_entry_t *ent;
239	lwpchan_entry_t *next;
240	uint_t count;
241
242	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
243
244	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
245	lcp->lwpchan_bits = bits;
246	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
247	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
248	lcp->lwpchan_entries = 0;
249	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
250	    sizeof (lwpchan_hashbucket_t), KM_SLEEP);
251	lcp->lwpchan_next_data = NULL;
252
253	mutex_enter(&p->p_lcp_lock);
254	if ((old_lcp = p->p_lcp) != NULL) {
255		if (old_lcp->lwpchan_bits >= bits) {
256			/* someone beat us to it */
257			mutex_exit(&p->p_lcp_lock);
258			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
259			    sizeof (lwpchan_hashbucket_t));
260			kmem_free(lcp, sizeof (lwpchan_data_t));
261			return;
262		}
263		/*
264		 * Acquire all of the old hash table locks.
265		 */
266		hashbucket = old_lcp->lwpchan_cache;
267		endbucket = hashbucket + old_lcp->lwpchan_size;
268		for (; hashbucket < endbucket; hashbucket++)
269			mutex_enter(&hashbucket->lwpchan_lock);
270		/*
271		 * Move all of the old hash table entries to the
272		 * new hash table.  The new hash table has not yet
273		 * been installed so we don't need any of its locks.
274		 */
275		count = 0;
276		hashbucket = old_lcp->lwpchan_cache;
277		for (; hashbucket < endbucket; hashbucket++) {
278			ent = hashbucket->lwpchan_chain;
279			while (ent != NULL) {
280				next = ent->lwpchan_next;
281				newbucket = lwpchan_bucket(lcp,
282				    (uintptr_t)ent->lwpchan_addr);
283				ent->lwpchan_next = newbucket->lwpchan_chain;
284				newbucket->lwpchan_chain = ent;
285				ent = next;
286				count++;
287			}
288			hashbucket->lwpchan_chain = NULL;
289		}
290		lcp->lwpchan_entries = count;
291	}
292
293	/*
294	 * Retire the old hash table.  We can't actually kmem_free() it
295	 * now because someone may still have a pointer to it.  Instead,
296	 * we link it onto the new hash table's list of retired hash tables.
297	 * The new hash table is double the size of the previous one, so
298	 * the total size of all retired hash tables is less than the size
299	 * of the new one.  exit() and exec() free the retired hash tables
300	 * (see lwpchan_destroy_cache(), below).
301	 */
302	lcp->lwpchan_next_data = old_lcp;
303
304	/*
305	 * As soon as we store the new lcp, future locking operations will
306	 * use it.  Therefore, we must ensure that all the state we've just
307	 * established reaches global visibility before the new lcp does.
308	 */
309	membar_producer();
310	p->p_lcp = lcp;
311
312	if (old_lcp != NULL) {
313		/*
314		 * Release all of the old hash table locks.
315		 */
316		hashbucket = old_lcp->lwpchan_cache;
317		for (; hashbucket < endbucket; hashbucket++)
318			mutex_exit(&hashbucket->lwpchan_lock);
319	}
320	mutex_exit(&p->p_lcp_lock);
321}
322
323/*
324 * Deallocate the lwpchan cache, and any dynamically allocated mappings.
325 * Called when the process exits or execs.  All lwps except one have
326 * exited so we need no locks here.
327 */
328void
329lwpchan_destroy_cache(int exec)
330{
331	proc_t *p = curproc;
332	lwpchan_hashbucket_t *hashbucket;
333	lwpchan_hashbucket_t *endbucket;
334	lwpchan_data_t *lcp;
335	lwpchan_entry_t *ent;
336	lwpchan_entry_t *next;
337	uint16_t lockflg;
338
339	lcp = p->p_lcp;
340	p->p_lcp = NULL;
341
342	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
343	hashbucket = lcp->lwpchan_cache;
344	endbucket = hashbucket + lcp->lwpchan_size;
345	for (; hashbucket < endbucket; hashbucket++) {
346		ent = hashbucket->lwpchan_chain;
347		hashbucket->lwpchan_chain = NULL;
348		while (ent != NULL) {
349			next = ent->lwpchan_next;
350			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
351			    (ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
352			    == (USYNC_PROCESS | LOCK_ROBUST))
353				lwp_mutex_cleanup(ent, lockflg);
354			kmem_free(ent, sizeof (*ent));
355			ent = next;
356		}
357	}
358
359	while (lcp != NULL) {
360		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
361		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
362		    sizeof (lwpchan_hashbucket_t));
363		kmem_free(lcp, sizeof (lwpchan_data_t));
364		lcp = next_lcp;
365	}
366}
367
368/*
369 * Return zero when there is an entry in the lwpchan cache for the
370 * given process virtual address and non-zero when there is not.
371 * The returned non-zero value is the current length of the
372 * hash chain plus one.  The caller holds the hash bucket lock.
373 */
374static uint_t
375lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
376	lwpchan_hashbucket_t *hashbucket)
377{
378	lwpchan_entry_t *ent;
379	uint_t count = 1;
380
381	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
382		if (ent->lwpchan_addr == addr) {
383			if (ent->lwpchan_type != type ||
384			    ent->lwpchan_pool != pool) {
385				/*
386				 * This shouldn't happen, but might if the
387				 * process reuses its memory for different
388				 * types of sync objects.  We test first
389				 * to avoid grabbing the memory cache line.
390				 */
391				ent->lwpchan_type = (uint16_t)type;
392				ent->lwpchan_pool = (uint16_t)pool;
393			}
394			*lwpchan = ent->lwpchan_lwpchan;
395			return (0);
396		}
397		count++;
398	}
399	return (count);
400}
401
402/*
403 * Return the cached lwpchan mapping if cached, otherwise insert
404 * a virtual address to lwpchan mapping into the cache.
405 */
406static int
407lwpchan_get_mapping(struct as *as, caddr_t addr, caddr_t uaddr,
408	int type, lwpchan_t *lwpchan, int pool)
409{
410	proc_t *p = curproc;
411	lwpchan_data_t *lcp;
412	lwpchan_hashbucket_t *hashbucket;
413	lwpchan_entry_t *ent;
414	memid_t	memid;
415	uint_t count;
416	uint_t bits;
417
418top:
419	/* initialize the lwpchan cache, if necesary */
420	if ((lcp = p->p_lcp) == NULL) {
421		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
422		goto top;
423	}
424	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
425	mutex_enter(&hashbucket->lwpchan_lock);
426	if (lcp != p->p_lcp) {
427		/* someone resized the lwpchan cache; start over */
428		mutex_exit(&hashbucket->lwpchan_lock);
429		goto top;
430	}
431	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
432		/* it's in the cache */
433		mutex_exit(&hashbucket->lwpchan_lock);
434		return (1);
435	}
436	mutex_exit(&hashbucket->lwpchan_lock);
437	if (as_getmemid(as, addr, &memid) != 0)
438		return (0);
439	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
440	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
441	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
442	mutex_enter(&hashbucket->lwpchan_lock);
443	if (lcp != p->p_lcp) {
444		/* someone resized the lwpchan cache; start over */
445		mutex_exit(&hashbucket->lwpchan_lock);
446		kmem_free(ent, sizeof (*ent));
447		goto top;
448	}
449	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
450	if (count == 0) {
451		/* someone else added this entry to the cache */
452		mutex_exit(&hashbucket->lwpchan_lock);
453		kmem_free(ent, sizeof (*ent));
454		return (1);
455	}
456	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
457	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
458		/* hash chain too long; reallocate the hash table */
459		mutex_exit(&hashbucket->lwpchan_lock);
460		kmem_free(ent, sizeof (*ent));
461		lwpchan_alloc_cache(p, bits + 1);
462		goto top;
463	}
464	ent->lwpchan_addr = addr;
465	ent->lwpchan_uaddr = uaddr;
466	ent->lwpchan_type = (uint16_t)type;
467	ent->lwpchan_pool = (uint16_t)pool;
468	ent->lwpchan_lwpchan = *lwpchan;
469	ent->lwpchan_next = hashbucket->lwpchan_chain;
470	hashbucket->lwpchan_chain = ent;
471	atomic_add_32(&lcp->lwpchan_entries, 1);
472	mutex_exit(&hashbucket->lwpchan_lock);
473	return (1);
474}
475
476/*
477 * Return a unique pair of identifiers that corresponds to a
478 * synchronization object's virtual address.  Process-shared
479 * sync objects usually get vnode/offset from as_getmemid().
480 */
481static int
482get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
483{
484	/*
485	 * If the lwp synch object is defined to be process-private,
486	 * we just make the first field of the lwpchan be 'as' and
487	 * the second field be the synch object's virtual address.
488	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
489	 * The lwpchan cache is used only for process-shared objects.
490	 */
491	if (!(type & USYNC_PROCESS)) {
492		lwpchan->lc_wchan0 = (caddr_t)as;
493		lwpchan->lc_wchan = addr;
494		return (1);
495	}
496
497	return (lwpchan_get_mapping(as, addr, NULL, type, lwpchan, pool));
498}
499
500static void
501lwp_block(lwpchan_t *lwpchan)
502{
503	kthread_t *t = curthread;
504	klwp_t *lwp = ttolwp(t);
505	sleepq_head_t *sqh;
506
507	thread_lock(t);
508	t->t_flag |= T_WAKEABLE;
509	t->t_lwpchan = *lwpchan;
510	t->t_sobj_ops = &lwp_sobj_ops;
511	t->t_release = 0;
512	sqh = lwpsqhash(lwpchan);
513	disp_lock_enter_high(&sqh->sq_lock);
514	CL_SLEEP(t);
515	DTRACE_SCHED(sleep);
516	THREAD_SLEEP(t, &sqh->sq_lock);
517	sleepq_insert(&sqh->sq_queue, t);
518	thread_unlock(t);
519	lwp->lwp_asleep = 1;
520	lwp->lwp_sysabort = 0;
521	lwp->lwp_ru.nvcsw++;
522	(void) new_mstate(curthread, LMS_SLEEP);
523}
524
525static kthread_t *
526lwpsobj_pi_owner(upimutex_t *up)
527{
528	return (up->upi_owner);
529}
530
531static struct upimutex *
532upi_get(upib_t *upibp, lwpchan_t *lcp)
533{
534	struct upimutex *upip;
535
536	for (upip = upibp->upib_first; upip != NULL;
537	    upip = upip->upi_nextchain) {
538		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
539		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
540			break;
541	}
542	return (upip);
543}
544
545static void
546upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
547{
548	ASSERT(MUTEX_HELD(&upibp->upib_lock));
549
550	/*
551	 * Insert upimutex at front of list. Maybe a bit unfair
552	 * but assume that not many lwpchans hash to the same
553	 * upimutextab bucket, i.e. the list of upimutexes from
554	 * upib_first is not too long.
555	 */
556	upimutex->upi_nextchain = upibp->upib_first;
557	upibp->upib_first = upimutex;
558}
559
560static void
561upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
562{
563	struct upimutex **prev;
564
565	ASSERT(MUTEX_HELD(&upibp->upib_lock));
566
567	prev = &upibp->upib_first;
568	while (*prev != upimutex) {
569		prev = &(*prev)->upi_nextchain;
570	}
571	*prev = upimutex->upi_nextchain;
572	upimutex->upi_nextchain = NULL;
573}
574
575/*
576 * Add upimutex to chain of upimutexes held by curthread.
577 * Returns number of upimutexes held by curthread.
578 */
579static uint32_t
580upi_mylist_add(struct upimutex *upimutex)
581{
582	kthread_t *t = curthread;
583
584	/*
585	 * Insert upimutex at front of list of upimutexes owned by t. This
586	 * would match typical LIFO order in which nested locks are acquired
587	 * and released.
588	 */
589	upimutex->upi_nextowned = t->t_upimutex;
590	t->t_upimutex = upimutex;
591	t->t_nupinest++;
592	ASSERT(t->t_nupinest > 0);
593	return (t->t_nupinest);
594}
595
596/*
597 * Delete upimutex from list of upimutexes owned by curthread.
598 */
599static void
600upi_mylist_del(struct upimutex *upimutex)
601{
602	kthread_t *t = curthread;
603	struct upimutex **prev;
604
605	/*
606	 * Since the order in which nested locks are acquired and released,
607	 * is typically LIFO, and typical nesting levels are not too deep, the
608	 * following should not be expensive in the general case.
609	 */
610	prev = &t->t_upimutex;
611	while (*prev != upimutex) {
612		prev = &(*prev)->upi_nextowned;
613	}
614	*prev = upimutex->upi_nextowned;
615	upimutex->upi_nextowned = NULL;
616	ASSERT(t->t_nupinest > 0);
617	t->t_nupinest--;
618}
619
620/*
621 * Returns true if upimutex is owned. Should be called only when upim points
622 * to kmem which cannot disappear from underneath.
623 */
624static int
625upi_owned(upimutex_t *upim)
626{
627	return (upim->upi_owner == curthread);
628}
629
630/*
631 * Returns pointer to kernel object (upimutex_t *) if lp is owned.
632 */
633static struct upimutex *
634lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
635{
636	lwpchan_t lwpchan;
637	upib_t *upibp;
638	struct upimutex *upimutex;
639
640	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
641	    &lwpchan, LWPCHAN_MPPOOL))
642		return (NULL);
643
644	upibp = &UPI_CHAIN(lwpchan);
645	mutex_enter(&upibp->upib_lock);
646	upimutex = upi_get(upibp, &lwpchan);
647	if (upimutex == NULL || upimutex->upi_owner != curthread) {
648		mutex_exit(&upibp->upib_lock);
649		return (NULL);
650	}
651	mutex_exit(&upibp->upib_lock);
652	return (upimutex);
653}
654
655/*
656 * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
657 * no lock hand-off occurrs.
658 */
659static void
660upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
661{
662	turnstile_t *ts;
663	upib_t *upibp;
664	kthread_t *newowner;
665
666	upi_mylist_del(upimutex);
667	upibp = upimutex->upi_upibp;
668	mutex_enter(&upibp->upib_lock);
669	if (upimutex->upi_waiter != 0) { /* if waiters */
670		ts = turnstile_lookup(upimutex);
671		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
672			/* hand-off lock to highest prio waiter */
673			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
674			upimutex->upi_owner = newowner;
675			if (ts->ts_waiters == 1)
676				upimutex->upi_waiter = 0;
677			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
678			mutex_exit(&upibp->upib_lock);
679			return;
680		} else if (ts != NULL) {
681			/* LOCK_NOTRECOVERABLE: wakeup all */
682			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
683		} else {
684			/*
685			 * Misleading w bit. Waiters might have been
686			 * interrupted. No need to clear the w bit (upimutex
687			 * will soon be freed). Re-calculate PI from existing
688			 * waiters.
689			 */
690			turnstile_exit(upimutex);
691			turnstile_pi_recalc();
692		}
693	}
694	/*
695	 * no waiters, or LOCK_NOTRECOVERABLE.
696	 * remove from the bucket chain of upi mutexes.
697	 * de-allocate kernel memory (upimutex).
698	 */
699	upi_chain_del(upimutex->upi_upibp, upimutex);
700	mutex_exit(&upibp->upib_lock);
701	kmem_free(upimutex, sizeof (upimutex_t));
702}
703
704static int
705lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
706{
707	label_t ljb;
708	int error = 0;
709	lwpchan_t lwpchan;
710	uint16_t flag;
711	upib_t *upibp;
712	volatile struct upimutex *upimutex = NULL;
713	turnstile_t *ts;
714	uint32_t nupinest;
715	volatile int upilocked = 0;
716
717	if (on_fault(&ljb)) {
718		if (upilocked)
719			upimutex_unlock((upimutex_t *)upimutex, 0);
720		error = EFAULT;
721		goto out;
722	}
723	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
724	    &lwpchan, LWPCHAN_MPPOOL)) {
725		error = EFAULT;
726		goto out;
727	}
728	upibp = &UPI_CHAIN(lwpchan);
729retry:
730	mutex_enter(&upibp->upib_lock);
731	upimutex = upi_get(upibp, &lwpchan);
732	if (upimutex == NULL)  {
733		/* lock available since lwpchan has no upimutex */
734		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
735		upi_chain_add(upibp, (upimutex_t *)upimutex);
736		upimutex->upi_owner = curthread; /* grab lock */
737		upimutex->upi_upibp = upibp;
738		upimutex->upi_vaddr = lp;
739		upimutex->upi_lwpchan = lwpchan;
740		mutex_exit(&upibp->upib_lock);
741		nupinest = upi_mylist_add((upimutex_t *)upimutex);
742		upilocked = 1;
743		fuword16_noerr(&lp->mutex_flag, &flag);
744		if (nupinest > maxnestupimx &&
745		    secpolicy_resource(CRED()) != 0) {
746			upimutex_unlock((upimutex_t *)upimutex, flag);
747			error = ENOMEM;
748			goto out;
749		}
750		if (flag & LOCK_NOTRECOVERABLE) {
751			/*
752			 * Since the setting of LOCK_NOTRECOVERABLE
753			 * was done under the high-level upi mutex,
754			 * in lwp_upimutex_unlock(), this flag needs to
755			 * be checked while holding the upi mutex.
756			 * If set, this thread should return without
757			 * the lock held, and with the right error code.
758			 */
759			upimutex_unlock((upimutex_t *)upimutex, flag);
760			upilocked = 0;
761			error = ENOTRECOVERABLE;
762		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
763			if (flag & LOCK_OWNERDEAD)
764				error = EOWNERDEAD;
765			else if (type & USYNC_PROCESS_ROBUST)
766				error = ELOCKUNMAPPED;
767			else
768				error = EOWNERDEAD;
769		}
770		goto out;
771	}
772	/*
773	 * If a upimutex object exists, it must have an owner.
774	 * This is due to lock hand-off, and release of upimutex when no
775	 * waiters are present at unlock time,
776	 */
777	ASSERT(upimutex->upi_owner != NULL);
778	if (upimutex->upi_owner == curthread) {
779		/*
780		 * The user wrapper can check if the mutex type is
781		 * ERRORCHECK: if not, it should stall at user-level.
782		 * If so, it should return the error code.
783		 */
784		mutex_exit(&upibp->upib_lock);
785		error = EDEADLK;
786		goto out;
787	}
788	if (try == UPIMUTEX_TRY) {
789		mutex_exit(&upibp->upib_lock);
790		error = EBUSY;
791		goto out;
792	}
793	/*
794	 * Block for the lock.
795	 */
796	if ((error = lwptp->lwpt_time_error) != 0) {
797		/*
798		 * The SUSV3 Posix spec is very clear that we
799		 * should get no error from validating the
800		 * timer until we would actually sleep.
801		 */
802		mutex_exit(&upibp->upib_lock);
803		goto out;
804	}
805	if (lwptp->lwpt_tsp != NULL) {
806		/*
807		 * Unlike the protocol for other lwp timedwait operations,
808		 * we must drop t_delay_lock before going to sleep in
809		 * turnstile_block() for a upi mutex.
810		 * See the comments below and in turnstile.c
811		 */
812		mutex_enter(&curthread->t_delay_lock);
813		(void) lwp_timer_enqueue(lwptp);
814		mutex_exit(&curthread->t_delay_lock);
815	}
816	/*
817	 * Now, set the waiter bit and block for the lock in turnstile_block().
818	 * No need to preserve the previous wbit since a lock try is not
819	 * attempted after setting the wait bit. Wait bit is set under
820	 * the upib_lock, which is not released until the turnstile lock
821	 * is acquired. Say, the upimutex is L:
822	 *
823	 * 1. upib_lock is held so the waiter does not have to retry L after
824	 *    setting the wait bit: since the owner has to grab the upib_lock
825	 *    to unlock L, it will certainly see the wait bit set.
826	 * 2. upib_lock is not released until the turnstile lock is acquired.
827	 *    This is the key to preventing a missed wake-up. Otherwise, the
828	 *    owner could acquire the upib_lock, and the tc_lock, to call
829	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
830	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
831	 *    find this waiter, resulting in the missed wakeup.
832	 * 3. The upib_lock, being a kernel mutex, cannot be released while
833	 *    holding the tc_lock (since mutex_exit() could need to acquire
834	 *    the same tc_lock)...and so is held when calling turnstile_block().
835	 *    The address of upib_lock is passed to turnstile_block() which
836	 *    releases it after releasing all turnstile locks, and before going
837	 *    to sleep in swtch().
838	 * 4. The waiter value cannot be a count of waiters, because a waiter
839	 *    can be interrupted. The interrupt occurs under the tc_lock, at
840	 *    which point, the upib_lock cannot be locked, to decrement waiter
841	 *    count. So, just treat the waiter state as a bit, not a count.
842	 */
843	ts = turnstile_lookup((upimutex_t *)upimutex);
844	upimutex->upi_waiter = 1;
845	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
846	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
847	/*
848	 * Hand-off implies that we wakeup holding the lock, except when:
849	 *	- deadlock is detected
850	 *	- lock is not recoverable
851	 *	- we got an interrupt or timeout
852	 * If we wake up due to an interrupt or timeout, we may
853	 * or may not be holding the lock due to mutex hand-off.
854	 * Use lwp_upimutex_owned() to check if we do hold the lock.
855	 */
856	if (error != 0) {
857		if ((error == EINTR || error == ETIME) &&
858		    (upimutex = lwp_upimutex_owned(lp, type))) {
859			/*
860			 * Unlock and return - the re-startable syscall will
861			 * try the lock again if we got EINTR.
862			 */
863			(void) upi_mylist_add((upimutex_t *)upimutex);
864			upimutex_unlock((upimutex_t *)upimutex, 0);
865		}
866		/*
867		 * The only other possible error is EDEADLK.  If so, upimutex
868		 * is valid, since its owner is deadlocked with curthread.
869		 */
870		ASSERT(error == EINTR || error == ETIME ||
871		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
872		ASSERT(!lwp_upimutex_owned(lp, type));
873		goto out;
874	}
875	if (lwp_upimutex_owned(lp, type)) {
876		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
877		nupinest = upi_mylist_add((upimutex_t *)upimutex);
878		upilocked = 1;
879	}
880	/*
881	 * Now, need to read the user-level lp->mutex_flag to do the following:
882	 *
883	 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
884	 *   should be returned.
885	 * - if lock isn't held, check if ENOTRECOVERABLE should
886	 *   be returned.
887	 *
888	 * Now, either lp->mutex_flag is readable or it's not. If not
889	 * readable, the on_fault path will cause a return with EFAULT
890	 * as it should.  If it is readable, the state of the flag
891	 * encodes the robustness state of the lock:
892	 *
893	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
894	 * or LOCK_UNMAPPED setting will influence the return code
895	 * appropriately.  If the upimutex is not locked here, this
896	 * could be due to a spurious wake-up or a NOTRECOVERABLE
897	 * event.  The flag's setting can be used to distinguish
898	 * between these two events.
899	 */
900	fuword16_noerr(&lp->mutex_flag, &flag);
901	if (upilocked) {
902		/*
903		 * If the thread wakes up from turnstile_block with the lock
904		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
905		 * since it would not have been handed-off the lock.
906		 * So, no need to check for this case.
907		 */
908		if (nupinest > maxnestupimx &&
909		    secpolicy_resource(CRED()) != 0) {
910			upimutex_unlock((upimutex_t *)upimutex, flag);
911			upilocked = 0;
912			error = ENOMEM;
913		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
914			if (flag & LOCK_OWNERDEAD)
915				error = EOWNERDEAD;
916			else if (type & USYNC_PROCESS_ROBUST)
917				error = ELOCKUNMAPPED;
918			else
919				error = EOWNERDEAD;
920		}
921	} else {
922		/*
923		 * Wake-up without the upimutex held. Either this is a
924		 * spurious wake-up (due to signals, forkall(), whatever), or
925		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
926		 * of the mutex flag can be used to distinguish between the
927		 * two events.
928		 */
929		if (flag & LOCK_NOTRECOVERABLE) {
930			error = ENOTRECOVERABLE;
931		} else {
932			/*
933			 * Here, the flag could be set to LOCK_OWNERDEAD or
934			 * not. In both cases, this is a spurious wakeup,
935			 * since the upi lock is not held, but the thread
936			 * has returned from turnstile_block().
937			 *
938			 * The user flag could be LOCK_OWNERDEAD if, at the
939			 * same time as curthread having been woken up
940			 * spuriously, the owner (say Tdead) has died, marked
941			 * the mutex flag accordingly, and handed off the lock
942			 * to some other waiter (say Tnew). curthread just
943			 * happened to read the flag while Tnew has yet to deal
944			 * with the owner-dead event.
945			 *
946			 * In this event, curthread should retry the lock.
947			 * If Tnew is able to cleanup the lock, curthread
948			 * will eventually get the lock with a zero error code,
949			 * If Tnew is unable to cleanup, its eventual call to
950			 * unlock the lock will result in the mutex flag being
951			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
952			 * all waiters, including curthread, which will then
953			 * eventually return ENOTRECOVERABLE due to the above
954			 * check.
955			 *
956			 * Of course, if the user-flag is not set with
957			 * LOCK_OWNERDEAD, retrying is the thing to do, since
958			 * this is definitely a spurious wakeup.
959			 */
960			goto retry;
961		}
962	}
963
964out:
965	no_fault();
966	return (error);
967}
968
969
970static int
971lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
972{
973	label_t ljb;
974	int error = 0;
975	lwpchan_t lwpchan;
976	uint16_t flag;
977	upib_t *upibp;
978	volatile struct upimutex *upimutex = NULL;
979	volatile int upilocked = 0;
980
981	if (on_fault(&ljb)) {
982		if (upilocked)
983			upimutex_unlock((upimutex_t *)upimutex, 0);
984		error = EFAULT;
985		goto out;
986	}
987	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
988	    &lwpchan, LWPCHAN_MPPOOL)) {
989		error = EFAULT;
990		goto out;
991	}
992	upibp = &UPI_CHAIN(lwpchan);
993	mutex_enter(&upibp->upib_lock);
994	upimutex = upi_get(upibp, &lwpchan);
995	/*
996	 * If the lock is not held, or the owner is not curthread, return
997	 * error. The user-level wrapper can return this error or stall,
998	 * depending on whether mutex is of ERRORCHECK type or not.
999	 */
1000	if (upimutex == NULL || upimutex->upi_owner != curthread) {
1001		mutex_exit(&upibp->upib_lock);
1002		error = EPERM;
1003		goto out;
1004	}
1005	mutex_exit(&upibp->upib_lock); /* release for user memory access */
1006	upilocked = 1;
1007	fuword16_noerr(&lp->mutex_flag, &flag);
1008	if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1009		/*
1010		 * transition mutex to the LOCK_NOTRECOVERABLE state.
1011		 */
1012		flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
1013		flag |= LOCK_NOTRECOVERABLE;
1014		suword16_noerr(&lp->mutex_flag, flag);
1015	}
1016	set_owner_pid(lp, 0, 0);
1017	upimutex_unlock((upimutex_t *)upimutex, flag);
1018	upilocked = 0;
1019out:
1020	no_fault();
1021	return (error);
1022}
1023
1024/*
1025 * Set the owner and ownerpid fields of a user-level mutex.
1026 */
1027static void
1028set_owner_pid(lwp_mutex_t *lp, uintptr_t owner, pid_t pid)
1029{
1030	union {
1031		uint64_t word64;
1032		uint32_t word32[2];
1033	} un;
1034
1035	un.word64 = (uint64_t)owner;
1036
1037	suword32_noerr(&lp->mutex_ownerpid, pid);
1038#if defined(_LP64)
1039	if (((uintptr_t)lp & (_LONG_LONG_ALIGNMENT - 1)) == 0) { /* aligned */
1040		suword64_noerr(&lp->mutex_owner, un.word64);
1041		return;
1042	}
1043#endif
1044	/* mutex is unaligned or we are running on a 32-bit kernel */
1045	suword32_noerr((uint32_t *)&lp->mutex_owner, un.word32[0]);
1046	suword32_noerr((uint32_t *)&lp->mutex_owner + 1, un.word32[1]);
1047}
1048
1049/*
1050 * Clear the contents of a user-level mutex; return the flags.
1051 * Used only by upi_dead() and lwp_mutex_cleanup(), below.
1052 */
1053static uint16_t
1054lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
1055{
1056	uint16_t flag;
1057
1058	fuword16_noerr(&lp->mutex_flag, &flag);
1059	if ((flag &
1060	    (LOCK_OWNERDEAD | LOCK_UNMAPPED | LOCK_NOTRECOVERABLE)) == 0) {
1061		flag |= lockflg;
1062		suword16_noerr(&lp->mutex_flag, flag);
1063	}
1064	set_owner_pid(lp, 0, 0);
1065	suword8_noerr(&lp->mutex_rcount, 0);
1066
1067	return (flag);
1068}
1069
1070/*
1071 * Mark user mutex state, corresponding to kernel upimutex,
1072 * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
1073 */
1074static int
1075upi_dead(upimutex_t *upip, uint16_t lockflg)
1076{
1077	label_t ljb;
1078	int error = 0;
1079	lwp_mutex_t *lp;
1080
1081	if (on_fault(&ljb)) {
1082		error = EFAULT;
1083		goto out;
1084	}
1085
1086	lp = upip->upi_vaddr;
1087	(void) lwp_clear_mutex(lp, lockflg);
1088	suword8_noerr(&lp->mutex_lockw, 0);
1089out:
1090	no_fault();
1091	return (error);
1092}
1093
1094/*
1095 * Unlock all upimutexes held by curthread, since curthread is dying.
1096 * For each upimutex, attempt to mark its corresponding user mutex object as
1097 * dead.
1098 */
1099void
1100upimutex_cleanup()
1101{
1102	kthread_t *t = curthread;
1103	uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)?
1104	    LOCK_UNMAPPED : LOCK_OWNERDEAD;
1105	struct upimutex *upip;
1106
1107	while ((upip = t->t_upimutex) != NULL) {
1108		if (upi_dead(upip, lockflg) != 0) {
1109			/*
1110			 * If the user object associated with this upimutex is
1111			 * unmapped, unlock upimutex with the
1112			 * LOCK_NOTRECOVERABLE flag, so that all waiters are
1113			 * woken up. Since user object is unmapped, it could
1114			 * not be marked as dead or notrecoverable.
1115			 * The waiters will now all wake up and return
1116			 * ENOTRECOVERABLE, since they would find that the lock
1117			 * has not been handed-off to them.
1118			 * See lwp_upimutex_lock().
1119			 */
1120			upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1121		} else {
1122			/*
1123			 * The user object has been updated as dead.
1124			 * Unlock the upimutex: if no waiters, upip kmem will
1125			 * be freed. If there is a waiter, the lock will be
1126			 * handed off. If exit() is in progress, each existing
1127			 * waiter will successively get the lock, as owners
1128			 * die, and each new owner will call this routine as
1129			 * it dies. The last owner will free kmem, since
1130			 * it will find the upimutex has no waiters. So,
1131			 * eventually, the kmem is guaranteed to be freed.
1132			 */
1133			upimutex_unlock(upip, 0);
1134		}
1135		/*
1136		 * Note that the call to upimutex_unlock() above will delete
1137		 * upimutex from the t_upimutexes chain. And so the
1138		 * while loop will eventually terminate.
1139		 */
1140	}
1141}
1142
1143int
1144lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp, uintptr_t owner)
1145{
1146	kthread_t *t = curthread;
1147	klwp_t *lwp = ttolwp(t);
1148	proc_t *p = ttoproc(t);
1149	lwp_timer_t lwpt;
1150	caddr_t timedwait;
1151	int error = 0;
1152	int time_error;
1153	clock_t tim = -1;
1154	uchar_t waiters;
1155	volatile int locked = 0;
1156	volatile int watched = 0;
1157	label_t ljb;
1158	volatile uint8_t type = 0;
1159	lwpchan_t lwpchan;
1160	sleepq_head_t *sqh;
1161	uint16_t flag;
1162	int imm_timeout = 0;
1163
1164	if ((caddr_t)lp >= p->p_as->a_userlimit)
1165		return (set_errno(EFAULT));
1166
1167	/*
1168	 * Put the lwp in an orderly state for debugging,
1169	 * in case we are stopped while sleeping, below.
1170	 */
1171	prstop(PR_REQUESTED, 0);
1172
1173	timedwait = (caddr_t)tsp;
1174	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1175	    lwpt.lwpt_imm_timeout) {
1176		imm_timeout = 1;
1177		timedwait = NULL;
1178	}
1179
1180	/*
1181	 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1182	 * this micro state is really a run state. If the thread indeed blocks,
1183	 * this state becomes valid. If not, the state is converted back to
1184	 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1185	 * when blocking.
1186	 */
1187	(void) new_mstate(t, LMS_USER_LOCK);
1188	if (on_fault(&ljb)) {
1189		if (locked)
1190			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1191		error = EFAULT;
1192		goto out;
1193	}
1194	/*
1195	 * Force Copy-on-write if necessary and ensure that the
1196	 * synchronization object resides in read/write memory.
1197	 * Cause an EFAULT return now if this is not so.
1198	 */
1199	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1200	suword8_noerr(&lp->mutex_type, type);
1201	if (UPIMUTEX(type)) {
1202		no_fault();
1203		error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1204		if (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED)
1205			set_owner_pid(lp, owner,
1206			    (type & USYNC_PROCESS)? p->p_pid : 0);
1207		if (tsp && !time_error)	/* copyout the residual time left */
1208			error = lwp_timer_copyout(&lwpt, error);
1209		if (error)
1210			return (set_errno(error));
1211		return (0);
1212	}
1213	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1214	    &lwpchan, LWPCHAN_MPPOOL)) {
1215		error = EFAULT;
1216		goto out;
1217	}
1218	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1219	locked = 1;
1220	if (type & LOCK_ROBUST) {
1221		fuword16_noerr(&lp->mutex_flag, &flag);
1222		if (flag & LOCK_NOTRECOVERABLE) {
1223			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1224			error = ENOTRECOVERABLE;
1225			goto out;
1226		}
1227	}
1228	fuword8_noerr(&lp->mutex_waiters, &waiters);
1229	suword8_noerr(&lp->mutex_waiters, 1);
1230
1231	/*
1232	 * If watchpoints are set, they need to be restored, since
1233	 * atomic accesses of memory such as the call to ulock_try()
1234	 * below cannot be watched.
1235	 */
1236
1237	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1238
1239	while (!ulock_try(&lp->mutex_lockw)) {
1240		if (time_error) {
1241			/*
1242			 * The SUSV3 Posix spec is very clear that we
1243			 * should get no error from validating the
1244			 * timer until we would actually sleep.
1245			 */
1246			error = time_error;
1247			break;
1248		}
1249
1250		if (watched) {
1251			watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1252			watched = 0;
1253		}
1254
1255		if (timedwait) {
1256			/*
1257			 * If we successfully queue the timeout,
1258			 * then don't drop t_delay_lock until
1259			 * we are on the sleep queue (below).
1260			 */
1261			mutex_enter(&t->t_delay_lock);
1262			if (lwp_timer_enqueue(&lwpt) != 0) {
1263				mutex_exit(&t->t_delay_lock);
1264				imm_timeout = 1;
1265				timedwait = NULL;
1266			}
1267		}
1268		lwp_block(&lwpchan);
1269		/*
1270		 * Nothing should happen to cause the lwp to go to
1271		 * sleep again until after it returns from swtch().
1272		 */
1273		if (timedwait)
1274			mutex_exit(&t->t_delay_lock);
1275		locked = 0;
1276		lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1277		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1278			setrun(t);
1279		swtch();
1280		t->t_flag &= ~T_WAKEABLE;
1281		if (timedwait)
1282			tim = lwp_timer_dequeue(&lwpt);
1283		setallwatch();
1284		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1285			error = EINTR;
1286		else if (imm_timeout || (timedwait && tim == -1))
1287			error = ETIME;
1288		if (error) {
1289			lwp->lwp_asleep = 0;
1290			lwp->lwp_sysabort = 0;
1291			watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1292			    S_WRITE);
1293
1294			/*
1295			 * Need to re-compute waiters bit. The waiters field in
1296			 * the lock is not reliable. Either of two things could
1297			 * have occurred: no lwp may have called lwp_release()
1298			 * for me but I have woken up due to a signal or
1299			 * timeout.  In this case, the waiter bit is incorrect
1300			 * since it is still set to 1, set above.
1301			 * OR an lwp_release() did occur for some other lwp on
1302			 * the same lwpchan. In this case, the waiter bit is
1303			 * correct.  But which event occurred, one can't tell.
1304			 * So, recompute.
1305			 */
1306			lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1307			locked = 1;
1308			sqh = lwpsqhash(&lwpchan);
1309			disp_lock_enter(&sqh->sq_lock);
1310			waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1311			disp_lock_exit(&sqh->sq_lock);
1312			break;
1313		}
1314		lwp->lwp_asleep = 0;
1315		watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1316		    S_WRITE);
1317		lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1318		locked = 1;
1319		fuword8_noerr(&lp->mutex_waiters, &waiters);
1320		suword8_noerr(&lp->mutex_waiters, 1);
1321		if (type & LOCK_ROBUST) {
1322			fuword16_noerr(&lp->mutex_flag, &flag);
1323			if (flag & LOCK_NOTRECOVERABLE) {
1324				error = ENOTRECOVERABLE;
1325				break;
1326			}
1327		}
1328	}
1329
1330	if (t->t_mstate == LMS_USER_LOCK)
1331		(void) new_mstate(t, LMS_SYSTEM);
1332
1333	if (error == 0) {
1334		set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
1335		if (type & LOCK_ROBUST) {
1336			fuword16_noerr(&lp->mutex_flag, &flag);
1337			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1338				if (flag & LOCK_OWNERDEAD)
1339					error = EOWNERDEAD;
1340				else if (type & USYNC_PROCESS_ROBUST)
1341					error = ELOCKUNMAPPED;
1342				else
1343					error = EOWNERDEAD;
1344			}
1345		}
1346	}
1347	suword8_noerr(&lp->mutex_waiters, waiters);
1348	locked = 0;
1349	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1350out:
1351	no_fault();
1352	if (watched)
1353		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1354	if (tsp && !time_error)		/* copyout the residual time left */
1355		error = lwp_timer_copyout(&lwpt, error);
1356	if (error)
1357		return (set_errno(error));
1358	return (0);
1359}
1360
1361static int
1362iswanted(kthread_t *t, lwpchan_t *lwpchan)
1363{
1364	/*
1365	 * The caller holds the dispatcher lock on the sleep queue.
1366	 */
1367	while (t != NULL) {
1368		if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1369		    t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1370			return (1);
1371		t = t->t_link;
1372	}
1373	return (0);
1374}
1375
1376/*
1377 * Return the highest priority thread sleeping on this lwpchan.
1378 */
1379static kthread_t *
1380lwp_queue_waiter(lwpchan_t *lwpchan)
1381{
1382	sleepq_head_t *sqh;
1383	kthread_t *tp;
1384
1385	sqh = lwpsqhash(lwpchan);
1386	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1387	for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1388		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1389		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1390			break;
1391	}
1392	disp_lock_exit(&sqh->sq_lock);
1393	return (tp);
1394}
1395
1396static int
1397lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1398{
1399	sleepq_head_t *sqh;
1400	kthread_t *tp;
1401	kthread_t **tpp;
1402
1403	sqh = lwpsqhash(lwpchan);
1404	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1405	tpp = &sqh->sq_queue.sq_first;
1406	while ((tp = *tpp) != NULL) {
1407		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1408		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1409			/*
1410			 * The following is typically false. It could be true
1411			 * only if lwp_release() is called from
1412			 * lwp_mutex_wakeup() after reading the waiters field
1413			 * from memory in which the lwp lock used to be, but has
1414			 * since been re-used to hold a lwp cv or lwp semaphore.
1415			 * The thread "tp" found to match the lwp lock's wchan
1416			 * is actually sleeping for the cv or semaphore which
1417			 * now has the same wchan. In this case, lwp_release()
1418			 * should return failure.
1419			 */
1420			if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1421				ASSERT(sync_type == 0);
1422				/*
1423				 * assert that this can happen only for mutexes
1424				 * i.e. sync_type == 0, for correctly written
1425				 * user programs.
1426				 */
1427				disp_lock_exit(&sqh->sq_lock);
1428				return (0);
1429			}
1430			*waiters = iswanted(tp->t_link, lwpchan);
1431			sleepq_unlink(tpp, tp);
1432			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1433			tp->t_wchan0 = NULL;
1434			tp->t_wchan = NULL;
1435			tp->t_sobj_ops = NULL;
1436			tp->t_release = 1;
1437			THREAD_TRANSITION(tp);	/* drops sleepq lock */
1438			CL_WAKEUP(tp);
1439			thread_unlock(tp);	/* drop run queue lock */
1440			return (1);
1441		}
1442		tpp = &tp->t_link;
1443	}
1444	*waiters = 0;
1445	disp_lock_exit(&sqh->sq_lock);
1446	return (0);
1447}
1448
1449static void
1450lwp_release_all(lwpchan_t *lwpchan)
1451{
1452	sleepq_head_t	*sqh;
1453	kthread_t *tp;
1454	kthread_t **tpp;
1455
1456	sqh = lwpsqhash(lwpchan);
1457	disp_lock_enter(&sqh->sq_lock);		/* lock sleep q queue */
1458	tpp = &sqh->sq_queue.sq_first;
1459	while ((tp = *tpp) != NULL) {
1460		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1461		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1462			sleepq_unlink(tpp, tp);
1463			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1464			tp->t_wchan0 = NULL;
1465			tp->t_wchan = NULL;
1466			tp->t_sobj_ops = NULL;
1467			CL_WAKEUP(tp);
1468			thread_unlock_high(tp);	/* release run queue lock */
1469		} else {
1470			tpp = &tp->t_link;
1471		}
1472	}
1473	disp_lock_exit(&sqh->sq_lock);		/* drop sleep q lock */
1474}
1475
1476/*
1477 * unblock a lwp that is trying to acquire this mutex. the blocked
1478 * lwp resumes and retries to acquire the lock.
1479 */
1480int
1481lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all)
1482{
1483	proc_t *p = ttoproc(curthread);
1484	lwpchan_t lwpchan;
1485	uchar_t waiters;
1486	volatile int locked = 0;
1487	volatile int watched = 0;
1488	volatile uint8_t type = 0;
1489	label_t ljb;
1490	int error = 0;
1491
1492	if ((caddr_t)lp >= p->p_as->a_userlimit)
1493		return (set_errno(EFAULT));
1494
1495	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1496
1497	if (on_fault(&ljb)) {
1498		if (locked)
1499			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1500		error = EFAULT;
1501		goto out;
1502	}
1503	/*
1504	 * Force Copy-on-write if necessary and ensure that the
1505	 * synchronization object resides in read/write memory.
1506	 * Cause an EFAULT return now if this is not so.
1507	 */
1508	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1509	suword8_noerr(&lp->mutex_type, type);
1510	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1511	    &lwpchan, LWPCHAN_MPPOOL)) {
1512		error = EFAULT;
1513		goto out;
1514	}
1515	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1516	locked = 1;
1517	/*
1518	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1519	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1520	 * may fail.  If it fails, do not write into the waiter bit.
1521	 * The call to lwp_release() might fail due to one of three reasons:
1522	 *
1523	 * 	1. due to the thread which set the waiter bit not actually
1524	 *	   sleeping since it got the lock on the re-try. The waiter
1525	 *	   bit will then be correctly updated by that thread. This
1526	 *	   window may be closed by reading the wait bit again here
1527	 *	   and not calling lwp_release() at all if it is zero.
1528	 *	2. the thread which set the waiter bit and went to sleep
1529	 *	   was woken up by a signal. This time, the waiter recomputes
1530	 *	   the wait bit in the return with EINTR code.
1531	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
1532	 *	   memory that has been re-used after the lock was dropped.
1533	 *	   In this case, writing into the waiter bit would cause data
1534	 *	   corruption.
1535	 */
1536	if (release_all)
1537		lwp_release_all(&lwpchan);
1538	else if (lwp_release(&lwpchan, &waiters, 0))
1539		suword8_noerr(&lp->mutex_waiters, waiters);
1540	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1541out:
1542	no_fault();
1543	if (watched)
1544		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1545	if (error)
1546		return (set_errno(error));
1547	return (0);
1548}
1549
1550/*
1551 * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1552 * a pointer to a mutex, a pointer to a timespec for a timed wait and
1553 * a flag telling the kernel whether or not to honor the kernel/user
1554 * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1555 * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1556 * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
1557 * it is used an an in/out parameter.  On entry, it contains the relative
1558 * time until timeout.  On exit, we copyout the residual time left to it.
1559 */
1560int
1561lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1562{
1563	kthread_t *t = curthread;
1564	klwp_t *lwp = ttolwp(t);
1565	proc_t *p = ttoproc(t);
1566	lwp_timer_t lwpt;
1567	lwpchan_t cv_lwpchan;
1568	lwpchan_t m_lwpchan;
1569	caddr_t timedwait;
1570	volatile uint16_t type = 0;
1571	volatile uint8_t mtype = 0;
1572	uchar_t waiters;
1573	volatile int error;
1574	clock_t tim = -1;
1575	volatile int locked = 0;
1576	volatile int m_locked = 0;
1577	volatile int cvwatched = 0;
1578	volatile int mpwatched = 0;
1579	label_t ljb;
1580	volatile int no_lwpchan = 1;
1581	int imm_timeout = 0;
1582	int imm_unpark = 0;
1583
1584	if ((caddr_t)cv >= p->p_as->a_userlimit ||
1585	    (caddr_t)mp >= p->p_as->a_userlimit)
1586		return (set_errno(EFAULT));
1587
1588	/*
1589	 * Put the lwp in an orderly state for debugging,
1590	 * in case we are stopped while sleeping, below.
1591	 */
1592	prstop(PR_REQUESTED, 0);
1593
1594	timedwait = (caddr_t)tsp;
1595	if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1596		return (set_errno(error));
1597	if (lwpt.lwpt_imm_timeout) {
1598		imm_timeout = 1;
1599		timedwait = NULL;
1600	}
1601
1602	(void) new_mstate(t, LMS_USER_LOCK);
1603
1604	if (on_fault(&ljb)) {
1605		if (no_lwpchan) {
1606			error = EFAULT;
1607			goto out;
1608		}
1609		if (m_locked) {
1610			m_locked = 0;
1611			lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1612		}
1613		if (locked) {
1614			locked = 0;
1615			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1616		}
1617		/*
1618		 * set up another on_fault() for a possible fault
1619		 * on the user lock accessed at "efault"
1620		 */
1621		if (on_fault(&ljb)) {
1622			if (m_locked) {
1623				m_locked = 0;
1624				lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1625			}
1626			goto out;
1627		}
1628		error = EFAULT;
1629		goto efault;
1630	}
1631
1632	/*
1633	 * Force Copy-on-write if necessary and ensure that the
1634	 * synchronization object resides in read/write memory.
1635	 * Cause an EFAULT return now if this is not so.
1636	 */
1637	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1638	suword8_noerr(&mp->mutex_type, mtype);
1639	if (UPIMUTEX(mtype) == 0) {
1640		/* convert user level mutex, "mp", to a unique lwpchan */
1641		/* check if mtype is ok to use below, instead of type from cv */
1642		if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1643		    &m_lwpchan, LWPCHAN_MPPOOL)) {
1644			error = EFAULT;
1645			goto out;
1646		}
1647	}
1648	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1649	suword16_noerr(&cv->cond_type, type);
1650	/* convert user level condition variable, "cv", to a unique lwpchan */
1651	if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1652	    &cv_lwpchan, LWPCHAN_CVPOOL)) {
1653		error = EFAULT;
1654		goto out;
1655	}
1656	no_lwpchan = 0;
1657	cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1658	if (UPIMUTEX(mtype) == 0)
1659		mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1660		    S_WRITE);
1661
1662	/*
1663	 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1664	 * with respect to a possible wakeup which is a result of either
1665	 * an lwp_cond_signal() or an lwp_cond_broadcast().
1666	 *
1667	 * What's misleading, is that the lwp is put to sleep after the
1668	 * condition variable's mutex is released.  This is OK as long as
1669	 * the release operation is also done while holding lwpchan_lock.
1670	 * The lwp is then put to sleep when the possibility of pagefaulting
1671	 * or sleeping is completely eliminated.
1672	 */
1673	lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1674	locked = 1;
1675	if (UPIMUTEX(mtype) == 0) {
1676		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1677		m_locked = 1;
1678		suword8_noerr(&cv->cond_waiters_kernel, 1);
1679		/*
1680		 * unlock the condition variable's mutex. (pagefaults are
1681		 * possible here.)
1682		 */
1683		set_owner_pid(mp, 0, 0);
1684		ulock_clear(&mp->mutex_lockw);
1685		fuword8_noerr(&mp->mutex_waiters, &waiters);
1686		if (waiters != 0) {
1687			/*
1688			 * Given the locking of lwpchan_lock around the release
1689			 * of the mutex and checking for waiters, the following
1690			 * call to lwp_release() can fail ONLY if the lock
1691			 * acquirer is interrupted after setting the waiter bit,
1692			 * calling lwp_block() and releasing lwpchan_lock.
1693			 * In this case, it could get pulled off the lwp sleep
1694			 * q (via setrun()) before the following call to
1695			 * lwp_release() occurs. In this case, the lock
1696			 * requestor will update the waiter bit correctly by
1697			 * re-evaluating it.
1698			 */
1699			if (lwp_release(&m_lwpchan, &waiters, 0))
1700				suword8_noerr(&mp->mutex_waiters, waiters);
1701		}
1702		m_locked = 0;
1703		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1704	} else {
1705		suword8_noerr(&cv->cond_waiters_kernel, 1);
1706		error = lwp_upimutex_unlock(mp, mtype);
1707		if (error) {	/* if the upimutex unlock failed */
1708			locked = 0;
1709			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1710			goto out;
1711		}
1712	}
1713	no_fault();
1714
1715	if (mpwatched) {
1716		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1717		mpwatched = 0;
1718	}
1719	if (cvwatched) {
1720		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1721		cvwatched = 0;
1722	}
1723
1724	if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1725		/*
1726		 * We received a signal at user-level before calling here
1727		 * or another thread wants us to return immediately
1728		 * with EINTR.  See lwp_unpark().
1729		 */
1730		imm_unpark = 1;
1731		t->t_unpark = 0;
1732		timedwait = NULL;
1733	} else if (timedwait) {
1734		/*
1735		 * If we successfully queue the timeout,
1736		 * then don't drop t_delay_lock until
1737		 * we are on the sleep queue (below).
1738		 */
1739		mutex_enter(&t->t_delay_lock);
1740		if (lwp_timer_enqueue(&lwpt) != 0) {
1741			mutex_exit(&t->t_delay_lock);
1742			imm_timeout = 1;
1743			timedwait = NULL;
1744		}
1745	}
1746	t->t_flag |= T_WAITCVSEM;
1747	lwp_block(&cv_lwpchan);
1748	/*
1749	 * Nothing should happen to cause the lwp to go to sleep
1750	 * until after it returns from swtch().
1751	 */
1752	if (timedwait)
1753		mutex_exit(&t->t_delay_lock);
1754	locked = 0;
1755	lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1756	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1757	    (imm_timeout | imm_unpark))
1758		setrun(t);
1759	swtch();
1760	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1761	if (timedwait)
1762		tim = lwp_timer_dequeue(&lwpt);
1763	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1764	    MUSTRETURN(p, t) || imm_unpark)
1765		error = EINTR;
1766	else if (imm_timeout || (timedwait && tim == -1))
1767		error = ETIME;
1768	lwp->lwp_asleep = 0;
1769	lwp->lwp_sysabort = 0;
1770	setallwatch();
1771
1772	if (t->t_mstate == LMS_USER_LOCK)
1773		(void) new_mstate(t, LMS_SYSTEM);
1774
1775	if (tsp && check_park)		/* copyout the residual time left */
1776		error = lwp_timer_copyout(&lwpt, error);
1777
1778	/* the mutex is reacquired by the caller on return to user level */
1779	if (error) {
1780		/*
1781		 * If we were concurrently lwp_cond_signal()d and we
1782		 * received a UNIX signal or got a timeout, then perform
1783		 * another lwp_cond_signal() to avoid consuming the wakeup.
1784		 */
1785		if (t->t_release)
1786			(void) lwp_cond_signal(cv);
1787		return (set_errno(error));
1788	}
1789	return (0);
1790
1791efault:
1792	/*
1793	 * make sure that the user level lock is dropped before
1794	 * returning to caller, since the caller always re-acquires it.
1795	 */
1796	if (UPIMUTEX(mtype) == 0) {
1797		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1798		m_locked = 1;
1799		set_owner_pid(mp, 0, 0);
1800		ulock_clear(&mp->mutex_lockw);
1801		fuword8_noerr(&mp->mutex_waiters, &waiters);
1802		if (waiters != 0) {
1803			/*
1804			 * See comment above on lock clearing and lwp_release()
1805			 * success/failure.
1806			 */
1807			if (lwp_release(&m_lwpchan, &waiters, 0))
1808				suword8_noerr(&mp->mutex_waiters, waiters);
1809		}
1810		m_locked = 0;
1811		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1812	} else {
1813		(void) lwp_upimutex_unlock(mp, mtype);
1814	}
1815out:
1816	no_fault();
1817	if (mpwatched)
1818		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1819	if (cvwatched)
1820		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1821	if (t->t_mstate == LMS_USER_LOCK)
1822		(void) new_mstate(t, LMS_SYSTEM);
1823	return (set_errno(error));
1824}
1825
1826/*
1827 * wakeup one lwp that's blocked on this condition variable.
1828 */
1829int
1830lwp_cond_signal(lwp_cond_t *cv)
1831{
1832	proc_t *p = ttoproc(curthread);
1833	lwpchan_t lwpchan;
1834	uchar_t waiters;
1835	volatile uint16_t type = 0;
1836	volatile int locked = 0;
1837	volatile int watched = 0;
1838	label_t ljb;
1839	int error = 0;
1840
1841	if ((caddr_t)cv >= p->p_as->a_userlimit)
1842		return (set_errno(EFAULT));
1843
1844	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1845
1846	if (on_fault(&ljb)) {
1847		if (locked)
1848			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1849		error = EFAULT;
1850		goto out;
1851	}
1852	/*
1853	 * Force Copy-on-write if necessary and ensure that the
1854	 * synchronization object resides in read/write memory.
1855	 * Cause an EFAULT return now if this is not so.
1856	 */
1857	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1858	suword16_noerr(&cv->cond_type, type);
1859	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1860	    &lwpchan, LWPCHAN_CVPOOL)) {
1861		error = EFAULT;
1862		goto out;
1863	}
1864	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1865	locked = 1;
1866	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1867	if (waiters != 0) {
1868		/*
1869		 * The following call to lwp_release() might fail but it is
1870		 * OK to write into the waiters bit below, since the memory
1871		 * could not have been re-used or unmapped (for correctly
1872		 * written user programs) as in the case of lwp_mutex_wakeup().
1873		 * For an incorrect program, we should not care about data
1874		 * corruption since this is just one instance of other places
1875		 * where corruption can occur for such a program. Of course
1876		 * if the memory is unmapped, normal fault recovery occurs.
1877		 */
1878		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1879		suword8_noerr(&cv->cond_waiters_kernel, waiters);
1880	}
1881	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1882out:
1883	no_fault();
1884	if (watched)
1885		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1886	if (error)
1887		return (set_errno(error));
1888	return (0);
1889}
1890
1891/*
1892 * wakeup every lwp that's blocked on this condition variable.
1893 */
1894int
1895lwp_cond_broadcast(lwp_cond_t *cv)
1896{
1897	proc_t *p = ttoproc(curthread);
1898	lwpchan_t lwpchan;
1899	volatile uint16_t type = 0;
1900	volatile int locked = 0;
1901	volatile int watched = 0;
1902	label_t ljb;
1903	uchar_t waiters;
1904	int error = 0;
1905
1906	if ((caddr_t)cv >= p->p_as->a_userlimit)
1907		return (set_errno(EFAULT));
1908
1909	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1910
1911	if (on_fault(&ljb)) {
1912		if (locked)
1913			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1914		error = EFAULT;
1915		goto out;
1916	}
1917	/*
1918	 * Force Copy-on-write if necessary and ensure that the
1919	 * synchronization object resides in read/write memory.
1920	 * Cause an EFAULT return now if this is not so.
1921	 */
1922	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1923	suword16_noerr(&cv->cond_type, type);
1924	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1925	    &lwpchan, LWPCHAN_CVPOOL)) {
1926		error = EFAULT;
1927		goto out;
1928	}
1929	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1930	locked = 1;
1931	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1932	if (waiters != 0) {
1933		lwp_release_all(&lwpchan);
1934		suword8_noerr(&cv->cond_waiters_kernel, 0);
1935	}
1936	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1937out:
1938	no_fault();
1939	if (watched)
1940		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1941	if (error)
1942		return (set_errno(error));
1943	return (0);
1944}
1945
1946int
1947lwp_sema_trywait(lwp_sema_t *sp)
1948{
1949	kthread_t *t = curthread;
1950	proc_t *p = ttoproc(t);
1951	label_t ljb;
1952	volatile int locked = 0;
1953	volatile int watched = 0;
1954	volatile uint16_t type = 0;
1955	int count;
1956	lwpchan_t lwpchan;
1957	uchar_t waiters;
1958	int error = 0;
1959
1960	if ((caddr_t)sp >= p->p_as->a_userlimit)
1961		return (set_errno(EFAULT));
1962
1963	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1964
1965	if (on_fault(&ljb)) {
1966		if (locked)
1967			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1968		error = EFAULT;
1969		goto out;
1970	}
1971	/*
1972	 * Force Copy-on-write if necessary and ensure that the
1973	 * synchronization object resides in read/write memory.
1974	 * Cause an EFAULT return now if this is not so.
1975	 */
1976	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1977	suword16_noerr((void *)&sp->sema_type, type);
1978	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1979	    &lwpchan, LWPCHAN_CVPOOL)) {
1980		error = EFAULT;
1981		goto out;
1982	}
1983	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1984	locked = 1;
1985	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
1986	if (count == 0)
1987		error = EBUSY;
1988	else
1989		suword32_noerr((void *)&sp->sema_count, --count);
1990	if (count != 0) {
1991		fuword8_noerr(&sp->sema_waiters, &waiters);
1992		if (waiters != 0) {
1993			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1994			suword8_noerr(&sp->sema_waiters, waiters);
1995		}
1996	}
1997	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1998out:
1999	no_fault();
2000	if (watched)
2001		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2002	if (error)
2003		return (set_errno(error));
2004	return (0);
2005}
2006
2007/*
2008 * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
2009 */
2010int
2011lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
2012{
2013	kthread_t *t = curthread;
2014	klwp_t *lwp = ttolwp(t);
2015	proc_t *p = ttoproc(t);
2016	lwp_timer_t lwpt;
2017	caddr_t timedwait;
2018	clock_t tim = -1;
2019	label_t ljb;
2020	volatile int locked = 0;
2021	volatile int watched = 0;
2022	volatile uint16_t type = 0;
2023	int count;
2024	lwpchan_t lwpchan;
2025	uchar_t waiters;
2026	int error = 0;
2027	int time_error;
2028	int imm_timeout = 0;
2029	int imm_unpark = 0;
2030
2031	if ((caddr_t)sp >= p->p_as->a_userlimit)
2032		return (set_errno(EFAULT));
2033
2034	/*
2035	 * Put the lwp in an orderly state for debugging,
2036	 * in case we are stopped while sleeping, below.
2037	 */
2038	prstop(PR_REQUESTED, 0);
2039
2040	timedwait = (caddr_t)tsp;
2041	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2042	    lwpt.lwpt_imm_timeout) {
2043		imm_timeout = 1;
2044		timedwait = NULL;
2045	}
2046
2047	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2048
2049	if (on_fault(&ljb)) {
2050		if (locked)
2051			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2052		error = EFAULT;
2053		goto out;
2054	}
2055	/*
2056	 * Force Copy-on-write if necessary and ensure that the
2057	 * synchronization object resides in read/write memory.
2058	 * Cause an EFAULT return now if this is not so.
2059	 */
2060	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2061	suword16_noerr((void *)&sp->sema_type, type);
2062	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2063	    &lwpchan, LWPCHAN_CVPOOL)) {
2064		error = EFAULT;
2065		goto out;
2066	}
2067	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2068	locked = 1;
2069	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2070	while (error == 0 && count == 0) {
2071		if (time_error) {
2072			/*
2073			 * The SUSV3 Posix spec is very clear that we
2074			 * should get no error from validating the
2075			 * timer until we would actually sleep.
2076			 */
2077			error = time_error;
2078			break;
2079		}
2080		suword8_noerr(&sp->sema_waiters, 1);
2081		if (watched)
2082			watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2083		if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2084			/*
2085			 * We received a signal at user-level before calling
2086			 * here or another thread wants us to return
2087			 * immediately with EINTR.  See lwp_unpark().
2088			 */
2089			imm_unpark = 1;
2090			t->t_unpark = 0;
2091			timedwait = NULL;
2092		} else if (timedwait) {
2093			/*
2094			 * If we successfully queue the timeout,
2095			 * then don't drop t_delay_lock until
2096			 * we are on the sleep queue (below).
2097			 */
2098			mutex_enter(&t->t_delay_lock);
2099			if (lwp_timer_enqueue(&lwpt) != 0) {
2100				mutex_exit(&t->t_delay_lock);
2101				imm_timeout = 1;
2102				timedwait = NULL;
2103			}
2104		}
2105		t->t_flag |= T_WAITCVSEM;
2106		lwp_block(&lwpchan);
2107		/*
2108		 * Nothing should happen to cause the lwp to sleep
2109		 * again until after it returns from swtch().
2110		 */
2111		if (timedwait)
2112			mutex_exit(&t->t_delay_lock);
2113		locked = 0;
2114		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2115		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2116		    (imm_timeout | imm_unpark))
2117			setrun(t);
2118		swtch();
2119		t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2120		if (timedwait)
2121			tim = lwp_timer_dequeue(&lwpt);
2122		setallwatch();
2123		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2124		    MUSTRETURN(p, t) || imm_unpark)
2125			error = EINTR;
2126		else if (imm_timeout || (timedwait && tim == -1))
2127			error = ETIME;
2128		lwp->lwp_asleep = 0;
2129		lwp->lwp_sysabort = 0;
2130		watched = watch_disable_addr((caddr_t)sp,
2131		    sizeof (*sp), S_WRITE);
2132		lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2133		locked = 1;
2134		fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2135	}
2136	if (error == 0)
2137		suword32_noerr((void *)&sp->sema_count, --count);
2138	if (count != 0) {
2139		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2140		suword8_noerr(&sp->sema_waiters, waiters);
2141	}
2142	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2143out:
2144	no_fault();
2145	if (watched)
2146		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2147	if (tsp && check_park && !time_error)
2148		error = lwp_timer_copyout(&lwpt, error);
2149	if (error)
2150		return (set_errno(error));
2151	return (0);
2152}
2153
2154int
2155lwp_sema_post(lwp_sema_t *sp)
2156{
2157	proc_t *p = ttoproc(curthread);
2158	label_t ljb;
2159	volatile int locked = 0;
2160	volatile int watched = 0;
2161	volatile uint16_t type = 0;
2162	int count;
2163	lwpchan_t lwpchan;
2164	uchar_t waiters;
2165	int error = 0;
2166
2167	if ((caddr_t)sp >= p->p_as->a_userlimit)
2168		return (set_errno(EFAULT));
2169
2170	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2171
2172	if (on_fault(&ljb)) {
2173		if (locked)
2174			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2175		error = EFAULT;
2176		goto out;
2177	}
2178	/*
2179	 * Force Copy-on-write if necessary and ensure that the
2180	 * synchronization object resides in read/write memory.
2181	 * Cause an EFAULT return now if this is not so.
2182	 */
2183	fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2184	suword16_noerr(&sp->sema_type, type);
2185	if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2186	    &lwpchan, LWPCHAN_CVPOOL)) {
2187		error = EFAULT;
2188		goto out;
2189	}
2190	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2191	locked = 1;
2192	fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2193	if (count == _SEM_VALUE_MAX)
2194		error = EOVERFLOW;
2195	else
2196		suword32_noerr(&sp->sema_count, ++count);
2197	if (count == 1) {
2198		fuword8_noerr(&sp->sema_waiters, &waiters);
2199		if (waiters) {
2200			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2201			suword8_noerr(&sp->sema_waiters, waiters);
2202		}
2203	}
2204	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2205out:
2206	no_fault();
2207	if (watched)
2208		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2209	if (error)
2210		return (set_errno(error));
2211	return (0);
2212}
2213
2214#define	TRW_WANT_WRITE		0x1
2215#define	TRW_LOCK_GRANTED	0x2
2216
2217#define	READ_LOCK		0
2218#define	WRITE_LOCK		1
2219#define	TRY_FLAG		0x10
2220#define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
2221#define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
2222
2223/*
2224 * Release one writer or one or more readers. Compute the rwstate word to
2225 * reflect the new state of the queue. For a safe hand-off we copy the new
2226 * rwstate value back to userland before we wake any of the new lock holders.
2227 *
2228 * Note that sleepq_insert() implements a prioritized FIFO (with writers
2229 * being given precedence over readers of the same priority).
2230 *
2231 * If the first thread is a reader we scan the queue releasing all readers
2232 * until we hit a writer or the end of the queue. If the first thread is a
2233 * writer we still need to check for another writer.
2234 */
2235void
2236lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2237{
2238	sleepq_head_t *sqh;
2239	kthread_t *tp;
2240	kthread_t **tpp;
2241	kthread_t *tpnext;
2242	kthread_t *wakelist = NULL;
2243	uint32_t rwstate = 0;
2244	int wcount = 0;
2245	int rcount = 0;
2246
2247	sqh = lwpsqhash(lwpchan);
2248	disp_lock_enter(&sqh->sq_lock);
2249	tpp = &sqh->sq_queue.sq_first;
2250	while ((tp = *tpp) != NULL) {
2251		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2252		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2253			if (tp->t_writer & TRW_WANT_WRITE) {
2254				if ((wcount++ == 0) && (rcount == 0)) {
2255					rwstate |= URW_WRITE_LOCKED;
2256
2257					/* Just one writer to wake. */
2258					sleepq_unlink(tpp, tp);
2259					wakelist = tp;
2260
2261					/* tpp already set for next thread. */
2262					continue;
2263				} else {
2264					rwstate |= URW_HAS_WAITERS;
2265					/* We need look no further. */
2266					break;
2267				}
2268			} else {
2269				rcount++;
2270				if (wcount == 0) {
2271					rwstate++;
2272
2273					/* Add reader to wake list. */
2274					sleepq_unlink(tpp, tp);
2275					tp->t_link = wakelist;
2276					wakelist = tp;
2277
2278					/* tpp already set for next thread. */
2279					continue;
2280				} else {
2281					rwstate |= URW_HAS_WAITERS;
2282					/* We need look no further. */
2283					break;
2284				}
2285			}
2286		}
2287		tpp = &tp->t_link;
2288	}
2289
2290	/* Copy the new rwstate back to userland. */
2291	suword32_noerr(&rw->rwlock_readers, rwstate);
2292
2293	/* Wake the new lock holder(s) up. */
2294	tp = wakelist;
2295	while (tp != NULL) {
2296		DTRACE_SCHED1(wakeup, kthread_t *, tp);
2297		tp->t_wchan0 = NULL;
2298		tp->t_wchan = NULL;
2299		tp->t_sobj_ops = NULL;
2300		tp->t_writer |= TRW_LOCK_GRANTED;
2301		tpnext = tp->t_link;
2302		tp->t_link = NULL;
2303		CL_WAKEUP(tp);
2304		thread_unlock_high(tp);
2305		tp = tpnext;
2306	}
2307
2308	disp_lock_exit(&sqh->sq_lock);
2309}
2310
2311/*
2312 * We enter here holding the user-level mutex, which we must release before
2313 * returning or blocking. Based on lwp_cond_wait().
2314 */
2315static int
2316lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2317{
2318	lwp_mutex_t *mp = NULL;
2319	kthread_t *t = curthread;
2320	kthread_t *tp;
2321	klwp_t *lwp = ttolwp(t);
2322	proc_t *p = ttoproc(t);
2323	lwp_timer_t lwpt;
2324	lwpchan_t lwpchan;
2325	lwpchan_t mlwpchan;
2326	caddr_t timedwait;
2327	volatile uint16_t type = 0;
2328	volatile uint8_t mtype = 0;
2329	uchar_t mwaiters;
2330	volatile int error = 0;
2331	int time_error;
2332	clock_t tim = -1;
2333	volatile int locked = 0;
2334	volatile int mlocked = 0;
2335	volatile int watched = 0;
2336	volatile int mwatched = 0;
2337	label_t ljb;
2338	volatile int no_lwpchan = 1;
2339	int imm_timeout = 0;
2340	int try_flag;
2341	uint32_t rwstate;
2342	int acquired = 0;
2343
2344	/* We only check rw because the mutex is included in it. */
2345	if ((caddr_t)rw >= p->p_as->a_userlimit)
2346		return (set_errno(EFAULT));
2347
2348	/*
2349	 * Put the lwp in an orderly state for debugging,
2350	 * in case we are stopped while sleeping, below.
2351	 */
2352	prstop(PR_REQUESTED, 0);
2353
2354	/* We must only report this error if we are about to sleep (later). */
2355	timedwait = (caddr_t)tsp;
2356	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2357	    lwpt.lwpt_imm_timeout) {
2358		imm_timeout = 1;
2359		timedwait = NULL;
2360	}
2361
2362	(void) new_mstate(t, LMS_USER_LOCK);
2363
2364	if (on_fault(&ljb)) {
2365		if (no_lwpchan) {
2366			error = EFAULT;
2367			goto out_nodrop;
2368		}
2369		if (mlocked) {
2370			mlocked = 0;
2371			lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2372		}
2373		if (locked) {
2374			locked = 0;
2375			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2376		}
2377		/*
2378		 * Set up another on_fault() for a possible fault
2379		 * on the user lock accessed at "out_drop".
2380		 */
2381		if (on_fault(&ljb)) {
2382			if (mlocked) {
2383				mlocked = 0;
2384				lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2385			}
2386			error = EFAULT;
2387			goto out_nodrop;
2388		}
2389		error = EFAULT;
2390		goto out_nodrop;
2391	}
2392
2393	/* Process rd_wr (including sanity check). */
2394	try_flag = (rd_wr & TRY_FLAG);
2395	rd_wr &= ~TRY_FLAG;
2396	if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2397		error = EINVAL;
2398		goto out_nodrop;
2399	}
2400
2401	/*
2402	 * Force Copy-on-write if necessary and ensure that the
2403	 * synchronization object resides in read/write memory.
2404	 * Cause an EFAULT return now if this is not so.
2405	 */
2406	mp = &rw->mutex;
2407	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2408	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2409	suword8_noerr(&mp->mutex_type, mtype);
2410	suword16_noerr(&rw->rwlock_type, type);
2411
2412	/* We can only continue for simple USYNC_PROCESS locks. */
2413	if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2414		error = EINVAL;
2415		goto out_nodrop;
2416	}
2417
2418	/* Convert user level mutex, "mp", to a unique lwpchan. */
2419	if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2420	    &mlwpchan, LWPCHAN_MPPOOL)) {
2421		error = EFAULT;
2422		goto out_nodrop;
2423	}
2424
2425	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2426	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2427	    &lwpchan, LWPCHAN_CVPOOL)) {
2428		error = EFAULT;
2429		goto out_nodrop;
2430	}
2431
2432	no_lwpchan = 0;
2433	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2434	mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2435
2436	/*
2437	 * lwpchan_lock() ensures that the calling LWP is put to sleep
2438	 * atomically with respect to a possible wakeup which is a result
2439	 * of lwp_rwlock_unlock().
2440	 *
2441	 * What's misleading is that the LWP is put to sleep after the
2442	 * rwlock's mutex is released. This is OK as long as the release
2443	 * operation is also done while holding mlwpchan. The LWP is then
2444	 * put to sleep when the possibility of pagefaulting or sleeping
2445	 * has been completely eliminated.
2446	 */
2447	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2448	locked = 1;
2449	lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2450	mlocked = 1;
2451
2452	/*
2453	 * Fetch the current rwlock state.
2454	 *
2455	 * The possibility of spurious wake-ups or killed waiters means
2456	 * rwstate's URW_HAS_WAITERS bit may indicate false positives.
2457	 * We only fix these if they are important to us.
2458	 *
2459	 * Although various error states can be observed here (e.g. the lock
2460	 * is not held, but there are waiters) we assume these are applicaton
2461	 * errors and so we take no corrective action.
2462	 */
2463	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2464	/*
2465	 * We cannot legitimately get here from user-level
2466	 * without URW_HAS_WAITERS being set.
2467	 * Set it now to guard against user-level error.
2468	 */
2469	rwstate |= URW_HAS_WAITERS;
2470
2471	/*
2472	 * We can try only if the lock isn't held by a writer.
2473	 */
2474	if (!(rwstate & URW_WRITE_LOCKED)) {
2475		tp = lwp_queue_waiter(&lwpchan);
2476		if (tp == NULL) {
2477			/*
2478			 * Hmmm, rwstate indicates waiters but there are
2479			 * none queued. This could just be the result of a
2480			 * spurious wakeup, so let's ignore it.
2481			 *
2482			 * We now have a chance to acquire the lock
2483			 * uncontended, but this is the last chance for
2484			 * a writer to acquire the lock without blocking.
2485			 */
2486			if (rd_wr == READ_LOCK) {
2487				rwstate++;
2488				acquired = 1;
2489			} else if ((rwstate & URW_READERS_MASK) == 0) {
2490				rwstate |= URW_WRITE_LOCKED;
2491				acquired = 1;
2492			}
2493		} else if (rd_wr == READ_LOCK) {
2494			/*
2495			 * This is the last chance for a reader to acquire
2496			 * the lock now, but it can only do so if there is
2497			 * no writer of equal or greater priority at the
2498			 * head of the queue .
2499			 *
2500			 * It is also just possible that there is a reader
2501			 * at the head of the queue. This may be the result
2502			 * of a spurious wakeup or an application failure.
2503			 * In this case we only acquire the lock if we have
2504			 * equal or greater priority. It is not our job to
2505			 * release spurious waiters.
2506			 */
2507			pri_t our_pri = DISP_PRIO(t);
2508			pri_t his_pri = DISP_PRIO(tp);
2509
2510			if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2511			    !(tp->t_writer & TRW_WANT_WRITE))) {
2512				rwstate++;
2513				acquired = 1;
2514			}
2515		}
2516	}
2517
2518	if (acquired || try_flag || time_error) {
2519		/*
2520		 * We're not going to block this time.
2521		 */
2522		suword32_noerr(&rw->rwlock_readers, rwstate);
2523		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2524		locked = 0;
2525
2526		if (acquired) {
2527			/*
2528			 * Got the lock!
2529			 */
2530			error = 0;
2531
2532		} else if (try_flag) {
2533			/*
2534			 * We didn't get the lock and we're about to block.
2535			 * If we're doing a trylock, return EBUSY instead.
2536			 */
2537			error = EBUSY;
2538
2539		} else if (time_error) {
2540			/*
2541			 * The SUSV3 POSIX spec is very clear that we should
2542			 * get no error from validating the timer (above)
2543			 * until we would actually sleep.
2544			 */
2545			error = time_error;
2546		}
2547
2548		goto out_drop;
2549	}
2550
2551	/*
2552	 * We're about to block, so indicate what kind of waiter we are.
2553	 */
2554	t->t_writer = 0;
2555	if (rd_wr == WRITE_LOCK)
2556		t->t_writer = TRW_WANT_WRITE;
2557	suword32_noerr(&rw->rwlock_readers, rwstate);
2558
2559	/*
2560	 * Unlock the rwlock's mutex (pagefaults are possible here).
2561	 */
2562	set_owner_pid(mp, 0, 0);
2563	ulock_clear(&mp->mutex_lockw);
2564	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2565	if (mwaiters != 0) {
2566		/*
2567		 * Given the locking of mlwpchan around the release of
2568		 * the mutex and checking for waiters, the following
2569		 * call to lwp_release() can fail ONLY if the lock
2570		 * acquirer is interrupted after setting the waiter bit,
2571		 * calling lwp_block() and releasing mlwpchan.
2572		 * In this case, it could get pulled off the LWP sleep
2573		 * queue (via setrun()) before the following call to
2574		 * lwp_release() occurs, and the lock requestor will
2575		 * update the waiter bit correctly by re-evaluating it.
2576		 */
2577		if (lwp_release(&mlwpchan, &mwaiters, 0))
2578			suword8_noerr(&mp->mutex_waiters, mwaiters);
2579	}
2580	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2581	mlocked = 0;
2582	no_fault();
2583
2584	if (mwatched) {
2585		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2586		mwatched = 0;
2587	}
2588	if (watched) {
2589		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2590		watched = 0;
2591	}
2592
2593	if (timedwait) {
2594		/*
2595		 * If we successfully queue the timeout,
2596		 * then don't drop t_delay_lock until
2597		 * we are on the sleep queue (below).
2598		 */
2599		mutex_enter(&t->t_delay_lock);
2600		if (lwp_timer_enqueue(&lwpt) != 0) {
2601			mutex_exit(&t->t_delay_lock);
2602			imm_timeout = 1;
2603			timedwait = NULL;
2604		}
2605	}
2606	t->t_flag |= T_WAITCVSEM;
2607	lwp_block(&lwpchan);
2608
2609	/*
2610	 * Nothing should happen to cause the LWp to go to sleep until after
2611	 * it returns from swtch().
2612	 */
2613	if (timedwait)
2614		mutex_exit(&t->t_delay_lock);
2615	locked = 0;
2616	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2617	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
2618		setrun(t);
2619	swtch();
2620
2621	/*
2622	 * We're back, but we need to work out why. Were we interrupted? Did
2623	 * we timeout? Were we granted the lock?
2624	 */
2625	error = EAGAIN;
2626	acquired = (t->t_writer & TRW_LOCK_GRANTED);
2627	t->t_writer = 0;
2628	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2629	if (timedwait)
2630		tim = lwp_timer_dequeue(&lwpt);
2631	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2632		error = EINTR;
2633	else if (imm_timeout || (timedwait && tim == -1))
2634		error = ETIME;
2635	lwp->lwp_asleep = 0;
2636	lwp->lwp_sysabort = 0;
2637	setallwatch();
2638
2639	/*
2640	 * If we were granted the lock we don't care about EINTR or ETIME.
2641	 */
2642	if (acquired)
2643		error = 0;
2644
2645	if (t->t_mstate == LMS_USER_LOCK)
2646		(void) new_mstate(t, LMS_SYSTEM);
2647
2648	if (error)
2649		return (set_errno(error));
2650	return (0);
2651
2652out_drop:
2653	/*
2654	 * Make sure that the user level lock is dropped before returning
2655	 * to the caller.
2656	 */
2657	if (!mlocked) {
2658		lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2659		mlocked = 1;
2660	}
2661	set_owner_pid(mp, 0, 0);
2662	ulock_clear(&mp->mutex_lockw);
2663	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2664	if (mwaiters != 0) {
2665		/*
2666		 * See comment above on lock clearing and lwp_release()
2667		 * success/failure.
2668		 */
2669		if (lwp_release(&mlwpchan, &mwaiters, 0))
2670			suword8_noerr(&mp->mutex_waiters, mwaiters);
2671	}
2672	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2673	mlocked = 0;
2674
2675out_nodrop:
2676	no_fault();
2677	if (mwatched)
2678		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2679	if (watched)
2680		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2681	if (t->t_mstate == LMS_USER_LOCK)
2682		(void) new_mstate(t, LMS_SYSTEM);
2683	if (error)
2684		return (set_errno(error));
2685	return (0);
2686}
2687
2688/*
2689 * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2690 * we never drop the lock.
2691 */
2692static int
2693lwp_rwlock_unlock(lwp_rwlock_t *rw)
2694{
2695	kthread_t *t = curthread;
2696	proc_t *p = ttoproc(t);
2697	lwpchan_t lwpchan;
2698	volatile uint16_t type = 0;
2699	volatile int error = 0;
2700	volatile int locked = 0;
2701	volatile int watched = 0;
2702	label_t ljb;
2703	volatile int no_lwpchan = 1;
2704	uint32_t rwstate;
2705
2706	/* We only check rw because the mutex is included in it. */
2707	if ((caddr_t)rw >= p->p_as->a_userlimit)
2708		return (set_errno(EFAULT));
2709
2710	if (on_fault(&ljb)) {
2711		if (no_lwpchan) {
2712			error = EFAULT;
2713			goto out_nodrop;
2714		}
2715		if (locked) {
2716			locked = 0;
2717			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2718		}
2719		error = EFAULT;
2720		goto out_nodrop;
2721	}
2722
2723	/*
2724	 * Force Copy-on-write if necessary and ensure that the
2725	 * synchronization object resides in read/write memory.
2726	 * Cause an EFAULT return now if this is not so.
2727	 */
2728	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2729	suword16_noerr(&rw->rwlock_type, type);
2730
2731	/* We can only continue for simple USYNC_PROCESS locks. */
2732	if (type != USYNC_PROCESS) {
2733		error = EINVAL;
2734		goto out_nodrop;
2735	}
2736
2737	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2738	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2739	    &lwpchan, LWPCHAN_CVPOOL)) {
2740		error = EFAULT;
2741		goto out_nodrop;
2742	}
2743
2744	no_lwpchan = 0;
2745	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2746
2747	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2748	locked = 1;
2749
2750	/*
2751	 * We can resolve multiple readers (except the last reader) here.
2752	 * For the last reader or a writer we need lwp_rwlock_release(),
2753	 * to which we also delegate the task of copying the new rwstate
2754	 * back to userland (see the comment there).
2755	 */
2756	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2757	if (rwstate & URW_WRITE_LOCKED)
2758		lwp_rwlock_release(&lwpchan, rw);
2759	else if ((rwstate & URW_READERS_MASK) > 0) {
2760		rwstate--;
2761		if ((rwstate & URW_READERS_MASK) == 0)
2762			lwp_rwlock_release(&lwpchan, rw);
2763		else
2764			suword32_noerr(&rw->rwlock_readers, rwstate);
2765	}
2766
2767	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2768	locked = 0;
2769	error = 0;
2770
2771out_nodrop:
2772	no_fault();
2773	if (watched)
2774		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2775	if (error)
2776		return (set_errno(error));
2777	return (0);
2778}
2779
2780int
2781lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2782{
2783	switch (subcode) {
2784	case 0:
2785		return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2786	case 1:
2787		return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2788	case 2:
2789		return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2790	case 3:
2791		return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2792	case 4:
2793		return (lwp_rwlock_unlock(rwlp));
2794	}
2795	return (set_errno(EINVAL));
2796}
2797
2798/*
2799 * Return the owner of the user-level s-object.
2800 * Since we can't really do this, return NULL.
2801 */
2802/* ARGSUSED */
2803static kthread_t *
2804lwpsobj_owner(caddr_t sobj)
2805{
2806	return ((kthread_t *)NULL);
2807}
2808
2809/*
2810 * Wake up a thread asleep on a user-level synchronization
2811 * object.
2812 */
2813static void
2814lwp_unsleep(kthread_t *t)
2815{
2816	ASSERT(THREAD_LOCK_HELD(t));
2817	if (t->t_wchan0 != NULL) {
2818		sleepq_head_t *sqh;
2819		sleepq_t *sqp = t->t_sleepq;
2820
2821		if (sqp != NULL) {
2822			sqh = lwpsqhash(&t->t_lwpchan);
2823			ASSERT(&sqh->sq_queue == sqp);
2824			sleepq_unsleep(t);
2825			disp_lock_exit_high(&sqh->sq_lock);
2826			CL_SETRUN(t);
2827			return;
2828		}
2829	}
2830	panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2831}
2832
2833/*
2834 * Change the priority of a thread asleep on a user-level
2835 * synchronization object. To maintain proper priority order,
2836 * we:
2837 *	o dequeue the thread.
2838 *	o change its priority.
2839 *	o re-enqueue the thread.
2840 * Assumption: the thread is locked on entry.
2841 */
2842static void
2843lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2844{
2845	ASSERT(THREAD_LOCK_HELD(t));
2846	if (t->t_wchan0 != NULL) {
2847		sleepq_t   *sqp = t->t_sleepq;
2848
2849		sleepq_dequeue(t);
2850		*t_prip = pri;
2851		sleepq_insert(sqp, t);
2852	} else
2853		panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2854}
2855
2856/*
2857 * Clean up a left-over process-shared robust mutex
2858 */
2859static void
2860lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2861{
2862	uint16_t flag;
2863	uchar_t waiters;
2864	label_t ljb;
2865	pid_t owner_pid;
2866	lwp_mutex_t *lp;
2867	volatile int locked = 0;
2868	volatile int watched = 0;
2869	volatile struct upimutex *upimutex = NULL;
2870	volatile int upilocked = 0;
2871
2872	if ((ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
2873	    != (USYNC_PROCESS | LOCK_ROBUST))
2874		return;
2875
2876	lp = (lwp_mutex_t *)ent->lwpchan_addr;
2877	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2878	if (on_fault(&ljb)) {
2879		if (locked)
2880			lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2881		if (upilocked)
2882			upimutex_unlock((upimutex_t *)upimutex, 0);
2883		goto out;
2884	}
2885
2886	fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2887
2888	if (UPIMUTEX(ent->lwpchan_type)) {
2889		lwpchan_t lwpchan = ent->lwpchan_lwpchan;
2890		upib_t *upibp = &UPI_CHAIN(lwpchan);
2891
2892		if (owner_pid != curproc->p_pid)
2893			goto out;
2894		mutex_enter(&upibp->upib_lock);
2895		upimutex = upi_get(upibp, &lwpchan);
2896		if (upimutex == NULL || upimutex->upi_owner != curthread) {
2897			mutex_exit(&upibp->upib_lock);
2898			goto out;
2899		}
2900		mutex_exit(&upibp->upib_lock);
2901		upilocked = 1;
2902		flag = lwp_clear_mutex(lp, lockflg);
2903		suword8_noerr(&lp->mutex_lockw, 0);
2904		upimutex_unlock((upimutex_t *)upimutex, flag);
2905	} else {
2906		lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2907		locked = 1;
2908		/*
2909		 * Clear the spinners count because one of our
2910		 * threads could have been spinning for this lock
2911		 * at user level when the process was suddenly killed.
2912		 * There is no harm in this since user-level libc code
2913		 * will adapt to the sudden change in the spinner count.
2914		 */
2915		suword8_noerr(&lp->mutex_spinners, 0);
2916		if (owner_pid != curproc->p_pid) {
2917			/*
2918			 * We are not the owner.  There may or may not be one.
2919			 * If there are waiters, we wake up one or all of them.
2920			 * It doesn't hurt to wake them up in error since
2921			 * they will just retry the lock and go to sleep
2922			 * again if necessary.
2923			 */
2924			fuword8_noerr(&lp->mutex_waiters, &waiters);
2925			if (waiters != 0) {	/* there are waiters */
2926				fuword16_noerr(&lp->mutex_flag, &flag);
2927				if (flag & LOCK_NOTRECOVERABLE) {
2928					lwp_release_all(&ent->lwpchan_lwpchan);
2929					suword8_noerr(&lp->mutex_waiters, 0);
2930				} else if (lwp_release(&ent->lwpchan_lwpchan,
2931				    &waiters, 0)) {
2932					suword8_noerr(&lp->mutex_waiters,
2933					    waiters);
2934				}
2935			}
2936		} else {
2937			/*
2938			 * We are the owner.  Release it.
2939			 */
2940			(void) lwp_clear_mutex(lp, lockflg);
2941			ulock_clear(&lp->mutex_lockw);
2942			fuword8_noerr(&lp->mutex_waiters, &waiters);
2943			if (waiters &&
2944			    lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2945				suword8_noerr(&lp->mutex_waiters, waiters);
2946		}
2947		lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2948	}
2949out:
2950	no_fault();
2951	if (watched)
2952		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2953}
2954
2955/*
2956 * Register a process-shared robust mutex in the lwpchan cache.
2957 */
2958int
2959lwp_mutex_register(lwp_mutex_t *lp, caddr_t uaddr)
2960{
2961	int error = 0;
2962	volatile int watched;
2963	label_t ljb;
2964	uint8_t type;
2965	lwpchan_t lwpchan;
2966
2967	if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2968		return (set_errno(EFAULT));
2969
2970	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2971
2972	if (on_fault(&ljb)) {
2973		error = EFAULT;
2974	} else {
2975		/*
2976		 * Force Copy-on-write if necessary and ensure that the
2977		 * synchronization object resides in read/write memory.
2978		 * Cause an EFAULT return now if this is not so.
2979		 */
2980		fuword8_noerr(&lp->mutex_type, &type);
2981		suword8_noerr(&lp->mutex_type, type);
2982		if ((type & (USYNC_PROCESS|LOCK_ROBUST))
2983		    != (USYNC_PROCESS|LOCK_ROBUST)) {
2984			error = EINVAL;
2985		} else if (!lwpchan_get_mapping(curproc->p_as, (caddr_t)lp,
2986		    uaddr, type, &lwpchan, LWPCHAN_MPPOOL)) {
2987			error = EFAULT;
2988		}
2989	}
2990	no_fault();
2991	if (watched)
2992		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2993	if (error)
2994		return (set_errno(error));
2995	return (0);
2996}
2997
2998/*
2999 * There is a user-level robust lock registration in libc.
3000 * Mark it as invalid by storing -1 into the location of the pointer.
3001 */
3002static void
3003lwp_mutex_unregister(void *uaddr)
3004{
3005	if (get_udatamodel() == DATAMODEL_NATIVE) {
3006		(void) sulword(uaddr, (ulong_t)-1);
3007#ifdef _SYSCALL32_IMPL
3008	} else {
3009		(void) suword32(uaddr, (uint32_t)-1);
3010#endif
3011	}
3012}
3013
3014int
3015lwp_mutex_trylock(lwp_mutex_t *lp, uintptr_t owner)
3016{
3017	kthread_t *t = curthread;
3018	proc_t *p = ttoproc(t);
3019	int error = 0;
3020	volatile int locked = 0;
3021	volatile int watched = 0;
3022	label_t ljb;
3023	volatile uint8_t type = 0;
3024	uint16_t flag;
3025	lwpchan_t lwpchan;
3026
3027	if ((caddr_t)lp >= p->p_as->a_userlimit)
3028		return (set_errno(EFAULT));
3029
3030	(void) new_mstate(t, LMS_USER_LOCK);
3031
3032	if (on_fault(&ljb)) {
3033		if (locked)
3034			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3035		error = EFAULT;
3036		goto out;
3037	}
3038	/*
3039	 * Force Copy-on-write if necessary and ensure that the
3040	 * synchronization object resides in read/write memory.
3041	 * Cause an EFAULT return now if this is not so.
3042	 */
3043	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3044	suword8_noerr(&lp->mutex_type, type);
3045	if (UPIMUTEX(type)) {
3046		no_fault();
3047		error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
3048		if (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED)
3049			set_owner_pid(lp, owner,
3050			    (type & USYNC_PROCESS)? p->p_pid : 0);
3051		if (error)
3052			return (set_errno(error));
3053		return (0);
3054	}
3055	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3056	    &lwpchan, LWPCHAN_MPPOOL)) {
3057		error = EFAULT;
3058		goto out;
3059	}
3060	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3061	locked = 1;
3062	if (type & LOCK_ROBUST) {
3063		fuword16_noerr(&lp->mutex_flag, &flag);
3064		if (flag & LOCK_NOTRECOVERABLE) {
3065			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3066			error =  ENOTRECOVERABLE;
3067			goto out;
3068		}
3069	}
3070
3071	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3072
3073	if (!ulock_try(&lp->mutex_lockw))
3074		error = EBUSY;
3075	else {
3076		set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
3077		if (type & LOCK_ROBUST) {
3078			fuword16_noerr(&lp->mutex_flag, &flag);
3079			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3080				if (flag & LOCK_OWNERDEAD)
3081					error = EOWNERDEAD;
3082				else if (type & USYNC_PROCESS_ROBUST)
3083					error = ELOCKUNMAPPED;
3084				else
3085					error = EOWNERDEAD;
3086			}
3087		}
3088	}
3089	locked = 0;
3090	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3091out:
3092
3093	if (t->t_mstate == LMS_USER_LOCK)
3094		(void) new_mstate(t, LMS_SYSTEM);
3095
3096	no_fault();
3097	if (watched)
3098		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3099	if (error)
3100		return (set_errno(error));
3101	return (0);
3102}
3103
3104/*
3105 * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3106 * the blocked lwp resumes and retries to acquire the lock.
3107 */
3108int
3109lwp_mutex_unlock(lwp_mutex_t *lp)
3110{
3111	proc_t *p = ttoproc(curthread);
3112	lwpchan_t lwpchan;
3113	uchar_t waiters;
3114	volatile int locked = 0;
3115	volatile int watched = 0;
3116	volatile uint8_t type = 0;
3117	label_t ljb;
3118	uint16_t flag;
3119	int error = 0;
3120
3121	if ((caddr_t)lp >= p->p_as->a_userlimit)
3122		return (set_errno(EFAULT));
3123
3124	if (on_fault(&ljb)) {
3125		if (locked)
3126			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3127		error = EFAULT;
3128		goto out;
3129	}
3130
3131	/*
3132	 * Force Copy-on-write if necessary and ensure that the
3133	 * synchronization object resides in read/write memory.
3134	 * Cause an EFAULT return now if this is not so.
3135	 */
3136	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3137	suword8_noerr(&lp->mutex_type, type);
3138
3139	if (UPIMUTEX(type)) {
3140		no_fault();
3141		error = lwp_upimutex_unlock(lp, type);
3142		if (error)
3143			return (set_errno(error));
3144		return (0);
3145	}
3146
3147	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3148
3149	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3150	    &lwpchan, LWPCHAN_MPPOOL)) {
3151		error = EFAULT;
3152		goto out;
3153	}
3154	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3155	locked = 1;
3156	if (type & LOCK_ROBUST) {
3157		fuword16_noerr(&lp->mutex_flag, &flag);
3158		if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3159			flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3160			flag |= LOCK_NOTRECOVERABLE;
3161			suword16_noerr(&lp->mutex_flag, flag);
3162		}
3163	}
3164	set_owner_pid(lp, 0, 0);
3165	ulock_clear(&lp->mutex_lockw);
3166	/*
3167	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3168	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3169	 * may fail.  If it fails, do not write into the waiter bit.
3170	 * The call to lwp_release() might fail due to one of three reasons:
3171	 *
3172	 * 	1. due to the thread which set the waiter bit not actually
3173	 *	   sleeping since it got the lock on the re-try. The waiter
3174	 *	   bit will then be correctly updated by that thread. This
3175	 *	   window may be closed by reading the wait bit again here
3176	 *	   and not calling lwp_release() at all if it is zero.
3177	 *	2. the thread which set the waiter bit and went to sleep
3178	 *	   was woken up by a signal. This time, the waiter recomputes
3179	 *	   the wait bit in the return with EINTR code.
3180	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
3181	 *	   memory that has been re-used after the lock was dropped.
3182	 *	   In this case, writing into the waiter bit would cause data
3183	 *	   corruption.
3184	 */
3185	fuword8_noerr(&lp->mutex_waiters, &waiters);
3186	if (waiters) {
3187		if ((type & LOCK_ROBUST) &&
3188		    (flag & LOCK_NOTRECOVERABLE)) {
3189			lwp_release_all(&lwpchan);
3190			suword8_noerr(&lp->mutex_waiters, 0);
3191		} else if (lwp_release(&lwpchan, &waiters, 0)) {
3192			suword8_noerr(&lp->mutex_waiters, waiters);
3193		}
3194	}
3195
3196	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3197out:
3198	no_fault();
3199	if (watched)
3200		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3201	if (error)
3202		return (set_errno(error));
3203	return (0);
3204}
3205