kern_umtx.c revision 251684
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 251684 2013-06-13 09:33:22Z kib $");
30
31#include "opt_compat.h"
32#include "opt_umtx_profiling.h"
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mutex.h>
40#include <sys/priv.h>
41#include <sys/proc.h>
42#include <sys/sbuf.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/sysctl.h>
46#include <sys/sysent.h>
47#include <sys/systm.h>
48#include <sys/sysproto.h>
49#include <sys/syscallsubr.h>
50#include <sys/eventhandler.h>
51#include <sys/umtx.h>
52
53#include <vm/vm.h>
54#include <vm/vm_param.h>
55#include <vm/pmap.h>
56#include <vm/vm_map.h>
57#include <vm/vm_object.h>
58
59#include <machine/cpu.h>
60
61#ifdef COMPAT_FREEBSD32
62#include <compat/freebsd32/freebsd32_proto.h>
63#endif
64
65#define _UMUTEX_TRY		1
66#define _UMUTEX_WAIT		2
67
68#ifdef UMTX_PROFILING
69#define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71#endif
72
73/* Priority inheritance mutex info. */
74struct umtx_pi {
75	/* Owner thread */
76	struct thread		*pi_owner;
77
78	/* Reference count */
79	int			pi_refcount;
80
81 	/* List entry to link umtx holding by thread */
82	TAILQ_ENTRY(umtx_pi)	pi_link;
83
84	/* List entry in hash */
85	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86
87	/* List for waiters */
88	TAILQ_HEAD(,umtx_q)	pi_blocked;
89
90	/* Identify a userland lock object */
91	struct umtx_key		pi_key;
92};
93
94/* A userland synchronous object user. */
95struct umtx_q {
96	/* Linked list for the hash. */
97	TAILQ_ENTRY(umtx_q)	uq_link;
98
99	/* Umtx key. */
100	struct umtx_key		uq_key;
101
102	/* Umtx flags. */
103	int			uq_flags;
104#define UQF_UMTXQ	0x0001
105
106	/* The thread waits on. */
107	struct thread		*uq_thread;
108
109	/*
110	 * Blocked on PI mutex. read can use chain lock
111	 * or umtx_lock, write must have both chain lock and
112	 * umtx_lock being hold.
113	 */
114	struct umtx_pi		*uq_pi_blocked;
115
116	/* On blocked list */
117	TAILQ_ENTRY(umtx_q)	uq_lockq;
118
119	/* Thread contending with us */
120	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121
122	/* Inherited priority from PP mutex */
123	u_char			uq_inherited_pri;
124
125	/* Spare queue ready to be reused */
126	struct umtxq_queue	*uq_spare_queue;
127
128	/* The queue we on */
129	struct umtxq_queue	*uq_cur_queue;
130};
131
132TAILQ_HEAD(umtxq_head, umtx_q);
133
134/* Per-key wait-queue */
135struct umtxq_queue {
136	struct umtxq_head	head;
137	struct umtx_key		key;
138	LIST_ENTRY(umtxq_queue)	link;
139	int			length;
140};
141
142LIST_HEAD(umtxq_list, umtxq_queue);
143
144/* Userland lock object's wait-queue chain */
145struct umtxq_chain {
146	/* Lock for this chain. */
147	struct mtx		uc_lock;
148
149	/* List of sleep queues. */
150	struct umtxq_list	uc_queue[2];
151#define UMTX_SHARED_QUEUE	0
152#define UMTX_EXCLUSIVE_QUEUE	1
153
154	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155
156	/* Busy flag */
157	char			uc_busy;
158
159	/* Chain lock waiters */
160	int			uc_waiters;
161
162	/* All PI in the list */
163	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164
165#ifdef UMTX_PROFILING
166	u_int 			length;
167	u_int			max_length;
168#endif
169};
170
171#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
173
174/*
175 * Don't propagate time-sharing priority, there is a security reason,
176 * a user can simply introduce PI-mutex, let thread A lock the mutex,
177 * and let another thread B block on the mutex, because B is
178 * sleeping, its priority will be boosted, this causes A's priority to
179 * be boosted via priority propagating too and will never be lowered even
180 * if it is using 100%CPU, this is unfair to other processes.
181 */
182
183#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
184			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
185			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
186
187#define	GOLDEN_RATIO_PRIME	2654404609U
188#define	UMTX_CHAINS		512
189#define	UMTX_SHIFTS		(__WORD_BIT - 9)
190
191#define	GET_SHARE(flags)	\
192    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
193
194#define BUSY_SPINS		200
195
196struct abs_timeout {
197	int clockid;
198	struct timespec cur;
199	struct timespec end;
200};
201
202static uma_zone_t		umtx_pi_zone;
203static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
204static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
205static int			umtx_pi_allocated;
206
207static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
208SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
209    &umtx_pi_allocated, 0, "Allocated umtx_pi");
210
211#ifdef UMTX_PROFILING
212static long max_length;
213SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
214static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
215#endif
216
217static void umtxq_sysinit(void *);
218static void umtxq_hash(struct umtx_key *key);
219static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
220static void umtxq_lock(struct umtx_key *key);
221static void umtxq_unlock(struct umtx_key *key);
222static void umtxq_busy(struct umtx_key *key);
223static void umtxq_unbusy(struct umtx_key *key);
224static void umtxq_insert_queue(struct umtx_q *uq, int q);
225static void umtxq_remove_queue(struct umtx_q *uq, int q);
226static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
227static int umtxq_count(struct umtx_key *key);
228static struct umtx_pi *umtx_pi_alloc(int);
229static void umtx_pi_free(struct umtx_pi *pi);
230static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
231static void umtx_thread_cleanup(struct thread *td);
232static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
233	struct image_params *imgp __unused);
234SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
235
236#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
237#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
238#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
239
240static struct mtx umtx_lock;
241
242#ifdef UMTX_PROFILING
243static void
244umtx_init_profiling(void)
245{
246	struct sysctl_oid *chain_oid;
247	char chain_name[10];
248	int i;
249
250	for (i = 0; i < UMTX_CHAINS; ++i) {
251		snprintf(chain_name, sizeof(chain_name), "%d", i);
252		chain_oid = SYSCTL_ADD_NODE(NULL,
253		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
254		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
255		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
256		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
257		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
258		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
259	}
260}
261
262static int
263sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
264{
265	char buf[512];
266	struct sbuf sb;
267	struct umtxq_chain *uc;
268	u_int fract, i, j, tot, whole;
269	u_int sf0, sf1, sf2, sf3, sf4;
270	u_int si0, si1, si2, si3, si4;
271	u_int sw0, sw1, sw2, sw3, sw4;
272
273	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
274	for (i = 0; i < 2; i++) {
275		tot = 0;
276		for (j = 0; j < UMTX_CHAINS; ++j) {
277			uc = &umtxq_chains[i][j];
278			mtx_lock(&uc->uc_lock);
279			tot += uc->max_length;
280			mtx_unlock(&uc->uc_lock);
281		}
282		if (tot == 0)
283			sbuf_printf(&sb, "%u) Empty ", i);
284		else {
285			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
286			si0 = si1 = si2 = si3 = si4 = 0;
287			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
288			for (j = 0; j < UMTX_CHAINS; j++) {
289				uc = &umtxq_chains[i][j];
290				mtx_lock(&uc->uc_lock);
291				whole = uc->max_length * 100;
292				mtx_unlock(&uc->uc_lock);
293				fract = (whole % tot) * 100;
294				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
295					sf0 = fract;
296					si0 = j;
297					sw0 = whole;
298				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
299				    sf1)) {
300					sf1 = fract;
301					si1 = j;
302					sw1 = whole;
303				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
304				    sf2)) {
305					sf2 = fract;
306					si2 = j;
307					sw2 = whole;
308				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
309				    sf3)) {
310					sf3 = fract;
311					si3 = j;
312					sw3 = whole;
313				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
314				    sf4)) {
315					sf4 = fract;
316					si4 = j;
317					sw4 = whole;
318				}
319			}
320			sbuf_printf(&sb, "queue %u:\n", i);
321			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
322			    sf0 / tot, si0);
323			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
324			    sf1 / tot, si1);
325			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
326			    sf2 / tot, si2);
327			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
328			    sf3 / tot, si3);
329			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
330			    sf4 / tot, si4);
331		}
332	}
333	sbuf_trim(&sb);
334	sbuf_finish(&sb);
335	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
336	sbuf_delete(&sb);
337	return (0);
338}
339
340static int
341sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
342{
343	struct umtxq_chain *uc;
344	u_int i, j;
345	int clear, error;
346
347	clear = 0;
348	error = sysctl_handle_int(oidp, &clear, 0, req);
349	if (error != 0 || req->newptr == NULL)
350		return (error);
351
352	if (clear != 0) {
353		for (i = 0; i < 2; ++i) {
354			for (j = 0; j < UMTX_CHAINS; ++j) {
355				uc = &umtxq_chains[i][j];
356				mtx_lock(&uc->uc_lock);
357				uc->length = 0;
358				uc->max_length = 0;
359				mtx_unlock(&uc->uc_lock);
360			}
361		}
362	}
363	return (0);
364}
365
366SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
367    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
368    sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
369SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
370    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
371    sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
372#endif
373
374static void
375umtxq_sysinit(void *arg __unused)
376{
377	int i, j;
378
379	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
380		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
381	for (i = 0; i < 2; ++i) {
382		for (j = 0; j < UMTX_CHAINS; ++j) {
383			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
384				 MTX_DEF | MTX_DUPOK);
385			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
386			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
387			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
388			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
389			umtxq_chains[i][j].uc_busy = 0;
390			umtxq_chains[i][j].uc_waiters = 0;
391#ifdef UMTX_PROFILING
392			umtxq_chains[i][j].length = 0;
393			umtxq_chains[i][j].max_length = 0;
394#endif
395		}
396	}
397#ifdef UMTX_PROFILING
398	umtx_init_profiling();
399#endif
400	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
401	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
402	    EVENTHANDLER_PRI_ANY);
403}
404
405struct umtx_q *
406umtxq_alloc(void)
407{
408	struct umtx_q *uq;
409
410	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
411	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
412	TAILQ_INIT(&uq->uq_spare_queue->head);
413	TAILQ_INIT(&uq->uq_pi_contested);
414	uq->uq_inherited_pri = PRI_MAX;
415	return (uq);
416}
417
418void
419umtxq_free(struct umtx_q *uq)
420{
421	MPASS(uq->uq_spare_queue != NULL);
422	free(uq->uq_spare_queue, M_UMTX);
423	free(uq, M_UMTX);
424}
425
426static inline void
427umtxq_hash(struct umtx_key *key)
428{
429	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
430	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
431}
432
433static inline struct umtxq_chain *
434umtxq_getchain(struct umtx_key *key)
435{
436	if (key->type <= TYPE_SEM)
437		return (&umtxq_chains[1][key->hash]);
438	return (&umtxq_chains[0][key->hash]);
439}
440
441/*
442 * Lock a chain.
443 */
444static inline void
445umtxq_lock(struct umtx_key *key)
446{
447	struct umtxq_chain *uc;
448
449	uc = umtxq_getchain(key);
450	mtx_lock(&uc->uc_lock);
451}
452
453/*
454 * Unlock a chain.
455 */
456static inline void
457umtxq_unlock(struct umtx_key *key)
458{
459	struct umtxq_chain *uc;
460
461	uc = umtxq_getchain(key);
462	mtx_unlock(&uc->uc_lock);
463}
464
465/*
466 * Set chain to busy state when following operation
467 * may be blocked (kernel mutex can not be used).
468 */
469static inline void
470umtxq_busy(struct umtx_key *key)
471{
472	struct umtxq_chain *uc;
473
474	uc = umtxq_getchain(key);
475	mtx_assert(&uc->uc_lock, MA_OWNED);
476	if (uc->uc_busy) {
477#ifdef SMP
478		if (smp_cpus > 1) {
479			int count = BUSY_SPINS;
480			if (count > 0) {
481				umtxq_unlock(key);
482				while (uc->uc_busy && --count > 0)
483					cpu_spinwait();
484				umtxq_lock(key);
485			}
486		}
487#endif
488		while (uc->uc_busy) {
489			uc->uc_waiters++;
490			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
491			uc->uc_waiters--;
492		}
493	}
494	uc->uc_busy = 1;
495}
496
497/*
498 * Unbusy a chain.
499 */
500static inline void
501umtxq_unbusy(struct umtx_key *key)
502{
503	struct umtxq_chain *uc;
504
505	uc = umtxq_getchain(key);
506	mtx_assert(&uc->uc_lock, MA_OWNED);
507	KASSERT(uc->uc_busy != 0, ("not busy"));
508	uc->uc_busy = 0;
509	if (uc->uc_waiters)
510		wakeup_one(uc);
511}
512
513static struct umtxq_queue *
514umtxq_queue_lookup(struct umtx_key *key, int q)
515{
516	struct umtxq_queue *uh;
517	struct umtxq_chain *uc;
518
519	uc = umtxq_getchain(key);
520	UMTXQ_LOCKED_ASSERT(uc);
521	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
522		if (umtx_key_match(&uh->key, key))
523			return (uh);
524	}
525
526	return (NULL);
527}
528
529static inline void
530umtxq_insert_queue(struct umtx_q *uq, int q)
531{
532	struct umtxq_queue *uh;
533	struct umtxq_chain *uc;
534
535	uc = umtxq_getchain(&uq->uq_key);
536	UMTXQ_LOCKED_ASSERT(uc);
537	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
538	uh = umtxq_queue_lookup(&uq->uq_key, q);
539	if (uh != NULL) {
540		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
541	} else {
542		uh = uq->uq_spare_queue;
543		uh->key = uq->uq_key;
544		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
545#ifdef UMTX_PROFILING
546		uc->length++;
547		if (uc->length > uc->max_length) {
548			uc->max_length = uc->length;
549			if (uc->max_length > max_length)
550				max_length = uc->max_length;
551		}
552#endif
553	}
554	uq->uq_spare_queue = NULL;
555
556	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
557	uh->length++;
558	uq->uq_flags |= UQF_UMTXQ;
559	uq->uq_cur_queue = uh;
560	return;
561}
562
563static inline void
564umtxq_remove_queue(struct umtx_q *uq, int q)
565{
566	struct umtxq_chain *uc;
567	struct umtxq_queue *uh;
568
569	uc = umtxq_getchain(&uq->uq_key);
570	UMTXQ_LOCKED_ASSERT(uc);
571	if (uq->uq_flags & UQF_UMTXQ) {
572		uh = uq->uq_cur_queue;
573		TAILQ_REMOVE(&uh->head, uq, uq_link);
574		uh->length--;
575		uq->uq_flags &= ~UQF_UMTXQ;
576		if (TAILQ_EMPTY(&uh->head)) {
577			KASSERT(uh->length == 0,
578			    ("inconsistent umtxq_queue length"));
579#ifdef UMTX_PROFILING
580			uc->length--;
581#endif
582			LIST_REMOVE(uh, link);
583		} else {
584			uh = LIST_FIRST(&uc->uc_spare_queue);
585			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
586			LIST_REMOVE(uh, link);
587		}
588		uq->uq_spare_queue = uh;
589		uq->uq_cur_queue = NULL;
590	}
591}
592
593/*
594 * Check if there are multiple waiters
595 */
596static int
597umtxq_count(struct umtx_key *key)
598{
599	struct umtxq_chain *uc;
600	struct umtxq_queue *uh;
601
602	uc = umtxq_getchain(key);
603	UMTXQ_LOCKED_ASSERT(uc);
604	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
605	if (uh != NULL)
606		return (uh->length);
607	return (0);
608}
609
610/*
611 * Check if there are multiple PI waiters and returns first
612 * waiter.
613 */
614static int
615umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
616{
617	struct umtxq_chain *uc;
618	struct umtxq_queue *uh;
619
620	*first = NULL;
621	uc = umtxq_getchain(key);
622	UMTXQ_LOCKED_ASSERT(uc);
623	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
624	if (uh != NULL) {
625		*first = TAILQ_FIRST(&uh->head);
626		return (uh->length);
627	}
628	return (0);
629}
630
631static int
632umtxq_check_susp(struct thread *td)
633{
634	struct proc *p;
635	int error;
636
637	/*
638	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
639	 * eventually break the lockstep loop.
640	 */
641	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
642		return (0);
643	error = 0;
644	p = td->td_proc;
645	PROC_LOCK(p);
646	if (P_SHOULDSTOP(p) ||
647	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
648		if (p->p_flag & P_SINGLE_EXIT)
649			error = EINTR;
650		else
651			error = ERESTART;
652	}
653	PROC_UNLOCK(p);
654	return (error);
655}
656
657/*
658 * Wake up threads waiting on an userland object.
659 */
660
661static int
662umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
663{
664	struct umtxq_chain *uc;
665	struct umtxq_queue *uh;
666	struct umtx_q *uq;
667	int ret;
668
669	ret = 0;
670	uc = umtxq_getchain(key);
671	UMTXQ_LOCKED_ASSERT(uc);
672	uh = umtxq_queue_lookup(key, q);
673	if (uh != NULL) {
674		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
675			umtxq_remove_queue(uq, q);
676			wakeup(uq);
677			if (++ret >= n_wake)
678				return (ret);
679		}
680	}
681	return (ret);
682}
683
684
685/*
686 * Wake up specified thread.
687 */
688static inline void
689umtxq_signal_thread(struct umtx_q *uq)
690{
691	struct umtxq_chain *uc;
692
693	uc = umtxq_getchain(&uq->uq_key);
694	UMTXQ_LOCKED_ASSERT(uc);
695	umtxq_remove(uq);
696	wakeup(uq);
697}
698
699static inline int
700tstohz(const struct timespec *tsp)
701{
702	struct timeval tv;
703
704	TIMESPEC_TO_TIMEVAL(&tv, tsp);
705	return tvtohz(&tv);
706}
707
708static void
709abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
710	const struct timespec *timeout)
711{
712
713	timo->clockid = clockid;
714	if (!absolute) {
715		kern_clock_gettime(curthread, clockid, &timo->end);
716		timo->cur = timo->end;
717		timespecadd(&timo->end, timeout);
718	} else {
719		timo->end = *timeout;
720		kern_clock_gettime(curthread, clockid, &timo->cur);
721	}
722}
723
724static void
725abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
726{
727
728	abs_timeout_init(timo, umtxtime->_clockid,
729		(umtxtime->_flags & UMTX_ABSTIME) != 0,
730		&umtxtime->_timeout);
731}
732
733static inline void
734abs_timeout_update(struct abs_timeout *timo)
735{
736	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
737}
738
739static int
740abs_timeout_gethz(struct abs_timeout *timo)
741{
742	struct timespec tts;
743
744	if (timespeccmp(&timo->end, &timo->cur, <=))
745		return (-1);
746	tts = timo->end;
747	timespecsub(&tts, &timo->cur);
748	return (tstohz(&tts));
749}
750
751/*
752 * Put thread into sleep state, before sleeping, check if
753 * thread was removed from umtx queue.
754 */
755static inline int
756umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
757{
758	struct umtxq_chain *uc;
759	int error, timo;
760
761	uc = umtxq_getchain(&uq->uq_key);
762	UMTXQ_LOCKED_ASSERT(uc);
763	for (;;) {
764		if (!(uq->uq_flags & UQF_UMTXQ))
765			return (0);
766		if (abstime != NULL) {
767			timo = abs_timeout_gethz(abstime);
768			if (timo < 0)
769				return (ETIMEDOUT);
770		} else
771			timo = 0;
772		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
773		if (error != EWOULDBLOCK) {
774			umtxq_lock(&uq->uq_key);
775			break;
776		}
777		if (abstime != NULL)
778			abs_timeout_update(abstime);
779		umtxq_lock(&uq->uq_key);
780	}
781	return (error);
782}
783
784/*
785 * Convert userspace address into unique logical address.
786 */
787int
788umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
789{
790	struct thread *td = curthread;
791	vm_map_t map;
792	vm_map_entry_t entry;
793	vm_pindex_t pindex;
794	vm_prot_t prot;
795	boolean_t wired;
796
797	key->type = type;
798	if (share == THREAD_SHARE) {
799		key->shared = 0;
800		key->info.private.vs = td->td_proc->p_vmspace;
801		key->info.private.addr = (uintptr_t)addr;
802	} else {
803		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
804		map = &td->td_proc->p_vmspace->vm_map;
805		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
806		    &entry, &key->info.shared.object, &pindex, &prot,
807		    &wired) != KERN_SUCCESS) {
808			return EFAULT;
809		}
810
811		if ((share == PROCESS_SHARE) ||
812		    (share == AUTO_SHARE &&
813		     VM_INHERIT_SHARE == entry->inheritance)) {
814			key->shared = 1;
815			key->info.shared.offset = entry->offset + entry->start -
816				(vm_offset_t)addr;
817			vm_object_reference(key->info.shared.object);
818		} else {
819			key->shared = 0;
820			key->info.private.vs = td->td_proc->p_vmspace;
821			key->info.private.addr = (uintptr_t)addr;
822		}
823		vm_map_lookup_done(map, entry);
824	}
825
826	umtxq_hash(key);
827	return (0);
828}
829
830/*
831 * Release key.
832 */
833void
834umtx_key_release(struct umtx_key *key)
835{
836	if (key->shared)
837		vm_object_deallocate(key->info.shared.object);
838}
839
840/*
841 * Lock a umtx object.
842 */
843static int
844do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
845	const struct timespec *timeout)
846{
847	struct abs_timeout timo;
848	struct umtx_q *uq;
849	u_long owner;
850	u_long old;
851	int error = 0;
852
853	uq = td->td_umtxq;
854	if (timeout != NULL)
855		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
856
857	/*
858	 * Care must be exercised when dealing with umtx structure. It
859	 * can fault on any access.
860	 */
861	for (;;) {
862		/*
863		 * Try the uncontested case.  This should be done in userland.
864		 */
865		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
866
867		/* The acquire succeeded. */
868		if (owner == UMTX_UNOWNED)
869			return (0);
870
871		/* The address was invalid. */
872		if (owner == -1)
873			return (EFAULT);
874
875		/* If no one owns it but it is contested try to acquire it. */
876		if (owner == UMTX_CONTESTED) {
877			owner = casuword(&umtx->u_owner,
878			    UMTX_CONTESTED, id | UMTX_CONTESTED);
879
880			if (owner == UMTX_CONTESTED)
881				return (0);
882
883			/* The address was invalid. */
884			if (owner == -1)
885				return (EFAULT);
886
887			error = umtxq_check_susp(td);
888			if (error != 0)
889				break;
890
891			/* If this failed the lock has changed, restart. */
892			continue;
893		}
894
895		/*
896		 * If we caught a signal, we have retried and now
897		 * exit immediately.
898		 */
899		if (error != 0)
900			break;
901
902		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
903			AUTO_SHARE, &uq->uq_key)) != 0)
904			return (error);
905
906		umtxq_lock(&uq->uq_key);
907		umtxq_busy(&uq->uq_key);
908		umtxq_insert(uq);
909		umtxq_unbusy(&uq->uq_key);
910		umtxq_unlock(&uq->uq_key);
911
912		/*
913		 * Set the contested bit so that a release in user space
914		 * knows to use the system call for unlock.  If this fails
915		 * either some one else has acquired the lock or it has been
916		 * released.
917		 */
918		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
919
920		/* The address was invalid. */
921		if (old == -1) {
922			umtxq_lock(&uq->uq_key);
923			umtxq_remove(uq);
924			umtxq_unlock(&uq->uq_key);
925			umtx_key_release(&uq->uq_key);
926			return (EFAULT);
927		}
928
929		/*
930		 * We set the contested bit, sleep. Otherwise the lock changed
931		 * and we need to retry or we lost a race to the thread
932		 * unlocking the umtx.
933		 */
934		umtxq_lock(&uq->uq_key);
935		if (old == owner)
936			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
937			    &timo);
938		umtxq_remove(uq);
939		umtxq_unlock(&uq->uq_key);
940		umtx_key_release(&uq->uq_key);
941
942		if (error == 0)
943			error = umtxq_check_susp(td);
944	}
945
946	if (timeout == NULL) {
947		/* Mutex locking is restarted if it is interrupted. */
948		if (error == EINTR)
949			error = ERESTART;
950	} else {
951		/* Timed-locking is not restarted. */
952		if (error == ERESTART)
953			error = EINTR;
954	}
955	return (error);
956}
957
958/*
959 * Unlock a umtx object.
960 */
961static int
962do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
963{
964	struct umtx_key key;
965	u_long owner;
966	u_long old;
967	int error;
968	int count;
969
970	/*
971	 * Make sure we own this mtx.
972	 */
973	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
974	if (owner == -1)
975		return (EFAULT);
976
977	if ((owner & ~UMTX_CONTESTED) != id)
978		return (EPERM);
979
980	/* This should be done in userland */
981	if ((owner & UMTX_CONTESTED) == 0) {
982		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
983		if (old == -1)
984			return (EFAULT);
985		if (old == owner)
986			return (0);
987		owner = old;
988	}
989
990	/* We should only ever be in here for contested locks */
991	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
992		&key)) != 0)
993		return (error);
994
995	umtxq_lock(&key);
996	umtxq_busy(&key);
997	count = umtxq_count(&key);
998	umtxq_unlock(&key);
999
1000	/*
1001	 * When unlocking the umtx, it must be marked as unowned if
1002	 * there is zero or one thread only waiting for it.
1003	 * Otherwise, it must be marked as contested.
1004	 */
1005	old = casuword(&umtx->u_owner, owner,
1006		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
1007	umtxq_lock(&key);
1008	umtxq_signal(&key,1);
1009	umtxq_unbusy(&key);
1010	umtxq_unlock(&key);
1011	umtx_key_release(&key);
1012	if (old == -1)
1013		return (EFAULT);
1014	if (old != owner)
1015		return (EINVAL);
1016	return (0);
1017}
1018
1019#ifdef COMPAT_FREEBSD32
1020
1021/*
1022 * Lock a umtx object.
1023 */
1024static int
1025do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
1026	const struct timespec *timeout)
1027{
1028	struct abs_timeout timo;
1029	struct umtx_q *uq;
1030	uint32_t owner;
1031	uint32_t old;
1032	int error = 0;
1033
1034	uq = td->td_umtxq;
1035
1036	if (timeout != NULL)
1037		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
1038
1039	/*
1040	 * Care must be exercised when dealing with umtx structure. It
1041	 * can fault on any access.
1042	 */
1043	for (;;) {
1044		/*
1045		 * Try the uncontested case.  This should be done in userland.
1046		 */
1047		owner = casuword32(m, UMUTEX_UNOWNED, id);
1048
1049		/* The acquire succeeded. */
1050		if (owner == UMUTEX_UNOWNED)
1051			return (0);
1052
1053		/* The address was invalid. */
1054		if (owner == -1)
1055			return (EFAULT);
1056
1057		/* If no one owns it but it is contested try to acquire it. */
1058		if (owner == UMUTEX_CONTESTED) {
1059			owner = casuword32(m,
1060			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1061			if (owner == UMUTEX_CONTESTED)
1062				return (0);
1063
1064			/* The address was invalid. */
1065			if (owner == -1)
1066				return (EFAULT);
1067
1068			error = umtxq_check_susp(td);
1069			if (error != 0)
1070				break;
1071
1072			/* If this failed the lock has changed, restart. */
1073			continue;
1074		}
1075
1076		/*
1077		 * If we caught a signal, we have retried and now
1078		 * exit immediately.
1079		 */
1080		if (error != 0)
1081			return (error);
1082
1083		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
1084			AUTO_SHARE, &uq->uq_key)) != 0)
1085			return (error);
1086
1087		umtxq_lock(&uq->uq_key);
1088		umtxq_busy(&uq->uq_key);
1089		umtxq_insert(uq);
1090		umtxq_unbusy(&uq->uq_key);
1091		umtxq_unlock(&uq->uq_key);
1092
1093		/*
1094		 * Set the contested bit so that a release in user space
1095		 * knows to use the system call for unlock.  If this fails
1096		 * either some one else has acquired the lock or it has been
1097		 * released.
1098		 */
1099		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
1100
1101		/* The address was invalid. */
1102		if (old == -1) {
1103			umtxq_lock(&uq->uq_key);
1104			umtxq_remove(uq);
1105			umtxq_unlock(&uq->uq_key);
1106			umtx_key_release(&uq->uq_key);
1107			return (EFAULT);
1108		}
1109
1110		/*
1111		 * We set the contested bit, sleep. Otherwise the lock changed
1112		 * and we need to retry or we lost a race to the thread
1113		 * unlocking the umtx.
1114		 */
1115		umtxq_lock(&uq->uq_key);
1116		if (old == owner)
1117			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
1118			    NULL : &timo);
1119		umtxq_remove(uq);
1120		umtxq_unlock(&uq->uq_key);
1121		umtx_key_release(&uq->uq_key);
1122
1123		if (error == 0)
1124			error = umtxq_check_susp(td);
1125	}
1126
1127	if (timeout == NULL) {
1128		/* Mutex locking is restarted if it is interrupted. */
1129		if (error == EINTR)
1130			error = ERESTART;
1131	} else {
1132		/* Timed-locking is not restarted. */
1133		if (error == ERESTART)
1134			error = EINTR;
1135	}
1136	return (error);
1137}
1138
1139/*
1140 * Unlock a umtx object.
1141 */
1142static int
1143do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
1144{
1145	struct umtx_key key;
1146	uint32_t owner;
1147	uint32_t old;
1148	int error;
1149	int count;
1150
1151	/*
1152	 * Make sure we own this mtx.
1153	 */
1154	owner = fuword32(m);
1155	if (owner == -1)
1156		return (EFAULT);
1157
1158	if ((owner & ~UMUTEX_CONTESTED) != id)
1159		return (EPERM);
1160
1161	/* This should be done in userland */
1162	if ((owner & UMUTEX_CONTESTED) == 0) {
1163		old = casuword32(m, owner, UMUTEX_UNOWNED);
1164		if (old == -1)
1165			return (EFAULT);
1166		if (old == owner)
1167			return (0);
1168		owner = old;
1169	}
1170
1171	/* We should only ever be in here for contested locks */
1172	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1173		&key)) != 0)
1174		return (error);
1175
1176	umtxq_lock(&key);
1177	umtxq_busy(&key);
1178	count = umtxq_count(&key);
1179	umtxq_unlock(&key);
1180
1181	/*
1182	 * When unlocking the umtx, it must be marked as unowned if
1183	 * there is zero or one thread only waiting for it.
1184	 * Otherwise, it must be marked as contested.
1185	 */
1186	old = casuword32(m, owner,
1187		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1188	umtxq_lock(&key);
1189	umtxq_signal(&key,1);
1190	umtxq_unbusy(&key);
1191	umtxq_unlock(&key);
1192	umtx_key_release(&key);
1193	if (old == -1)
1194		return (EFAULT);
1195	if (old != owner)
1196		return (EINVAL);
1197	return (0);
1198}
1199#endif
1200
1201/*
1202 * Fetch and compare value, sleep on the address if value is not changed.
1203 */
1204static int
1205do_wait(struct thread *td, void *addr, u_long id,
1206	struct _umtx_time *timeout, int compat32, int is_private)
1207{
1208	struct abs_timeout timo;
1209	struct umtx_q *uq;
1210	u_long tmp;
1211	int error = 0;
1212
1213	uq = td->td_umtxq;
1214	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1215		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1216		return (error);
1217
1218	if (timeout != NULL)
1219		abs_timeout_init2(&timo, timeout);
1220
1221	umtxq_lock(&uq->uq_key);
1222	umtxq_insert(uq);
1223	umtxq_unlock(&uq->uq_key);
1224	if (compat32 == 0)
1225		tmp = fuword(addr);
1226        else
1227		tmp = (unsigned int)fuword32(addr);
1228	umtxq_lock(&uq->uq_key);
1229	if (tmp == id)
1230		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
1231		    NULL : &timo);
1232	if ((uq->uq_flags & UQF_UMTXQ) == 0)
1233		error = 0;
1234	else
1235		umtxq_remove(uq);
1236	umtxq_unlock(&uq->uq_key);
1237	umtx_key_release(&uq->uq_key);
1238	if (error == ERESTART)
1239		error = EINTR;
1240	return (error);
1241}
1242
1243/*
1244 * Wake up threads sleeping on the specified address.
1245 */
1246int
1247kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1248{
1249	struct umtx_key key;
1250	int ret;
1251
1252	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1253		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1254		return (ret);
1255	umtxq_lock(&key);
1256	ret = umtxq_signal(&key, n_wake);
1257	umtxq_unlock(&key);
1258	umtx_key_release(&key);
1259	return (0);
1260}
1261
1262/*
1263 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1264 */
1265static int
1266do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1267	struct _umtx_time *timeout, int mode)
1268{
1269	struct abs_timeout timo;
1270	struct umtx_q *uq;
1271	uint32_t owner, old, id;
1272	int error = 0;
1273
1274	id = td->td_tid;
1275	uq = td->td_umtxq;
1276
1277	if (timeout != NULL)
1278		abs_timeout_init2(&timo, timeout);
1279
1280	/*
1281	 * Care must be exercised when dealing with umtx structure. It
1282	 * can fault on any access.
1283	 */
1284	for (;;) {
1285		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1286		if (mode == _UMUTEX_WAIT) {
1287			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1288				return (0);
1289		} else {
1290			/*
1291			 * Try the uncontested case.  This should be done in userland.
1292			 */
1293			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1294
1295			/* The acquire succeeded. */
1296			if (owner == UMUTEX_UNOWNED)
1297				return (0);
1298
1299			/* The address was invalid. */
1300			if (owner == -1)
1301				return (EFAULT);
1302
1303			/* If no one owns it but it is contested try to acquire it. */
1304			if (owner == UMUTEX_CONTESTED) {
1305				owner = casuword32(&m->m_owner,
1306				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1307
1308				if (owner == UMUTEX_CONTESTED)
1309					return (0);
1310
1311				/* The address was invalid. */
1312				if (owner == -1)
1313					return (EFAULT);
1314
1315				error = umtxq_check_susp(td);
1316				if (error != 0)
1317					return (error);
1318
1319				/* If this failed the lock has changed, restart. */
1320				continue;
1321			}
1322		}
1323
1324		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1325		    (owner & ~UMUTEX_CONTESTED) == id)
1326			return (EDEADLK);
1327
1328		if (mode == _UMUTEX_TRY)
1329			return (EBUSY);
1330
1331		/*
1332		 * If we caught a signal, we have retried and now
1333		 * exit immediately.
1334		 */
1335		if (error != 0)
1336			return (error);
1337
1338		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1339		    GET_SHARE(flags), &uq->uq_key)) != 0)
1340			return (error);
1341
1342		umtxq_lock(&uq->uq_key);
1343		umtxq_busy(&uq->uq_key);
1344		umtxq_insert(uq);
1345		umtxq_unlock(&uq->uq_key);
1346
1347		/*
1348		 * Set the contested bit so that a release in user space
1349		 * knows to use the system call for unlock.  If this fails
1350		 * either some one else has acquired the lock or it has been
1351		 * released.
1352		 */
1353		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1354
1355		/* The address was invalid. */
1356		if (old == -1) {
1357			umtxq_lock(&uq->uq_key);
1358			umtxq_remove(uq);
1359			umtxq_unbusy(&uq->uq_key);
1360			umtxq_unlock(&uq->uq_key);
1361			umtx_key_release(&uq->uq_key);
1362			return (EFAULT);
1363		}
1364
1365		/*
1366		 * We set the contested bit, sleep. Otherwise the lock changed
1367		 * and we need to retry or we lost a race to the thread
1368		 * unlocking the umtx.
1369		 */
1370		umtxq_lock(&uq->uq_key);
1371		umtxq_unbusy(&uq->uq_key);
1372		if (old == owner)
1373			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1374			    NULL : &timo);
1375		umtxq_remove(uq);
1376		umtxq_unlock(&uq->uq_key);
1377		umtx_key_release(&uq->uq_key);
1378
1379		if (error == 0)
1380			error = umtxq_check_susp(td);
1381	}
1382
1383	return (0);
1384}
1385
1386/*
1387 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1388 */
1389static int
1390do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1391{
1392	struct umtx_key key;
1393	uint32_t owner, old, id;
1394	int error;
1395	int count;
1396
1397	id = td->td_tid;
1398	/*
1399	 * Make sure we own this mtx.
1400	 */
1401	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1402	if (owner == -1)
1403		return (EFAULT);
1404
1405	if ((owner & ~UMUTEX_CONTESTED) != id)
1406		return (EPERM);
1407
1408	if ((owner & UMUTEX_CONTESTED) == 0) {
1409		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1410		if (old == -1)
1411			return (EFAULT);
1412		if (old == owner)
1413			return (0);
1414		owner = old;
1415	}
1416
1417	/* We should only ever be in here for contested locks */
1418	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1419	    &key)) != 0)
1420		return (error);
1421
1422	umtxq_lock(&key);
1423	umtxq_busy(&key);
1424	count = umtxq_count(&key);
1425	umtxq_unlock(&key);
1426
1427	/*
1428	 * When unlocking the umtx, it must be marked as unowned if
1429	 * there is zero or one thread only waiting for it.
1430	 * Otherwise, it must be marked as contested.
1431	 */
1432	old = casuword32(&m->m_owner, owner,
1433		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1434	umtxq_lock(&key);
1435	umtxq_signal(&key,1);
1436	umtxq_unbusy(&key);
1437	umtxq_unlock(&key);
1438	umtx_key_release(&key);
1439	if (old == -1)
1440		return (EFAULT);
1441	if (old != owner)
1442		return (EINVAL);
1443	return (0);
1444}
1445
1446/*
1447 * Check if the mutex is available and wake up a waiter,
1448 * only for simple mutex.
1449 */
1450static int
1451do_wake_umutex(struct thread *td, struct umutex *m)
1452{
1453	struct umtx_key key;
1454	uint32_t owner;
1455	uint32_t flags;
1456	int error;
1457	int count;
1458
1459	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1460	if (owner == -1)
1461		return (EFAULT);
1462
1463	if ((owner & ~UMUTEX_CONTESTED) != 0)
1464		return (0);
1465
1466	flags = fuword32(&m->m_flags);
1467
1468	/* We should only ever be in here for contested locks */
1469	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1470	    &key)) != 0)
1471		return (error);
1472
1473	umtxq_lock(&key);
1474	umtxq_busy(&key);
1475	count = umtxq_count(&key);
1476	umtxq_unlock(&key);
1477
1478	if (count <= 1)
1479		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1480
1481	umtxq_lock(&key);
1482	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1483		umtxq_signal(&key, 1);
1484	umtxq_unbusy(&key);
1485	umtxq_unlock(&key);
1486	umtx_key_release(&key);
1487	return (0);
1488}
1489
1490/*
1491 * Check if the mutex has waiters and tries to fix contention bit.
1492 */
1493static int
1494do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1495{
1496	struct umtx_key key;
1497	uint32_t owner, old;
1498	int type;
1499	int error;
1500	int count;
1501
1502	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1503	case 0:
1504		type = TYPE_NORMAL_UMUTEX;
1505		break;
1506	case UMUTEX_PRIO_INHERIT:
1507		type = TYPE_PI_UMUTEX;
1508		break;
1509	case UMUTEX_PRIO_PROTECT:
1510		type = TYPE_PP_UMUTEX;
1511		break;
1512	default:
1513		return (EINVAL);
1514	}
1515	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1516	    &key)) != 0)
1517		return (error);
1518
1519	owner = 0;
1520	umtxq_lock(&key);
1521	umtxq_busy(&key);
1522	count = umtxq_count(&key);
1523	umtxq_unlock(&key);
1524	/*
1525	 * Only repair contention bit if there is a waiter, this means the mutex
1526	 * is still being referenced by userland code, otherwise don't update
1527	 * any memory.
1528	 */
1529	if (count > 1) {
1530		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1531		while ((owner & UMUTEX_CONTESTED) ==0) {
1532			old = casuword32(&m->m_owner, owner,
1533			    owner|UMUTEX_CONTESTED);
1534			if (old == owner)
1535				break;
1536			owner = old;
1537			if (old == -1)
1538				break;
1539			error = umtxq_check_susp(td);
1540			if (error != 0)
1541				break;
1542		}
1543	} else if (count == 1) {
1544		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1545		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
1546		       (owner & UMUTEX_CONTESTED) == 0) {
1547			old = casuword32(&m->m_owner, owner,
1548			    owner|UMUTEX_CONTESTED);
1549			if (old == owner)
1550				break;
1551			owner = old;
1552			if (old == -1)
1553				break;
1554			error = umtxq_check_susp(td);
1555			if (error != 0)
1556				break;
1557		}
1558	}
1559	umtxq_lock(&key);
1560	if (owner == -1) {
1561		error = EFAULT;
1562		umtxq_signal(&key, INT_MAX);
1563	}
1564	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1565		umtxq_signal(&key, 1);
1566	umtxq_unbusy(&key);
1567	umtxq_unlock(&key);
1568	umtx_key_release(&key);
1569	return (error);
1570}
1571
1572static inline struct umtx_pi *
1573umtx_pi_alloc(int flags)
1574{
1575	struct umtx_pi *pi;
1576
1577	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1578	TAILQ_INIT(&pi->pi_blocked);
1579	atomic_add_int(&umtx_pi_allocated, 1);
1580	return (pi);
1581}
1582
1583static inline void
1584umtx_pi_free(struct umtx_pi *pi)
1585{
1586	uma_zfree(umtx_pi_zone, pi);
1587	atomic_add_int(&umtx_pi_allocated, -1);
1588}
1589
1590/*
1591 * Adjust the thread's position on a pi_state after its priority has been
1592 * changed.
1593 */
1594static int
1595umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1596{
1597	struct umtx_q *uq, *uq1, *uq2;
1598	struct thread *td1;
1599
1600	mtx_assert(&umtx_lock, MA_OWNED);
1601	if (pi == NULL)
1602		return (0);
1603
1604	uq = td->td_umtxq;
1605
1606	/*
1607	 * Check if the thread needs to be moved on the blocked chain.
1608	 * It needs to be moved if either its priority is lower than
1609	 * the previous thread or higher than the next thread.
1610	 */
1611	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1612	uq2 = TAILQ_NEXT(uq, uq_lockq);
1613	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1614	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1615		/*
1616		 * Remove thread from blocked chain and determine where
1617		 * it should be moved to.
1618		 */
1619		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1620		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1621			td1 = uq1->uq_thread;
1622			MPASS(td1->td_proc->p_magic == P_MAGIC);
1623			if (UPRI(td1) > UPRI(td))
1624				break;
1625		}
1626
1627		if (uq1 == NULL)
1628			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1629		else
1630			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1631	}
1632	return (1);
1633}
1634
1635/*
1636 * Propagate priority when a thread is blocked on POSIX
1637 * PI mutex.
1638 */
1639static void
1640umtx_propagate_priority(struct thread *td)
1641{
1642	struct umtx_q *uq;
1643	struct umtx_pi *pi;
1644	int pri;
1645
1646	mtx_assert(&umtx_lock, MA_OWNED);
1647	pri = UPRI(td);
1648	uq = td->td_umtxq;
1649	pi = uq->uq_pi_blocked;
1650	if (pi == NULL)
1651		return;
1652
1653	for (;;) {
1654		td = pi->pi_owner;
1655		if (td == NULL || td == curthread)
1656			return;
1657
1658		MPASS(td->td_proc != NULL);
1659		MPASS(td->td_proc->p_magic == P_MAGIC);
1660
1661		thread_lock(td);
1662		if (td->td_lend_user_pri > pri)
1663			sched_lend_user_prio(td, pri);
1664		else {
1665			thread_unlock(td);
1666			break;
1667		}
1668		thread_unlock(td);
1669
1670		/*
1671		 * Pick up the lock that td is blocked on.
1672		 */
1673		uq = td->td_umtxq;
1674		pi = uq->uq_pi_blocked;
1675		if (pi == NULL)
1676			break;
1677		/* Resort td on the list if needed. */
1678		umtx_pi_adjust_thread(pi, td);
1679	}
1680}
1681
1682/*
1683 * Unpropagate priority for a PI mutex when a thread blocked on
1684 * it is interrupted by signal or resumed by others.
1685 */
1686static void
1687umtx_repropagate_priority(struct umtx_pi *pi)
1688{
1689	struct umtx_q *uq, *uq_owner;
1690	struct umtx_pi *pi2;
1691	int pri;
1692
1693	mtx_assert(&umtx_lock, MA_OWNED);
1694
1695	while (pi != NULL && pi->pi_owner != NULL) {
1696		pri = PRI_MAX;
1697		uq_owner = pi->pi_owner->td_umtxq;
1698
1699		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1700			uq = TAILQ_FIRST(&pi2->pi_blocked);
1701			if (uq != NULL) {
1702				if (pri > UPRI(uq->uq_thread))
1703					pri = UPRI(uq->uq_thread);
1704			}
1705		}
1706
1707		if (pri > uq_owner->uq_inherited_pri)
1708			pri = uq_owner->uq_inherited_pri;
1709		thread_lock(pi->pi_owner);
1710		sched_lend_user_prio(pi->pi_owner, pri);
1711		thread_unlock(pi->pi_owner);
1712		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1713			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1714	}
1715}
1716
1717/*
1718 * Insert a PI mutex into owned list.
1719 */
1720static void
1721umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1722{
1723	struct umtx_q *uq_owner;
1724
1725	uq_owner = owner->td_umtxq;
1726	mtx_assert(&umtx_lock, MA_OWNED);
1727	if (pi->pi_owner != NULL)
1728		panic("pi_ower != NULL");
1729	pi->pi_owner = owner;
1730	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1731}
1732
1733/*
1734 * Claim ownership of a PI mutex.
1735 */
1736static int
1737umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1738{
1739	struct umtx_q *uq, *uq_owner;
1740
1741	uq_owner = owner->td_umtxq;
1742	mtx_lock_spin(&umtx_lock);
1743	if (pi->pi_owner == owner) {
1744		mtx_unlock_spin(&umtx_lock);
1745		return (0);
1746	}
1747
1748	if (pi->pi_owner != NULL) {
1749		/*
1750		 * userland may have already messed the mutex, sigh.
1751		 */
1752		mtx_unlock_spin(&umtx_lock);
1753		return (EPERM);
1754	}
1755	umtx_pi_setowner(pi, owner);
1756	uq = TAILQ_FIRST(&pi->pi_blocked);
1757	if (uq != NULL) {
1758		int pri;
1759
1760		pri = UPRI(uq->uq_thread);
1761		thread_lock(owner);
1762		if (pri < UPRI(owner))
1763			sched_lend_user_prio(owner, pri);
1764		thread_unlock(owner);
1765	}
1766	mtx_unlock_spin(&umtx_lock);
1767	return (0);
1768}
1769
1770/*
1771 * Adjust a thread's order position in its blocked PI mutex,
1772 * this may result new priority propagating process.
1773 */
1774void
1775umtx_pi_adjust(struct thread *td, u_char oldpri)
1776{
1777	struct umtx_q *uq;
1778	struct umtx_pi *pi;
1779
1780	uq = td->td_umtxq;
1781	mtx_lock_spin(&umtx_lock);
1782	/*
1783	 * Pick up the lock that td is blocked on.
1784	 */
1785	pi = uq->uq_pi_blocked;
1786	if (pi != NULL) {
1787		umtx_pi_adjust_thread(pi, td);
1788		umtx_repropagate_priority(pi);
1789	}
1790	mtx_unlock_spin(&umtx_lock);
1791}
1792
1793/*
1794 * Sleep on a PI mutex.
1795 */
1796static int
1797umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1798	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1799{
1800	struct umtxq_chain *uc;
1801	struct thread *td, *td1;
1802	struct umtx_q *uq1;
1803	int pri;
1804	int error = 0;
1805
1806	td = uq->uq_thread;
1807	KASSERT(td == curthread, ("inconsistent uq_thread"));
1808	uc = umtxq_getchain(&uq->uq_key);
1809	UMTXQ_LOCKED_ASSERT(uc);
1810	UMTXQ_BUSY_ASSERT(uc);
1811	umtxq_insert(uq);
1812	mtx_lock_spin(&umtx_lock);
1813	if (pi->pi_owner == NULL) {
1814		mtx_unlock_spin(&umtx_lock);
1815		/* XXX Only look up thread in current process. */
1816		td1 = tdfind(owner, curproc->p_pid);
1817		mtx_lock_spin(&umtx_lock);
1818		if (td1 != NULL) {
1819			if (pi->pi_owner == NULL)
1820				umtx_pi_setowner(pi, td1);
1821			PROC_UNLOCK(td1->td_proc);
1822		}
1823	}
1824
1825	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1826		pri = UPRI(uq1->uq_thread);
1827		if (pri > UPRI(td))
1828			break;
1829	}
1830
1831	if (uq1 != NULL)
1832		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1833	else
1834		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1835
1836	uq->uq_pi_blocked = pi;
1837	thread_lock(td);
1838	td->td_flags |= TDF_UPIBLOCKED;
1839	thread_unlock(td);
1840	umtx_propagate_priority(td);
1841	mtx_unlock_spin(&umtx_lock);
1842	umtxq_unbusy(&uq->uq_key);
1843
1844	error = umtxq_sleep(uq, wmesg, timo);
1845	umtxq_remove(uq);
1846
1847	mtx_lock_spin(&umtx_lock);
1848	uq->uq_pi_blocked = NULL;
1849	thread_lock(td);
1850	td->td_flags &= ~TDF_UPIBLOCKED;
1851	thread_unlock(td);
1852	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1853	umtx_repropagate_priority(pi);
1854	mtx_unlock_spin(&umtx_lock);
1855	umtxq_unlock(&uq->uq_key);
1856
1857	return (error);
1858}
1859
1860/*
1861 * Add reference count for a PI mutex.
1862 */
1863static void
1864umtx_pi_ref(struct umtx_pi *pi)
1865{
1866	struct umtxq_chain *uc;
1867
1868	uc = umtxq_getchain(&pi->pi_key);
1869	UMTXQ_LOCKED_ASSERT(uc);
1870	pi->pi_refcount++;
1871}
1872
1873/*
1874 * Decrease reference count for a PI mutex, if the counter
1875 * is decreased to zero, its memory space is freed.
1876 */
1877static void
1878umtx_pi_unref(struct umtx_pi *pi)
1879{
1880	struct umtxq_chain *uc;
1881
1882	uc = umtxq_getchain(&pi->pi_key);
1883	UMTXQ_LOCKED_ASSERT(uc);
1884	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1885	if (--pi->pi_refcount == 0) {
1886		mtx_lock_spin(&umtx_lock);
1887		if (pi->pi_owner != NULL) {
1888			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1889				pi, pi_link);
1890			pi->pi_owner = NULL;
1891		}
1892		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1893			("blocked queue not empty"));
1894		mtx_unlock_spin(&umtx_lock);
1895		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1896		umtx_pi_free(pi);
1897	}
1898}
1899
1900/*
1901 * Find a PI mutex in hash table.
1902 */
1903static struct umtx_pi *
1904umtx_pi_lookup(struct umtx_key *key)
1905{
1906	struct umtxq_chain *uc;
1907	struct umtx_pi *pi;
1908
1909	uc = umtxq_getchain(key);
1910	UMTXQ_LOCKED_ASSERT(uc);
1911
1912	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1913		if (umtx_key_match(&pi->pi_key, key)) {
1914			return (pi);
1915		}
1916	}
1917	return (NULL);
1918}
1919
1920/*
1921 * Insert a PI mutex into hash table.
1922 */
1923static inline void
1924umtx_pi_insert(struct umtx_pi *pi)
1925{
1926	struct umtxq_chain *uc;
1927
1928	uc = umtxq_getchain(&pi->pi_key);
1929	UMTXQ_LOCKED_ASSERT(uc);
1930	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1931}
1932
1933/*
1934 * Lock a PI mutex.
1935 */
1936static int
1937do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1938    struct _umtx_time *timeout, int try)
1939{
1940	struct abs_timeout timo;
1941	struct umtx_q *uq;
1942	struct umtx_pi *pi, *new_pi;
1943	uint32_t id, owner, old;
1944	int error;
1945
1946	id = td->td_tid;
1947	uq = td->td_umtxq;
1948
1949	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1950	    &uq->uq_key)) != 0)
1951		return (error);
1952
1953	if (timeout != NULL)
1954		abs_timeout_init2(&timo, timeout);
1955
1956	umtxq_lock(&uq->uq_key);
1957	pi = umtx_pi_lookup(&uq->uq_key);
1958	if (pi == NULL) {
1959		new_pi = umtx_pi_alloc(M_NOWAIT);
1960		if (new_pi == NULL) {
1961			umtxq_unlock(&uq->uq_key);
1962			new_pi = umtx_pi_alloc(M_WAITOK);
1963			umtxq_lock(&uq->uq_key);
1964			pi = umtx_pi_lookup(&uq->uq_key);
1965			if (pi != NULL) {
1966				umtx_pi_free(new_pi);
1967				new_pi = NULL;
1968			}
1969		}
1970		if (new_pi != NULL) {
1971			new_pi->pi_key = uq->uq_key;
1972			umtx_pi_insert(new_pi);
1973			pi = new_pi;
1974		}
1975	}
1976	umtx_pi_ref(pi);
1977	umtxq_unlock(&uq->uq_key);
1978
1979	/*
1980	 * Care must be exercised when dealing with umtx structure.  It
1981	 * can fault on any access.
1982	 */
1983	for (;;) {
1984		/*
1985		 * Try the uncontested case.  This should be done in userland.
1986		 */
1987		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1988
1989		/* The acquire succeeded. */
1990		if (owner == UMUTEX_UNOWNED) {
1991			error = 0;
1992			break;
1993		}
1994
1995		/* The address was invalid. */
1996		if (owner == -1) {
1997			error = EFAULT;
1998			break;
1999		}
2000
2001		/* If no one owns it but it is contested try to acquire it. */
2002		if (owner == UMUTEX_CONTESTED) {
2003			owner = casuword32(&m->m_owner,
2004			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2005
2006			if (owner == UMUTEX_CONTESTED) {
2007				umtxq_lock(&uq->uq_key);
2008				umtxq_busy(&uq->uq_key);
2009				error = umtx_pi_claim(pi, td);
2010				umtxq_unbusy(&uq->uq_key);
2011				umtxq_unlock(&uq->uq_key);
2012				break;
2013			}
2014
2015			/* The address was invalid. */
2016			if (owner == -1) {
2017				error = EFAULT;
2018				break;
2019			}
2020
2021			error = umtxq_check_susp(td);
2022			if (error != 0)
2023				break;
2024
2025			/* If this failed the lock has changed, restart. */
2026			continue;
2027		}
2028
2029		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2030		    (owner & ~UMUTEX_CONTESTED) == id) {
2031			error = EDEADLK;
2032			break;
2033		}
2034
2035		if (try != 0) {
2036			error = EBUSY;
2037			break;
2038		}
2039
2040		/*
2041		 * If we caught a signal, we have retried and now
2042		 * exit immediately.
2043		 */
2044		if (error != 0)
2045			break;
2046
2047		umtxq_lock(&uq->uq_key);
2048		umtxq_busy(&uq->uq_key);
2049		umtxq_unlock(&uq->uq_key);
2050
2051		/*
2052		 * Set the contested bit so that a release in user space
2053		 * knows to use the system call for unlock.  If this fails
2054		 * either some one else has acquired the lock or it has been
2055		 * released.
2056		 */
2057		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
2058
2059		/* The address was invalid. */
2060		if (old == -1) {
2061			umtxq_lock(&uq->uq_key);
2062			umtxq_unbusy(&uq->uq_key);
2063			umtxq_unlock(&uq->uq_key);
2064			error = EFAULT;
2065			break;
2066		}
2067
2068		umtxq_lock(&uq->uq_key);
2069		/*
2070		 * We set the contested bit, sleep. Otherwise the lock changed
2071		 * and we need to retry or we lost a race to the thread
2072		 * unlocking the umtx.
2073		 */
2074		if (old == owner)
2075			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
2076			    "umtxpi", timeout == NULL ? NULL : &timo);
2077		else {
2078			umtxq_unbusy(&uq->uq_key);
2079			umtxq_unlock(&uq->uq_key);
2080		}
2081
2082		error = umtxq_check_susp(td);
2083		if (error != 0)
2084			break;
2085	}
2086
2087	umtxq_lock(&uq->uq_key);
2088	umtx_pi_unref(pi);
2089	umtxq_unlock(&uq->uq_key);
2090
2091	umtx_key_release(&uq->uq_key);
2092	return (error);
2093}
2094
2095/*
2096 * Unlock a PI mutex.
2097 */
2098static int
2099do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
2100{
2101	struct umtx_key key;
2102	struct umtx_q *uq_first, *uq_first2, *uq_me;
2103	struct umtx_pi *pi, *pi2;
2104	uint32_t owner, old, id;
2105	int error;
2106	int count;
2107	int pri;
2108
2109	id = td->td_tid;
2110	/*
2111	 * Make sure we own this mtx.
2112	 */
2113	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2114	if (owner == -1)
2115		return (EFAULT);
2116
2117	if ((owner & ~UMUTEX_CONTESTED) != id)
2118		return (EPERM);
2119
2120	/* This should be done in userland */
2121	if ((owner & UMUTEX_CONTESTED) == 0) {
2122		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
2123		if (old == -1)
2124			return (EFAULT);
2125		if (old == owner)
2126			return (0);
2127		owner = old;
2128	}
2129
2130	/* We should only ever be in here for contested locks */
2131	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
2132	    &key)) != 0)
2133		return (error);
2134
2135	umtxq_lock(&key);
2136	umtxq_busy(&key);
2137	count = umtxq_count_pi(&key, &uq_first);
2138	if (uq_first != NULL) {
2139		mtx_lock_spin(&umtx_lock);
2140		pi = uq_first->uq_pi_blocked;
2141		KASSERT(pi != NULL, ("pi == NULL?"));
2142		if (pi->pi_owner != curthread) {
2143			mtx_unlock_spin(&umtx_lock);
2144			umtxq_unbusy(&key);
2145			umtxq_unlock(&key);
2146			umtx_key_release(&key);
2147			/* userland messed the mutex */
2148			return (EPERM);
2149		}
2150		uq_me = curthread->td_umtxq;
2151		pi->pi_owner = NULL;
2152		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
2153		/* get highest priority thread which is still sleeping. */
2154		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2155		while (uq_first != NULL &&
2156		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2157			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2158		}
2159		pri = PRI_MAX;
2160		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2161			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2162			if (uq_first2 != NULL) {
2163				if (pri > UPRI(uq_first2->uq_thread))
2164					pri = UPRI(uq_first2->uq_thread);
2165			}
2166		}
2167		thread_lock(curthread);
2168		sched_lend_user_prio(curthread, pri);
2169		thread_unlock(curthread);
2170		mtx_unlock_spin(&umtx_lock);
2171		if (uq_first)
2172			umtxq_signal_thread(uq_first);
2173	}
2174	umtxq_unlock(&key);
2175
2176	/*
2177	 * When unlocking the umtx, it must be marked as unowned if
2178	 * there is zero or one thread only waiting for it.
2179	 * Otherwise, it must be marked as contested.
2180	 */
2181	old = casuword32(&m->m_owner, owner,
2182		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
2183
2184	umtxq_lock(&key);
2185	umtxq_unbusy(&key);
2186	umtxq_unlock(&key);
2187	umtx_key_release(&key);
2188	if (old == -1)
2189		return (EFAULT);
2190	if (old != owner)
2191		return (EINVAL);
2192	return (0);
2193}
2194
2195/*
2196 * Lock a PP mutex.
2197 */
2198static int
2199do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2200    struct _umtx_time *timeout, int try)
2201{
2202	struct abs_timeout timo;
2203	struct umtx_q *uq, *uq2;
2204	struct umtx_pi *pi;
2205	uint32_t ceiling;
2206	uint32_t owner, id;
2207	int error, pri, old_inherited_pri, su;
2208
2209	id = td->td_tid;
2210	uq = td->td_umtxq;
2211	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2212	    &uq->uq_key)) != 0)
2213		return (error);
2214
2215	if (timeout != NULL)
2216		abs_timeout_init2(&timo, timeout);
2217
2218	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2219	for (;;) {
2220		old_inherited_pri = uq->uq_inherited_pri;
2221		umtxq_lock(&uq->uq_key);
2222		umtxq_busy(&uq->uq_key);
2223		umtxq_unlock(&uq->uq_key);
2224
2225		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
2226		if (ceiling > RTP_PRIO_MAX) {
2227			error = EINVAL;
2228			goto out;
2229		}
2230
2231		mtx_lock_spin(&umtx_lock);
2232		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2233			mtx_unlock_spin(&umtx_lock);
2234			error = EINVAL;
2235			goto out;
2236		}
2237		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2238			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2239			thread_lock(td);
2240			if (uq->uq_inherited_pri < UPRI(td))
2241				sched_lend_user_prio(td, uq->uq_inherited_pri);
2242			thread_unlock(td);
2243		}
2244		mtx_unlock_spin(&umtx_lock);
2245
2246		owner = casuword32(&m->m_owner,
2247		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2248
2249		if (owner == UMUTEX_CONTESTED) {
2250			error = 0;
2251			break;
2252		}
2253
2254		/* The address was invalid. */
2255		if (owner == -1) {
2256			error = EFAULT;
2257			break;
2258		}
2259
2260		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2261		    (owner & ~UMUTEX_CONTESTED) == id) {
2262			error = EDEADLK;
2263			break;
2264		}
2265
2266		if (try != 0) {
2267			error = EBUSY;
2268			break;
2269		}
2270
2271		/*
2272		 * If we caught a signal, we have retried and now
2273		 * exit immediately.
2274		 */
2275		if (error != 0)
2276			break;
2277
2278		umtxq_lock(&uq->uq_key);
2279		umtxq_insert(uq);
2280		umtxq_unbusy(&uq->uq_key);
2281		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2282		    NULL : &timo);
2283		umtxq_remove(uq);
2284		umtxq_unlock(&uq->uq_key);
2285
2286		mtx_lock_spin(&umtx_lock);
2287		uq->uq_inherited_pri = old_inherited_pri;
2288		pri = PRI_MAX;
2289		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2290			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2291			if (uq2 != NULL) {
2292				if (pri > UPRI(uq2->uq_thread))
2293					pri = UPRI(uq2->uq_thread);
2294			}
2295		}
2296		if (pri > uq->uq_inherited_pri)
2297			pri = uq->uq_inherited_pri;
2298		thread_lock(td);
2299		sched_lend_user_prio(td, pri);
2300		thread_unlock(td);
2301		mtx_unlock_spin(&umtx_lock);
2302	}
2303
2304	if (error != 0) {
2305		mtx_lock_spin(&umtx_lock);
2306		uq->uq_inherited_pri = old_inherited_pri;
2307		pri = PRI_MAX;
2308		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2309			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2310			if (uq2 != NULL) {
2311				if (pri > UPRI(uq2->uq_thread))
2312					pri = UPRI(uq2->uq_thread);
2313			}
2314		}
2315		if (pri > uq->uq_inherited_pri)
2316			pri = uq->uq_inherited_pri;
2317		thread_lock(td);
2318		sched_lend_user_prio(td, pri);
2319		thread_unlock(td);
2320		mtx_unlock_spin(&umtx_lock);
2321	}
2322
2323out:
2324	umtxq_lock(&uq->uq_key);
2325	umtxq_unbusy(&uq->uq_key);
2326	umtxq_unlock(&uq->uq_key);
2327	umtx_key_release(&uq->uq_key);
2328	return (error);
2329}
2330
2331/*
2332 * Unlock a PP mutex.
2333 */
2334static int
2335do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2336{
2337	struct umtx_key key;
2338	struct umtx_q *uq, *uq2;
2339	struct umtx_pi *pi;
2340	uint32_t owner, id;
2341	uint32_t rceiling;
2342	int error, pri, new_inherited_pri, su;
2343
2344	id = td->td_tid;
2345	uq = td->td_umtxq;
2346	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2347
2348	/*
2349	 * Make sure we own this mtx.
2350	 */
2351	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2352	if (owner == -1)
2353		return (EFAULT);
2354
2355	if ((owner & ~UMUTEX_CONTESTED) != id)
2356		return (EPERM);
2357
2358	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2359	if (error != 0)
2360		return (error);
2361
2362	if (rceiling == -1)
2363		new_inherited_pri = PRI_MAX;
2364	else {
2365		rceiling = RTP_PRIO_MAX - rceiling;
2366		if (rceiling > RTP_PRIO_MAX)
2367			return (EINVAL);
2368		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2369	}
2370
2371	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2372	    &key)) != 0)
2373		return (error);
2374	umtxq_lock(&key);
2375	umtxq_busy(&key);
2376	umtxq_unlock(&key);
2377	/*
2378	 * For priority protected mutex, always set unlocked state
2379	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2380	 * to lock the mutex, it is necessary because thread priority
2381	 * has to be adjusted for such mutex.
2382	 */
2383	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2384		UMUTEX_CONTESTED);
2385
2386	umtxq_lock(&key);
2387	if (error == 0)
2388		umtxq_signal(&key, 1);
2389	umtxq_unbusy(&key);
2390	umtxq_unlock(&key);
2391
2392	if (error == -1)
2393		error = EFAULT;
2394	else {
2395		mtx_lock_spin(&umtx_lock);
2396		if (su != 0)
2397			uq->uq_inherited_pri = new_inherited_pri;
2398		pri = PRI_MAX;
2399		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2400			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2401			if (uq2 != NULL) {
2402				if (pri > UPRI(uq2->uq_thread))
2403					pri = UPRI(uq2->uq_thread);
2404			}
2405		}
2406		if (pri > uq->uq_inherited_pri)
2407			pri = uq->uq_inherited_pri;
2408		thread_lock(td);
2409		sched_lend_user_prio(td, pri);
2410		thread_unlock(td);
2411		mtx_unlock_spin(&umtx_lock);
2412	}
2413	umtx_key_release(&key);
2414	return (error);
2415}
2416
2417static int
2418do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2419	uint32_t *old_ceiling)
2420{
2421	struct umtx_q *uq;
2422	uint32_t save_ceiling;
2423	uint32_t owner, id;
2424	uint32_t flags;
2425	int error;
2426
2427	flags = fuword32(&m->m_flags);
2428	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2429		return (EINVAL);
2430	if (ceiling > RTP_PRIO_MAX)
2431		return (EINVAL);
2432	id = td->td_tid;
2433	uq = td->td_umtxq;
2434	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2435	   &uq->uq_key)) != 0)
2436		return (error);
2437	for (;;) {
2438		umtxq_lock(&uq->uq_key);
2439		umtxq_busy(&uq->uq_key);
2440		umtxq_unlock(&uq->uq_key);
2441
2442		save_ceiling = fuword32(&m->m_ceilings[0]);
2443
2444		owner = casuword32(&m->m_owner,
2445		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2446
2447		if (owner == UMUTEX_CONTESTED) {
2448			suword32(&m->m_ceilings[0], ceiling);
2449			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2450				UMUTEX_CONTESTED);
2451			error = 0;
2452			break;
2453		}
2454
2455		/* The address was invalid. */
2456		if (owner == -1) {
2457			error = EFAULT;
2458			break;
2459		}
2460
2461		if ((owner & ~UMUTEX_CONTESTED) == id) {
2462			suword32(&m->m_ceilings[0], ceiling);
2463			error = 0;
2464			break;
2465		}
2466
2467		/*
2468		 * If we caught a signal, we have retried and now
2469		 * exit immediately.
2470		 */
2471		if (error != 0)
2472			break;
2473
2474		/*
2475		 * We set the contested bit, sleep. Otherwise the lock changed
2476		 * and we need to retry or we lost a race to the thread
2477		 * unlocking the umtx.
2478		 */
2479		umtxq_lock(&uq->uq_key);
2480		umtxq_insert(uq);
2481		umtxq_unbusy(&uq->uq_key);
2482		error = umtxq_sleep(uq, "umtxpp", NULL);
2483		umtxq_remove(uq);
2484		umtxq_unlock(&uq->uq_key);
2485	}
2486	umtxq_lock(&uq->uq_key);
2487	if (error == 0)
2488		umtxq_signal(&uq->uq_key, INT_MAX);
2489	umtxq_unbusy(&uq->uq_key);
2490	umtxq_unlock(&uq->uq_key);
2491	umtx_key_release(&uq->uq_key);
2492	if (error == 0 && old_ceiling != NULL)
2493		suword32(old_ceiling, save_ceiling);
2494	return (error);
2495}
2496
2497/*
2498 * Lock a userland POSIX mutex.
2499 */
2500static int
2501do_lock_umutex(struct thread *td, struct umutex *m,
2502    struct _umtx_time *timeout, int mode)
2503{
2504	uint32_t flags;
2505	int error;
2506
2507	flags = fuword32(&m->m_flags);
2508	if (flags == -1)
2509		return (EFAULT);
2510
2511	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2512	case 0:
2513		error = do_lock_normal(td, m, flags, timeout, mode);
2514		break;
2515	case UMUTEX_PRIO_INHERIT:
2516		error = do_lock_pi(td, m, flags, timeout, mode);
2517		break;
2518	case UMUTEX_PRIO_PROTECT:
2519		error = do_lock_pp(td, m, flags, timeout, mode);
2520		break;
2521	default:
2522		return (EINVAL);
2523	}
2524	if (timeout == NULL) {
2525		if (error == EINTR && mode != _UMUTEX_WAIT)
2526			error = ERESTART;
2527	} else {
2528		/* Timed-locking is not restarted. */
2529		if (error == ERESTART)
2530			error = EINTR;
2531	}
2532	return (error);
2533}
2534
2535/*
2536 * Unlock a userland POSIX mutex.
2537 */
2538static int
2539do_unlock_umutex(struct thread *td, struct umutex *m)
2540{
2541	uint32_t flags;
2542
2543	flags = fuword32(&m->m_flags);
2544	if (flags == -1)
2545		return (EFAULT);
2546
2547	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2548	case 0:
2549		return (do_unlock_normal(td, m, flags));
2550	case UMUTEX_PRIO_INHERIT:
2551		return (do_unlock_pi(td, m, flags));
2552	case UMUTEX_PRIO_PROTECT:
2553		return (do_unlock_pp(td, m, flags));
2554	}
2555
2556	return (EINVAL);
2557}
2558
2559static int
2560do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2561	struct timespec *timeout, u_long wflags)
2562{
2563	struct abs_timeout timo;
2564	struct umtx_q *uq;
2565	uint32_t flags;
2566	uint32_t clockid;
2567	int error;
2568
2569	uq = td->td_umtxq;
2570	flags = fuword32(&cv->c_flags);
2571	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2572	if (error != 0)
2573		return (error);
2574
2575	if ((wflags & CVWAIT_CLOCKID) != 0) {
2576		clockid = fuword32(&cv->c_clockid);
2577		if (clockid < CLOCK_REALTIME ||
2578		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2579			/* hmm, only HW clock id will work. */
2580			return (EINVAL);
2581		}
2582	} else {
2583		clockid = CLOCK_REALTIME;
2584	}
2585
2586	umtxq_lock(&uq->uq_key);
2587	umtxq_busy(&uq->uq_key);
2588	umtxq_insert(uq);
2589	umtxq_unlock(&uq->uq_key);
2590
2591	/*
2592	 * Set c_has_waiters to 1 before releasing user mutex, also
2593	 * don't modify cache line when unnecessary.
2594	 */
2595	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2596		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2597
2598	umtxq_lock(&uq->uq_key);
2599	umtxq_unbusy(&uq->uq_key);
2600	umtxq_unlock(&uq->uq_key);
2601
2602	error = do_unlock_umutex(td, m);
2603
2604	if (timeout != NULL)
2605		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2606			timeout);
2607
2608	umtxq_lock(&uq->uq_key);
2609	if (error == 0) {
2610		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2611		    NULL : &timo);
2612	}
2613
2614	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2615		error = 0;
2616	else {
2617		/*
2618		 * This must be timeout,interrupted by signal or
2619		 * surprious wakeup, clear c_has_waiter flag when
2620		 * necessary.
2621		 */
2622		umtxq_busy(&uq->uq_key);
2623		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2624			int oldlen = uq->uq_cur_queue->length;
2625			umtxq_remove(uq);
2626			if (oldlen == 1) {
2627				umtxq_unlock(&uq->uq_key);
2628				suword32(
2629				    __DEVOLATILE(uint32_t *,
2630					 &cv->c_has_waiters), 0);
2631				umtxq_lock(&uq->uq_key);
2632			}
2633		}
2634		umtxq_unbusy(&uq->uq_key);
2635		if (error == ERESTART)
2636			error = EINTR;
2637	}
2638
2639	umtxq_unlock(&uq->uq_key);
2640	umtx_key_release(&uq->uq_key);
2641	return (error);
2642}
2643
2644/*
2645 * Signal a userland condition variable.
2646 */
2647static int
2648do_cv_signal(struct thread *td, struct ucond *cv)
2649{
2650	struct umtx_key key;
2651	int error, cnt, nwake;
2652	uint32_t flags;
2653
2654	flags = fuword32(&cv->c_flags);
2655	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2656		return (error);
2657	umtxq_lock(&key);
2658	umtxq_busy(&key);
2659	cnt = umtxq_count(&key);
2660	nwake = umtxq_signal(&key, 1);
2661	if (cnt <= nwake) {
2662		umtxq_unlock(&key);
2663		error = suword32(
2664		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2665		umtxq_lock(&key);
2666	}
2667	umtxq_unbusy(&key);
2668	umtxq_unlock(&key);
2669	umtx_key_release(&key);
2670	return (error);
2671}
2672
2673static int
2674do_cv_broadcast(struct thread *td, struct ucond *cv)
2675{
2676	struct umtx_key key;
2677	int error;
2678	uint32_t flags;
2679
2680	flags = fuword32(&cv->c_flags);
2681	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2682		return (error);
2683
2684	umtxq_lock(&key);
2685	umtxq_busy(&key);
2686	umtxq_signal(&key, INT_MAX);
2687	umtxq_unlock(&key);
2688
2689	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2690
2691	umtxq_lock(&key);
2692	umtxq_unbusy(&key);
2693	umtxq_unlock(&key);
2694
2695	umtx_key_release(&key);
2696	return (error);
2697}
2698
2699static int
2700do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2701{
2702	struct abs_timeout timo;
2703	struct umtx_q *uq;
2704	uint32_t flags, wrflags;
2705	int32_t state, oldstate;
2706	int32_t blocked_readers;
2707	int error;
2708
2709	uq = td->td_umtxq;
2710	flags = fuword32(&rwlock->rw_flags);
2711	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2712	if (error != 0)
2713		return (error);
2714
2715	if (timeout != NULL)
2716		abs_timeout_init2(&timo, timeout);
2717
2718	wrflags = URWLOCK_WRITE_OWNER;
2719	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2720		wrflags |= URWLOCK_WRITE_WAITERS;
2721
2722	for (;;) {
2723		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2724		/* try to lock it */
2725		while (!(state & wrflags)) {
2726			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2727				umtx_key_release(&uq->uq_key);
2728				return (EAGAIN);
2729			}
2730			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2731			if (oldstate == -1) {
2732				umtx_key_release(&uq->uq_key);
2733				return (EFAULT);
2734			}
2735			if (oldstate == state) {
2736				umtx_key_release(&uq->uq_key);
2737				return (0);
2738			}
2739			error = umtxq_check_susp(td);
2740			if (error != 0)
2741				break;
2742			state = oldstate;
2743		}
2744
2745		if (error)
2746			break;
2747
2748		/* grab monitor lock */
2749		umtxq_lock(&uq->uq_key);
2750		umtxq_busy(&uq->uq_key);
2751		umtxq_unlock(&uq->uq_key);
2752
2753		/*
2754		 * re-read the state, in case it changed between the try-lock above
2755		 * and the check below
2756		 */
2757		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2758
2759		/* set read contention bit */
2760		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2761			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2762			if (oldstate == -1) {
2763				error = EFAULT;
2764				break;
2765			}
2766			if (oldstate == state)
2767				goto sleep;
2768			state = oldstate;
2769			error = umtxq_check_susp(td);
2770			if (error != 0)
2771				break;
2772		}
2773		if (error != 0) {
2774			umtxq_lock(&uq->uq_key);
2775			umtxq_unbusy(&uq->uq_key);
2776			umtxq_unlock(&uq->uq_key);
2777			break;
2778		}
2779
2780		/* state is changed while setting flags, restart */
2781		if (!(state & wrflags)) {
2782			umtxq_lock(&uq->uq_key);
2783			umtxq_unbusy(&uq->uq_key);
2784			umtxq_unlock(&uq->uq_key);
2785			error = umtxq_check_susp(td);
2786			if (error != 0)
2787				break;
2788			continue;
2789		}
2790
2791sleep:
2792		/* contention bit is set, before sleeping, increase read waiter count */
2793		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2794		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2795
2796		while (state & wrflags) {
2797			umtxq_lock(&uq->uq_key);
2798			umtxq_insert(uq);
2799			umtxq_unbusy(&uq->uq_key);
2800
2801			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2802			    NULL : &timo);
2803
2804			umtxq_busy(&uq->uq_key);
2805			umtxq_remove(uq);
2806			umtxq_unlock(&uq->uq_key);
2807			if (error)
2808				break;
2809			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2810		}
2811
2812		/* decrease read waiter count, and may clear read contention bit */
2813		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2814		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2815		if (blocked_readers == 1) {
2816			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2817			for (;;) {
2818				oldstate = casuword32(&rwlock->rw_state, state,
2819					 state & ~URWLOCK_READ_WAITERS);
2820				if (oldstate == -1) {
2821					error = EFAULT;
2822					break;
2823				}
2824				if (oldstate == state)
2825					break;
2826				state = oldstate;
2827				error = umtxq_check_susp(td);
2828				if (error != 0)
2829					break;
2830			}
2831		}
2832
2833		umtxq_lock(&uq->uq_key);
2834		umtxq_unbusy(&uq->uq_key);
2835		umtxq_unlock(&uq->uq_key);
2836		if (error != 0)
2837			break;
2838	}
2839	umtx_key_release(&uq->uq_key);
2840	if (error == ERESTART)
2841		error = EINTR;
2842	return (error);
2843}
2844
2845static int
2846do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2847{
2848	struct abs_timeout timo;
2849	struct umtx_q *uq;
2850	uint32_t flags;
2851	int32_t state, oldstate;
2852	int32_t blocked_writers;
2853	int32_t blocked_readers;
2854	int error;
2855
2856	uq = td->td_umtxq;
2857	flags = fuword32(&rwlock->rw_flags);
2858	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2859	if (error != 0)
2860		return (error);
2861
2862	if (timeout != NULL)
2863		abs_timeout_init2(&timo, timeout);
2864
2865	blocked_readers = 0;
2866	for (;;) {
2867		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2868		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2869			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2870			if (oldstate == -1) {
2871				umtx_key_release(&uq->uq_key);
2872				return (EFAULT);
2873			}
2874			if (oldstate == state) {
2875				umtx_key_release(&uq->uq_key);
2876				return (0);
2877			}
2878			state = oldstate;
2879			error = umtxq_check_susp(td);
2880			if (error != 0)
2881				break;
2882		}
2883
2884		if (error) {
2885			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2886			    blocked_readers != 0) {
2887				umtxq_lock(&uq->uq_key);
2888				umtxq_busy(&uq->uq_key);
2889				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2890				umtxq_unbusy(&uq->uq_key);
2891				umtxq_unlock(&uq->uq_key);
2892			}
2893
2894			break;
2895		}
2896
2897		/* grab monitor lock */
2898		umtxq_lock(&uq->uq_key);
2899		umtxq_busy(&uq->uq_key);
2900		umtxq_unlock(&uq->uq_key);
2901
2902		/*
2903		 * re-read the state, in case it changed between the try-lock above
2904		 * and the check below
2905		 */
2906		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2907
2908		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2909		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2910			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2911			if (oldstate == -1) {
2912				error = EFAULT;
2913				break;
2914			}
2915			if (oldstate == state)
2916				goto sleep;
2917			state = oldstate;
2918			error = umtxq_check_susp(td);
2919			if (error != 0)
2920				break;
2921		}
2922		if (error != 0) {
2923			umtxq_lock(&uq->uq_key);
2924			umtxq_unbusy(&uq->uq_key);
2925			umtxq_unlock(&uq->uq_key);
2926			break;
2927		}
2928
2929		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2930			umtxq_lock(&uq->uq_key);
2931			umtxq_unbusy(&uq->uq_key);
2932			umtxq_unlock(&uq->uq_key);
2933			error = umtxq_check_susp(td);
2934			if (error != 0)
2935				break;
2936			continue;
2937		}
2938sleep:
2939		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2940		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2941
2942		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2943			umtxq_lock(&uq->uq_key);
2944			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2945			umtxq_unbusy(&uq->uq_key);
2946
2947			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2948			    NULL : &timo);
2949
2950			umtxq_busy(&uq->uq_key);
2951			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2952			umtxq_unlock(&uq->uq_key);
2953			if (error)
2954				break;
2955			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2956		}
2957
2958		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2959		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2960		if (blocked_writers == 1) {
2961			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2962			for (;;) {
2963				oldstate = casuword32(&rwlock->rw_state, state,
2964					 state & ~URWLOCK_WRITE_WAITERS);
2965				if (oldstate == -1) {
2966					error = EFAULT;
2967					break;
2968				}
2969				if (oldstate == state)
2970					break;
2971				state = oldstate;
2972				error = umtxq_check_susp(td);
2973				/*
2974				 * We are leaving the URWLOCK_WRITE_WAITERS
2975				 * behind, but this should not harm the
2976				 * correctness.
2977				 */
2978				if (error != 0)
2979					break;
2980			}
2981			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2982		} else
2983			blocked_readers = 0;
2984
2985		umtxq_lock(&uq->uq_key);
2986		umtxq_unbusy(&uq->uq_key);
2987		umtxq_unlock(&uq->uq_key);
2988	}
2989
2990	umtx_key_release(&uq->uq_key);
2991	if (error == ERESTART)
2992		error = EINTR;
2993	return (error);
2994}
2995
2996static int
2997do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2998{
2999	struct umtx_q *uq;
3000	uint32_t flags;
3001	int32_t state, oldstate;
3002	int error, q, count;
3003
3004	uq = td->td_umtxq;
3005	flags = fuword32(&rwlock->rw_flags);
3006	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3007	if (error != 0)
3008		return (error);
3009
3010	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
3011	if (state & URWLOCK_WRITE_OWNER) {
3012		for (;;) {
3013			oldstate = casuword32(&rwlock->rw_state, state,
3014				state & ~URWLOCK_WRITE_OWNER);
3015			if (oldstate == -1) {
3016				error = EFAULT;
3017				goto out;
3018			}
3019			if (oldstate != state) {
3020				state = oldstate;
3021				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3022					error = EPERM;
3023					goto out;
3024				}
3025				error = umtxq_check_susp(td);
3026				if (error != 0)
3027					goto out;
3028			} else
3029				break;
3030		}
3031	} else if (URWLOCK_READER_COUNT(state) != 0) {
3032		for (;;) {
3033			oldstate = casuword32(&rwlock->rw_state, state,
3034				state - 1);
3035			if (oldstate == -1) {
3036				error = EFAULT;
3037				goto out;
3038			}
3039			if (oldstate != state) {
3040				state = oldstate;
3041				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3042					error = EPERM;
3043					goto out;
3044				}
3045				error = umtxq_check_susp(td);
3046				if (error != 0)
3047					goto out;
3048			} else
3049				break;
3050		}
3051	} else {
3052		error = EPERM;
3053		goto out;
3054	}
3055
3056	count = 0;
3057
3058	if (!(flags & URWLOCK_PREFER_READER)) {
3059		if (state & URWLOCK_WRITE_WAITERS) {
3060			count = 1;
3061			q = UMTX_EXCLUSIVE_QUEUE;
3062		} else if (state & URWLOCK_READ_WAITERS) {
3063			count = INT_MAX;
3064			q = UMTX_SHARED_QUEUE;
3065		}
3066	} else {
3067		if (state & URWLOCK_READ_WAITERS) {
3068			count = INT_MAX;
3069			q = UMTX_SHARED_QUEUE;
3070		} else if (state & URWLOCK_WRITE_WAITERS) {
3071			count = 1;
3072			q = UMTX_EXCLUSIVE_QUEUE;
3073		}
3074	}
3075
3076	if (count) {
3077		umtxq_lock(&uq->uq_key);
3078		umtxq_busy(&uq->uq_key);
3079		umtxq_signal_queue(&uq->uq_key, count, q);
3080		umtxq_unbusy(&uq->uq_key);
3081		umtxq_unlock(&uq->uq_key);
3082	}
3083out:
3084	umtx_key_release(&uq->uq_key);
3085	return (error);
3086}
3087
3088static int
3089do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3090{
3091	struct abs_timeout timo;
3092	struct umtx_q *uq;
3093	uint32_t flags, count;
3094	int error;
3095
3096	uq = td->td_umtxq;
3097	flags = fuword32(&sem->_flags);
3098	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3099	if (error != 0)
3100		return (error);
3101
3102	if (timeout != NULL)
3103		abs_timeout_init2(&timo, timeout);
3104
3105	umtxq_lock(&uq->uq_key);
3106	umtxq_busy(&uq->uq_key);
3107	umtxq_insert(uq);
3108	umtxq_unlock(&uq->uq_key);
3109	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
3110	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
3111	if (count != 0) {
3112		umtxq_lock(&uq->uq_key);
3113		umtxq_unbusy(&uq->uq_key);
3114		umtxq_remove(uq);
3115		umtxq_unlock(&uq->uq_key);
3116		umtx_key_release(&uq->uq_key);
3117		return (0);
3118	}
3119	umtxq_lock(&uq->uq_key);
3120	umtxq_unbusy(&uq->uq_key);
3121
3122	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3123
3124	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3125		error = 0;
3126	else {
3127		umtxq_remove(uq);
3128		/* A relative timeout cannot be restarted. */
3129		if (error == ERESTART && timeout != NULL &&
3130		    (timeout->_flags & UMTX_ABSTIME) == 0)
3131			error = EINTR;
3132	}
3133	umtxq_unlock(&uq->uq_key);
3134	umtx_key_release(&uq->uq_key);
3135	return (error);
3136}
3137
3138/*
3139 * Signal a userland condition variable.
3140 */
3141static int
3142do_sem_wake(struct thread *td, struct _usem *sem)
3143{
3144	struct umtx_key key;
3145	int error, cnt;
3146	uint32_t flags;
3147
3148	flags = fuword32(&sem->_flags);
3149	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3150		return (error);
3151	umtxq_lock(&key);
3152	umtxq_busy(&key);
3153	cnt = umtxq_count(&key);
3154	if (cnt > 0) {
3155		umtxq_signal(&key, 1);
3156		/*
3157		 * Check if count is greater than 0, this means the memory is
3158		 * still being referenced by user code, so we can safely
3159		 * update _has_waiters flag.
3160		 */
3161		if (cnt == 1) {
3162			umtxq_unlock(&key);
3163			error = suword32(
3164			    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
3165			umtxq_lock(&key);
3166		}
3167	}
3168	umtxq_unbusy(&key);
3169	umtxq_unlock(&key);
3170	umtx_key_release(&key);
3171	return (error);
3172}
3173
3174int
3175sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
3176    /* struct umtx *umtx */
3177{
3178	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
3179}
3180
3181int
3182sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
3183    /* struct umtx *umtx */
3184{
3185	return do_unlock_umtx(td, uap->umtx, td->td_tid);
3186}
3187
3188inline int
3189umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3190{
3191	int error;
3192
3193	error = copyin(addr, tsp, sizeof(struct timespec));
3194	if (error == 0) {
3195		if (tsp->tv_sec < 0 ||
3196		    tsp->tv_nsec >= 1000000000 ||
3197		    tsp->tv_nsec < 0)
3198			error = EINVAL;
3199	}
3200	return (error);
3201}
3202
3203static inline int
3204umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3205{
3206	int error;
3207
3208	if (size <= sizeof(struct timespec)) {
3209		tp->_clockid = CLOCK_REALTIME;
3210		tp->_flags = 0;
3211		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3212	} else
3213		error = copyin(addr, tp, sizeof(struct _umtx_time));
3214	if (error != 0)
3215		return (error);
3216	if (tp->_timeout.tv_sec < 0 ||
3217	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3218		return (EINVAL);
3219	return (0);
3220}
3221
3222static int
3223__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
3224{
3225	struct timespec *ts, timeout;
3226	int error;
3227
3228	/* Allow a null timespec (wait forever). */
3229	if (uap->uaddr2 == NULL)
3230		ts = NULL;
3231	else {
3232		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3233		if (error != 0)
3234			return (error);
3235		ts = &timeout;
3236	}
3237	return (do_lock_umtx(td, uap->obj, uap->val, ts));
3238}
3239
3240static int
3241__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
3242{
3243	return (do_unlock_umtx(td, uap->obj, uap->val));
3244}
3245
3246static int
3247__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3248{
3249	struct _umtx_time timeout, *tm_p;
3250	int error;
3251
3252	if (uap->uaddr2 == NULL)
3253		tm_p = NULL;
3254	else {
3255		error = umtx_copyin_umtx_time(
3256		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3257		if (error != 0)
3258			return (error);
3259		tm_p = &timeout;
3260	}
3261	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3262}
3263
3264static int
3265__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3266{
3267	struct _umtx_time timeout, *tm_p;
3268	int error;
3269
3270	if (uap->uaddr2 == NULL)
3271		tm_p = NULL;
3272	else {
3273		error = umtx_copyin_umtx_time(
3274		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3275		if (error != 0)
3276			return (error);
3277		tm_p = &timeout;
3278	}
3279	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3280}
3281
3282static int
3283__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3284{
3285	struct _umtx_time *tm_p, timeout;
3286	int error;
3287
3288	if (uap->uaddr2 == NULL)
3289		tm_p = NULL;
3290	else {
3291		error = umtx_copyin_umtx_time(
3292		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3293		if (error != 0)
3294			return (error);
3295		tm_p = &timeout;
3296	}
3297	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3298}
3299
3300static int
3301__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3302{
3303	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3304}
3305
3306#define BATCH_SIZE	128
3307static int
3308__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3309{
3310	int count = uap->val;
3311	void *uaddrs[BATCH_SIZE];
3312	char **upp = (char **)uap->obj;
3313	int tocopy;
3314	int error = 0;
3315	int i, pos = 0;
3316
3317	while (count > 0) {
3318		tocopy = count;
3319		if (tocopy > BATCH_SIZE)
3320			tocopy = BATCH_SIZE;
3321		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3322		if (error != 0)
3323			break;
3324		for (i = 0; i < tocopy; ++i)
3325			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3326		count -= tocopy;
3327		pos += tocopy;
3328	}
3329	return (error);
3330}
3331
3332static int
3333__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3334{
3335	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3336}
3337
3338static int
3339__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3340{
3341	struct _umtx_time *tm_p, timeout;
3342	int error;
3343
3344	/* Allow a null timespec (wait forever). */
3345	if (uap->uaddr2 == NULL)
3346		tm_p = NULL;
3347	else {
3348		error = umtx_copyin_umtx_time(
3349		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3350		if (error != 0)
3351			return (error);
3352		tm_p = &timeout;
3353	}
3354	return do_lock_umutex(td, uap->obj, tm_p, 0);
3355}
3356
3357static int
3358__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3359{
3360	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3361}
3362
3363static int
3364__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3365{
3366	struct _umtx_time *tm_p, timeout;
3367	int error;
3368
3369	/* Allow a null timespec (wait forever). */
3370	if (uap->uaddr2 == NULL)
3371		tm_p = NULL;
3372	else {
3373		error = umtx_copyin_umtx_time(
3374		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3375		if (error != 0)
3376			return (error);
3377		tm_p = &timeout;
3378	}
3379	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3380}
3381
3382static int
3383__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3384{
3385	return do_wake_umutex(td, uap->obj);
3386}
3387
3388static int
3389__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3390{
3391	return do_unlock_umutex(td, uap->obj);
3392}
3393
3394static int
3395__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3396{
3397	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3398}
3399
3400static int
3401__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3402{
3403	struct timespec *ts, timeout;
3404	int error;
3405
3406	/* Allow a null timespec (wait forever). */
3407	if (uap->uaddr2 == NULL)
3408		ts = NULL;
3409	else {
3410		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3411		if (error != 0)
3412			return (error);
3413		ts = &timeout;
3414	}
3415	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3416}
3417
3418static int
3419__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3420{
3421	return do_cv_signal(td, uap->obj);
3422}
3423
3424static int
3425__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3426{
3427	return do_cv_broadcast(td, uap->obj);
3428}
3429
3430static int
3431__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3432{
3433	struct _umtx_time timeout;
3434	int error;
3435
3436	/* Allow a null timespec (wait forever). */
3437	if (uap->uaddr2 == NULL) {
3438		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3439	} else {
3440		error = umtx_copyin_umtx_time(uap->uaddr2,
3441		   (size_t)uap->uaddr1, &timeout);
3442		if (error != 0)
3443			return (error);
3444		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3445	}
3446	return (error);
3447}
3448
3449static int
3450__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3451{
3452	struct _umtx_time timeout;
3453	int error;
3454
3455	/* Allow a null timespec (wait forever). */
3456	if (uap->uaddr2 == NULL) {
3457		error = do_rw_wrlock(td, uap->obj, 0);
3458	} else {
3459		error = umtx_copyin_umtx_time(uap->uaddr2,
3460		   (size_t)uap->uaddr1, &timeout);
3461		if (error != 0)
3462			return (error);
3463
3464		error = do_rw_wrlock(td, uap->obj, &timeout);
3465	}
3466	return (error);
3467}
3468
3469static int
3470__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3471{
3472	return do_rw_unlock(td, uap->obj);
3473}
3474
3475static int
3476__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3477{
3478	struct _umtx_time *tm_p, timeout;
3479	int error;
3480
3481	/* Allow a null timespec (wait forever). */
3482	if (uap->uaddr2 == NULL)
3483		tm_p = NULL;
3484	else {
3485		error = umtx_copyin_umtx_time(
3486		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3487		if (error != 0)
3488			return (error);
3489		tm_p = &timeout;
3490	}
3491	return (do_sem_wait(td, uap->obj, tm_p));
3492}
3493
3494static int
3495__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3496{
3497	return do_sem_wake(td, uap->obj);
3498}
3499
3500static int
3501__umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3502{
3503	return do_wake2_umutex(td, uap->obj, uap->val);
3504}
3505
3506typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3507
3508static _umtx_op_func op_table[] = {
3509	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3510	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3511	__umtx_op_wait,			/* UMTX_OP_WAIT */
3512	__umtx_op_wake,			/* UMTX_OP_WAKE */
3513	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3514	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3515	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3516	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3517	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3518	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3519	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3520	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3521	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3522	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3523	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3524	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3525	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3526	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3527	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3528	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3529	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3530	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3531	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3532};
3533
3534int
3535sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3536{
3537	if ((unsigned)uap->op < UMTX_OP_MAX)
3538		return (*op_table[uap->op])(td, uap);
3539	return (EINVAL);
3540}
3541
3542#ifdef COMPAT_FREEBSD32
3543int
3544freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3545    /* struct umtx *umtx */
3546{
3547	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3548}
3549
3550int
3551freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3552    /* struct umtx *umtx */
3553{
3554	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3555}
3556
3557struct timespec32 {
3558	int32_t tv_sec;
3559	int32_t tv_nsec;
3560};
3561
3562struct umtx_time32 {
3563	struct	timespec32	timeout;
3564	uint32_t		flags;
3565	uint32_t		clockid;
3566};
3567
3568static inline int
3569umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3570{
3571	struct timespec32 ts32;
3572	int error;
3573
3574	error = copyin(addr, &ts32, sizeof(struct timespec32));
3575	if (error == 0) {
3576		if (ts32.tv_sec < 0 ||
3577		    ts32.tv_nsec >= 1000000000 ||
3578		    ts32.tv_nsec < 0)
3579			error = EINVAL;
3580		else {
3581			tsp->tv_sec = ts32.tv_sec;
3582			tsp->tv_nsec = ts32.tv_nsec;
3583		}
3584	}
3585	return (error);
3586}
3587
3588static inline int
3589umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3590{
3591	struct umtx_time32 t32;
3592	int error;
3593
3594	t32.clockid = CLOCK_REALTIME;
3595	t32.flags   = 0;
3596	if (size <= sizeof(struct timespec32))
3597		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3598	else
3599		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3600	if (error != 0)
3601		return (error);
3602	if (t32.timeout.tv_sec < 0 ||
3603	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3604		return (EINVAL);
3605	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3606	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3607	tp->_flags = t32.flags;
3608	tp->_clockid = t32.clockid;
3609	return (0);
3610}
3611
3612static int
3613__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3614{
3615	struct timespec *ts, timeout;
3616	int error;
3617
3618	/* Allow a null timespec (wait forever). */
3619	if (uap->uaddr2 == NULL)
3620		ts = NULL;
3621	else {
3622		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3623		if (error != 0)
3624			return (error);
3625		ts = &timeout;
3626	}
3627	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3628}
3629
3630static int
3631__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3632{
3633	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3634}
3635
3636static int
3637__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3638{
3639	struct _umtx_time *tm_p, timeout;
3640	int error;
3641
3642	if (uap->uaddr2 == NULL)
3643		tm_p = NULL;
3644	else {
3645		error = umtx_copyin_umtx_time32(uap->uaddr2,
3646			(size_t)uap->uaddr1, &timeout);
3647		if (error != 0)
3648			return (error);
3649		tm_p = &timeout;
3650	}
3651	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3652}
3653
3654static int
3655__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3656{
3657	struct _umtx_time *tm_p, timeout;
3658	int error;
3659
3660	/* Allow a null timespec (wait forever). */
3661	if (uap->uaddr2 == NULL)
3662		tm_p = NULL;
3663	else {
3664		error = umtx_copyin_umtx_time(uap->uaddr2,
3665			    (size_t)uap->uaddr1, &timeout);
3666		if (error != 0)
3667			return (error);
3668		tm_p = &timeout;
3669	}
3670	return do_lock_umutex(td, uap->obj, tm_p, 0);
3671}
3672
3673static int
3674__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3675{
3676	struct _umtx_time *tm_p, timeout;
3677	int error;
3678
3679	/* Allow a null timespec (wait forever). */
3680	if (uap->uaddr2 == NULL)
3681		tm_p = NULL;
3682	else {
3683		error = umtx_copyin_umtx_time32(uap->uaddr2,
3684		    (size_t)uap->uaddr1, &timeout);
3685		if (error != 0)
3686			return (error);
3687		tm_p = &timeout;
3688	}
3689	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3690}
3691
3692static int
3693__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3694{
3695	struct timespec *ts, timeout;
3696	int error;
3697
3698	/* Allow a null timespec (wait forever). */
3699	if (uap->uaddr2 == NULL)
3700		ts = NULL;
3701	else {
3702		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3703		if (error != 0)
3704			return (error);
3705		ts = &timeout;
3706	}
3707	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3708}
3709
3710static int
3711__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3712{
3713	struct _umtx_time timeout;
3714	int error;
3715
3716	/* Allow a null timespec (wait forever). */
3717	if (uap->uaddr2 == NULL) {
3718		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3719	} else {
3720		error = umtx_copyin_umtx_time32(uap->uaddr2,
3721		    (size_t)uap->uaddr1, &timeout);
3722		if (error != 0)
3723			return (error);
3724		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3725	}
3726	return (error);
3727}
3728
3729static int
3730__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3731{
3732	struct _umtx_time timeout;
3733	int error;
3734
3735	/* Allow a null timespec (wait forever). */
3736	if (uap->uaddr2 == NULL) {
3737		error = do_rw_wrlock(td, uap->obj, 0);
3738	} else {
3739		error = umtx_copyin_umtx_time32(uap->uaddr2,
3740		    (size_t)uap->uaddr1, &timeout);
3741		if (error != 0)
3742			return (error);
3743		error = do_rw_wrlock(td, uap->obj, &timeout);
3744	}
3745	return (error);
3746}
3747
3748static int
3749__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3750{
3751	struct _umtx_time *tm_p, timeout;
3752	int error;
3753
3754	if (uap->uaddr2 == NULL)
3755		tm_p = NULL;
3756	else {
3757		error = umtx_copyin_umtx_time32(
3758		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3759		if (error != 0)
3760			return (error);
3761		tm_p = &timeout;
3762	}
3763	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3764}
3765
3766static int
3767__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3768{
3769	struct _umtx_time *tm_p, timeout;
3770	int error;
3771
3772	/* Allow a null timespec (wait forever). */
3773	if (uap->uaddr2 == NULL)
3774		tm_p = NULL;
3775	else {
3776		error = umtx_copyin_umtx_time32(uap->uaddr2,
3777		    (size_t)uap->uaddr1, &timeout);
3778		if (error != 0)
3779			return (error);
3780		tm_p = &timeout;
3781	}
3782	return (do_sem_wait(td, uap->obj, tm_p));
3783}
3784
3785static int
3786__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3787{
3788	int count = uap->val;
3789	uint32_t uaddrs[BATCH_SIZE];
3790	uint32_t **upp = (uint32_t **)uap->obj;
3791	int tocopy;
3792	int error = 0;
3793	int i, pos = 0;
3794
3795	while (count > 0) {
3796		tocopy = count;
3797		if (tocopy > BATCH_SIZE)
3798			tocopy = BATCH_SIZE;
3799		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3800		if (error != 0)
3801			break;
3802		for (i = 0; i < tocopy; ++i)
3803			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3804				INT_MAX, 1);
3805		count -= tocopy;
3806		pos += tocopy;
3807	}
3808	return (error);
3809}
3810
3811static _umtx_op_func op_table_compat32[] = {
3812	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3813	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3814	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3815	__umtx_op_wake,			/* UMTX_OP_WAKE */
3816	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3817	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3818	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3819	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3820	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3821	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3822	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3823	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3824	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3825	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3826	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3827	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3828	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3829	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3830	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3831	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3832	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3833	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3834	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3835};
3836
3837int
3838freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3839{
3840	if ((unsigned)uap->op < UMTX_OP_MAX)
3841		return (*op_table_compat32[uap->op])(td,
3842			(struct _umtx_op_args *)uap);
3843	return (EINVAL);
3844}
3845#endif
3846
3847void
3848umtx_thread_init(struct thread *td)
3849{
3850	td->td_umtxq = umtxq_alloc();
3851	td->td_umtxq->uq_thread = td;
3852}
3853
3854void
3855umtx_thread_fini(struct thread *td)
3856{
3857	umtxq_free(td->td_umtxq);
3858}
3859
3860/*
3861 * It will be called when new thread is created, e.g fork().
3862 */
3863void
3864umtx_thread_alloc(struct thread *td)
3865{
3866	struct umtx_q *uq;
3867
3868	uq = td->td_umtxq;
3869	uq->uq_inherited_pri = PRI_MAX;
3870
3871	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3872	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3873	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3874	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3875}
3876
3877/*
3878 * exec() hook.
3879 */
3880static void
3881umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3882	struct image_params *imgp __unused)
3883{
3884	umtx_thread_cleanup(curthread);
3885}
3886
3887/*
3888 * thread_exit() hook.
3889 */
3890void
3891umtx_thread_exit(struct thread *td)
3892{
3893	umtx_thread_cleanup(td);
3894}
3895
3896/*
3897 * clean up umtx data.
3898 */
3899static void
3900umtx_thread_cleanup(struct thread *td)
3901{
3902	struct umtx_q *uq;
3903	struct umtx_pi *pi;
3904
3905	if ((uq = td->td_umtxq) == NULL)
3906		return;
3907
3908	mtx_lock_spin(&umtx_lock);
3909	uq->uq_inherited_pri = PRI_MAX;
3910	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3911		pi->pi_owner = NULL;
3912		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3913	}
3914	mtx_unlock_spin(&umtx_lock);
3915	thread_lock(td);
3916	sched_lend_user_prio(td, PRI_MAX);
3917	thread_unlock(td);
3918}
3919