kern_umtx.c revision 248591
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 248591 2013-03-21 19:58:25Z attilio $");
30
31#include "opt_compat.h"
32#include "opt_umtx_profiling.h"
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mutex.h>
40#include <sys/priv.h>
41#include <sys/proc.h>
42#include <sys/sbuf.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/sysctl.h>
46#include <sys/sysent.h>
47#include <sys/systm.h>
48#include <sys/sysproto.h>
49#include <sys/syscallsubr.h>
50#include <sys/eventhandler.h>
51#include <sys/umtx.h>
52
53#include <vm/vm.h>
54#include <vm/vm_param.h>
55#include <vm/pmap.h>
56#include <vm/vm_map.h>
57#include <vm/vm_object.h>
58
59#include <machine/cpu.h>
60
61#ifdef COMPAT_FREEBSD32
62#include <compat/freebsd32/freebsd32_proto.h>
63#endif
64
65#define _UMUTEX_TRY		1
66#define _UMUTEX_WAIT		2
67
68#ifdef UMTX_PROFILING
69#define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71#endif
72
73/* Priority inheritance mutex info. */
74struct umtx_pi {
75	/* Owner thread */
76	struct thread		*pi_owner;
77
78	/* Reference count */
79	int			pi_refcount;
80
81 	/* List entry to link umtx holding by thread */
82	TAILQ_ENTRY(umtx_pi)	pi_link;
83
84	/* List entry in hash */
85	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86
87	/* List for waiters */
88	TAILQ_HEAD(,umtx_q)	pi_blocked;
89
90	/* Identify a userland lock object */
91	struct umtx_key		pi_key;
92};
93
94/* A userland synchronous object user. */
95struct umtx_q {
96	/* Linked list for the hash. */
97	TAILQ_ENTRY(umtx_q)	uq_link;
98
99	/* Umtx key. */
100	struct umtx_key		uq_key;
101
102	/* Umtx flags. */
103	int			uq_flags;
104#define UQF_UMTXQ	0x0001
105
106	/* The thread waits on. */
107	struct thread		*uq_thread;
108
109	/*
110	 * Blocked on PI mutex. read can use chain lock
111	 * or umtx_lock, write must have both chain lock and
112	 * umtx_lock being hold.
113	 */
114	struct umtx_pi		*uq_pi_blocked;
115
116	/* On blocked list */
117	TAILQ_ENTRY(umtx_q)	uq_lockq;
118
119	/* Thread contending with us */
120	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121
122	/* Inherited priority from PP mutex */
123	u_char			uq_inherited_pri;
124
125	/* Spare queue ready to be reused */
126	struct umtxq_queue	*uq_spare_queue;
127
128	/* The queue we on */
129	struct umtxq_queue	*uq_cur_queue;
130};
131
132TAILQ_HEAD(umtxq_head, umtx_q);
133
134/* Per-key wait-queue */
135struct umtxq_queue {
136	struct umtxq_head	head;
137	struct umtx_key		key;
138	LIST_ENTRY(umtxq_queue)	link;
139	int			length;
140};
141
142LIST_HEAD(umtxq_list, umtxq_queue);
143
144/* Userland lock object's wait-queue chain */
145struct umtxq_chain {
146	/* Lock for this chain. */
147	struct mtx		uc_lock;
148
149	/* List of sleep queues. */
150	struct umtxq_list	uc_queue[2];
151#define UMTX_SHARED_QUEUE	0
152#define UMTX_EXCLUSIVE_QUEUE	1
153
154	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155
156	/* Busy flag */
157	char			uc_busy;
158
159	/* Chain lock waiters */
160	int			uc_waiters;
161
162	/* All PI in the list */
163	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164
165#ifdef UMTX_PROFILING
166	u_int 			length;
167	u_int			max_length;
168#endif
169};
170
171#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
173
174/*
175 * Don't propagate time-sharing priority, there is a security reason,
176 * a user can simply introduce PI-mutex, let thread A lock the mutex,
177 * and let another thread B block on the mutex, because B is
178 * sleeping, its priority will be boosted, this causes A's priority to
179 * be boosted via priority propagating too and will never be lowered even
180 * if it is using 100%CPU, this is unfair to other processes.
181 */
182
183#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
184			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
185			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
186
187#define	GOLDEN_RATIO_PRIME	2654404609U
188#define	UMTX_CHAINS		512
189#define	UMTX_SHIFTS		(__WORD_BIT - 9)
190
191#define	GET_SHARE(flags)	\
192    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
193
194#define BUSY_SPINS		200
195
196struct abs_timeout {
197	int clockid;
198	struct timespec cur;
199	struct timespec end;
200};
201
202static uma_zone_t		umtx_pi_zone;
203static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
204static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
205static int			umtx_pi_allocated;
206
207static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
208SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
209    &umtx_pi_allocated, 0, "Allocated umtx_pi");
210
211#ifdef UMTX_PROFILING
212static long max_length;
213SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
214static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
215#endif
216
217static void umtxq_sysinit(void *);
218static void umtxq_hash(struct umtx_key *key);
219static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
220static void umtxq_lock(struct umtx_key *key);
221static void umtxq_unlock(struct umtx_key *key);
222static void umtxq_busy(struct umtx_key *key);
223static void umtxq_unbusy(struct umtx_key *key);
224static void umtxq_insert_queue(struct umtx_q *uq, int q);
225static void umtxq_remove_queue(struct umtx_q *uq, int q);
226static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
227static int umtxq_count(struct umtx_key *key);
228static struct umtx_pi *umtx_pi_alloc(int);
229static void umtx_pi_free(struct umtx_pi *pi);
230static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
231static void umtx_thread_cleanup(struct thread *td);
232static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
233	struct image_params *imgp __unused);
234SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
235
236#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
237#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
238#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
239
240static struct mtx umtx_lock;
241
242#ifdef UMTX_PROFILING
243static void
244umtx_init_profiling(void)
245{
246	struct sysctl_oid *chain_oid;
247	char chain_name[10];
248	int i;
249
250	for (i = 0; i < UMTX_CHAINS; ++i) {
251		snprintf(chain_name, sizeof(chain_name), "%d", i);
252		chain_oid = SYSCTL_ADD_NODE(NULL,
253		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
254		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
255		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
256		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
257		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
258		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
259	}
260}
261
262static int
263sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
264{
265	char buf[512];
266	struct sbuf sb;
267	struct umtxq_chain *uc;
268	u_int fract, i, j, tot, whole;
269	u_int sf0, sf1, sf2, sf3, sf4;
270	u_int si0, si1, si2, si3, si4;
271	u_int sw0, sw1, sw2, sw3, sw4;
272
273	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
274	for (i = 0; i < 2; i++) {
275		tot = 0;
276		for (j = 0; j < UMTX_CHAINS; ++j) {
277			uc = &umtxq_chains[i][j];
278			mtx_lock(&uc->uc_lock);
279			tot += uc->max_length;
280			mtx_unlock(&uc->uc_lock);
281		}
282		if (tot == 0)
283			sbuf_printf(&sb, "%u) Empty ", i);
284		else {
285			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
286			si0 = si1 = si2 = si3 = si4 = 0;
287			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
288			for (j = 0; j < UMTX_CHAINS; j++) {
289				uc = &umtxq_chains[i][j];
290				mtx_lock(&uc->uc_lock);
291				whole = uc->max_length * 100;
292				mtx_unlock(&uc->uc_lock);
293				fract = (whole % tot) * 100;
294				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
295					sf0 = fract;
296					si0 = j;
297					sw0 = whole;
298				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
299				    sf1)) {
300					sf1 = fract;
301					si1 = j;
302					sw1 = whole;
303				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
304				    sf2)) {
305					sf2 = fract;
306					si2 = j;
307					sw2 = whole;
308				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
309				    sf3)) {
310					sf3 = fract;
311					si3 = j;
312					sw3 = whole;
313				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
314				    sf4)) {
315					sf4 = fract;
316					si4 = j;
317					sw4 = whole;
318				}
319			}
320			sbuf_printf(&sb, "queue %u:\n", i);
321			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
322			    sf0 / tot, si0);
323			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
324			    sf1 / tot, si1);
325			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
326			    sf2 / tot, si2);
327			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
328			    sf3 / tot, si3);
329			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
330			    sf4 / tot, si4);
331		}
332	}
333	sbuf_trim(&sb);
334	sbuf_finish(&sb);
335	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
336	sbuf_delete(&sb);
337	return (0);
338}
339
340static int
341sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
342{
343	struct umtxq_chain *uc;
344	u_int i, j;
345	int clear, error;
346
347	clear = 0;
348	error = sysctl_handle_int(oidp, &clear, 0, req);
349	if (error != 0 || req->newptr == NULL)
350		return (error);
351
352	if (clear != 0) {
353		for (i = 0; i < 2; ++i) {
354			for (j = 0; j < UMTX_CHAINS; ++j) {
355				uc = &umtxq_chains[i][j];
356				mtx_lock(&uc->uc_lock);
357				uc->length = 0;
358				uc->max_length = 0;
359				mtx_unlock(&uc->uc_lock);
360			}
361		}
362	}
363	return (0);
364}
365
366SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
367    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
368    sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
369SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
370    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
371    sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
372#endif
373
374static void
375umtxq_sysinit(void *arg __unused)
376{
377	int i, j;
378
379	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
380		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
381	for (i = 0; i < 2; ++i) {
382		for (j = 0; j < UMTX_CHAINS; ++j) {
383			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
384				 MTX_DEF | MTX_DUPOK);
385			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
386			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
387			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
388			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
389			umtxq_chains[i][j].uc_busy = 0;
390			umtxq_chains[i][j].uc_waiters = 0;
391#ifdef UMTX_PROFILING
392			umtxq_chains[i][j].length = 0;
393			umtxq_chains[i][j].max_length = 0;
394#endif
395		}
396	}
397#ifdef UMTX_PROFILING
398	umtx_init_profiling();
399#endif
400	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
401	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
402	    EVENTHANDLER_PRI_ANY);
403}
404
405struct umtx_q *
406umtxq_alloc(void)
407{
408	struct umtx_q *uq;
409
410	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
411	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
412	TAILQ_INIT(&uq->uq_spare_queue->head);
413	TAILQ_INIT(&uq->uq_pi_contested);
414	uq->uq_inherited_pri = PRI_MAX;
415	return (uq);
416}
417
418void
419umtxq_free(struct umtx_q *uq)
420{
421	MPASS(uq->uq_spare_queue != NULL);
422	free(uq->uq_spare_queue, M_UMTX);
423	free(uq, M_UMTX);
424}
425
426static inline void
427umtxq_hash(struct umtx_key *key)
428{
429	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
430	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
431}
432
433static inline struct umtxq_chain *
434umtxq_getchain(struct umtx_key *key)
435{
436	if (key->type <= TYPE_SEM)
437		return (&umtxq_chains[1][key->hash]);
438	return (&umtxq_chains[0][key->hash]);
439}
440
441/*
442 * Lock a chain.
443 */
444static inline void
445umtxq_lock(struct umtx_key *key)
446{
447	struct umtxq_chain *uc;
448
449	uc = umtxq_getchain(key);
450	mtx_lock(&uc->uc_lock);
451}
452
453/*
454 * Unlock a chain.
455 */
456static inline void
457umtxq_unlock(struct umtx_key *key)
458{
459	struct umtxq_chain *uc;
460
461	uc = umtxq_getchain(key);
462	mtx_unlock(&uc->uc_lock);
463}
464
465/*
466 * Set chain to busy state when following operation
467 * may be blocked (kernel mutex can not be used).
468 */
469static inline void
470umtxq_busy(struct umtx_key *key)
471{
472	struct umtxq_chain *uc;
473
474	uc = umtxq_getchain(key);
475	mtx_assert(&uc->uc_lock, MA_OWNED);
476	if (uc->uc_busy) {
477#ifdef SMP
478		if (smp_cpus > 1) {
479			int count = BUSY_SPINS;
480			if (count > 0) {
481				umtxq_unlock(key);
482				while (uc->uc_busy && --count > 0)
483					cpu_spinwait();
484				umtxq_lock(key);
485			}
486		}
487#endif
488		while (uc->uc_busy) {
489			uc->uc_waiters++;
490			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
491			uc->uc_waiters--;
492		}
493	}
494	uc->uc_busy = 1;
495}
496
497/*
498 * Unbusy a chain.
499 */
500static inline void
501umtxq_unbusy(struct umtx_key *key)
502{
503	struct umtxq_chain *uc;
504
505	uc = umtxq_getchain(key);
506	mtx_assert(&uc->uc_lock, MA_OWNED);
507	KASSERT(uc->uc_busy != 0, ("not busy"));
508	uc->uc_busy = 0;
509	if (uc->uc_waiters)
510		wakeup_one(uc);
511}
512
513static struct umtxq_queue *
514umtxq_queue_lookup(struct umtx_key *key, int q)
515{
516	struct umtxq_queue *uh;
517	struct umtxq_chain *uc;
518
519	uc = umtxq_getchain(key);
520	UMTXQ_LOCKED_ASSERT(uc);
521	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
522		if (umtx_key_match(&uh->key, key))
523			return (uh);
524	}
525
526	return (NULL);
527}
528
529static inline void
530umtxq_insert_queue(struct umtx_q *uq, int q)
531{
532	struct umtxq_queue *uh;
533	struct umtxq_chain *uc;
534
535	uc = umtxq_getchain(&uq->uq_key);
536	UMTXQ_LOCKED_ASSERT(uc);
537	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
538	uh = umtxq_queue_lookup(&uq->uq_key, q);
539	if (uh != NULL) {
540		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
541	} else {
542		uh = uq->uq_spare_queue;
543		uh->key = uq->uq_key;
544		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
545#ifdef UMTX_PROFILING
546		uc->length++;
547		if (uc->length > uc->max_length) {
548			uc->max_length = uc->length;
549			if (uc->max_length > max_length)
550				max_length = uc->max_length;
551		}
552#endif
553	}
554	uq->uq_spare_queue = NULL;
555
556	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
557	uh->length++;
558	uq->uq_flags |= UQF_UMTXQ;
559	uq->uq_cur_queue = uh;
560	return;
561}
562
563static inline void
564umtxq_remove_queue(struct umtx_q *uq, int q)
565{
566	struct umtxq_chain *uc;
567	struct umtxq_queue *uh;
568
569	uc = umtxq_getchain(&uq->uq_key);
570	UMTXQ_LOCKED_ASSERT(uc);
571	if (uq->uq_flags & UQF_UMTXQ) {
572		uh = uq->uq_cur_queue;
573		TAILQ_REMOVE(&uh->head, uq, uq_link);
574		uh->length--;
575		uq->uq_flags &= ~UQF_UMTXQ;
576		if (TAILQ_EMPTY(&uh->head)) {
577			KASSERT(uh->length == 0,
578			    ("inconsistent umtxq_queue length"));
579#ifdef UMTX_PROFILING
580			uc->length--;
581#endif
582			LIST_REMOVE(uh, link);
583		} else {
584			uh = LIST_FIRST(&uc->uc_spare_queue);
585			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
586			LIST_REMOVE(uh, link);
587		}
588		uq->uq_spare_queue = uh;
589		uq->uq_cur_queue = NULL;
590	}
591}
592
593/*
594 * Check if there are multiple waiters
595 */
596static int
597umtxq_count(struct umtx_key *key)
598{
599	struct umtxq_chain *uc;
600	struct umtxq_queue *uh;
601
602	uc = umtxq_getchain(key);
603	UMTXQ_LOCKED_ASSERT(uc);
604	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
605	if (uh != NULL)
606		return (uh->length);
607	return (0);
608}
609
610/*
611 * Check if there are multiple PI waiters and returns first
612 * waiter.
613 */
614static int
615umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
616{
617	struct umtxq_chain *uc;
618	struct umtxq_queue *uh;
619
620	*first = NULL;
621	uc = umtxq_getchain(key);
622	UMTXQ_LOCKED_ASSERT(uc);
623	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
624	if (uh != NULL) {
625		*first = TAILQ_FIRST(&uh->head);
626		return (uh->length);
627	}
628	return (0);
629}
630
631/*
632 * Wake up threads waiting on an userland object.
633 */
634
635static int
636umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
637{
638	struct umtxq_chain *uc;
639	struct umtxq_queue *uh;
640	struct umtx_q *uq;
641	int ret;
642
643	ret = 0;
644	uc = umtxq_getchain(key);
645	UMTXQ_LOCKED_ASSERT(uc);
646	uh = umtxq_queue_lookup(key, q);
647	if (uh != NULL) {
648		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
649			umtxq_remove_queue(uq, q);
650			wakeup(uq);
651			if (++ret >= n_wake)
652				return (ret);
653		}
654	}
655	return (ret);
656}
657
658
659/*
660 * Wake up specified thread.
661 */
662static inline void
663umtxq_signal_thread(struct umtx_q *uq)
664{
665	struct umtxq_chain *uc;
666
667	uc = umtxq_getchain(&uq->uq_key);
668	UMTXQ_LOCKED_ASSERT(uc);
669	umtxq_remove(uq);
670	wakeup(uq);
671}
672
673static inline int
674tstohz(const struct timespec *tsp)
675{
676	struct timeval tv;
677
678	TIMESPEC_TO_TIMEVAL(&tv, tsp);
679	return tvtohz(&tv);
680}
681
682static void
683abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
684	const struct timespec *timeout)
685{
686
687	timo->clockid = clockid;
688	if (!absolute) {
689		kern_clock_gettime(curthread, clockid, &timo->end);
690		timo->cur = timo->end;
691		timespecadd(&timo->end, timeout);
692	} else {
693		timo->end = *timeout;
694		kern_clock_gettime(curthread, clockid, &timo->cur);
695	}
696}
697
698static void
699abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
700{
701
702	abs_timeout_init(timo, umtxtime->_clockid,
703		(umtxtime->_flags & UMTX_ABSTIME) != 0,
704		&umtxtime->_timeout);
705}
706
707static inline void
708abs_timeout_update(struct abs_timeout *timo)
709{
710	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
711}
712
713static int
714abs_timeout_gethz(struct abs_timeout *timo)
715{
716	struct timespec tts;
717
718	if (timespeccmp(&timo->end, &timo->cur, <=))
719		return (-1);
720	tts = timo->end;
721	timespecsub(&tts, &timo->cur);
722	return (tstohz(&tts));
723}
724
725/*
726 * Put thread into sleep state, before sleeping, check if
727 * thread was removed from umtx queue.
728 */
729static inline int
730umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
731{
732	struct umtxq_chain *uc;
733	int error, timo;
734
735	uc = umtxq_getchain(&uq->uq_key);
736	UMTXQ_LOCKED_ASSERT(uc);
737	for (;;) {
738		if (!(uq->uq_flags & UQF_UMTXQ))
739			return (0);
740		if (abstime != NULL) {
741			timo = abs_timeout_gethz(abstime);
742			if (timo < 0)
743				return (ETIMEDOUT);
744		} else
745			timo = 0;
746		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
747		if (error != EWOULDBLOCK) {
748			umtxq_lock(&uq->uq_key);
749			break;
750		}
751		if (abstime != NULL)
752			abs_timeout_update(abstime);
753		umtxq_lock(&uq->uq_key);
754	}
755	return (error);
756}
757
758/*
759 * Convert userspace address into unique logical address.
760 */
761int
762umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
763{
764	struct thread *td = curthread;
765	vm_map_t map;
766	vm_map_entry_t entry;
767	vm_pindex_t pindex;
768	vm_prot_t prot;
769	boolean_t wired;
770
771	key->type = type;
772	if (share == THREAD_SHARE) {
773		key->shared = 0;
774		key->info.private.vs = td->td_proc->p_vmspace;
775		key->info.private.addr = (uintptr_t)addr;
776	} else {
777		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
778		map = &td->td_proc->p_vmspace->vm_map;
779		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
780		    &entry, &key->info.shared.object, &pindex, &prot,
781		    &wired) != KERN_SUCCESS) {
782			return EFAULT;
783		}
784
785		if ((share == PROCESS_SHARE) ||
786		    (share == AUTO_SHARE &&
787		     VM_INHERIT_SHARE == entry->inheritance)) {
788			key->shared = 1;
789			key->info.shared.offset = entry->offset + entry->start -
790				(vm_offset_t)addr;
791			vm_object_reference(key->info.shared.object);
792		} else {
793			key->shared = 0;
794			key->info.private.vs = td->td_proc->p_vmspace;
795			key->info.private.addr = (uintptr_t)addr;
796		}
797		vm_map_lookup_done(map, entry);
798	}
799
800	umtxq_hash(key);
801	return (0);
802}
803
804/*
805 * Release key.
806 */
807void
808umtx_key_release(struct umtx_key *key)
809{
810	if (key->shared)
811		vm_object_deallocate(key->info.shared.object);
812}
813
814/*
815 * Lock a umtx object.
816 */
817static int
818do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
819	const struct timespec *timeout)
820{
821	struct abs_timeout timo;
822	struct umtx_q *uq;
823	u_long owner;
824	u_long old;
825	int error = 0;
826
827	uq = td->td_umtxq;
828	if (timeout != NULL)
829		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
830
831	/*
832	 * Care must be exercised when dealing with umtx structure. It
833	 * can fault on any access.
834	 */
835	for (;;) {
836		/*
837		 * Try the uncontested case.  This should be done in userland.
838		 */
839		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
840
841		/* The acquire succeeded. */
842		if (owner == UMTX_UNOWNED)
843			return (0);
844
845		/* The address was invalid. */
846		if (owner == -1)
847			return (EFAULT);
848
849		/* If no one owns it but it is contested try to acquire it. */
850		if (owner == UMTX_CONTESTED) {
851			owner = casuword(&umtx->u_owner,
852			    UMTX_CONTESTED, id | UMTX_CONTESTED);
853
854			if (owner == UMTX_CONTESTED)
855				return (0);
856
857			/* The address was invalid. */
858			if (owner == -1)
859				return (EFAULT);
860
861			/* If this failed the lock has changed, restart. */
862			continue;
863		}
864
865		/*
866		 * If we caught a signal, we have retried and now
867		 * exit immediately.
868		 */
869		if (error != 0)
870			break;
871
872		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
873			AUTO_SHARE, &uq->uq_key)) != 0)
874			return (error);
875
876		umtxq_lock(&uq->uq_key);
877		umtxq_busy(&uq->uq_key);
878		umtxq_insert(uq);
879		umtxq_unbusy(&uq->uq_key);
880		umtxq_unlock(&uq->uq_key);
881
882		/*
883		 * Set the contested bit so that a release in user space
884		 * knows to use the system call for unlock.  If this fails
885		 * either some one else has acquired the lock or it has been
886		 * released.
887		 */
888		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
889
890		/* The address was invalid. */
891		if (old == -1) {
892			umtxq_lock(&uq->uq_key);
893			umtxq_remove(uq);
894			umtxq_unlock(&uq->uq_key);
895			umtx_key_release(&uq->uq_key);
896			return (EFAULT);
897		}
898
899		/*
900		 * We set the contested bit, sleep. Otherwise the lock changed
901		 * and we need to retry or we lost a race to the thread
902		 * unlocking the umtx.
903		 */
904		umtxq_lock(&uq->uq_key);
905		if (old == owner)
906			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
907			    &timo);
908		umtxq_remove(uq);
909		umtxq_unlock(&uq->uq_key);
910		umtx_key_release(&uq->uq_key);
911	}
912
913	if (timeout == NULL) {
914		/* Mutex locking is restarted if it is interrupted. */
915		if (error == EINTR)
916			error = ERESTART;
917	} else {
918		/* Timed-locking is not restarted. */
919		if (error == ERESTART)
920			error = EINTR;
921	}
922	return (error);
923}
924
925/*
926 * Unlock a umtx object.
927 */
928static int
929do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
930{
931	struct umtx_key key;
932	u_long owner;
933	u_long old;
934	int error;
935	int count;
936
937	/*
938	 * Make sure we own this mtx.
939	 */
940	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
941	if (owner == -1)
942		return (EFAULT);
943
944	if ((owner & ~UMTX_CONTESTED) != id)
945		return (EPERM);
946
947	/* This should be done in userland */
948	if ((owner & UMTX_CONTESTED) == 0) {
949		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
950		if (old == -1)
951			return (EFAULT);
952		if (old == owner)
953			return (0);
954		owner = old;
955	}
956
957	/* We should only ever be in here for contested locks */
958	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
959		&key)) != 0)
960		return (error);
961
962	umtxq_lock(&key);
963	umtxq_busy(&key);
964	count = umtxq_count(&key);
965	umtxq_unlock(&key);
966
967	/*
968	 * When unlocking the umtx, it must be marked as unowned if
969	 * there is zero or one thread only waiting for it.
970	 * Otherwise, it must be marked as contested.
971	 */
972	old = casuword(&umtx->u_owner, owner,
973		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
974	umtxq_lock(&key);
975	umtxq_signal(&key,1);
976	umtxq_unbusy(&key);
977	umtxq_unlock(&key);
978	umtx_key_release(&key);
979	if (old == -1)
980		return (EFAULT);
981	if (old != owner)
982		return (EINVAL);
983	return (0);
984}
985
986#ifdef COMPAT_FREEBSD32
987
988/*
989 * Lock a umtx object.
990 */
991static int
992do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
993	const struct timespec *timeout)
994{
995	struct abs_timeout timo;
996	struct umtx_q *uq;
997	uint32_t owner;
998	uint32_t old;
999	int error = 0;
1000
1001	uq = td->td_umtxq;
1002
1003	if (timeout != NULL)
1004		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
1005
1006	/*
1007	 * Care must be exercised when dealing with umtx structure. It
1008	 * can fault on any access.
1009	 */
1010	for (;;) {
1011		/*
1012		 * Try the uncontested case.  This should be done in userland.
1013		 */
1014		owner = casuword32(m, UMUTEX_UNOWNED, id);
1015
1016		/* The acquire succeeded. */
1017		if (owner == UMUTEX_UNOWNED)
1018			return (0);
1019
1020		/* The address was invalid. */
1021		if (owner == -1)
1022			return (EFAULT);
1023
1024		/* If no one owns it but it is contested try to acquire it. */
1025		if (owner == UMUTEX_CONTESTED) {
1026			owner = casuword32(m,
1027			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1028			if (owner == UMUTEX_CONTESTED)
1029				return (0);
1030
1031			/* The address was invalid. */
1032			if (owner == -1)
1033				return (EFAULT);
1034
1035			/* If this failed the lock has changed, restart. */
1036			continue;
1037		}
1038
1039		/*
1040		 * If we caught a signal, we have retried and now
1041		 * exit immediately.
1042		 */
1043		if (error != 0)
1044			return (error);
1045
1046		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
1047			AUTO_SHARE, &uq->uq_key)) != 0)
1048			return (error);
1049
1050		umtxq_lock(&uq->uq_key);
1051		umtxq_busy(&uq->uq_key);
1052		umtxq_insert(uq);
1053		umtxq_unbusy(&uq->uq_key);
1054		umtxq_unlock(&uq->uq_key);
1055
1056		/*
1057		 * Set the contested bit so that a release in user space
1058		 * knows to use the system call for unlock.  If this fails
1059		 * either some one else has acquired the lock or it has been
1060		 * released.
1061		 */
1062		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
1063
1064		/* The address was invalid. */
1065		if (old == -1) {
1066			umtxq_lock(&uq->uq_key);
1067			umtxq_remove(uq);
1068			umtxq_unlock(&uq->uq_key);
1069			umtx_key_release(&uq->uq_key);
1070			return (EFAULT);
1071		}
1072
1073		/*
1074		 * We set the contested bit, sleep. Otherwise the lock changed
1075		 * and we need to retry or we lost a race to the thread
1076		 * unlocking the umtx.
1077		 */
1078		umtxq_lock(&uq->uq_key);
1079		if (old == owner)
1080			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
1081			    NULL : &timo);
1082		umtxq_remove(uq);
1083		umtxq_unlock(&uq->uq_key);
1084		umtx_key_release(&uq->uq_key);
1085	}
1086
1087	if (timeout == NULL) {
1088		/* Mutex locking is restarted if it is interrupted. */
1089		if (error == EINTR)
1090			error = ERESTART;
1091	} else {
1092		/* Timed-locking is not restarted. */
1093		if (error == ERESTART)
1094			error = EINTR;
1095	}
1096	return (error);
1097}
1098
1099/*
1100 * Unlock a umtx object.
1101 */
1102static int
1103do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
1104{
1105	struct umtx_key key;
1106	uint32_t owner;
1107	uint32_t old;
1108	int error;
1109	int count;
1110
1111	/*
1112	 * Make sure we own this mtx.
1113	 */
1114	owner = fuword32(m);
1115	if (owner == -1)
1116		return (EFAULT);
1117
1118	if ((owner & ~UMUTEX_CONTESTED) != id)
1119		return (EPERM);
1120
1121	/* This should be done in userland */
1122	if ((owner & UMUTEX_CONTESTED) == 0) {
1123		old = casuword32(m, owner, UMUTEX_UNOWNED);
1124		if (old == -1)
1125			return (EFAULT);
1126		if (old == owner)
1127			return (0);
1128		owner = old;
1129	}
1130
1131	/* We should only ever be in here for contested locks */
1132	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1133		&key)) != 0)
1134		return (error);
1135
1136	umtxq_lock(&key);
1137	umtxq_busy(&key);
1138	count = umtxq_count(&key);
1139	umtxq_unlock(&key);
1140
1141	/*
1142	 * When unlocking the umtx, it must be marked as unowned if
1143	 * there is zero or one thread only waiting for it.
1144	 * Otherwise, it must be marked as contested.
1145	 */
1146	old = casuword32(m, owner,
1147		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1148	umtxq_lock(&key);
1149	umtxq_signal(&key,1);
1150	umtxq_unbusy(&key);
1151	umtxq_unlock(&key);
1152	umtx_key_release(&key);
1153	if (old == -1)
1154		return (EFAULT);
1155	if (old != owner)
1156		return (EINVAL);
1157	return (0);
1158}
1159#endif
1160
1161/*
1162 * Fetch and compare value, sleep on the address if value is not changed.
1163 */
1164static int
1165do_wait(struct thread *td, void *addr, u_long id,
1166	struct _umtx_time *timeout, int compat32, int is_private)
1167{
1168	struct abs_timeout timo;
1169	struct umtx_q *uq;
1170	u_long tmp;
1171	int error = 0;
1172
1173	uq = td->td_umtxq;
1174	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1175		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1176		return (error);
1177
1178	if (timeout != NULL)
1179		abs_timeout_init2(&timo, timeout);
1180
1181	umtxq_lock(&uq->uq_key);
1182	umtxq_insert(uq);
1183	umtxq_unlock(&uq->uq_key);
1184	if (compat32 == 0)
1185		tmp = fuword(addr);
1186        else
1187		tmp = (unsigned int)fuword32(addr);
1188	umtxq_lock(&uq->uq_key);
1189	if (tmp == id)
1190		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
1191		    NULL : &timo);
1192	if ((uq->uq_flags & UQF_UMTXQ) == 0)
1193		error = 0;
1194	else
1195		umtxq_remove(uq);
1196	umtxq_unlock(&uq->uq_key);
1197	umtx_key_release(&uq->uq_key);
1198	if (error == ERESTART)
1199		error = EINTR;
1200	return (error);
1201}
1202
1203/*
1204 * Wake up threads sleeping on the specified address.
1205 */
1206int
1207kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1208{
1209	struct umtx_key key;
1210	int ret;
1211
1212	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1213		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1214		return (ret);
1215	umtxq_lock(&key);
1216	ret = umtxq_signal(&key, n_wake);
1217	umtxq_unlock(&key);
1218	umtx_key_release(&key);
1219	return (0);
1220}
1221
1222/*
1223 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1224 */
1225static int
1226do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1227	struct _umtx_time *timeout, int mode)
1228{
1229	struct abs_timeout timo;
1230	struct umtx_q *uq;
1231	uint32_t owner, old, id;
1232	int error = 0;
1233
1234	id = td->td_tid;
1235	uq = td->td_umtxq;
1236
1237	if (timeout != NULL)
1238		abs_timeout_init2(&timo, timeout);
1239
1240	/*
1241	 * Care must be exercised when dealing with umtx structure. It
1242	 * can fault on any access.
1243	 */
1244	for (;;) {
1245		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1246		if (mode == _UMUTEX_WAIT) {
1247			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1248				return (0);
1249		} else {
1250			/*
1251			 * Try the uncontested case.  This should be done in userland.
1252			 */
1253			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1254
1255			/* The acquire succeeded. */
1256			if (owner == UMUTEX_UNOWNED)
1257				return (0);
1258
1259			/* The address was invalid. */
1260			if (owner == -1)
1261				return (EFAULT);
1262
1263			/* If no one owns it but it is contested try to acquire it. */
1264			if (owner == UMUTEX_CONTESTED) {
1265				owner = casuword32(&m->m_owner,
1266				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1267
1268				if (owner == UMUTEX_CONTESTED)
1269					return (0);
1270
1271				/* The address was invalid. */
1272				if (owner == -1)
1273					return (EFAULT);
1274
1275				/* If this failed the lock has changed, restart. */
1276				continue;
1277			}
1278		}
1279
1280		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1281		    (owner & ~UMUTEX_CONTESTED) == id)
1282			return (EDEADLK);
1283
1284		if (mode == _UMUTEX_TRY)
1285			return (EBUSY);
1286
1287		/*
1288		 * If we caught a signal, we have retried and now
1289		 * exit immediately.
1290		 */
1291		if (error != 0)
1292			return (error);
1293
1294		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1295		    GET_SHARE(flags), &uq->uq_key)) != 0)
1296			return (error);
1297
1298		umtxq_lock(&uq->uq_key);
1299		umtxq_busy(&uq->uq_key);
1300		umtxq_insert(uq);
1301		umtxq_unlock(&uq->uq_key);
1302
1303		/*
1304		 * Set the contested bit so that a release in user space
1305		 * knows to use the system call for unlock.  If this fails
1306		 * either some one else has acquired the lock or it has been
1307		 * released.
1308		 */
1309		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1310
1311		/* The address was invalid. */
1312		if (old == -1) {
1313			umtxq_lock(&uq->uq_key);
1314			umtxq_remove(uq);
1315			umtxq_unbusy(&uq->uq_key);
1316			umtxq_unlock(&uq->uq_key);
1317			umtx_key_release(&uq->uq_key);
1318			return (EFAULT);
1319		}
1320
1321		/*
1322		 * We set the contested bit, sleep. Otherwise the lock changed
1323		 * and we need to retry or we lost a race to the thread
1324		 * unlocking the umtx.
1325		 */
1326		umtxq_lock(&uq->uq_key);
1327		umtxq_unbusy(&uq->uq_key);
1328		if (old == owner)
1329			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1330			    NULL : &timo);
1331		umtxq_remove(uq);
1332		umtxq_unlock(&uq->uq_key);
1333		umtx_key_release(&uq->uq_key);
1334	}
1335
1336	return (0);
1337}
1338
1339/*
1340 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1341 */
1342static int
1343do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1344{
1345	struct umtx_key key;
1346	uint32_t owner, old, id;
1347	int error;
1348	int count;
1349
1350	id = td->td_tid;
1351	/*
1352	 * Make sure we own this mtx.
1353	 */
1354	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1355	if (owner == -1)
1356		return (EFAULT);
1357
1358	if ((owner & ~UMUTEX_CONTESTED) != id)
1359		return (EPERM);
1360
1361	if ((owner & UMUTEX_CONTESTED) == 0) {
1362		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1363		if (old == -1)
1364			return (EFAULT);
1365		if (old == owner)
1366			return (0);
1367		owner = old;
1368	}
1369
1370	/* We should only ever be in here for contested locks */
1371	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1372	    &key)) != 0)
1373		return (error);
1374
1375	umtxq_lock(&key);
1376	umtxq_busy(&key);
1377	count = umtxq_count(&key);
1378	umtxq_unlock(&key);
1379
1380	/*
1381	 * When unlocking the umtx, it must be marked as unowned if
1382	 * there is zero or one thread only waiting for it.
1383	 * Otherwise, it must be marked as contested.
1384	 */
1385	old = casuword32(&m->m_owner, owner,
1386		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1387	umtxq_lock(&key);
1388	umtxq_signal(&key,1);
1389	umtxq_unbusy(&key);
1390	umtxq_unlock(&key);
1391	umtx_key_release(&key);
1392	if (old == -1)
1393		return (EFAULT);
1394	if (old != owner)
1395		return (EINVAL);
1396	return (0);
1397}
1398
1399/*
1400 * Check if the mutex is available and wake up a waiter,
1401 * only for simple mutex.
1402 */
1403static int
1404do_wake_umutex(struct thread *td, struct umutex *m)
1405{
1406	struct umtx_key key;
1407	uint32_t owner;
1408	uint32_t flags;
1409	int error;
1410	int count;
1411
1412	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1413	if (owner == -1)
1414		return (EFAULT);
1415
1416	if ((owner & ~UMUTEX_CONTESTED) != 0)
1417		return (0);
1418
1419	flags = fuword32(&m->m_flags);
1420
1421	/* We should only ever be in here for contested locks */
1422	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1423	    &key)) != 0)
1424		return (error);
1425
1426	umtxq_lock(&key);
1427	umtxq_busy(&key);
1428	count = umtxq_count(&key);
1429	umtxq_unlock(&key);
1430
1431	if (count <= 1)
1432		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1433
1434	umtxq_lock(&key);
1435	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1436		umtxq_signal(&key, 1);
1437	umtxq_unbusy(&key);
1438	umtxq_unlock(&key);
1439	umtx_key_release(&key);
1440	return (0);
1441}
1442
1443/*
1444 * Check if the mutex has waiters and tries to fix contention bit.
1445 */
1446static int
1447do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1448{
1449	struct umtx_key key;
1450	uint32_t owner, old;
1451	int type;
1452	int error;
1453	int count;
1454
1455	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1456	case 0:
1457		type = TYPE_NORMAL_UMUTEX;
1458		break;
1459	case UMUTEX_PRIO_INHERIT:
1460		type = TYPE_PI_UMUTEX;
1461		break;
1462	case UMUTEX_PRIO_PROTECT:
1463		type = TYPE_PP_UMUTEX;
1464		break;
1465	default:
1466		return (EINVAL);
1467	}
1468	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1469	    &key)) != 0)
1470		return (error);
1471
1472	owner = 0;
1473	umtxq_lock(&key);
1474	umtxq_busy(&key);
1475	count = umtxq_count(&key);
1476	umtxq_unlock(&key);
1477	/*
1478	 * Only repair contention bit if there is a waiter, this means the mutex
1479	 * is still being referenced by userland code, otherwise don't update
1480	 * any memory.
1481	 */
1482	if (count > 1) {
1483		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1484		while ((owner & UMUTEX_CONTESTED) ==0) {
1485			old = casuword32(&m->m_owner, owner,
1486			    owner|UMUTEX_CONTESTED);
1487			if (old == owner)
1488				break;
1489			owner = old;
1490		}
1491	} else if (count == 1) {
1492		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1493		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
1494		       (owner & UMUTEX_CONTESTED) == 0) {
1495			old = casuword32(&m->m_owner, owner,
1496			    owner|UMUTEX_CONTESTED);
1497			if (old == owner)
1498				break;
1499			owner = old;
1500		}
1501	}
1502	umtxq_lock(&key);
1503	if (owner == -1) {
1504		error = EFAULT;
1505		umtxq_signal(&key, INT_MAX);
1506	}
1507	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1508		umtxq_signal(&key, 1);
1509	umtxq_unbusy(&key);
1510	umtxq_unlock(&key);
1511	umtx_key_release(&key);
1512	return (error);
1513}
1514
1515static inline struct umtx_pi *
1516umtx_pi_alloc(int flags)
1517{
1518	struct umtx_pi *pi;
1519
1520	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1521	TAILQ_INIT(&pi->pi_blocked);
1522	atomic_add_int(&umtx_pi_allocated, 1);
1523	return (pi);
1524}
1525
1526static inline void
1527umtx_pi_free(struct umtx_pi *pi)
1528{
1529	uma_zfree(umtx_pi_zone, pi);
1530	atomic_add_int(&umtx_pi_allocated, -1);
1531}
1532
1533/*
1534 * Adjust the thread's position on a pi_state after its priority has been
1535 * changed.
1536 */
1537static int
1538umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1539{
1540	struct umtx_q *uq, *uq1, *uq2;
1541	struct thread *td1;
1542
1543	mtx_assert(&umtx_lock, MA_OWNED);
1544	if (pi == NULL)
1545		return (0);
1546
1547	uq = td->td_umtxq;
1548
1549	/*
1550	 * Check if the thread needs to be moved on the blocked chain.
1551	 * It needs to be moved if either its priority is lower than
1552	 * the previous thread or higher than the next thread.
1553	 */
1554	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1555	uq2 = TAILQ_NEXT(uq, uq_lockq);
1556	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1557	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1558		/*
1559		 * Remove thread from blocked chain and determine where
1560		 * it should be moved to.
1561		 */
1562		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1563		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1564			td1 = uq1->uq_thread;
1565			MPASS(td1->td_proc->p_magic == P_MAGIC);
1566			if (UPRI(td1) > UPRI(td))
1567				break;
1568		}
1569
1570		if (uq1 == NULL)
1571			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1572		else
1573			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1574	}
1575	return (1);
1576}
1577
1578/*
1579 * Propagate priority when a thread is blocked on POSIX
1580 * PI mutex.
1581 */
1582static void
1583umtx_propagate_priority(struct thread *td)
1584{
1585	struct umtx_q *uq;
1586	struct umtx_pi *pi;
1587	int pri;
1588
1589	mtx_assert(&umtx_lock, MA_OWNED);
1590	pri = UPRI(td);
1591	uq = td->td_umtxq;
1592	pi = uq->uq_pi_blocked;
1593	if (pi == NULL)
1594		return;
1595
1596	for (;;) {
1597		td = pi->pi_owner;
1598		if (td == NULL || td == curthread)
1599			return;
1600
1601		MPASS(td->td_proc != NULL);
1602		MPASS(td->td_proc->p_magic == P_MAGIC);
1603
1604		thread_lock(td);
1605		if (td->td_lend_user_pri > pri)
1606			sched_lend_user_prio(td, pri);
1607		else {
1608			thread_unlock(td);
1609			break;
1610		}
1611		thread_unlock(td);
1612
1613		/*
1614		 * Pick up the lock that td is blocked on.
1615		 */
1616		uq = td->td_umtxq;
1617		pi = uq->uq_pi_blocked;
1618		if (pi == NULL)
1619			break;
1620		/* Resort td on the list if needed. */
1621		umtx_pi_adjust_thread(pi, td);
1622	}
1623}
1624
1625/*
1626 * Unpropagate priority for a PI mutex when a thread blocked on
1627 * it is interrupted by signal or resumed by others.
1628 */
1629static void
1630umtx_repropagate_priority(struct umtx_pi *pi)
1631{
1632	struct umtx_q *uq, *uq_owner;
1633	struct umtx_pi *pi2;
1634	int pri;
1635
1636	mtx_assert(&umtx_lock, MA_OWNED);
1637
1638	while (pi != NULL && pi->pi_owner != NULL) {
1639		pri = PRI_MAX;
1640		uq_owner = pi->pi_owner->td_umtxq;
1641
1642		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1643			uq = TAILQ_FIRST(&pi2->pi_blocked);
1644			if (uq != NULL) {
1645				if (pri > UPRI(uq->uq_thread))
1646					pri = UPRI(uq->uq_thread);
1647			}
1648		}
1649
1650		if (pri > uq_owner->uq_inherited_pri)
1651			pri = uq_owner->uq_inherited_pri;
1652		thread_lock(pi->pi_owner);
1653		sched_lend_user_prio(pi->pi_owner, pri);
1654		thread_unlock(pi->pi_owner);
1655		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1656			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1657	}
1658}
1659
1660/*
1661 * Insert a PI mutex into owned list.
1662 */
1663static void
1664umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1665{
1666	struct umtx_q *uq_owner;
1667
1668	uq_owner = owner->td_umtxq;
1669	mtx_assert(&umtx_lock, MA_OWNED);
1670	if (pi->pi_owner != NULL)
1671		panic("pi_ower != NULL");
1672	pi->pi_owner = owner;
1673	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1674}
1675
1676/*
1677 * Claim ownership of a PI mutex.
1678 */
1679static int
1680umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1681{
1682	struct umtx_q *uq, *uq_owner;
1683
1684	uq_owner = owner->td_umtxq;
1685	mtx_lock_spin(&umtx_lock);
1686	if (pi->pi_owner == owner) {
1687		mtx_unlock_spin(&umtx_lock);
1688		return (0);
1689	}
1690
1691	if (pi->pi_owner != NULL) {
1692		/*
1693		 * userland may have already messed the mutex, sigh.
1694		 */
1695		mtx_unlock_spin(&umtx_lock);
1696		return (EPERM);
1697	}
1698	umtx_pi_setowner(pi, owner);
1699	uq = TAILQ_FIRST(&pi->pi_blocked);
1700	if (uq != NULL) {
1701		int pri;
1702
1703		pri = UPRI(uq->uq_thread);
1704		thread_lock(owner);
1705		if (pri < UPRI(owner))
1706			sched_lend_user_prio(owner, pri);
1707		thread_unlock(owner);
1708	}
1709	mtx_unlock_spin(&umtx_lock);
1710	return (0);
1711}
1712
1713/*
1714 * Adjust a thread's order position in its blocked PI mutex,
1715 * this may result new priority propagating process.
1716 */
1717void
1718umtx_pi_adjust(struct thread *td, u_char oldpri)
1719{
1720	struct umtx_q *uq;
1721	struct umtx_pi *pi;
1722
1723	uq = td->td_umtxq;
1724	mtx_lock_spin(&umtx_lock);
1725	/*
1726	 * Pick up the lock that td is blocked on.
1727	 */
1728	pi = uq->uq_pi_blocked;
1729	if (pi != NULL) {
1730		umtx_pi_adjust_thread(pi, td);
1731		umtx_repropagate_priority(pi);
1732	}
1733	mtx_unlock_spin(&umtx_lock);
1734}
1735
1736/*
1737 * Sleep on a PI mutex.
1738 */
1739static int
1740umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1741	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1742{
1743	struct umtxq_chain *uc;
1744	struct thread *td, *td1;
1745	struct umtx_q *uq1;
1746	int pri;
1747	int error = 0;
1748
1749	td = uq->uq_thread;
1750	KASSERT(td == curthread, ("inconsistent uq_thread"));
1751	uc = umtxq_getchain(&uq->uq_key);
1752	UMTXQ_LOCKED_ASSERT(uc);
1753	UMTXQ_BUSY_ASSERT(uc);
1754	umtxq_insert(uq);
1755	mtx_lock_spin(&umtx_lock);
1756	if (pi->pi_owner == NULL) {
1757		mtx_unlock_spin(&umtx_lock);
1758		/* XXX Only look up thread in current process. */
1759		td1 = tdfind(owner, curproc->p_pid);
1760		mtx_lock_spin(&umtx_lock);
1761		if (td1 != NULL) {
1762			if (pi->pi_owner == NULL)
1763				umtx_pi_setowner(pi, td1);
1764			PROC_UNLOCK(td1->td_proc);
1765		}
1766	}
1767
1768	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1769		pri = UPRI(uq1->uq_thread);
1770		if (pri > UPRI(td))
1771			break;
1772	}
1773
1774	if (uq1 != NULL)
1775		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1776	else
1777		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1778
1779	uq->uq_pi_blocked = pi;
1780	thread_lock(td);
1781	td->td_flags |= TDF_UPIBLOCKED;
1782	thread_unlock(td);
1783	umtx_propagate_priority(td);
1784	mtx_unlock_spin(&umtx_lock);
1785	umtxq_unbusy(&uq->uq_key);
1786
1787	error = umtxq_sleep(uq, wmesg, timo);
1788	umtxq_remove(uq);
1789
1790	mtx_lock_spin(&umtx_lock);
1791	uq->uq_pi_blocked = NULL;
1792	thread_lock(td);
1793	td->td_flags &= ~TDF_UPIBLOCKED;
1794	thread_unlock(td);
1795	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1796	umtx_repropagate_priority(pi);
1797	mtx_unlock_spin(&umtx_lock);
1798	umtxq_unlock(&uq->uq_key);
1799
1800	return (error);
1801}
1802
1803/*
1804 * Add reference count for a PI mutex.
1805 */
1806static void
1807umtx_pi_ref(struct umtx_pi *pi)
1808{
1809	struct umtxq_chain *uc;
1810
1811	uc = umtxq_getchain(&pi->pi_key);
1812	UMTXQ_LOCKED_ASSERT(uc);
1813	pi->pi_refcount++;
1814}
1815
1816/*
1817 * Decrease reference count for a PI mutex, if the counter
1818 * is decreased to zero, its memory space is freed.
1819 */
1820static void
1821umtx_pi_unref(struct umtx_pi *pi)
1822{
1823	struct umtxq_chain *uc;
1824
1825	uc = umtxq_getchain(&pi->pi_key);
1826	UMTXQ_LOCKED_ASSERT(uc);
1827	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1828	if (--pi->pi_refcount == 0) {
1829		mtx_lock_spin(&umtx_lock);
1830		if (pi->pi_owner != NULL) {
1831			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1832				pi, pi_link);
1833			pi->pi_owner = NULL;
1834		}
1835		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1836			("blocked queue not empty"));
1837		mtx_unlock_spin(&umtx_lock);
1838		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1839		umtx_pi_free(pi);
1840	}
1841}
1842
1843/*
1844 * Find a PI mutex in hash table.
1845 */
1846static struct umtx_pi *
1847umtx_pi_lookup(struct umtx_key *key)
1848{
1849	struct umtxq_chain *uc;
1850	struct umtx_pi *pi;
1851
1852	uc = umtxq_getchain(key);
1853	UMTXQ_LOCKED_ASSERT(uc);
1854
1855	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1856		if (umtx_key_match(&pi->pi_key, key)) {
1857			return (pi);
1858		}
1859	}
1860	return (NULL);
1861}
1862
1863/*
1864 * Insert a PI mutex into hash table.
1865 */
1866static inline void
1867umtx_pi_insert(struct umtx_pi *pi)
1868{
1869	struct umtxq_chain *uc;
1870
1871	uc = umtxq_getchain(&pi->pi_key);
1872	UMTXQ_LOCKED_ASSERT(uc);
1873	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1874}
1875
1876/*
1877 * Lock a PI mutex.
1878 */
1879static int
1880do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1881    struct _umtx_time *timeout, int try)
1882{
1883	struct abs_timeout timo;
1884	struct umtx_q *uq;
1885	struct umtx_pi *pi, *new_pi;
1886	uint32_t id, owner, old;
1887	int error;
1888
1889	id = td->td_tid;
1890	uq = td->td_umtxq;
1891
1892	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1893	    &uq->uq_key)) != 0)
1894		return (error);
1895
1896	if (timeout != NULL)
1897		abs_timeout_init2(&timo, timeout);
1898
1899	umtxq_lock(&uq->uq_key);
1900	pi = umtx_pi_lookup(&uq->uq_key);
1901	if (pi == NULL) {
1902		new_pi = umtx_pi_alloc(M_NOWAIT);
1903		if (new_pi == NULL) {
1904			umtxq_unlock(&uq->uq_key);
1905			new_pi = umtx_pi_alloc(M_WAITOK);
1906			umtxq_lock(&uq->uq_key);
1907			pi = umtx_pi_lookup(&uq->uq_key);
1908			if (pi != NULL) {
1909				umtx_pi_free(new_pi);
1910				new_pi = NULL;
1911			}
1912		}
1913		if (new_pi != NULL) {
1914			new_pi->pi_key = uq->uq_key;
1915			umtx_pi_insert(new_pi);
1916			pi = new_pi;
1917		}
1918	}
1919	umtx_pi_ref(pi);
1920	umtxq_unlock(&uq->uq_key);
1921
1922	/*
1923	 * Care must be exercised when dealing with umtx structure.  It
1924	 * can fault on any access.
1925	 */
1926	for (;;) {
1927		/*
1928		 * Try the uncontested case.  This should be done in userland.
1929		 */
1930		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1931
1932		/* The acquire succeeded. */
1933		if (owner == UMUTEX_UNOWNED) {
1934			error = 0;
1935			break;
1936		}
1937
1938		/* The address was invalid. */
1939		if (owner == -1) {
1940			error = EFAULT;
1941			break;
1942		}
1943
1944		/* If no one owns it but it is contested try to acquire it. */
1945		if (owner == UMUTEX_CONTESTED) {
1946			owner = casuword32(&m->m_owner,
1947			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1948
1949			if (owner == UMUTEX_CONTESTED) {
1950				umtxq_lock(&uq->uq_key);
1951				umtxq_busy(&uq->uq_key);
1952				error = umtx_pi_claim(pi, td);
1953				umtxq_unbusy(&uq->uq_key);
1954				umtxq_unlock(&uq->uq_key);
1955				break;
1956			}
1957
1958			/* The address was invalid. */
1959			if (owner == -1) {
1960				error = EFAULT;
1961				break;
1962			}
1963
1964			/* If this failed the lock has changed, restart. */
1965			continue;
1966		}
1967
1968		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1969		    (owner & ~UMUTEX_CONTESTED) == id) {
1970			error = EDEADLK;
1971			break;
1972		}
1973
1974		if (try != 0) {
1975			error = EBUSY;
1976			break;
1977		}
1978
1979		/*
1980		 * If we caught a signal, we have retried and now
1981		 * exit immediately.
1982		 */
1983		if (error != 0)
1984			break;
1985
1986		umtxq_lock(&uq->uq_key);
1987		umtxq_busy(&uq->uq_key);
1988		umtxq_unlock(&uq->uq_key);
1989
1990		/*
1991		 * Set the contested bit so that a release in user space
1992		 * knows to use the system call for unlock.  If this fails
1993		 * either some one else has acquired the lock or it has been
1994		 * released.
1995		 */
1996		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1997
1998		/* The address was invalid. */
1999		if (old == -1) {
2000			umtxq_lock(&uq->uq_key);
2001			umtxq_unbusy(&uq->uq_key);
2002			umtxq_unlock(&uq->uq_key);
2003			error = EFAULT;
2004			break;
2005		}
2006
2007		umtxq_lock(&uq->uq_key);
2008		/*
2009		 * We set the contested bit, sleep. Otherwise the lock changed
2010		 * and we need to retry or we lost a race to the thread
2011		 * unlocking the umtx.
2012		 */
2013		if (old == owner)
2014			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
2015			    "umtxpi", timeout == NULL ? NULL : &timo);
2016		else {
2017			umtxq_unbusy(&uq->uq_key);
2018			umtxq_unlock(&uq->uq_key);
2019		}
2020	}
2021
2022	umtxq_lock(&uq->uq_key);
2023	umtx_pi_unref(pi);
2024	umtxq_unlock(&uq->uq_key);
2025
2026	umtx_key_release(&uq->uq_key);
2027	return (error);
2028}
2029
2030/*
2031 * Unlock a PI mutex.
2032 */
2033static int
2034do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
2035{
2036	struct umtx_key key;
2037	struct umtx_q *uq_first, *uq_first2, *uq_me;
2038	struct umtx_pi *pi, *pi2;
2039	uint32_t owner, old, id;
2040	int error;
2041	int count;
2042	int pri;
2043
2044	id = td->td_tid;
2045	/*
2046	 * Make sure we own this mtx.
2047	 */
2048	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2049	if (owner == -1)
2050		return (EFAULT);
2051
2052	if ((owner & ~UMUTEX_CONTESTED) != id)
2053		return (EPERM);
2054
2055	/* This should be done in userland */
2056	if ((owner & UMUTEX_CONTESTED) == 0) {
2057		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
2058		if (old == -1)
2059			return (EFAULT);
2060		if (old == owner)
2061			return (0);
2062		owner = old;
2063	}
2064
2065	/* We should only ever be in here for contested locks */
2066	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
2067	    &key)) != 0)
2068		return (error);
2069
2070	umtxq_lock(&key);
2071	umtxq_busy(&key);
2072	count = umtxq_count_pi(&key, &uq_first);
2073	if (uq_first != NULL) {
2074		mtx_lock_spin(&umtx_lock);
2075		pi = uq_first->uq_pi_blocked;
2076		KASSERT(pi != NULL, ("pi == NULL?"));
2077		if (pi->pi_owner != curthread) {
2078			mtx_unlock_spin(&umtx_lock);
2079			umtxq_unbusy(&key);
2080			umtxq_unlock(&key);
2081			umtx_key_release(&key);
2082			/* userland messed the mutex */
2083			return (EPERM);
2084		}
2085		uq_me = curthread->td_umtxq;
2086		pi->pi_owner = NULL;
2087		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
2088		/* get highest priority thread which is still sleeping. */
2089		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2090		while (uq_first != NULL &&
2091		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2092			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2093		}
2094		pri = PRI_MAX;
2095		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2096			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2097			if (uq_first2 != NULL) {
2098				if (pri > UPRI(uq_first2->uq_thread))
2099					pri = UPRI(uq_first2->uq_thread);
2100			}
2101		}
2102		thread_lock(curthread);
2103		sched_lend_user_prio(curthread, pri);
2104		thread_unlock(curthread);
2105		mtx_unlock_spin(&umtx_lock);
2106		if (uq_first)
2107			umtxq_signal_thread(uq_first);
2108	}
2109	umtxq_unlock(&key);
2110
2111	/*
2112	 * When unlocking the umtx, it must be marked as unowned if
2113	 * there is zero or one thread only waiting for it.
2114	 * Otherwise, it must be marked as contested.
2115	 */
2116	old = casuword32(&m->m_owner, owner,
2117		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
2118
2119	umtxq_lock(&key);
2120	umtxq_unbusy(&key);
2121	umtxq_unlock(&key);
2122	umtx_key_release(&key);
2123	if (old == -1)
2124		return (EFAULT);
2125	if (old != owner)
2126		return (EINVAL);
2127	return (0);
2128}
2129
2130/*
2131 * Lock a PP mutex.
2132 */
2133static int
2134do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2135    struct _umtx_time *timeout, int try)
2136{
2137	struct abs_timeout timo;
2138	struct umtx_q *uq, *uq2;
2139	struct umtx_pi *pi;
2140	uint32_t ceiling;
2141	uint32_t owner, id;
2142	int error, pri, old_inherited_pri, su;
2143
2144	id = td->td_tid;
2145	uq = td->td_umtxq;
2146	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2147	    &uq->uq_key)) != 0)
2148		return (error);
2149
2150	if (timeout != NULL)
2151		abs_timeout_init2(&timo, timeout);
2152
2153	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2154	for (;;) {
2155		old_inherited_pri = uq->uq_inherited_pri;
2156		umtxq_lock(&uq->uq_key);
2157		umtxq_busy(&uq->uq_key);
2158		umtxq_unlock(&uq->uq_key);
2159
2160		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
2161		if (ceiling > RTP_PRIO_MAX) {
2162			error = EINVAL;
2163			goto out;
2164		}
2165
2166		mtx_lock_spin(&umtx_lock);
2167		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2168			mtx_unlock_spin(&umtx_lock);
2169			error = EINVAL;
2170			goto out;
2171		}
2172		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2173			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2174			thread_lock(td);
2175			if (uq->uq_inherited_pri < UPRI(td))
2176				sched_lend_user_prio(td, uq->uq_inherited_pri);
2177			thread_unlock(td);
2178		}
2179		mtx_unlock_spin(&umtx_lock);
2180
2181		owner = casuword32(&m->m_owner,
2182		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2183
2184		if (owner == UMUTEX_CONTESTED) {
2185			error = 0;
2186			break;
2187		}
2188
2189		/* The address was invalid. */
2190		if (owner == -1) {
2191			error = EFAULT;
2192			break;
2193		}
2194
2195		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2196		    (owner & ~UMUTEX_CONTESTED) == id) {
2197			error = EDEADLK;
2198			break;
2199		}
2200
2201		if (try != 0) {
2202			error = EBUSY;
2203			break;
2204		}
2205
2206		/*
2207		 * If we caught a signal, we have retried and now
2208		 * exit immediately.
2209		 */
2210		if (error != 0)
2211			break;
2212
2213		umtxq_lock(&uq->uq_key);
2214		umtxq_insert(uq);
2215		umtxq_unbusy(&uq->uq_key);
2216		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2217		    NULL : &timo);
2218		umtxq_remove(uq);
2219		umtxq_unlock(&uq->uq_key);
2220
2221		mtx_lock_spin(&umtx_lock);
2222		uq->uq_inherited_pri = old_inherited_pri;
2223		pri = PRI_MAX;
2224		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2225			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2226			if (uq2 != NULL) {
2227				if (pri > UPRI(uq2->uq_thread))
2228					pri = UPRI(uq2->uq_thread);
2229			}
2230		}
2231		if (pri > uq->uq_inherited_pri)
2232			pri = uq->uq_inherited_pri;
2233		thread_lock(td);
2234		sched_lend_user_prio(td, pri);
2235		thread_unlock(td);
2236		mtx_unlock_spin(&umtx_lock);
2237	}
2238
2239	if (error != 0) {
2240		mtx_lock_spin(&umtx_lock);
2241		uq->uq_inherited_pri = old_inherited_pri;
2242		pri = PRI_MAX;
2243		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2244			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2245			if (uq2 != NULL) {
2246				if (pri > UPRI(uq2->uq_thread))
2247					pri = UPRI(uq2->uq_thread);
2248			}
2249		}
2250		if (pri > uq->uq_inherited_pri)
2251			pri = uq->uq_inherited_pri;
2252		thread_lock(td);
2253		sched_lend_user_prio(td, pri);
2254		thread_unlock(td);
2255		mtx_unlock_spin(&umtx_lock);
2256	}
2257
2258out:
2259	umtxq_lock(&uq->uq_key);
2260	umtxq_unbusy(&uq->uq_key);
2261	umtxq_unlock(&uq->uq_key);
2262	umtx_key_release(&uq->uq_key);
2263	return (error);
2264}
2265
2266/*
2267 * Unlock a PP mutex.
2268 */
2269static int
2270do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2271{
2272	struct umtx_key key;
2273	struct umtx_q *uq, *uq2;
2274	struct umtx_pi *pi;
2275	uint32_t owner, id;
2276	uint32_t rceiling;
2277	int error, pri, new_inherited_pri, su;
2278
2279	id = td->td_tid;
2280	uq = td->td_umtxq;
2281	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2282
2283	/*
2284	 * Make sure we own this mtx.
2285	 */
2286	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2287	if (owner == -1)
2288		return (EFAULT);
2289
2290	if ((owner & ~UMUTEX_CONTESTED) != id)
2291		return (EPERM);
2292
2293	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2294	if (error != 0)
2295		return (error);
2296
2297	if (rceiling == -1)
2298		new_inherited_pri = PRI_MAX;
2299	else {
2300		rceiling = RTP_PRIO_MAX - rceiling;
2301		if (rceiling > RTP_PRIO_MAX)
2302			return (EINVAL);
2303		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2304	}
2305
2306	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2307	    &key)) != 0)
2308		return (error);
2309	umtxq_lock(&key);
2310	umtxq_busy(&key);
2311	umtxq_unlock(&key);
2312	/*
2313	 * For priority protected mutex, always set unlocked state
2314	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2315	 * to lock the mutex, it is necessary because thread priority
2316	 * has to be adjusted for such mutex.
2317	 */
2318	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2319		UMUTEX_CONTESTED);
2320
2321	umtxq_lock(&key);
2322	if (error == 0)
2323		umtxq_signal(&key, 1);
2324	umtxq_unbusy(&key);
2325	umtxq_unlock(&key);
2326
2327	if (error == -1)
2328		error = EFAULT;
2329	else {
2330		mtx_lock_spin(&umtx_lock);
2331		if (su != 0)
2332			uq->uq_inherited_pri = new_inherited_pri;
2333		pri = PRI_MAX;
2334		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2335			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2336			if (uq2 != NULL) {
2337				if (pri > UPRI(uq2->uq_thread))
2338					pri = UPRI(uq2->uq_thread);
2339			}
2340		}
2341		if (pri > uq->uq_inherited_pri)
2342			pri = uq->uq_inherited_pri;
2343		thread_lock(td);
2344		sched_lend_user_prio(td, pri);
2345		thread_unlock(td);
2346		mtx_unlock_spin(&umtx_lock);
2347	}
2348	umtx_key_release(&key);
2349	return (error);
2350}
2351
2352static int
2353do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2354	uint32_t *old_ceiling)
2355{
2356	struct umtx_q *uq;
2357	uint32_t save_ceiling;
2358	uint32_t owner, id;
2359	uint32_t flags;
2360	int error;
2361
2362	flags = fuword32(&m->m_flags);
2363	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2364		return (EINVAL);
2365	if (ceiling > RTP_PRIO_MAX)
2366		return (EINVAL);
2367	id = td->td_tid;
2368	uq = td->td_umtxq;
2369	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2370	   &uq->uq_key)) != 0)
2371		return (error);
2372	for (;;) {
2373		umtxq_lock(&uq->uq_key);
2374		umtxq_busy(&uq->uq_key);
2375		umtxq_unlock(&uq->uq_key);
2376
2377		save_ceiling = fuword32(&m->m_ceilings[0]);
2378
2379		owner = casuword32(&m->m_owner,
2380		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2381
2382		if (owner == UMUTEX_CONTESTED) {
2383			suword32(&m->m_ceilings[0], ceiling);
2384			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2385				UMUTEX_CONTESTED);
2386			error = 0;
2387			break;
2388		}
2389
2390		/* The address was invalid. */
2391		if (owner == -1) {
2392			error = EFAULT;
2393			break;
2394		}
2395
2396		if ((owner & ~UMUTEX_CONTESTED) == id) {
2397			suword32(&m->m_ceilings[0], ceiling);
2398			error = 0;
2399			break;
2400		}
2401
2402		/*
2403		 * If we caught a signal, we have retried and now
2404		 * exit immediately.
2405		 */
2406		if (error != 0)
2407			break;
2408
2409		/*
2410		 * We set the contested bit, sleep. Otherwise the lock changed
2411		 * and we need to retry or we lost a race to the thread
2412		 * unlocking the umtx.
2413		 */
2414		umtxq_lock(&uq->uq_key);
2415		umtxq_insert(uq);
2416		umtxq_unbusy(&uq->uq_key);
2417		error = umtxq_sleep(uq, "umtxpp", NULL);
2418		umtxq_remove(uq);
2419		umtxq_unlock(&uq->uq_key);
2420	}
2421	umtxq_lock(&uq->uq_key);
2422	if (error == 0)
2423		umtxq_signal(&uq->uq_key, INT_MAX);
2424	umtxq_unbusy(&uq->uq_key);
2425	umtxq_unlock(&uq->uq_key);
2426	umtx_key_release(&uq->uq_key);
2427	if (error == 0 && old_ceiling != NULL)
2428		suword32(old_ceiling, save_ceiling);
2429	return (error);
2430}
2431
2432/*
2433 * Lock a userland POSIX mutex.
2434 */
2435static int
2436do_lock_umutex(struct thread *td, struct umutex *m,
2437    struct _umtx_time *timeout, int mode)
2438{
2439	uint32_t flags;
2440	int error;
2441
2442	flags = fuword32(&m->m_flags);
2443	if (flags == -1)
2444		return (EFAULT);
2445
2446	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2447	case 0:
2448		error = do_lock_normal(td, m, flags, timeout, mode);
2449		break;
2450	case UMUTEX_PRIO_INHERIT:
2451		error = do_lock_pi(td, m, flags, timeout, mode);
2452		break;
2453	case UMUTEX_PRIO_PROTECT:
2454		error = do_lock_pp(td, m, flags, timeout, mode);
2455		break;
2456	default:
2457		return (EINVAL);
2458	}
2459	if (timeout == NULL) {
2460		if (error == EINTR && mode != _UMUTEX_WAIT)
2461			error = ERESTART;
2462	} else {
2463		/* Timed-locking is not restarted. */
2464		if (error == ERESTART)
2465			error = EINTR;
2466	}
2467	return (error);
2468}
2469
2470/*
2471 * Unlock a userland POSIX mutex.
2472 */
2473static int
2474do_unlock_umutex(struct thread *td, struct umutex *m)
2475{
2476	uint32_t flags;
2477
2478	flags = fuword32(&m->m_flags);
2479	if (flags == -1)
2480		return (EFAULT);
2481
2482	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2483	case 0:
2484		return (do_unlock_normal(td, m, flags));
2485	case UMUTEX_PRIO_INHERIT:
2486		return (do_unlock_pi(td, m, flags));
2487	case UMUTEX_PRIO_PROTECT:
2488		return (do_unlock_pp(td, m, flags));
2489	}
2490
2491	return (EINVAL);
2492}
2493
2494static int
2495do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2496	struct timespec *timeout, u_long wflags)
2497{
2498	struct abs_timeout timo;
2499	struct umtx_q *uq;
2500	uint32_t flags;
2501	uint32_t clockid;
2502	int error;
2503
2504	uq = td->td_umtxq;
2505	flags = fuword32(&cv->c_flags);
2506	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2507	if (error != 0)
2508		return (error);
2509
2510	if ((wflags & CVWAIT_CLOCKID) != 0) {
2511		clockid = fuword32(&cv->c_clockid);
2512		if (clockid < CLOCK_REALTIME ||
2513		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2514			/* hmm, only HW clock id will work. */
2515			return (EINVAL);
2516		}
2517	} else {
2518		clockid = CLOCK_REALTIME;
2519	}
2520
2521	umtxq_lock(&uq->uq_key);
2522	umtxq_busy(&uq->uq_key);
2523	umtxq_insert(uq);
2524	umtxq_unlock(&uq->uq_key);
2525
2526	/*
2527	 * Set c_has_waiters to 1 before releasing user mutex, also
2528	 * don't modify cache line when unnecessary.
2529	 */
2530	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2531		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2532
2533	umtxq_lock(&uq->uq_key);
2534	umtxq_unbusy(&uq->uq_key);
2535	umtxq_unlock(&uq->uq_key);
2536
2537	error = do_unlock_umutex(td, m);
2538
2539	if (timeout != NULL)
2540		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2541			timeout);
2542
2543	umtxq_lock(&uq->uq_key);
2544	if (error == 0) {
2545		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2546		    NULL : &timo);
2547	}
2548
2549	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2550		error = 0;
2551	else {
2552		/*
2553		 * This must be timeout,interrupted by signal or
2554		 * surprious wakeup, clear c_has_waiter flag when
2555		 * necessary.
2556		 */
2557		umtxq_busy(&uq->uq_key);
2558		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2559			int oldlen = uq->uq_cur_queue->length;
2560			umtxq_remove(uq);
2561			if (oldlen == 1) {
2562				umtxq_unlock(&uq->uq_key);
2563				suword32(
2564				    __DEVOLATILE(uint32_t *,
2565					 &cv->c_has_waiters), 0);
2566				umtxq_lock(&uq->uq_key);
2567			}
2568		}
2569		umtxq_unbusy(&uq->uq_key);
2570		if (error == ERESTART)
2571			error = EINTR;
2572	}
2573
2574	umtxq_unlock(&uq->uq_key);
2575	umtx_key_release(&uq->uq_key);
2576	return (error);
2577}
2578
2579/*
2580 * Signal a userland condition variable.
2581 */
2582static int
2583do_cv_signal(struct thread *td, struct ucond *cv)
2584{
2585	struct umtx_key key;
2586	int error, cnt, nwake;
2587	uint32_t flags;
2588
2589	flags = fuword32(&cv->c_flags);
2590	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2591		return (error);
2592	umtxq_lock(&key);
2593	umtxq_busy(&key);
2594	cnt = umtxq_count(&key);
2595	nwake = umtxq_signal(&key, 1);
2596	if (cnt <= nwake) {
2597		umtxq_unlock(&key);
2598		error = suword32(
2599		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2600		umtxq_lock(&key);
2601	}
2602	umtxq_unbusy(&key);
2603	umtxq_unlock(&key);
2604	umtx_key_release(&key);
2605	return (error);
2606}
2607
2608static int
2609do_cv_broadcast(struct thread *td, struct ucond *cv)
2610{
2611	struct umtx_key key;
2612	int error;
2613	uint32_t flags;
2614
2615	flags = fuword32(&cv->c_flags);
2616	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2617		return (error);
2618
2619	umtxq_lock(&key);
2620	umtxq_busy(&key);
2621	umtxq_signal(&key, INT_MAX);
2622	umtxq_unlock(&key);
2623
2624	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2625
2626	umtxq_lock(&key);
2627	umtxq_unbusy(&key);
2628	umtxq_unlock(&key);
2629
2630	umtx_key_release(&key);
2631	return (error);
2632}
2633
2634static int
2635do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2636{
2637	struct abs_timeout timo;
2638	struct umtx_q *uq;
2639	uint32_t flags, wrflags;
2640	int32_t state, oldstate;
2641	int32_t blocked_readers;
2642	int error;
2643
2644	uq = td->td_umtxq;
2645	flags = fuword32(&rwlock->rw_flags);
2646	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2647	if (error != 0)
2648		return (error);
2649
2650	if (timeout != NULL)
2651		abs_timeout_init2(&timo, timeout);
2652
2653	wrflags = URWLOCK_WRITE_OWNER;
2654	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2655		wrflags |= URWLOCK_WRITE_WAITERS;
2656
2657	for (;;) {
2658		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2659		/* try to lock it */
2660		while (!(state & wrflags)) {
2661			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2662				umtx_key_release(&uq->uq_key);
2663				return (EAGAIN);
2664			}
2665			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2666			if (oldstate == state) {
2667				umtx_key_release(&uq->uq_key);
2668				return (0);
2669			}
2670			state = oldstate;
2671		}
2672
2673		if (error)
2674			break;
2675
2676		/* grab monitor lock */
2677		umtxq_lock(&uq->uq_key);
2678		umtxq_busy(&uq->uq_key);
2679		umtxq_unlock(&uq->uq_key);
2680
2681		/*
2682		 * re-read the state, in case it changed between the try-lock above
2683		 * and the check below
2684		 */
2685		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2686
2687		/* set read contention bit */
2688		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2689			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2690			if (oldstate == state)
2691				goto sleep;
2692			state = oldstate;
2693		}
2694
2695		/* state is changed while setting flags, restart */
2696		if (!(state & wrflags)) {
2697			umtxq_lock(&uq->uq_key);
2698			umtxq_unbusy(&uq->uq_key);
2699			umtxq_unlock(&uq->uq_key);
2700			continue;
2701		}
2702
2703sleep:
2704		/* contention bit is set, before sleeping, increase read waiter count */
2705		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2706		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2707
2708		while (state & wrflags) {
2709			umtxq_lock(&uq->uq_key);
2710			umtxq_insert(uq);
2711			umtxq_unbusy(&uq->uq_key);
2712
2713			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2714			    NULL : &timo);
2715
2716			umtxq_busy(&uq->uq_key);
2717			umtxq_remove(uq);
2718			umtxq_unlock(&uq->uq_key);
2719			if (error)
2720				break;
2721			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2722		}
2723
2724		/* decrease read waiter count, and may clear read contention bit */
2725		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2726		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2727		if (blocked_readers == 1) {
2728			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2729			for (;;) {
2730				oldstate = casuword32(&rwlock->rw_state, state,
2731					 state & ~URWLOCK_READ_WAITERS);
2732				if (oldstate == state)
2733					break;
2734				state = oldstate;
2735			}
2736		}
2737
2738		umtxq_lock(&uq->uq_key);
2739		umtxq_unbusy(&uq->uq_key);
2740		umtxq_unlock(&uq->uq_key);
2741	}
2742	umtx_key_release(&uq->uq_key);
2743	if (error == ERESTART)
2744		error = EINTR;
2745	return (error);
2746}
2747
2748static int
2749do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2750{
2751	struct abs_timeout timo;
2752	struct umtx_q *uq;
2753	uint32_t flags;
2754	int32_t state, oldstate;
2755	int32_t blocked_writers;
2756	int32_t blocked_readers;
2757	int error;
2758
2759	uq = td->td_umtxq;
2760	flags = fuword32(&rwlock->rw_flags);
2761	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2762	if (error != 0)
2763		return (error);
2764
2765	if (timeout != NULL)
2766		abs_timeout_init2(&timo, timeout);
2767
2768	blocked_readers = 0;
2769	for (;;) {
2770		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2771		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2772			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2773			if (oldstate == state) {
2774				umtx_key_release(&uq->uq_key);
2775				return (0);
2776			}
2777			state = oldstate;
2778		}
2779
2780		if (error) {
2781			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2782			    blocked_readers != 0) {
2783				umtxq_lock(&uq->uq_key);
2784				umtxq_busy(&uq->uq_key);
2785				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2786				umtxq_unbusy(&uq->uq_key);
2787				umtxq_unlock(&uq->uq_key);
2788			}
2789
2790			break;
2791		}
2792
2793		/* grab monitor lock */
2794		umtxq_lock(&uq->uq_key);
2795		umtxq_busy(&uq->uq_key);
2796		umtxq_unlock(&uq->uq_key);
2797
2798		/*
2799		 * re-read the state, in case it changed between the try-lock above
2800		 * and the check below
2801		 */
2802		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2803
2804		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2805		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2806			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2807			if (oldstate == state)
2808				goto sleep;
2809			state = oldstate;
2810		}
2811
2812		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2813			umtxq_lock(&uq->uq_key);
2814			umtxq_unbusy(&uq->uq_key);
2815			umtxq_unlock(&uq->uq_key);
2816			continue;
2817		}
2818sleep:
2819		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2820		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2821
2822		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2823			umtxq_lock(&uq->uq_key);
2824			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2825			umtxq_unbusy(&uq->uq_key);
2826
2827			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2828			    NULL : &timo);
2829
2830			umtxq_busy(&uq->uq_key);
2831			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2832			umtxq_unlock(&uq->uq_key);
2833			if (error)
2834				break;
2835			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2836		}
2837
2838		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2839		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2840		if (blocked_writers == 1) {
2841			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2842			for (;;) {
2843				oldstate = casuword32(&rwlock->rw_state, state,
2844					 state & ~URWLOCK_WRITE_WAITERS);
2845				if (oldstate == state)
2846					break;
2847				state = oldstate;
2848			}
2849			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2850		} else
2851			blocked_readers = 0;
2852
2853		umtxq_lock(&uq->uq_key);
2854		umtxq_unbusy(&uq->uq_key);
2855		umtxq_unlock(&uq->uq_key);
2856	}
2857
2858	umtx_key_release(&uq->uq_key);
2859	if (error == ERESTART)
2860		error = EINTR;
2861	return (error);
2862}
2863
2864static int
2865do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2866{
2867	struct umtx_q *uq;
2868	uint32_t flags;
2869	int32_t state, oldstate;
2870	int error, q, count;
2871
2872	uq = td->td_umtxq;
2873	flags = fuword32(&rwlock->rw_flags);
2874	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2875	if (error != 0)
2876		return (error);
2877
2878	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2879	if (state & URWLOCK_WRITE_OWNER) {
2880		for (;;) {
2881			oldstate = casuword32(&rwlock->rw_state, state,
2882				state & ~URWLOCK_WRITE_OWNER);
2883			if (oldstate != state) {
2884				state = oldstate;
2885				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2886					error = EPERM;
2887					goto out;
2888				}
2889			} else
2890				break;
2891		}
2892	} else if (URWLOCK_READER_COUNT(state) != 0) {
2893		for (;;) {
2894			oldstate = casuword32(&rwlock->rw_state, state,
2895				state - 1);
2896			if (oldstate != state) {
2897				state = oldstate;
2898				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2899					error = EPERM;
2900					goto out;
2901				}
2902			}
2903			else
2904				break;
2905		}
2906	} else {
2907		error = EPERM;
2908		goto out;
2909	}
2910
2911	count = 0;
2912
2913	if (!(flags & URWLOCK_PREFER_READER)) {
2914		if (state & URWLOCK_WRITE_WAITERS) {
2915			count = 1;
2916			q = UMTX_EXCLUSIVE_QUEUE;
2917		} else if (state & URWLOCK_READ_WAITERS) {
2918			count = INT_MAX;
2919			q = UMTX_SHARED_QUEUE;
2920		}
2921	} else {
2922		if (state & URWLOCK_READ_WAITERS) {
2923			count = INT_MAX;
2924			q = UMTX_SHARED_QUEUE;
2925		} else if (state & URWLOCK_WRITE_WAITERS) {
2926			count = 1;
2927			q = UMTX_EXCLUSIVE_QUEUE;
2928		}
2929	}
2930
2931	if (count) {
2932		umtxq_lock(&uq->uq_key);
2933		umtxq_busy(&uq->uq_key);
2934		umtxq_signal_queue(&uq->uq_key, count, q);
2935		umtxq_unbusy(&uq->uq_key);
2936		umtxq_unlock(&uq->uq_key);
2937	}
2938out:
2939	umtx_key_release(&uq->uq_key);
2940	return (error);
2941}
2942
2943static int
2944do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2945{
2946	struct abs_timeout timo;
2947	struct umtx_q *uq;
2948	uint32_t flags, count;
2949	int error;
2950
2951	uq = td->td_umtxq;
2952	flags = fuword32(&sem->_flags);
2953	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2954	if (error != 0)
2955		return (error);
2956
2957	if (timeout != NULL)
2958		abs_timeout_init2(&timo, timeout);
2959
2960	umtxq_lock(&uq->uq_key);
2961	umtxq_busy(&uq->uq_key);
2962	umtxq_insert(uq);
2963	umtxq_unlock(&uq->uq_key);
2964	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2965	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2966	if (count != 0) {
2967		umtxq_lock(&uq->uq_key);
2968		umtxq_unbusy(&uq->uq_key);
2969		umtxq_remove(uq);
2970		umtxq_unlock(&uq->uq_key);
2971		umtx_key_release(&uq->uq_key);
2972		return (0);
2973	}
2974	umtxq_lock(&uq->uq_key);
2975	umtxq_unbusy(&uq->uq_key);
2976
2977	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2978
2979	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2980		error = 0;
2981	else {
2982		umtxq_remove(uq);
2983		if (error == ERESTART)
2984			error = EINTR;
2985	}
2986	umtxq_unlock(&uq->uq_key);
2987	umtx_key_release(&uq->uq_key);
2988	return (error);
2989}
2990
2991/*
2992 * Signal a userland condition variable.
2993 */
2994static int
2995do_sem_wake(struct thread *td, struct _usem *sem)
2996{
2997	struct umtx_key key;
2998	int error, cnt;
2999	uint32_t flags;
3000
3001	flags = fuword32(&sem->_flags);
3002	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3003		return (error);
3004	umtxq_lock(&key);
3005	umtxq_busy(&key);
3006	cnt = umtxq_count(&key);
3007	if (cnt > 0) {
3008		umtxq_signal(&key, 1);
3009		/*
3010		 * Check if count is greater than 0, this means the memory is
3011		 * still being referenced by user code, so we can safely
3012		 * update _has_waiters flag.
3013		 */
3014		if (cnt == 1) {
3015			umtxq_unlock(&key);
3016			error = suword32(
3017			    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
3018			umtxq_lock(&key);
3019		}
3020	}
3021	umtxq_unbusy(&key);
3022	umtxq_unlock(&key);
3023	umtx_key_release(&key);
3024	return (error);
3025}
3026
3027int
3028sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
3029    /* struct umtx *umtx */
3030{
3031	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
3032}
3033
3034int
3035sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
3036    /* struct umtx *umtx */
3037{
3038	return do_unlock_umtx(td, uap->umtx, td->td_tid);
3039}
3040
3041inline int
3042umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3043{
3044	int error;
3045
3046	error = copyin(addr, tsp, sizeof(struct timespec));
3047	if (error == 0) {
3048		if (tsp->tv_sec < 0 ||
3049		    tsp->tv_nsec >= 1000000000 ||
3050		    tsp->tv_nsec < 0)
3051			error = EINVAL;
3052	}
3053	return (error);
3054}
3055
3056static inline int
3057umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3058{
3059	int error;
3060
3061	if (size <= sizeof(struct timespec)) {
3062		tp->_clockid = CLOCK_REALTIME;
3063		tp->_flags = 0;
3064		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3065	} else
3066		error = copyin(addr, tp, sizeof(struct _umtx_time));
3067	if (error != 0)
3068		return (error);
3069	if (tp->_timeout.tv_sec < 0 ||
3070	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3071		return (EINVAL);
3072	return (0);
3073}
3074
3075static int
3076__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
3077{
3078	struct timespec *ts, timeout;
3079	int error;
3080
3081	/* Allow a null timespec (wait forever). */
3082	if (uap->uaddr2 == NULL)
3083		ts = NULL;
3084	else {
3085		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3086		if (error != 0)
3087			return (error);
3088		ts = &timeout;
3089	}
3090	return (do_lock_umtx(td, uap->obj, uap->val, ts));
3091}
3092
3093static int
3094__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
3095{
3096	return (do_unlock_umtx(td, uap->obj, uap->val));
3097}
3098
3099static int
3100__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3101{
3102	struct _umtx_time timeout, *tm_p;
3103	int error;
3104
3105	if (uap->uaddr2 == NULL)
3106		tm_p = NULL;
3107	else {
3108		error = umtx_copyin_umtx_time(
3109		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3110		if (error != 0)
3111			return (error);
3112		tm_p = &timeout;
3113	}
3114	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3115}
3116
3117static int
3118__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3119{
3120	struct _umtx_time timeout, *tm_p;
3121	int error;
3122
3123	if (uap->uaddr2 == NULL)
3124		tm_p = NULL;
3125	else {
3126		error = umtx_copyin_umtx_time(
3127		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3128		if (error != 0)
3129			return (error);
3130		tm_p = &timeout;
3131	}
3132	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3133}
3134
3135static int
3136__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3137{
3138	struct _umtx_time *tm_p, timeout;
3139	int error;
3140
3141	if (uap->uaddr2 == NULL)
3142		tm_p = NULL;
3143	else {
3144		error = umtx_copyin_umtx_time(
3145		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3146		if (error != 0)
3147			return (error);
3148		tm_p = &timeout;
3149	}
3150	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3151}
3152
3153static int
3154__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3155{
3156	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3157}
3158
3159#define BATCH_SIZE	128
3160static int
3161__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3162{
3163	int count = uap->val;
3164	void *uaddrs[BATCH_SIZE];
3165	char **upp = (char **)uap->obj;
3166	int tocopy;
3167	int error = 0;
3168	int i, pos = 0;
3169
3170	while (count > 0) {
3171		tocopy = count;
3172		if (tocopy > BATCH_SIZE)
3173			tocopy = BATCH_SIZE;
3174		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3175		if (error != 0)
3176			break;
3177		for (i = 0; i < tocopy; ++i)
3178			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3179		count -= tocopy;
3180		pos += tocopy;
3181	}
3182	return (error);
3183}
3184
3185static int
3186__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3187{
3188	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3189}
3190
3191static int
3192__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3193{
3194	struct _umtx_time *tm_p, timeout;
3195	int error;
3196
3197	/* Allow a null timespec (wait forever). */
3198	if (uap->uaddr2 == NULL)
3199		tm_p = NULL;
3200	else {
3201		error = umtx_copyin_umtx_time(
3202		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3203		if (error != 0)
3204			return (error);
3205		tm_p = &timeout;
3206	}
3207	return do_lock_umutex(td, uap->obj, tm_p, 0);
3208}
3209
3210static int
3211__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3212{
3213	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3214}
3215
3216static int
3217__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3218{
3219	struct _umtx_time *tm_p, timeout;
3220	int error;
3221
3222	/* Allow a null timespec (wait forever). */
3223	if (uap->uaddr2 == NULL)
3224		tm_p = NULL;
3225	else {
3226		error = umtx_copyin_umtx_time(
3227		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3228		if (error != 0)
3229			return (error);
3230		tm_p = &timeout;
3231	}
3232	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3233}
3234
3235static int
3236__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3237{
3238	return do_wake_umutex(td, uap->obj);
3239}
3240
3241static int
3242__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3243{
3244	return do_unlock_umutex(td, uap->obj);
3245}
3246
3247static int
3248__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3249{
3250	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3251}
3252
3253static int
3254__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3255{
3256	struct timespec *ts, timeout;
3257	int error;
3258
3259	/* Allow a null timespec (wait forever). */
3260	if (uap->uaddr2 == NULL)
3261		ts = NULL;
3262	else {
3263		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3264		if (error != 0)
3265			return (error);
3266		ts = &timeout;
3267	}
3268	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3269}
3270
3271static int
3272__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3273{
3274	return do_cv_signal(td, uap->obj);
3275}
3276
3277static int
3278__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3279{
3280	return do_cv_broadcast(td, uap->obj);
3281}
3282
3283static int
3284__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3285{
3286	struct _umtx_time timeout;
3287	int error;
3288
3289	/* Allow a null timespec (wait forever). */
3290	if (uap->uaddr2 == NULL) {
3291		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3292	} else {
3293		error = umtx_copyin_umtx_time(uap->uaddr2,
3294		   (size_t)uap->uaddr1, &timeout);
3295		if (error != 0)
3296			return (error);
3297		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3298	}
3299	return (error);
3300}
3301
3302static int
3303__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3304{
3305	struct _umtx_time timeout;
3306	int error;
3307
3308	/* Allow a null timespec (wait forever). */
3309	if (uap->uaddr2 == NULL) {
3310		error = do_rw_wrlock(td, uap->obj, 0);
3311	} else {
3312		error = umtx_copyin_umtx_time(uap->uaddr2,
3313		   (size_t)uap->uaddr1, &timeout);
3314		if (error != 0)
3315			return (error);
3316
3317		error = do_rw_wrlock(td, uap->obj, &timeout);
3318	}
3319	return (error);
3320}
3321
3322static int
3323__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3324{
3325	return do_rw_unlock(td, uap->obj);
3326}
3327
3328static int
3329__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3330{
3331	struct _umtx_time *tm_p, timeout;
3332	int error;
3333
3334	/* Allow a null timespec (wait forever). */
3335	if (uap->uaddr2 == NULL)
3336		tm_p = NULL;
3337	else {
3338		error = umtx_copyin_umtx_time(
3339		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3340		if (error != 0)
3341			return (error);
3342		tm_p = &timeout;
3343	}
3344	return (do_sem_wait(td, uap->obj, tm_p));
3345}
3346
3347static int
3348__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3349{
3350	return do_sem_wake(td, uap->obj);
3351}
3352
3353static int
3354__umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3355{
3356	return do_wake2_umutex(td, uap->obj, uap->val);
3357}
3358
3359typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3360
3361static _umtx_op_func op_table[] = {
3362	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3363	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3364	__umtx_op_wait,			/* UMTX_OP_WAIT */
3365	__umtx_op_wake,			/* UMTX_OP_WAKE */
3366	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3367	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3368	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3369	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3370	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3371	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3372	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3373	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3374	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3375	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3376	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3377	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3378	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3379	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3380	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3381	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3382	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3383	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3384	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3385};
3386
3387int
3388sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3389{
3390	if ((unsigned)uap->op < UMTX_OP_MAX)
3391		return (*op_table[uap->op])(td, uap);
3392	return (EINVAL);
3393}
3394
3395#ifdef COMPAT_FREEBSD32
3396int
3397freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3398    /* struct umtx *umtx */
3399{
3400	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3401}
3402
3403int
3404freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3405    /* struct umtx *umtx */
3406{
3407	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3408}
3409
3410struct timespec32 {
3411	int32_t tv_sec;
3412	int32_t tv_nsec;
3413};
3414
3415struct umtx_time32 {
3416	struct	timespec32	timeout;
3417	uint32_t		flags;
3418	uint32_t		clockid;
3419};
3420
3421static inline int
3422umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3423{
3424	struct timespec32 ts32;
3425	int error;
3426
3427	error = copyin(addr, &ts32, sizeof(struct timespec32));
3428	if (error == 0) {
3429		if (ts32.tv_sec < 0 ||
3430		    ts32.tv_nsec >= 1000000000 ||
3431		    ts32.tv_nsec < 0)
3432			error = EINVAL;
3433		else {
3434			tsp->tv_sec = ts32.tv_sec;
3435			tsp->tv_nsec = ts32.tv_nsec;
3436		}
3437	}
3438	return (error);
3439}
3440
3441static inline int
3442umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3443{
3444	struct umtx_time32 t32;
3445	int error;
3446
3447	t32.clockid = CLOCK_REALTIME;
3448	t32.flags   = 0;
3449	if (size <= sizeof(struct timespec32))
3450		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3451	else
3452		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3453	if (error != 0)
3454		return (error);
3455	if (t32.timeout.tv_sec < 0 ||
3456	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3457		return (EINVAL);
3458	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3459	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3460	tp->_flags = t32.flags;
3461	tp->_clockid = t32.clockid;
3462	return (0);
3463}
3464
3465static int
3466__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3467{
3468	struct timespec *ts, timeout;
3469	int error;
3470
3471	/* Allow a null timespec (wait forever). */
3472	if (uap->uaddr2 == NULL)
3473		ts = NULL;
3474	else {
3475		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3476		if (error != 0)
3477			return (error);
3478		ts = &timeout;
3479	}
3480	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3481}
3482
3483static int
3484__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3485{
3486	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3487}
3488
3489static int
3490__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3491{
3492	struct _umtx_time *tm_p, timeout;
3493	int error;
3494
3495	if (uap->uaddr2 == NULL)
3496		tm_p = NULL;
3497	else {
3498		error = umtx_copyin_umtx_time32(uap->uaddr2,
3499			(size_t)uap->uaddr1, &timeout);
3500		if (error != 0)
3501			return (error);
3502		tm_p = &timeout;
3503	}
3504	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3505}
3506
3507static int
3508__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3509{
3510	struct _umtx_time *tm_p, timeout;
3511	int error;
3512
3513	/* Allow a null timespec (wait forever). */
3514	if (uap->uaddr2 == NULL)
3515		tm_p = NULL;
3516	else {
3517		error = umtx_copyin_umtx_time(uap->uaddr2,
3518			    (size_t)uap->uaddr1, &timeout);
3519		if (error != 0)
3520			return (error);
3521		tm_p = &timeout;
3522	}
3523	return do_lock_umutex(td, uap->obj, tm_p, 0);
3524}
3525
3526static int
3527__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3528{
3529	struct _umtx_time *tm_p, timeout;
3530	int error;
3531
3532	/* Allow a null timespec (wait forever). */
3533	if (uap->uaddr2 == NULL)
3534		tm_p = NULL;
3535	else {
3536		error = umtx_copyin_umtx_time32(uap->uaddr2,
3537		    (size_t)uap->uaddr1, &timeout);
3538		if (error != 0)
3539			return (error);
3540		tm_p = &timeout;
3541	}
3542	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3543}
3544
3545static int
3546__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3547{
3548	struct timespec *ts, timeout;
3549	int error;
3550
3551	/* Allow a null timespec (wait forever). */
3552	if (uap->uaddr2 == NULL)
3553		ts = NULL;
3554	else {
3555		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3556		if (error != 0)
3557			return (error);
3558		ts = &timeout;
3559	}
3560	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3561}
3562
3563static int
3564__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3565{
3566	struct _umtx_time timeout;
3567	int error;
3568
3569	/* Allow a null timespec (wait forever). */
3570	if (uap->uaddr2 == NULL) {
3571		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3572	} else {
3573		error = umtx_copyin_umtx_time32(uap->uaddr2,
3574		    (size_t)uap->uaddr1, &timeout);
3575		if (error != 0)
3576			return (error);
3577		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3578	}
3579	return (error);
3580}
3581
3582static int
3583__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3584{
3585	struct _umtx_time timeout;
3586	int error;
3587
3588	/* Allow a null timespec (wait forever). */
3589	if (uap->uaddr2 == NULL) {
3590		error = do_rw_wrlock(td, uap->obj, 0);
3591	} else {
3592		error = umtx_copyin_umtx_time32(uap->uaddr2,
3593		    (size_t)uap->uaddr1, &timeout);
3594		if (error != 0)
3595			return (error);
3596		error = do_rw_wrlock(td, uap->obj, &timeout);
3597	}
3598	return (error);
3599}
3600
3601static int
3602__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3603{
3604	struct _umtx_time *tm_p, timeout;
3605	int error;
3606
3607	if (uap->uaddr2 == NULL)
3608		tm_p = NULL;
3609	else {
3610		error = umtx_copyin_umtx_time32(
3611		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3612		if (error != 0)
3613			return (error);
3614		tm_p = &timeout;
3615	}
3616	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3617}
3618
3619static int
3620__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3621{
3622	struct _umtx_time *tm_p, timeout;
3623	int error;
3624
3625	/* Allow a null timespec (wait forever). */
3626	if (uap->uaddr2 == NULL)
3627		tm_p = NULL;
3628	else {
3629		error = umtx_copyin_umtx_time32(uap->uaddr2,
3630		    (size_t)uap->uaddr1, &timeout);
3631		if (error != 0)
3632			return (error);
3633		tm_p = &timeout;
3634	}
3635	return (do_sem_wait(td, uap->obj, tm_p));
3636}
3637
3638static int
3639__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3640{
3641	int count = uap->val;
3642	uint32_t uaddrs[BATCH_SIZE];
3643	uint32_t **upp = (uint32_t **)uap->obj;
3644	int tocopy;
3645	int error = 0;
3646	int i, pos = 0;
3647
3648	while (count > 0) {
3649		tocopy = count;
3650		if (tocopy > BATCH_SIZE)
3651			tocopy = BATCH_SIZE;
3652		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3653		if (error != 0)
3654			break;
3655		for (i = 0; i < tocopy; ++i)
3656			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3657				INT_MAX, 1);
3658		count -= tocopy;
3659		pos += tocopy;
3660	}
3661	return (error);
3662}
3663
3664static _umtx_op_func op_table_compat32[] = {
3665	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3666	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3667	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3668	__umtx_op_wake,			/* UMTX_OP_WAKE */
3669	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3670	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3671	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3672	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3673	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3674	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3675	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3676	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3677	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3678	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3679	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3680	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3681	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3682	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3683	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3684	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3685	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3686	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3687	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3688};
3689
3690int
3691freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3692{
3693	if ((unsigned)uap->op < UMTX_OP_MAX)
3694		return (*op_table_compat32[uap->op])(td,
3695			(struct _umtx_op_args *)uap);
3696	return (EINVAL);
3697}
3698#endif
3699
3700void
3701umtx_thread_init(struct thread *td)
3702{
3703	td->td_umtxq = umtxq_alloc();
3704	td->td_umtxq->uq_thread = td;
3705}
3706
3707void
3708umtx_thread_fini(struct thread *td)
3709{
3710	umtxq_free(td->td_umtxq);
3711}
3712
3713/*
3714 * It will be called when new thread is created, e.g fork().
3715 */
3716void
3717umtx_thread_alloc(struct thread *td)
3718{
3719	struct umtx_q *uq;
3720
3721	uq = td->td_umtxq;
3722	uq->uq_inherited_pri = PRI_MAX;
3723
3724	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3725	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3726	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3727	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3728}
3729
3730/*
3731 * exec() hook.
3732 */
3733static void
3734umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3735	struct image_params *imgp __unused)
3736{
3737	umtx_thread_cleanup(curthread);
3738}
3739
3740/*
3741 * thread_exit() hook.
3742 */
3743void
3744umtx_thread_exit(struct thread *td)
3745{
3746	umtx_thread_cleanup(td);
3747}
3748
3749/*
3750 * clean up umtx data.
3751 */
3752static void
3753umtx_thread_cleanup(struct thread *td)
3754{
3755	struct umtx_q *uq;
3756	struct umtx_pi *pi;
3757
3758	if ((uq = td->td_umtxq) == NULL)
3759		return;
3760
3761	mtx_lock_spin(&umtx_lock);
3762	uq->uq_inherited_pri = PRI_MAX;
3763	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3764		pi->pi_owner = NULL;
3765		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3766	}
3767	mtx_unlock_spin(&umtx_lock);
3768	thread_lock(td);
3769	sched_lend_user_prio(td, PRI_MAX);
3770	thread_unlock(td);
3771}
3772