kern_umtx.c revision 278345
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/10/sys/kern/kern_umtx.c 278345 2015-02-07 08:35:18Z kib $");
30
31#include "opt_compat.h"
32#include "opt_umtx_profiling.h"
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mutex.h>
40#include <sys/priv.h>
41#include <sys/proc.h>
42#include <sys/sbuf.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/sysctl.h>
46#include <sys/sysent.h>
47#include <sys/systm.h>
48#include <sys/sysproto.h>
49#include <sys/syscallsubr.h>
50#include <sys/eventhandler.h>
51#include <sys/umtx.h>
52
53#include <vm/vm.h>
54#include <vm/vm_param.h>
55#include <vm/pmap.h>
56#include <vm/vm_map.h>
57#include <vm/vm_object.h>
58
59#include <machine/cpu.h>
60
61#ifdef COMPAT_FREEBSD32
62#include <compat/freebsd32/freebsd32_proto.h>
63#endif
64
65#define _UMUTEX_TRY		1
66#define _UMUTEX_WAIT		2
67
68#ifdef UMTX_PROFILING
69#define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71#endif
72
73/* Priority inheritance mutex info. */
74struct umtx_pi {
75	/* Owner thread */
76	struct thread		*pi_owner;
77
78	/* Reference count */
79	int			pi_refcount;
80
81 	/* List entry to link umtx holding by thread */
82	TAILQ_ENTRY(umtx_pi)	pi_link;
83
84	/* List entry in hash */
85	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86
87	/* List for waiters */
88	TAILQ_HEAD(,umtx_q)	pi_blocked;
89
90	/* Identify a userland lock object */
91	struct umtx_key		pi_key;
92};
93
94/* A userland synchronous object user. */
95struct umtx_q {
96	/* Linked list for the hash. */
97	TAILQ_ENTRY(umtx_q)	uq_link;
98
99	/* Umtx key. */
100	struct umtx_key		uq_key;
101
102	/* Umtx flags. */
103	int			uq_flags;
104#define UQF_UMTXQ	0x0001
105
106	/* The thread waits on. */
107	struct thread		*uq_thread;
108
109	/*
110	 * Blocked on PI mutex. read can use chain lock
111	 * or umtx_lock, write must have both chain lock and
112	 * umtx_lock being hold.
113	 */
114	struct umtx_pi		*uq_pi_blocked;
115
116	/* On blocked list */
117	TAILQ_ENTRY(umtx_q)	uq_lockq;
118
119	/* Thread contending with us */
120	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121
122	/* Inherited priority from PP mutex */
123	u_char			uq_inherited_pri;
124
125	/* Spare queue ready to be reused */
126	struct umtxq_queue	*uq_spare_queue;
127
128	/* The queue we on */
129	struct umtxq_queue	*uq_cur_queue;
130};
131
132TAILQ_HEAD(umtxq_head, umtx_q);
133
134/* Per-key wait-queue */
135struct umtxq_queue {
136	struct umtxq_head	head;
137	struct umtx_key		key;
138	LIST_ENTRY(umtxq_queue)	link;
139	int			length;
140};
141
142LIST_HEAD(umtxq_list, umtxq_queue);
143
144/* Userland lock object's wait-queue chain */
145struct umtxq_chain {
146	/* Lock for this chain. */
147	struct mtx		uc_lock;
148
149	/* List of sleep queues. */
150	struct umtxq_list	uc_queue[2];
151#define UMTX_SHARED_QUEUE	0
152#define UMTX_EXCLUSIVE_QUEUE	1
153
154	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155
156	/* Busy flag */
157	char			uc_busy;
158
159	/* Chain lock waiters */
160	int			uc_waiters;
161
162	/* All PI in the list */
163	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164
165#ifdef UMTX_PROFILING
166	u_int 			length;
167	u_int			max_length;
168#endif
169};
170
171#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172
173/*
174 * Don't propagate time-sharing priority, there is a security reason,
175 * a user can simply introduce PI-mutex, let thread A lock the mutex,
176 * and let another thread B block on the mutex, because B is
177 * sleeping, its priority will be boosted, this causes A's priority to
178 * be boosted via priority propagating too and will never be lowered even
179 * if it is using 100%CPU, this is unfair to other processes.
180 */
181
182#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
183			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
184			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
185
186#define	GOLDEN_RATIO_PRIME	2654404609U
187#define	UMTX_CHAINS		512
188#define	UMTX_SHIFTS		(__WORD_BIT - 9)
189
190#define	GET_SHARE(flags)	\
191    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
192
193#define BUSY_SPINS		200
194
195struct abs_timeout {
196	int clockid;
197	struct timespec cur;
198	struct timespec end;
199};
200
201static uma_zone_t		umtx_pi_zone;
202static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
203static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
204static int			umtx_pi_allocated;
205
206static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
207SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
208    &umtx_pi_allocated, 0, "Allocated umtx_pi");
209
210#ifdef UMTX_PROFILING
211static long max_length;
212SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
213static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
214#endif
215
216static void umtxq_sysinit(void *);
217static void umtxq_hash(struct umtx_key *key);
218static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
219static void umtxq_lock(struct umtx_key *key);
220static void umtxq_unlock(struct umtx_key *key);
221static void umtxq_busy(struct umtx_key *key);
222static void umtxq_unbusy(struct umtx_key *key);
223static void umtxq_insert_queue(struct umtx_q *uq, int q);
224static void umtxq_remove_queue(struct umtx_q *uq, int q);
225static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
226static int umtxq_count(struct umtx_key *key);
227static struct umtx_pi *umtx_pi_alloc(int);
228static void umtx_pi_free(struct umtx_pi *pi);
229static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
230static void umtx_thread_cleanup(struct thread *td);
231static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
232	struct image_params *imgp __unused);
233SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
234
235#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
236#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
237#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
238
239static struct mtx umtx_lock;
240
241#ifdef UMTX_PROFILING
242static void
243umtx_init_profiling(void)
244{
245	struct sysctl_oid *chain_oid;
246	char chain_name[10];
247	int i;
248
249	for (i = 0; i < UMTX_CHAINS; ++i) {
250		snprintf(chain_name, sizeof(chain_name), "%d", i);
251		chain_oid = SYSCTL_ADD_NODE(NULL,
252		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
253		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
254		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
255		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
256		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
257		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
258	}
259}
260
261static int
262sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
263{
264	char buf[512];
265	struct sbuf sb;
266	struct umtxq_chain *uc;
267	u_int fract, i, j, tot, whole;
268	u_int sf0, sf1, sf2, sf3, sf4;
269	u_int si0, si1, si2, si3, si4;
270	u_int sw0, sw1, sw2, sw3, sw4;
271
272	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
273	for (i = 0; i < 2; i++) {
274		tot = 0;
275		for (j = 0; j < UMTX_CHAINS; ++j) {
276			uc = &umtxq_chains[i][j];
277			mtx_lock(&uc->uc_lock);
278			tot += uc->max_length;
279			mtx_unlock(&uc->uc_lock);
280		}
281		if (tot == 0)
282			sbuf_printf(&sb, "%u) Empty ", i);
283		else {
284			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
285			si0 = si1 = si2 = si3 = si4 = 0;
286			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
287			for (j = 0; j < UMTX_CHAINS; j++) {
288				uc = &umtxq_chains[i][j];
289				mtx_lock(&uc->uc_lock);
290				whole = uc->max_length * 100;
291				mtx_unlock(&uc->uc_lock);
292				fract = (whole % tot) * 100;
293				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
294					sf0 = fract;
295					si0 = j;
296					sw0 = whole;
297				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
298				    sf1)) {
299					sf1 = fract;
300					si1 = j;
301					sw1 = whole;
302				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
303				    sf2)) {
304					sf2 = fract;
305					si2 = j;
306					sw2 = whole;
307				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
308				    sf3)) {
309					sf3 = fract;
310					si3 = j;
311					sw3 = whole;
312				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
313				    sf4)) {
314					sf4 = fract;
315					si4 = j;
316					sw4 = whole;
317				}
318			}
319			sbuf_printf(&sb, "queue %u:\n", i);
320			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
321			    sf0 / tot, si0);
322			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
323			    sf1 / tot, si1);
324			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
325			    sf2 / tot, si2);
326			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
327			    sf3 / tot, si3);
328			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
329			    sf4 / tot, si4);
330		}
331	}
332	sbuf_trim(&sb);
333	sbuf_finish(&sb);
334	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
335	sbuf_delete(&sb);
336	return (0);
337}
338
339static int
340sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
341{
342	struct umtxq_chain *uc;
343	u_int i, j;
344	int clear, error;
345
346	clear = 0;
347	error = sysctl_handle_int(oidp, &clear, 0, req);
348	if (error != 0 || req->newptr == NULL)
349		return (error);
350
351	if (clear != 0) {
352		for (i = 0; i < 2; ++i) {
353			for (j = 0; j < UMTX_CHAINS; ++j) {
354				uc = &umtxq_chains[i][j];
355				mtx_lock(&uc->uc_lock);
356				uc->length = 0;
357				uc->max_length = 0;
358				mtx_unlock(&uc->uc_lock);
359			}
360		}
361	}
362	return (0);
363}
364
365SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
366    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
367    sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
368SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
369    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
370    sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
371#endif
372
373static void
374umtxq_sysinit(void *arg __unused)
375{
376	int i, j;
377
378	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
379		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
380	for (i = 0; i < 2; ++i) {
381		for (j = 0; j < UMTX_CHAINS; ++j) {
382			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
383				 MTX_DEF | MTX_DUPOK);
384			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
385			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
386			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
387			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
388			umtxq_chains[i][j].uc_busy = 0;
389			umtxq_chains[i][j].uc_waiters = 0;
390#ifdef UMTX_PROFILING
391			umtxq_chains[i][j].length = 0;
392			umtxq_chains[i][j].max_length = 0;
393#endif
394		}
395	}
396#ifdef UMTX_PROFILING
397	umtx_init_profiling();
398#endif
399	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
400	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
401	    EVENTHANDLER_PRI_ANY);
402}
403
404struct umtx_q *
405umtxq_alloc(void)
406{
407	struct umtx_q *uq;
408
409	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
410	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
411	TAILQ_INIT(&uq->uq_spare_queue->head);
412	TAILQ_INIT(&uq->uq_pi_contested);
413	uq->uq_inherited_pri = PRI_MAX;
414	return (uq);
415}
416
417void
418umtxq_free(struct umtx_q *uq)
419{
420	MPASS(uq->uq_spare_queue != NULL);
421	free(uq->uq_spare_queue, M_UMTX);
422	free(uq, M_UMTX);
423}
424
425static inline void
426umtxq_hash(struct umtx_key *key)
427{
428	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
429	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
430}
431
432static inline struct umtxq_chain *
433umtxq_getchain(struct umtx_key *key)
434{
435	if (key->type <= TYPE_SEM)
436		return (&umtxq_chains[1][key->hash]);
437	return (&umtxq_chains[0][key->hash]);
438}
439
440/*
441 * Lock a chain.
442 */
443static inline void
444umtxq_lock(struct umtx_key *key)
445{
446	struct umtxq_chain *uc;
447
448	uc = umtxq_getchain(key);
449	mtx_lock(&uc->uc_lock);
450}
451
452/*
453 * Unlock a chain.
454 */
455static inline void
456umtxq_unlock(struct umtx_key *key)
457{
458	struct umtxq_chain *uc;
459
460	uc = umtxq_getchain(key);
461	mtx_unlock(&uc->uc_lock);
462}
463
464/*
465 * Set chain to busy state when following operation
466 * may be blocked (kernel mutex can not be used).
467 */
468static inline void
469umtxq_busy(struct umtx_key *key)
470{
471	struct umtxq_chain *uc;
472
473	uc = umtxq_getchain(key);
474	mtx_assert(&uc->uc_lock, MA_OWNED);
475	if (uc->uc_busy) {
476#ifdef SMP
477		if (smp_cpus > 1) {
478			int count = BUSY_SPINS;
479			if (count > 0) {
480				umtxq_unlock(key);
481				while (uc->uc_busy && --count > 0)
482					cpu_spinwait();
483				umtxq_lock(key);
484			}
485		}
486#endif
487		while (uc->uc_busy) {
488			uc->uc_waiters++;
489			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
490			uc->uc_waiters--;
491		}
492	}
493	uc->uc_busy = 1;
494}
495
496/*
497 * Unbusy a chain.
498 */
499static inline void
500umtxq_unbusy(struct umtx_key *key)
501{
502	struct umtxq_chain *uc;
503
504	uc = umtxq_getchain(key);
505	mtx_assert(&uc->uc_lock, MA_OWNED);
506	KASSERT(uc->uc_busy != 0, ("not busy"));
507	uc->uc_busy = 0;
508	if (uc->uc_waiters)
509		wakeup_one(uc);
510}
511
512static inline void
513umtxq_unbusy_unlocked(struct umtx_key *key)
514{
515
516	umtxq_lock(key);
517	umtxq_unbusy(key);
518	umtxq_unlock(key);
519}
520
521static struct umtxq_queue *
522umtxq_queue_lookup(struct umtx_key *key, int q)
523{
524	struct umtxq_queue *uh;
525	struct umtxq_chain *uc;
526
527	uc = umtxq_getchain(key);
528	UMTXQ_LOCKED_ASSERT(uc);
529	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
530		if (umtx_key_match(&uh->key, key))
531			return (uh);
532	}
533
534	return (NULL);
535}
536
537static inline void
538umtxq_insert_queue(struct umtx_q *uq, int q)
539{
540	struct umtxq_queue *uh;
541	struct umtxq_chain *uc;
542
543	uc = umtxq_getchain(&uq->uq_key);
544	UMTXQ_LOCKED_ASSERT(uc);
545	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
546	uh = umtxq_queue_lookup(&uq->uq_key, q);
547	if (uh != NULL) {
548		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
549	} else {
550		uh = uq->uq_spare_queue;
551		uh->key = uq->uq_key;
552		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
553#ifdef UMTX_PROFILING
554		uc->length++;
555		if (uc->length > uc->max_length) {
556			uc->max_length = uc->length;
557			if (uc->max_length > max_length)
558				max_length = uc->max_length;
559		}
560#endif
561	}
562	uq->uq_spare_queue = NULL;
563
564	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
565	uh->length++;
566	uq->uq_flags |= UQF_UMTXQ;
567	uq->uq_cur_queue = uh;
568	return;
569}
570
571static inline void
572umtxq_remove_queue(struct umtx_q *uq, int q)
573{
574	struct umtxq_chain *uc;
575	struct umtxq_queue *uh;
576
577	uc = umtxq_getchain(&uq->uq_key);
578	UMTXQ_LOCKED_ASSERT(uc);
579	if (uq->uq_flags & UQF_UMTXQ) {
580		uh = uq->uq_cur_queue;
581		TAILQ_REMOVE(&uh->head, uq, uq_link);
582		uh->length--;
583		uq->uq_flags &= ~UQF_UMTXQ;
584		if (TAILQ_EMPTY(&uh->head)) {
585			KASSERT(uh->length == 0,
586			    ("inconsistent umtxq_queue length"));
587#ifdef UMTX_PROFILING
588			uc->length--;
589#endif
590			LIST_REMOVE(uh, link);
591		} else {
592			uh = LIST_FIRST(&uc->uc_spare_queue);
593			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
594			LIST_REMOVE(uh, link);
595		}
596		uq->uq_spare_queue = uh;
597		uq->uq_cur_queue = NULL;
598	}
599}
600
601/*
602 * Check if there are multiple waiters
603 */
604static int
605umtxq_count(struct umtx_key *key)
606{
607	struct umtxq_chain *uc;
608	struct umtxq_queue *uh;
609
610	uc = umtxq_getchain(key);
611	UMTXQ_LOCKED_ASSERT(uc);
612	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
613	if (uh != NULL)
614		return (uh->length);
615	return (0);
616}
617
618/*
619 * Check if there are multiple PI waiters and returns first
620 * waiter.
621 */
622static int
623umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
624{
625	struct umtxq_chain *uc;
626	struct umtxq_queue *uh;
627
628	*first = NULL;
629	uc = umtxq_getchain(key);
630	UMTXQ_LOCKED_ASSERT(uc);
631	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
632	if (uh != NULL) {
633		*first = TAILQ_FIRST(&uh->head);
634		return (uh->length);
635	}
636	return (0);
637}
638
639static int
640umtxq_check_susp(struct thread *td)
641{
642	struct proc *p;
643	int error;
644
645	/*
646	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
647	 * eventually break the lockstep loop.
648	 */
649	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
650		return (0);
651	error = 0;
652	p = td->td_proc;
653	PROC_LOCK(p);
654	if (P_SHOULDSTOP(p) ||
655	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
656		if (p->p_flag & P_SINGLE_EXIT)
657			error = EINTR;
658		else
659			error = ERESTART;
660	}
661	PROC_UNLOCK(p);
662	return (error);
663}
664
665/*
666 * Wake up threads waiting on an userland object.
667 */
668
669static int
670umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
671{
672	struct umtxq_chain *uc;
673	struct umtxq_queue *uh;
674	struct umtx_q *uq;
675	int ret;
676
677	ret = 0;
678	uc = umtxq_getchain(key);
679	UMTXQ_LOCKED_ASSERT(uc);
680	uh = umtxq_queue_lookup(key, q);
681	if (uh != NULL) {
682		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
683			umtxq_remove_queue(uq, q);
684			wakeup(uq);
685			if (++ret >= n_wake)
686				return (ret);
687		}
688	}
689	return (ret);
690}
691
692
693/*
694 * Wake up specified thread.
695 */
696static inline void
697umtxq_signal_thread(struct umtx_q *uq)
698{
699	struct umtxq_chain *uc;
700
701	uc = umtxq_getchain(&uq->uq_key);
702	UMTXQ_LOCKED_ASSERT(uc);
703	umtxq_remove(uq);
704	wakeup(uq);
705}
706
707static inline int
708tstohz(const struct timespec *tsp)
709{
710	struct timeval tv;
711
712	TIMESPEC_TO_TIMEVAL(&tv, tsp);
713	return tvtohz(&tv);
714}
715
716static void
717abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
718	const struct timespec *timeout)
719{
720
721	timo->clockid = clockid;
722	if (!absolute) {
723		kern_clock_gettime(curthread, clockid, &timo->end);
724		timo->cur = timo->end;
725		timespecadd(&timo->end, timeout);
726	} else {
727		timo->end = *timeout;
728		kern_clock_gettime(curthread, clockid, &timo->cur);
729	}
730}
731
732static void
733abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
734{
735
736	abs_timeout_init(timo, umtxtime->_clockid,
737		(umtxtime->_flags & UMTX_ABSTIME) != 0,
738		&umtxtime->_timeout);
739}
740
741static inline void
742abs_timeout_update(struct abs_timeout *timo)
743{
744	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
745}
746
747static int
748abs_timeout_gethz(struct abs_timeout *timo)
749{
750	struct timespec tts;
751
752	if (timespeccmp(&timo->end, &timo->cur, <=))
753		return (-1);
754	tts = timo->end;
755	timespecsub(&tts, &timo->cur);
756	return (tstohz(&tts));
757}
758
759/*
760 * Put thread into sleep state, before sleeping, check if
761 * thread was removed from umtx queue.
762 */
763static inline int
764umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
765{
766	struct umtxq_chain *uc;
767	int error, timo;
768
769	uc = umtxq_getchain(&uq->uq_key);
770	UMTXQ_LOCKED_ASSERT(uc);
771	for (;;) {
772		if (!(uq->uq_flags & UQF_UMTXQ))
773			return (0);
774		if (abstime != NULL) {
775			timo = abs_timeout_gethz(abstime);
776			if (timo < 0)
777				return (ETIMEDOUT);
778		} else
779			timo = 0;
780		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
781		if (error != EWOULDBLOCK) {
782			umtxq_lock(&uq->uq_key);
783			break;
784		}
785		if (abstime != NULL)
786			abs_timeout_update(abstime);
787		umtxq_lock(&uq->uq_key);
788	}
789	return (error);
790}
791
792/*
793 * Convert userspace address into unique logical address.
794 */
795int
796umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
797{
798	struct thread *td = curthread;
799	vm_map_t map;
800	vm_map_entry_t entry;
801	vm_pindex_t pindex;
802	vm_prot_t prot;
803	boolean_t wired;
804
805	key->type = type;
806	if (share == THREAD_SHARE) {
807		key->shared = 0;
808		key->info.private.vs = td->td_proc->p_vmspace;
809		key->info.private.addr = (uintptr_t)addr;
810	} else {
811		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
812		map = &td->td_proc->p_vmspace->vm_map;
813		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
814		    &entry, &key->info.shared.object, &pindex, &prot,
815		    &wired) != KERN_SUCCESS) {
816			return EFAULT;
817		}
818
819		if ((share == PROCESS_SHARE) ||
820		    (share == AUTO_SHARE &&
821		     VM_INHERIT_SHARE == entry->inheritance)) {
822			key->shared = 1;
823			key->info.shared.offset = entry->offset + entry->start -
824				(vm_offset_t)addr;
825			vm_object_reference(key->info.shared.object);
826		} else {
827			key->shared = 0;
828			key->info.private.vs = td->td_proc->p_vmspace;
829			key->info.private.addr = (uintptr_t)addr;
830		}
831		vm_map_lookup_done(map, entry);
832	}
833
834	umtxq_hash(key);
835	return (0);
836}
837
838/*
839 * Release key.
840 */
841void
842umtx_key_release(struct umtx_key *key)
843{
844	if (key->shared)
845		vm_object_deallocate(key->info.shared.object);
846}
847
848/*
849 * Lock a umtx object.
850 */
851static int
852do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
853	const struct timespec *timeout)
854{
855	struct abs_timeout timo;
856	struct umtx_q *uq;
857	u_long owner;
858	u_long old;
859	int error = 0;
860
861	uq = td->td_umtxq;
862	if (timeout != NULL)
863		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
864
865	/*
866	 * Care must be exercised when dealing with umtx structure. It
867	 * can fault on any access.
868	 */
869	for (;;) {
870		/*
871		 * Try the uncontested case.  This should be done in userland.
872		 */
873		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
874
875		/* The acquire succeeded. */
876		if (owner == UMTX_UNOWNED)
877			return (0);
878
879		/* The address was invalid. */
880		if (owner == -1)
881			return (EFAULT);
882
883		/* If no one owns it but it is contested try to acquire it. */
884		if (owner == UMTX_CONTESTED) {
885			owner = casuword(&umtx->u_owner,
886			    UMTX_CONTESTED, id | UMTX_CONTESTED);
887
888			if (owner == UMTX_CONTESTED)
889				return (0);
890
891			/* The address was invalid. */
892			if (owner == -1)
893				return (EFAULT);
894
895			error = umtxq_check_susp(td);
896			if (error != 0)
897				break;
898
899			/* If this failed the lock has changed, restart. */
900			continue;
901		}
902
903		/*
904		 * If we caught a signal, we have retried and now
905		 * exit immediately.
906		 */
907		if (error != 0)
908			break;
909
910		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
911			AUTO_SHARE, &uq->uq_key)) != 0)
912			return (error);
913
914		umtxq_lock(&uq->uq_key);
915		umtxq_busy(&uq->uq_key);
916		umtxq_insert(uq);
917		umtxq_unbusy(&uq->uq_key);
918		umtxq_unlock(&uq->uq_key);
919
920		/*
921		 * Set the contested bit so that a release in user space
922		 * knows to use the system call for unlock.  If this fails
923		 * either some one else has acquired the lock or it has been
924		 * released.
925		 */
926		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
927
928		/* The address was invalid. */
929		if (old == -1) {
930			umtxq_lock(&uq->uq_key);
931			umtxq_remove(uq);
932			umtxq_unlock(&uq->uq_key);
933			umtx_key_release(&uq->uq_key);
934			return (EFAULT);
935		}
936
937		/*
938		 * We set the contested bit, sleep. Otherwise the lock changed
939		 * and we need to retry or we lost a race to the thread
940		 * unlocking the umtx.
941		 */
942		umtxq_lock(&uq->uq_key);
943		if (old == owner)
944			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
945			    &timo);
946		umtxq_remove(uq);
947		umtxq_unlock(&uq->uq_key);
948		umtx_key_release(&uq->uq_key);
949
950		if (error == 0)
951			error = umtxq_check_susp(td);
952	}
953
954	if (timeout == NULL) {
955		/* Mutex locking is restarted if it is interrupted. */
956		if (error == EINTR)
957			error = ERESTART;
958	} else {
959		/* Timed-locking is not restarted. */
960		if (error == ERESTART)
961			error = EINTR;
962	}
963	return (error);
964}
965
966/*
967 * Unlock a umtx object.
968 */
969static int
970do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
971{
972	struct umtx_key key;
973	u_long owner;
974	u_long old;
975	int error;
976	int count;
977
978	/*
979	 * Make sure we own this mtx.
980	 */
981	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
982	if (owner == -1)
983		return (EFAULT);
984
985	if ((owner & ~UMTX_CONTESTED) != id)
986		return (EPERM);
987
988	/* This should be done in userland */
989	if ((owner & UMTX_CONTESTED) == 0) {
990		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
991		if (old == -1)
992			return (EFAULT);
993		if (old == owner)
994			return (0);
995		owner = old;
996	}
997
998	/* We should only ever be in here for contested locks */
999	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1000		&key)) != 0)
1001		return (error);
1002
1003	umtxq_lock(&key);
1004	umtxq_busy(&key);
1005	count = umtxq_count(&key);
1006	umtxq_unlock(&key);
1007
1008	/*
1009	 * When unlocking the umtx, it must be marked as unowned if
1010	 * there is zero or one thread only waiting for it.
1011	 * Otherwise, it must be marked as contested.
1012	 */
1013	old = casuword(&umtx->u_owner, owner,
1014		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
1015	umtxq_lock(&key);
1016	umtxq_signal(&key,1);
1017	umtxq_unbusy(&key);
1018	umtxq_unlock(&key);
1019	umtx_key_release(&key);
1020	if (old == -1)
1021		return (EFAULT);
1022	if (old != owner)
1023		return (EINVAL);
1024	return (0);
1025}
1026
1027#ifdef COMPAT_FREEBSD32
1028
1029/*
1030 * Lock a umtx object.
1031 */
1032static int
1033do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
1034	const struct timespec *timeout)
1035{
1036	struct abs_timeout timo;
1037	struct umtx_q *uq;
1038	uint32_t owner;
1039	uint32_t old;
1040	int error = 0;
1041
1042	uq = td->td_umtxq;
1043
1044	if (timeout != NULL)
1045		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
1046
1047	/*
1048	 * Care must be exercised when dealing with umtx structure. It
1049	 * can fault on any access.
1050	 */
1051	for (;;) {
1052		/*
1053		 * Try the uncontested case.  This should be done in userland.
1054		 */
1055		owner = casuword32(m, UMUTEX_UNOWNED, id);
1056
1057		/* The acquire succeeded. */
1058		if (owner == UMUTEX_UNOWNED)
1059			return (0);
1060
1061		/* The address was invalid. */
1062		if (owner == -1)
1063			return (EFAULT);
1064
1065		/* If no one owns it but it is contested try to acquire it. */
1066		if (owner == UMUTEX_CONTESTED) {
1067			owner = casuword32(m,
1068			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1069			if (owner == UMUTEX_CONTESTED)
1070				return (0);
1071
1072			/* The address was invalid. */
1073			if (owner == -1)
1074				return (EFAULT);
1075
1076			error = umtxq_check_susp(td);
1077			if (error != 0)
1078				break;
1079
1080			/* If this failed the lock has changed, restart. */
1081			continue;
1082		}
1083
1084		/*
1085		 * If we caught a signal, we have retried and now
1086		 * exit immediately.
1087		 */
1088		if (error != 0)
1089			return (error);
1090
1091		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
1092			AUTO_SHARE, &uq->uq_key)) != 0)
1093			return (error);
1094
1095		umtxq_lock(&uq->uq_key);
1096		umtxq_busy(&uq->uq_key);
1097		umtxq_insert(uq);
1098		umtxq_unbusy(&uq->uq_key);
1099		umtxq_unlock(&uq->uq_key);
1100
1101		/*
1102		 * Set the contested bit so that a release in user space
1103		 * knows to use the system call for unlock.  If this fails
1104		 * either some one else has acquired the lock or it has been
1105		 * released.
1106		 */
1107		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
1108
1109		/* The address was invalid. */
1110		if (old == -1) {
1111			umtxq_lock(&uq->uq_key);
1112			umtxq_remove(uq);
1113			umtxq_unlock(&uq->uq_key);
1114			umtx_key_release(&uq->uq_key);
1115			return (EFAULT);
1116		}
1117
1118		/*
1119		 * We set the contested bit, sleep. Otherwise the lock changed
1120		 * and we need to retry or we lost a race to the thread
1121		 * unlocking the umtx.
1122		 */
1123		umtxq_lock(&uq->uq_key);
1124		if (old == owner)
1125			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
1126			    NULL : &timo);
1127		umtxq_remove(uq);
1128		umtxq_unlock(&uq->uq_key);
1129		umtx_key_release(&uq->uq_key);
1130
1131		if (error == 0)
1132			error = umtxq_check_susp(td);
1133	}
1134
1135	if (timeout == NULL) {
1136		/* Mutex locking is restarted if it is interrupted. */
1137		if (error == EINTR)
1138			error = ERESTART;
1139	} else {
1140		/* Timed-locking is not restarted. */
1141		if (error == ERESTART)
1142			error = EINTR;
1143	}
1144	return (error);
1145}
1146
1147/*
1148 * Unlock a umtx object.
1149 */
1150static int
1151do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
1152{
1153	struct umtx_key key;
1154	uint32_t owner;
1155	uint32_t old;
1156	int error;
1157	int count;
1158
1159	/*
1160	 * Make sure we own this mtx.
1161	 */
1162	owner = fuword32(m);
1163	if (owner == -1)
1164		return (EFAULT);
1165
1166	if ((owner & ~UMUTEX_CONTESTED) != id)
1167		return (EPERM);
1168
1169	/* This should be done in userland */
1170	if ((owner & UMUTEX_CONTESTED) == 0) {
1171		old = casuword32(m, owner, UMUTEX_UNOWNED);
1172		if (old == -1)
1173			return (EFAULT);
1174		if (old == owner)
1175			return (0);
1176		owner = old;
1177	}
1178
1179	/* We should only ever be in here for contested locks */
1180	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1181		&key)) != 0)
1182		return (error);
1183
1184	umtxq_lock(&key);
1185	umtxq_busy(&key);
1186	count = umtxq_count(&key);
1187	umtxq_unlock(&key);
1188
1189	/*
1190	 * When unlocking the umtx, it must be marked as unowned if
1191	 * there is zero or one thread only waiting for it.
1192	 * Otherwise, it must be marked as contested.
1193	 */
1194	old = casuword32(m, owner,
1195		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1196	umtxq_lock(&key);
1197	umtxq_signal(&key,1);
1198	umtxq_unbusy(&key);
1199	umtxq_unlock(&key);
1200	umtx_key_release(&key);
1201	if (old == -1)
1202		return (EFAULT);
1203	if (old != owner)
1204		return (EINVAL);
1205	return (0);
1206}
1207#endif
1208
1209/*
1210 * Fetch and compare value, sleep on the address if value is not changed.
1211 */
1212static int
1213do_wait(struct thread *td, void *addr, u_long id,
1214	struct _umtx_time *timeout, int compat32, int is_private)
1215{
1216	struct abs_timeout timo;
1217	struct umtx_q *uq;
1218	u_long tmp;
1219	uint32_t tmp32;
1220	int error = 0;
1221
1222	uq = td->td_umtxq;
1223	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1224		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1225		return (error);
1226
1227	if (timeout != NULL)
1228		abs_timeout_init2(&timo, timeout);
1229
1230	umtxq_lock(&uq->uq_key);
1231	umtxq_insert(uq);
1232	umtxq_unlock(&uq->uq_key);
1233	if (compat32 == 0) {
1234		error = fueword(addr, &tmp);
1235		if (error != 0)
1236			error = EFAULT;
1237	} else {
1238		error = fueword32(addr, &tmp32);
1239		if (error == 0)
1240			tmp = tmp32;
1241		else
1242			error = EFAULT;
1243	}
1244	umtxq_lock(&uq->uq_key);
1245	if (error == 0) {
1246		if (tmp == id)
1247			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
1248			    NULL : &timo);
1249		if ((uq->uq_flags & UQF_UMTXQ) == 0)
1250			error = 0;
1251		else
1252			umtxq_remove(uq);
1253	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
1254		umtxq_remove(uq);
1255	}
1256	umtxq_unlock(&uq->uq_key);
1257	umtx_key_release(&uq->uq_key);
1258	if (error == ERESTART)
1259		error = EINTR;
1260	return (error);
1261}
1262
1263/*
1264 * Wake up threads sleeping on the specified address.
1265 */
1266int
1267kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1268{
1269	struct umtx_key key;
1270	int ret;
1271
1272	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1273		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1274		return (ret);
1275	umtxq_lock(&key);
1276	ret = umtxq_signal(&key, n_wake);
1277	umtxq_unlock(&key);
1278	umtx_key_release(&key);
1279	return (0);
1280}
1281
1282/*
1283 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1284 */
1285static int
1286do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1287	struct _umtx_time *timeout, int mode)
1288{
1289	struct abs_timeout timo;
1290	struct umtx_q *uq;
1291	uint32_t owner, old, id;
1292	int error, rv;
1293
1294	id = td->td_tid;
1295	uq = td->td_umtxq;
1296	error = 0;
1297	if (timeout != NULL)
1298		abs_timeout_init2(&timo, timeout);
1299
1300	/*
1301	 * Care must be exercised when dealing with umtx structure. It
1302	 * can fault on any access.
1303	 */
1304	for (;;) {
1305		rv = fueword32(&m->m_owner, &owner);
1306		if (rv == -1)
1307			return (EFAULT);
1308		if (mode == _UMUTEX_WAIT) {
1309			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1310				return (0);
1311		} else {
1312			/*
1313			 * Try the uncontested case.  This should be done in userland.
1314			 */
1315			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
1316			    &owner, id);
1317			/* The address was invalid. */
1318			if (rv == -1)
1319				return (EFAULT);
1320
1321			/* The acquire succeeded. */
1322			if (owner == UMUTEX_UNOWNED)
1323				return (0);
1324
1325			/* If no one owns it but it is contested try to acquire it. */
1326			if (owner == UMUTEX_CONTESTED) {
1327				rv = casueword32(&m->m_owner,
1328				    UMUTEX_CONTESTED, &owner,
1329				    id | UMUTEX_CONTESTED);
1330				/* The address was invalid. */
1331				if (rv == -1)
1332					return (EFAULT);
1333
1334				if (owner == UMUTEX_CONTESTED)
1335					return (0);
1336
1337				rv = umtxq_check_susp(td);
1338				if (rv != 0)
1339					return (rv);
1340
1341				/* If this failed the lock has changed, restart. */
1342				continue;
1343			}
1344		}
1345
1346		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1347		    (owner & ~UMUTEX_CONTESTED) == id)
1348			return (EDEADLK);
1349
1350		if (mode == _UMUTEX_TRY)
1351			return (EBUSY);
1352
1353		/*
1354		 * If we caught a signal, we have retried and now
1355		 * exit immediately.
1356		 */
1357		if (error != 0)
1358			return (error);
1359
1360		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1361		    GET_SHARE(flags), &uq->uq_key)) != 0)
1362			return (error);
1363
1364		umtxq_lock(&uq->uq_key);
1365		umtxq_busy(&uq->uq_key);
1366		umtxq_insert(uq);
1367		umtxq_unlock(&uq->uq_key);
1368
1369		/*
1370		 * Set the contested bit so that a release in user space
1371		 * knows to use the system call for unlock.  If this fails
1372		 * either some one else has acquired the lock or it has been
1373		 * released.
1374		 */
1375		rv = casueword32(&m->m_owner, owner, &old,
1376		    owner | UMUTEX_CONTESTED);
1377
1378		/* The address was invalid. */
1379		if (rv == -1) {
1380			umtxq_lock(&uq->uq_key);
1381			umtxq_remove(uq);
1382			umtxq_unbusy(&uq->uq_key);
1383			umtxq_unlock(&uq->uq_key);
1384			umtx_key_release(&uq->uq_key);
1385			return (EFAULT);
1386		}
1387
1388		/*
1389		 * We set the contested bit, sleep. Otherwise the lock changed
1390		 * and we need to retry or we lost a race to the thread
1391		 * unlocking the umtx.
1392		 */
1393		umtxq_lock(&uq->uq_key);
1394		umtxq_unbusy(&uq->uq_key);
1395		if (old == owner)
1396			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1397			    NULL : &timo);
1398		umtxq_remove(uq);
1399		umtxq_unlock(&uq->uq_key);
1400		umtx_key_release(&uq->uq_key);
1401
1402		if (error == 0)
1403			error = umtxq_check_susp(td);
1404	}
1405
1406	return (0);
1407}
1408
1409/*
1410 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1411 */
1412static int
1413do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1414{
1415	struct umtx_key key;
1416	uint32_t owner, old, id;
1417	int error;
1418	int count;
1419
1420	id = td->td_tid;
1421	/*
1422	 * Make sure we own this mtx.
1423	 */
1424	error = fueword32(&m->m_owner, &owner);
1425	if (error == -1)
1426		return (EFAULT);
1427
1428	if ((owner & ~UMUTEX_CONTESTED) != id)
1429		return (EPERM);
1430
1431	if ((owner & UMUTEX_CONTESTED) == 0) {
1432		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1433		if (error == -1)
1434			return (EFAULT);
1435		if (old == owner)
1436			return (0);
1437		owner = old;
1438	}
1439
1440	/* We should only ever be in here for contested locks */
1441	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1442	    &key)) != 0)
1443		return (error);
1444
1445	umtxq_lock(&key);
1446	umtxq_busy(&key);
1447	count = umtxq_count(&key);
1448	umtxq_unlock(&key);
1449
1450	/*
1451	 * When unlocking the umtx, it must be marked as unowned if
1452	 * there is zero or one thread only waiting for it.
1453	 * Otherwise, it must be marked as contested.
1454	 */
1455	error = casueword32(&m->m_owner, owner, &old,
1456	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1457	umtxq_lock(&key);
1458	umtxq_signal(&key,1);
1459	umtxq_unbusy(&key);
1460	umtxq_unlock(&key);
1461	umtx_key_release(&key);
1462	if (error == -1)
1463		return (EFAULT);
1464	if (old != owner)
1465		return (EINVAL);
1466	return (0);
1467}
1468
1469/*
1470 * Check if the mutex is available and wake up a waiter,
1471 * only for simple mutex.
1472 */
1473static int
1474do_wake_umutex(struct thread *td, struct umutex *m)
1475{
1476	struct umtx_key key;
1477	uint32_t owner;
1478	uint32_t flags;
1479	int error;
1480	int count;
1481
1482	error = fueword32(&m->m_owner, &owner);
1483	if (error == -1)
1484		return (EFAULT);
1485
1486	if ((owner & ~UMUTEX_CONTESTED) != 0)
1487		return (0);
1488
1489	error = fueword32(&m->m_flags, &flags);
1490	if (error == -1)
1491		return (EFAULT);
1492
1493	/* We should only ever be in here for contested locks */
1494	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1495	    &key)) != 0)
1496		return (error);
1497
1498	umtxq_lock(&key);
1499	umtxq_busy(&key);
1500	count = umtxq_count(&key);
1501	umtxq_unlock(&key);
1502
1503	if (count <= 1) {
1504		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1505		    UMUTEX_UNOWNED);
1506		if (error == -1)
1507			error = EFAULT;
1508	}
1509
1510	umtxq_lock(&key);
1511	if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1512		umtxq_signal(&key, 1);
1513	umtxq_unbusy(&key);
1514	umtxq_unlock(&key);
1515	umtx_key_release(&key);
1516	return (error);
1517}
1518
1519/*
1520 * Check if the mutex has waiters and tries to fix contention bit.
1521 */
1522static int
1523do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1524{
1525	struct umtx_key key;
1526	uint32_t owner, old;
1527	int type;
1528	int error;
1529	int count;
1530
1531	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1532	case 0:
1533		type = TYPE_NORMAL_UMUTEX;
1534		break;
1535	case UMUTEX_PRIO_INHERIT:
1536		type = TYPE_PI_UMUTEX;
1537		break;
1538	case UMUTEX_PRIO_PROTECT:
1539		type = TYPE_PP_UMUTEX;
1540		break;
1541	default:
1542		return (EINVAL);
1543	}
1544	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1545	    &key)) != 0)
1546		return (error);
1547
1548	owner = 0;
1549	umtxq_lock(&key);
1550	umtxq_busy(&key);
1551	count = umtxq_count(&key);
1552	umtxq_unlock(&key);
1553	/*
1554	 * Only repair contention bit if there is a waiter, this means the mutex
1555	 * is still being referenced by userland code, otherwise don't update
1556	 * any memory.
1557	 */
1558	if (count > 1) {
1559		error = fueword32(&m->m_owner, &owner);
1560		if (error == -1)
1561			error = EFAULT;
1562		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1563			error = casueword32(&m->m_owner, owner, &old,
1564			    owner | UMUTEX_CONTESTED);
1565			if (error == -1) {
1566				error = EFAULT;
1567				break;
1568			}
1569			if (old == owner)
1570				break;
1571			owner = old;
1572			error = umtxq_check_susp(td);
1573			if (error != 0)
1574				break;
1575		}
1576	} else if (count == 1) {
1577		error = fueword32(&m->m_owner, &owner);
1578		if (error == -1)
1579			error = EFAULT;
1580		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1581		       (owner & UMUTEX_CONTESTED) == 0) {
1582			error = casueword32(&m->m_owner, owner, &old,
1583			    owner | UMUTEX_CONTESTED);
1584			if (error == -1) {
1585				error = EFAULT;
1586				break;
1587			}
1588			if (old == owner)
1589				break;
1590			owner = old;
1591			error = umtxq_check_susp(td);
1592			if (error != 0)
1593				break;
1594		}
1595	}
1596	umtxq_lock(&key);
1597	if (error == EFAULT) {
1598		umtxq_signal(&key, INT_MAX);
1599	} else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1600		umtxq_signal(&key, 1);
1601	umtxq_unbusy(&key);
1602	umtxq_unlock(&key);
1603	umtx_key_release(&key);
1604	return (error);
1605}
1606
1607static inline struct umtx_pi *
1608umtx_pi_alloc(int flags)
1609{
1610	struct umtx_pi *pi;
1611
1612	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1613	TAILQ_INIT(&pi->pi_blocked);
1614	atomic_add_int(&umtx_pi_allocated, 1);
1615	return (pi);
1616}
1617
1618static inline void
1619umtx_pi_free(struct umtx_pi *pi)
1620{
1621	uma_zfree(umtx_pi_zone, pi);
1622	atomic_add_int(&umtx_pi_allocated, -1);
1623}
1624
1625/*
1626 * Adjust the thread's position on a pi_state after its priority has been
1627 * changed.
1628 */
1629static int
1630umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1631{
1632	struct umtx_q *uq, *uq1, *uq2;
1633	struct thread *td1;
1634
1635	mtx_assert(&umtx_lock, MA_OWNED);
1636	if (pi == NULL)
1637		return (0);
1638
1639	uq = td->td_umtxq;
1640
1641	/*
1642	 * Check if the thread needs to be moved on the blocked chain.
1643	 * It needs to be moved if either its priority is lower than
1644	 * the previous thread or higher than the next thread.
1645	 */
1646	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1647	uq2 = TAILQ_NEXT(uq, uq_lockq);
1648	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1649	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1650		/*
1651		 * Remove thread from blocked chain and determine where
1652		 * it should be moved to.
1653		 */
1654		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1655		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1656			td1 = uq1->uq_thread;
1657			MPASS(td1->td_proc->p_magic == P_MAGIC);
1658			if (UPRI(td1) > UPRI(td))
1659				break;
1660		}
1661
1662		if (uq1 == NULL)
1663			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1664		else
1665			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1666	}
1667	return (1);
1668}
1669
1670static struct umtx_pi *
1671umtx_pi_next(struct umtx_pi *pi)
1672{
1673	struct umtx_q *uq_owner;
1674
1675	if (pi->pi_owner == NULL)
1676		return (NULL);
1677	uq_owner = pi->pi_owner->td_umtxq;
1678	if (uq_owner == NULL)
1679		return (NULL);
1680	return (uq_owner->uq_pi_blocked);
1681}
1682
1683/*
1684 * Floyd's Cycle-Finding Algorithm.
1685 */
1686static bool
1687umtx_pi_check_loop(struct umtx_pi *pi)
1688{
1689	struct umtx_pi *pi1;	/* fast iterator */
1690
1691	mtx_assert(&umtx_lock, MA_OWNED);
1692	if (pi == NULL)
1693		return (false);
1694	pi1 = pi;
1695	for (;;) {
1696		pi = umtx_pi_next(pi);
1697		if (pi == NULL)
1698			break;
1699		pi1 = umtx_pi_next(pi1);
1700		if (pi1 == NULL)
1701			break;
1702		pi1 = umtx_pi_next(pi1);
1703		if (pi1 == NULL)
1704			break;
1705		if (pi == pi1)
1706			return (true);
1707	}
1708	return (false);
1709}
1710
1711/*
1712 * Propagate priority when a thread is blocked on POSIX
1713 * PI mutex.
1714 */
1715static void
1716umtx_propagate_priority(struct thread *td)
1717{
1718	struct umtx_q *uq;
1719	struct umtx_pi *pi;
1720	int pri;
1721
1722	mtx_assert(&umtx_lock, MA_OWNED);
1723	pri = UPRI(td);
1724	uq = td->td_umtxq;
1725	pi = uq->uq_pi_blocked;
1726	if (pi == NULL)
1727		return;
1728	if (umtx_pi_check_loop(pi))
1729		return;
1730
1731	for (;;) {
1732		td = pi->pi_owner;
1733		if (td == NULL || td == curthread)
1734			return;
1735
1736		MPASS(td->td_proc != NULL);
1737		MPASS(td->td_proc->p_magic == P_MAGIC);
1738
1739		thread_lock(td);
1740		if (td->td_lend_user_pri > pri)
1741			sched_lend_user_prio(td, pri);
1742		else {
1743			thread_unlock(td);
1744			break;
1745		}
1746		thread_unlock(td);
1747
1748		/*
1749		 * Pick up the lock that td is blocked on.
1750		 */
1751		uq = td->td_umtxq;
1752		pi = uq->uq_pi_blocked;
1753		if (pi == NULL)
1754			break;
1755		/* Resort td on the list if needed. */
1756		umtx_pi_adjust_thread(pi, td);
1757	}
1758}
1759
1760/*
1761 * Unpropagate priority for a PI mutex when a thread blocked on
1762 * it is interrupted by signal or resumed by others.
1763 */
1764static void
1765umtx_repropagate_priority(struct umtx_pi *pi)
1766{
1767	struct umtx_q *uq, *uq_owner;
1768	struct umtx_pi *pi2;
1769	int pri;
1770
1771	mtx_assert(&umtx_lock, MA_OWNED);
1772
1773	if (umtx_pi_check_loop(pi))
1774		return;
1775	while (pi != NULL && pi->pi_owner != NULL) {
1776		pri = PRI_MAX;
1777		uq_owner = pi->pi_owner->td_umtxq;
1778
1779		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1780			uq = TAILQ_FIRST(&pi2->pi_blocked);
1781			if (uq != NULL) {
1782				if (pri > UPRI(uq->uq_thread))
1783					pri = UPRI(uq->uq_thread);
1784			}
1785		}
1786
1787		if (pri > uq_owner->uq_inherited_pri)
1788			pri = uq_owner->uq_inherited_pri;
1789		thread_lock(pi->pi_owner);
1790		sched_lend_user_prio(pi->pi_owner, pri);
1791		thread_unlock(pi->pi_owner);
1792		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1793			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1794	}
1795}
1796
1797/*
1798 * Insert a PI mutex into owned list.
1799 */
1800static void
1801umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1802{
1803	struct umtx_q *uq_owner;
1804
1805	uq_owner = owner->td_umtxq;
1806	mtx_assert(&umtx_lock, MA_OWNED);
1807	if (pi->pi_owner != NULL)
1808		panic("pi_ower != NULL");
1809	pi->pi_owner = owner;
1810	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1811}
1812
1813/*
1814 * Claim ownership of a PI mutex.
1815 */
1816static int
1817umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1818{
1819	struct umtx_q *uq, *uq_owner;
1820
1821	uq_owner = owner->td_umtxq;
1822	mtx_lock_spin(&umtx_lock);
1823	if (pi->pi_owner == owner) {
1824		mtx_unlock_spin(&umtx_lock);
1825		return (0);
1826	}
1827
1828	if (pi->pi_owner != NULL) {
1829		/*
1830		 * userland may have already messed the mutex, sigh.
1831		 */
1832		mtx_unlock_spin(&umtx_lock);
1833		return (EPERM);
1834	}
1835	umtx_pi_setowner(pi, owner);
1836	uq = TAILQ_FIRST(&pi->pi_blocked);
1837	if (uq != NULL) {
1838		int pri;
1839
1840		pri = UPRI(uq->uq_thread);
1841		thread_lock(owner);
1842		if (pri < UPRI(owner))
1843			sched_lend_user_prio(owner, pri);
1844		thread_unlock(owner);
1845	}
1846	mtx_unlock_spin(&umtx_lock);
1847	return (0);
1848}
1849
1850/*
1851 * Adjust a thread's order position in its blocked PI mutex,
1852 * this may result new priority propagating process.
1853 */
1854void
1855umtx_pi_adjust(struct thread *td, u_char oldpri)
1856{
1857	struct umtx_q *uq;
1858	struct umtx_pi *pi;
1859
1860	uq = td->td_umtxq;
1861	mtx_lock_spin(&umtx_lock);
1862	/*
1863	 * Pick up the lock that td is blocked on.
1864	 */
1865	pi = uq->uq_pi_blocked;
1866	if (pi != NULL) {
1867		umtx_pi_adjust_thread(pi, td);
1868		umtx_repropagate_priority(pi);
1869	}
1870	mtx_unlock_spin(&umtx_lock);
1871}
1872
1873/*
1874 * Sleep on a PI mutex.
1875 */
1876static int
1877umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1878	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1879{
1880	struct umtxq_chain *uc;
1881	struct thread *td, *td1;
1882	struct umtx_q *uq1;
1883	int pri;
1884	int error = 0;
1885
1886	td = uq->uq_thread;
1887	KASSERT(td == curthread, ("inconsistent uq_thread"));
1888	uc = umtxq_getchain(&uq->uq_key);
1889	UMTXQ_LOCKED_ASSERT(uc);
1890	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1891	umtxq_insert(uq);
1892	mtx_lock_spin(&umtx_lock);
1893	if (pi->pi_owner == NULL) {
1894		mtx_unlock_spin(&umtx_lock);
1895		/* XXX Only look up thread in current process. */
1896		td1 = tdfind(owner, curproc->p_pid);
1897		mtx_lock_spin(&umtx_lock);
1898		if (td1 != NULL) {
1899			if (pi->pi_owner == NULL)
1900				umtx_pi_setowner(pi, td1);
1901			PROC_UNLOCK(td1->td_proc);
1902		}
1903	}
1904
1905	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1906		pri = UPRI(uq1->uq_thread);
1907		if (pri > UPRI(td))
1908			break;
1909	}
1910
1911	if (uq1 != NULL)
1912		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1913	else
1914		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1915
1916	uq->uq_pi_blocked = pi;
1917	thread_lock(td);
1918	td->td_flags |= TDF_UPIBLOCKED;
1919	thread_unlock(td);
1920	umtx_propagate_priority(td);
1921	mtx_unlock_spin(&umtx_lock);
1922	umtxq_unbusy(&uq->uq_key);
1923
1924	error = umtxq_sleep(uq, wmesg, timo);
1925	umtxq_remove(uq);
1926
1927	mtx_lock_spin(&umtx_lock);
1928	uq->uq_pi_blocked = NULL;
1929	thread_lock(td);
1930	td->td_flags &= ~TDF_UPIBLOCKED;
1931	thread_unlock(td);
1932	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1933	umtx_repropagate_priority(pi);
1934	mtx_unlock_spin(&umtx_lock);
1935	umtxq_unlock(&uq->uq_key);
1936
1937	return (error);
1938}
1939
1940/*
1941 * Add reference count for a PI mutex.
1942 */
1943static void
1944umtx_pi_ref(struct umtx_pi *pi)
1945{
1946	struct umtxq_chain *uc;
1947
1948	uc = umtxq_getchain(&pi->pi_key);
1949	UMTXQ_LOCKED_ASSERT(uc);
1950	pi->pi_refcount++;
1951}
1952
1953/*
1954 * Decrease reference count for a PI mutex, if the counter
1955 * is decreased to zero, its memory space is freed.
1956 */
1957static void
1958umtx_pi_unref(struct umtx_pi *pi)
1959{
1960	struct umtxq_chain *uc;
1961
1962	uc = umtxq_getchain(&pi->pi_key);
1963	UMTXQ_LOCKED_ASSERT(uc);
1964	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1965	if (--pi->pi_refcount == 0) {
1966		mtx_lock_spin(&umtx_lock);
1967		if (pi->pi_owner != NULL) {
1968			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1969				pi, pi_link);
1970			pi->pi_owner = NULL;
1971		}
1972		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1973			("blocked queue not empty"));
1974		mtx_unlock_spin(&umtx_lock);
1975		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1976		umtx_pi_free(pi);
1977	}
1978}
1979
1980/*
1981 * Find a PI mutex in hash table.
1982 */
1983static struct umtx_pi *
1984umtx_pi_lookup(struct umtx_key *key)
1985{
1986	struct umtxq_chain *uc;
1987	struct umtx_pi *pi;
1988
1989	uc = umtxq_getchain(key);
1990	UMTXQ_LOCKED_ASSERT(uc);
1991
1992	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1993		if (umtx_key_match(&pi->pi_key, key)) {
1994			return (pi);
1995		}
1996	}
1997	return (NULL);
1998}
1999
2000/*
2001 * Insert a PI mutex into hash table.
2002 */
2003static inline void
2004umtx_pi_insert(struct umtx_pi *pi)
2005{
2006	struct umtxq_chain *uc;
2007
2008	uc = umtxq_getchain(&pi->pi_key);
2009	UMTXQ_LOCKED_ASSERT(uc);
2010	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
2011}
2012
2013/*
2014 * Lock a PI mutex.
2015 */
2016static int
2017do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
2018    struct _umtx_time *timeout, int try)
2019{
2020	struct abs_timeout timo;
2021	struct umtx_q *uq;
2022	struct umtx_pi *pi, *new_pi;
2023	uint32_t id, owner, old;
2024	int error, rv;
2025
2026	id = td->td_tid;
2027	uq = td->td_umtxq;
2028
2029	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
2030	    &uq->uq_key)) != 0)
2031		return (error);
2032
2033	if (timeout != NULL)
2034		abs_timeout_init2(&timo, timeout);
2035
2036	umtxq_lock(&uq->uq_key);
2037	pi = umtx_pi_lookup(&uq->uq_key);
2038	if (pi == NULL) {
2039		new_pi = umtx_pi_alloc(M_NOWAIT);
2040		if (new_pi == NULL) {
2041			umtxq_unlock(&uq->uq_key);
2042			new_pi = umtx_pi_alloc(M_WAITOK);
2043			umtxq_lock(&uq->uq_key);
2044			pi = umtx_pi_lookup(&uq->uq_key);
2045			if (pi != NULL) {
2046				umtx_pi_free(new_pi);
2047				new_pi = NULL;
2048			}
2049		}
2050		if (new_pi != NULL) {
2051			new_pi->pi_key = uq->uq_key;
2052			umtx_pi_insert(new_pi);
2053			pi = new_pi;
2054		}
2055	}
2056	umtx_pi_ref(pi);
2057	umtxq_unlock(&uq->uq_key);
2058
2059	/*
2060	 * Care must be exercised when dealing with umtx structure.  It
2061	 * can fault on any access.
2062	 */
2063	for (;;) {
2064		/*
2065		 * Try the uncontested case.  This should be done in userland.
2066		 */
2067		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
2068		/* The address was invalid. */
2069		if (rv == -1) {
2070			error = EFAULT;
2071			break;
2072		}
2073
2074		/* The acquire succeeded. */
2075		if (owner == UMUTEX_UNOWNED) {
2076			error = 0;
2077			break;
2078		}
2079
2080		/* If no one owns it but it is contested try to acquire it. */
2081		if (owner == UMUTEX_CONTESTED) {
2082			rv = casueword32(&m->m_owner,
2083			    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2084			/* The address was invalid. */
2085			if (rv == -1) {
2086				error = EFAULT;
2087				break;
2088			}
2089
2090			if (owner == UMUTEX_CONTESTED) {
2091				umtxq_lock(&uq->uq_key);
2092				umtxq_busy(&uq->uq_key);
2093				error = umtx_pi_claim(pi, td);
2094				umtxq_unbusy(&uq->uq_key);
2095				umtxq_unlock(&uq->uq_key);
2096				break;
2097			}
2098
2099			error = umtxq_check_susp(td);
2100			if (error != 0)
2101				break;
2102
2103			/* If this failed the lock has changed, restart. */
2104			continue;
2105		}
2106
2107		if ((owner & ~UMUTEX_CONTESTED) == id) {
2108			error = EDEADLK;
2109			break;
2110		}
2111
2112		if (try != 0) {
2113			error = EBUSY;
2114			break;
2115		}
2116
2117		/*
2118		 * If we caught a signal, we have retried and now
2119		 * exit immediately.
2120		 */
2121		if (error != 0)
2122			break;
2123
2124		umtxq_lock(&uq->uq_key);
2125		umtxq_busy(&uq->uq_key);
2126		umtxq_unlock(&uq->uq_key);
2127
2128		/*
2129		 * Set the contested bit so that a release in user space
2130		 * knows to use the system call for unlock.  If this fails
2131		 * either some one else has acquired the lock or it has been
2132		 * released.
2133		 */
2134		rv = casueword32(&m->m_owner, owner, &old,
2135		    owner | UMUTEX_CONTESTED);
2136
2137		/* The address was invalid. */
2138		if (rv == -1) {
2139			umtxq_unbusy_unlocked(&uq->uq_key);
2140			error = EFAULT;
2141			break;
2142		}
2143
2144		umtxq_lock(&uq->uq_key);
2145		/*
2146		 * We set the contested bit, sleep. Otherwise the lock changed
2147		 * and we need to retry or we lost a race to the thread
2148		 * unlocking the umtx.
2149		 */
2150		if (old == owner) {
2151			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
2152			    "umtxpi", timeout == NULL ? NULL : &timo);
2153			if (error != 0)
2154				continue;
2155		} else {
2156			umtxq_unbusy(&uq->uq_key);
2157			umtxq_unlock(&uq->uq_key);
2158		}
2159
2160		error = umtxq_check_susp(td);
2161		if (error != 0)
2162			break;
2163	}
2164
2165	umtxq_lock(&uq->uq_key);
2166	umtx_pi_unref(pi);
2167	umtxq_unlock(&uq->uq_key);
2168
2169	umtx_key_release(&uq->uq_key);
2170	return (error);
2171}
2172
2173/*
2174 * Unlock a PI mutex.
2175 */
2176static int
2177do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
2178{
2179	struct umtx_key key;
2180	struct umtx_q *uq_first, *uq_first2, *uq_me;
2181	struct umtx_pi *pi, *pi2;
2182	uint32_t owner, old, id;
2183	int error;
2184	int count;
2185	int pri;
2186
2187	id = td->td_tid;
2188	/*
2189	 * Make sure we own this mtx.
2190	 */
2191	error = fueword32(&m->m_owner, &owner);
2192	if (error == -1)
2193		return (EFAULT);
2194
2195	if ((owner & ~UMUTEX_CONTESTED) != id)
2196		return (EPERM);
2197
2198	/* This should be done in userland */
2199	if ((owner & UMUTEX_CONTESTED) == 0) {
2200		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
2201		if (error == -1)
2202			return (EFAULT);
2203		if (old == owner)
2204			return (0);
2205		owner = old;
2206	}
2207
2208	/* We should only ever be in here for contested locks */
2209	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
2210	    &key)) != 0)
2211		return (error);
2212
2213	umtxq_lock(&key);
2214	umtxq_busy(&key);
2215	count = umtxq_count_pi(&key, &uq_first);
2216	if (uq_first != NULL) {
2217		mtx_lock_spin(&umtx_lock);
2218		pi = uq_first->uq_pi_blocked;
2219		KASSERT(pi != NULL, ("pi == NULL?"));
2220		if (pi->pi_owner != curthread) {
2221			mtx_unlock_spin(&umtx_lock);
2222			umtxq_unbusy(&key);
2223			umtxq_unlock(&key);
2224			umtx_key_release(&key);
2225			/* userland messed the mutex */
2226			return (EPERM);
2227		}
2228		uq_me = curthread->td_umtxq;
2229		pi->pi_owner = NULL;
2230		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
2231		/* get highest priority thread which is still sleeping. */
2232		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2233		while (uq_first != NULL &&
2234		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2235			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2236		}
2237		pri = PRI_MAX;
2238		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2239			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2240			if (uq_first2 != NULL) {
2241				if (pri > UPRI(uq_first2->uq_thread))
2242					pri = UPRI(uq_first2->uq_thread);
2243			}
2244		}
2245		thread_lock(curthread);
2246		sched_lend_user_prio(curthread, pri);
2247		thread_unlock(curthread);
2248		mtx_unlock_spin(&umtx_lock);
2249		if (uq_first)
2250			umtxq_signal_thread(uq_first);
2251	}
2252	umtxq_unlock(&key);
2253
2254	/*
2255	 * When unlocking the umtx, it must be marked as unowned if
2256	 * there is zero or one thread only waiting for it.
2257	 * Otherwise, it must be marked as contested.
2258	 */
2259	error = casueword32(&m->m_owner, owner, &old,
2260	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
2261
2262	umtxq_unbusy_unlocked(&key);
2263	umtx_key_release(&key);
2264	if (error == -1)
2265		return (EFAULT);
2266	if (old != owner)
2267		return (EINVAL);
2268	return (0);
2269}
2270
2271/*
2272 * Lock a PP mutex.
2273 */
2274static int
2275do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2276    struct _umtx_time *timeout, int try)
2277{
2278	struct abs_timeout timo;
2279	struct umtx_q *uq, *uq2;
2280	struct umtx_pi *pi;
2281	uint32_t ceiling;
2282	uint32_t owner, id;
2283	int error, pri, old_inherited_pri, su, rv;
2284
2285	id = td->td_tid;
2286	uq = td->td_umtxq;
2287	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2288	    &uq->uq_key)) != 0)
2289		return (error);
2290
2291	if (timeout != NULL)
2292		abs_timeout_init2(&timo, timeout);
2293
2294	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2295	for (;;) {
2296		old_inherited_pri = uq->uq_inherited_pri;
2297		umtxq_lock(&uq->uq_key);
2298		umtxq_busy(&uq->uq_key);
2299		umtxq_unlock(&uq->uq_key);
2300
2301		rv = fueword32(&m->m_ceilings[0], &ceiling);
2302		if (rv == -1) {
2303			error = EFAULT;
2304			goto out;
2305		}
2306		ceiling = RTP_PRIO_MAX - ceiling;
2307		if (ceiling > RTP_PRIO_MAX) {
2308			error = EINVAL;
2309			goto out;
2310		}
2311
2312		mtx_lock_spin(&umtx_lock);
2313		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2314			mtx_unlock_spin(&umtx_lock);
2315			error = EINVAL;
2316			goto out;
2317		}
2318		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2319			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2320			thread_lock(td);
2321			if (uq->uq_inherited_pri < UPRI(td))
2322				sched_lend_user_prio(td, uq->uq_inherited_pri);
2323			thread_unlock(td);
2324		}
2325		mtx_unlock_spin(&umtx_lock);
2326
2327		rv = casueword32(&m->m_owner,
2328		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2329		/* The address was invalid. */
2330		if (rv == -1) {
2331			error = EFAULT;
2332			break;
2333		}
2334
2335		if (owner == UMUTEX_CONTESTED) {
2336			error = 0;
2337			break;
2338		}
2339
2340		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2341		    (owner & ~UMUTEX_CONTESTED) == id) {
2342			error = EDEADLK;
2343			break;
2344		}
2345
2346		if (try != 0) {
2347			error = EBUSY;
2348			break;
2349		}
2350
2351		/*
2352		 * If we caught a signal, we have retried and now
2353		 * exit immediately.
2354		 */
2355		if (error != 0)
2356			break;
2357
2358		umtxq_lock(&uq->uq_key);
2359		umtxq_insert(uq);
2360		umtxq_unbusy(&uq->uq_key);
2361		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2362		    NULL : &timo);
2363		umtxq_remove(uq);
2364		umtxq_unlock(&uq->uq_key);
2365
2366		mtx_lock_spin(&umtx_lock);
2367		uq->uq_inherited_pri = old_inherited_pri;
2368		pri = PRI_MAX;
2369		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2370			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2371			if (uq2 != NULL) {
2372				if (pri > UPRI(uq2->uq_thread))
2373					pri = UPRI(uq2->uq_thread);
2374			}
2375		}
2376		if (pri > uq->uq_inherited_pri)
2377			pri = uq->uq_inherited_pri;
2378		thread_lock(td);
2379		sched_lend_user_prio(td, pri);
2380		thread_unlock(td);
2381		mtx_unlock_spin(&umtx_lock);
2382	}
2383
2384	if (error != 0) {
2385		mtx_lock_spin(&umtx_lock);
2386		uq->uq_inherited_pri = old_inherited_pri;
2387		pri = PRI_MAX;
2388		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2389			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2390			if (uq2 != NULL) {
2391				if (pri > UPRI(uq2->uq_thread))
2392					pri = UPRI(uq2->uq_thread);
2393			}
2394		}
2395		if (pri > uq->uq_inherited_pri)
2396			pri = uq->uq_inherited_pri;
2397		thread_lock(td);
2398		sched_lend_user_prio(td, pri);
2399		thread_unlock(td);
2400		mtx_unlock_spin(&umtx_lock);
2401	}
2402
2403out:
2404	umtxq_unbusy_unlocked(&uq->uq_key);
2405	umtx_key_release(&uq->uq_key);
2406	return (error);
2407}
2408
2409/*
2410 * Unlock a PP mutex.
2411 */
2412static int
2413do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2414{
2415	struct umtx_key key;
2416	struct umtx_q *uq, *uq2;
2417	struct umtx_pi *pi;
2418	uint32_t owner, id;
2419	uint32_t rceiling;
2420	int error, pri, new_inherited_pri, su;
2421
2422	id = td->td_tid;
2423	uq = td->td_umtxq;
2424	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2425
2426	/*
2427	 * Make sure we own this mtx.
2428	 */
2429	error = fueword32(&m->m_owner, &owner);
2430	if (error == -1)
2431		return (EFAULT);
2432
2433	if ((owner & ~UMUTEX_CONTESTED) != id)
2434		return (EPERM);
2435
2436	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2437	if (error != 0)
2438		return (error);
2439
2440	if (rceiling == -1)
2441		new_inherited_pri = PRI_MAX;
2442	else {
2443		rceiling = RTP_PRIO_MAX - rceiling;
2444		if (rceiling > RTP_PRIO_MAX)
2445			return (EINVAL);
2446		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2447	}
2448
2449	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2450	    &key)) != 0)
2451		return (error);
2452	umtxq_lock(&key);
2453	umtxq_busy(&key);
2454	umtxq_unlock(&key);
2455	/*
2456	 * For priority protected mutex, always set unlocked state
2457	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2458	 * to lock the mutex, it is necessary because thread priority
2459	 * has to be adjusted for such mutex.
2460	 */
2461	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
2462
2463	umtxq_lock(&key);
2464	if (error == 0)
2465		umtxq_signal(&key, 1);
2466	umtxq_unbusy(&key);
2467	umtxq_unlock(&key);
2468
2469	if (error == -1)
2470		error = EFAULT;
2471	else {
2472		mtx_lock_spin(&umtx_lock);
2473		if (su != 0)
2474			uq->uq_inherited_pri = new_inherited_pri;
2475		pri = PRI_MAX;
2476		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2477			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2478			if (uq2 != NULL) {
2479				if (pri > UPRI(uq2->uq_thread))
2480					pri = UPRI(uq2->uq_thread);
2481			}
2482		}
2483		if (pri > uq->uq_inherited_pri)
2484			pri = uq->uq_inherited_pri;
2485		thread_lock(td);
2486		sched_lend_user_prio(td, pri);
2487		thread_unlock(td);
2488		mtx_unlock_spin(&umtx_lock);
2489	}
2490	umtx_key_release(&key);
2491	return (error);
2492}
2493
2494static int
2495do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2496	uint32_t *old_ceiling)
2497{
2498	struct umtx_q *uq;
2499	uint32_t save_ceiling;
2500	uint32_t owner, id;
2501	uint32_t flags;
2502	int error, rv;
2503
2504	error = fueword32(&m->m_flags, &flags);
2505	if (error == -1)
2506		return (EFAULT);
2507	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2508		return (EINVAL);
2509	if (ceiling > RTP_PRIO_MAX)
2510		return (EINVAL);
2511	id = td->td_tid;
2512	uq = td->td_umtxq;
2513	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2514	   &uq->uq_key)) != 0)
2515		return (error);
2516	for (;;) {
2517		umtxq_lock(&uq->uq_key);
2518		umtxq_busy(&uq->uq_key);
2519		umtxq_unlock(&uq->uq_key);
2520
2521		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2522		if (rv == -1) {
2523			error = EFAULT;
2524			break;
2525		}
2526
2527		rv = casueword32(&m->m_owner,
2528		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2529		if (rv == -1) {
2530			error = EFAULT;
2531			break;
2532		}
2533
2534		if (owner == UMUTEX_CONTESTED) {
2535			suword32(&m->m_ceilings[0], ceiling);
2536			suword32(&m->m_owner, UMUTEX_CONTESTED);
2537			error = 0;
2538			break;
2539		}
2540
2541		if ((owner & ~UMUTEX_CONTESTED) == id) {
2542			suword32(&m->m_ceilings[0], ceiling);
2543			error = 0;
2544			break;
2545		}
2546
2547		/*
2548		 * If we caught a signal, we have retried and now
2549		 * exit immediately.
2550		 */
2551		if (error != 0)
2552			break;
2553
2554		/*
2555		 * We set the contested bit, sleep. Otherwise the lock changed
2556		 * and we need to retry or we lost a race to the thread
2557		 * unlocking the umtx.
2558		 */
2559		umtxq_lock(&uq->uq_key);
2560		umtxq_insert(uq);
2561		umtxq_unbusy(&uq->uq_key);
2562		error = umtxq_sleep(uq, "umtxpp", NULL);
2563		umtxq_remove(uq);
2564		umtxq_unlock(&uq->uq_key);
2565	}
2566	umtxq_lock(&uq->uq_key);
2567	if (error == 0)
2568		umtxq_signal(&uq->uq_key, INT_MAX);
2569	umtxq_unbusy(&uq->uq_key);
2570	umtxq_unlock(&uq->uq_key);
2571	umtx_key_release(&uq->uq_key);
2572	if (error == 0 && old_ceiling != NULL)
2573		suword32(old_ceiling, save_ceiling);
2574	return (error);
2575}
2576
2577/*
2578 * Lock a userland POSIX mutex.
2579 */
2580static int
2581do_lock_umutex(struct thread *td, struct umutex *m,
2582    struct _umtx_time *timeout, int mode)
2583{
2584	uint32_t flags;
2585	int error;
2586
2587	error = fueword32(&m->m_flags, &flags);
2588	if (error == -1)
2589		return (EFAULT);
2590
2591	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2592	case 0:
2593		error = do_lock_normal(td, m, flags, timeout, mode);
2594		break;
2595	case UMUTEX_PRIO_INHERIT:
2596		error = do_lock_pi(td, m, flags, timeout, mode);
2597		break;
2598	case UMUTEX_PRIO_PROTECT:
2599		error = do_lock_pp(td, m, flags, timeout, mode);
2600		break;
2601	default:
2602		return (EINVAL);
2603	}
2604	if (timeout == NULL) {
2605		if (error == EINTR && mode != _UMUTEX_WAIT)
2606			error = ERESTART;
2607	} else {
2608		/* Timed-locking is not restarted. */
2609		if (error == ERESTART)
2610			error = EINTR;
2611	}
2612	return (error);
2613}
2614
2615/*
2616 * Unlock a userland POSIX mutex.
2617 */
2618static int
2619do_unlock_umutex(struct thread *td, struct umutex *m)
2620{
2621	uint32_t flags;
2622	int error;
2623
2624	error = fueword32(&m->m_flags, &flags);
2625	if (error == -1)
2626		return (EFAULT);
2627
2628	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2629	case 0:
2630		return (do_unlock_normal(td, m, flags));
2631	case UMUTEX_PRIO_INHERIT:
2632		return (do_unlock_pi(td, m, flags));
2633	case UMUTEX_PRIO_PROTECT:
2634		return (do_unlock_pp(td, m, flags));
2635	}
2636
2637	return (EINVAL);
2638}
2639
2640static int
2641do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2642	struct timespec *timeout, u_long wflags)
2643{
2644	struct abs_timeout timo;
2645	struct umtx_q *uq;
2646	uint32_t flags, clockid, hasw;
2647	int error;
2648
2649	uq = td->td_umtxq;
2650	error = fueword32(&cv->c_flags, &flags);
2651	if (error == -1)
2652		return (EFAULT);
2653	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2654	if (error != 0)
2655		return (error);
2656
2657	if ((wflags & CVWAIT_CLOCKID) != 0) {
2658		error = fueword32(&cv->c_clockid, &clockid);
2659		if (error == -1) {
2660			umtx_key_release(&uq->uq_key);
2661			return (EFAULT);
2662		}
2663		if (clockid < CLOCK_REALTIME ||
2664		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2665			/* hmm, only HW clock id will work. */
2666			umtx_key_release(&uq->uq_key);
2667			return (EINVAL);
2668		}
2669	} else {
2670		clockid = CLOCK_REALTIME;
2671	}
2672
2673	umtxq_lock(&uq->uq_key);
2674	umtxq_busy(&uq->uq_key);
2675	umtxq_insert(uq);
2676	umtxq_unlock(&uq->uq_key);
2677
2678	/*
2679	 * Set c_has_waiters to 1 before releasing user mutex, also
2680	 * don't modify cache line when unnecessary.
2681	 */
2682	error = fueword32(&cv->c_has_waiters, &hasw);
2683	if (error == 0 && hasw == 0)
2684		suword32(&cv->c_has_waiters, 1);
2685
2686	umtxq_unbusy_unlocked(&uq->uq_key);
2687
2688	error = do_unlock_umutex(td, m);
2689
2690	if (timeout != NULL)
2691		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2692			timeout);
2693
2694	umtxq_lock(&uq->uq_key);
2695	if (error == 0) {
2696		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2697		    NULL : &timo);
2698	}
2699
2700	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2701		error = 0;
2702	else {
2703		/*
2704		 * This must be timeout,interrupted by signal or
2705		 * surprious wakeup, clear c_has_waiter flag when
2706		 * necessary.
2707		 */
2708		umtxq_busy(&uq->uq_key);
2709		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2710			int oldlen = uq->uq_cur_queue->length;
2711			umtxq_remove(uq);
2712			if (oldlen == 1) {
2713				umtxq_unlock(&uq->uq_key);
2714				suword32(&cv->c_has_waiters, 0);
2715				umtxq_lock(&uq->uq_key);
2716			}
2717		}
2718		umtxq_unbusy(&uq->uq_key);
2719		if (error == ERESTART)
2720			error = EINTR;
2721	}
2722
2723	umtxq_unlock(&uq->uq_key);
2724	umtx_key_release(&uq->uq_key);
2725	return (error);
2726}
2727
2728/*
2729 * Signal a userland condition variable.
2730 */
2731static int
2732do_cv_signal(struct thread *td, struct ucond *cv)
2733{
2734	struct umtx_key key;
2735	int error, cnt, nwake;
2736	uint32_t flags;
2737
2738	error = fueword32(&cv->c_flags, &flags);
2739	if (error == -1)
2740		return (EFAULT);
2741	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2742		return (error);
2743	umtxq_lock(&key);
2744	umtxq_busy(&key);
2745	cnt = umtxq_count(&key);
2746	nwake = umtxq_signal(&key, 1);
2747	if (cnt <= nwake) {
2748		umtxq_unlock(&key);
2749		error = suword32(&cv->c_has_waiters, 0);
2750		if (error == -1)
2751			error = EFAULT;
2752		umtxq_lock(&key);
2753	}
2754	umtxq_unbusy(&key);
2755	umtxq_unlock(&key);
2756	umtx_key_release(&key);
2757	return (error);
2758}
2759
2760static int
2761do_cv_broadcast(struct thread *td, struct ucond *cv)
2762{
2763	struct umtx_key key;
2764	int error;
2765	uint32_t flags;
2766
2767	error = fueword32(&cv->c_flags, &flags);
2768	if (error == -1)
2769		return (EFAULT);
2770	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2771		return (error);
2772
2773	umtxq_lock(&key);
2774	umtxq_busy(&key);
2775	umtxq_signal(&key, INT_MAX);
2776	umtxq_unlock(&key);
2777
2778	error = suword32(&cv->c_has_waiters, 0);
2779	if (error == -1)
2780		error = EFAULT;
2781
2782	umtxq_unbusy_unlocked(&key);
2783
2784	umtx_key_release(&key);
2785	return (error);
2786}
2787
2788static int
2789do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2790{
2791	struct abs_timeout timo;
2792	struct umtx_q *uq;
2793	uint32_t flags, wrflags;
2794	int32_t state, oldstate;
2795	int32_t blocked_readers;
2796	int error, rv;
2797
2798	uq = td->td_umtxq;
2799	error = fueword32(&rwlock->rw_flags, &flags);
2800	if (error == -1)
2801		return (EFAULT);
2802	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2803	if (error != 0)
2804		return (error);
2805
2806	if (timeout != NULL)
2807		abs_timeout_init2(&timo, timeout);
2808
2809	wrflags = URWLOCK_WRITE_OWNER;
2810	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2811		wrflags |= URWLOCK_WRITE_WAITERS;
2812
2813	for (;;) {
2814		rv = fueword32(&rwlock->rw_state, &state);
2815		if (rv == -1) {
2816			umtx_key_release(&uq->uq_key);
2817			return (EFAULT);
2818		}
2819
2820		/* try to lock it */
2821		while (!(state & wrflags)) {
2822			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2823				umtx_key_release(&uq->uq_key);
2824				return (EAGAIN);
2825			}
2826			rv = casueword32(&rwlock->rw_state, state,
2827			    &oldstate, state + 1);
2828			if (rv == -1) {
2829				umtx_key_release(&uq->uq_key);
2830				return (EFAULT);
2831			}
2832			if (oldstate == state) {
2833				umtx_key_release(&uq->uq_key);
2834				return (0);
2835			}
2836			error = umtxq_check_susp(td);
2837			if (error != 0)
2838				break;
2839			state = oldstate;
2840		}
2841
2842		if (error)
2843			break;
2844
2845		/* grab monitor lock */
2846		umtxq_lock(&uq->uq_key);
2847		umtxq_busy(&uq->uq_key);
2848		umtxq_unlock(&uq->uq_key);
2849
2850		/*
2851		 * re-read the state, in case it changed between the try-lock above
2852		 * and the check below
2853		 */
2854		rv = fueword32(&rwlock->rw_state, &state);
2855		if (rv == -1)
2856			error = EFAULT;
2857
2858		/* set read contention bit */
2859		while (error == 0 && (state & wrflags) &&
2860		    !(state & URWLOCK_READ_WAITERS)) {
2861			rv = casueword32(&rwlock->rw_state, state,
2862			    &oldstate, state | URWLOCK_READ_WAITERS);
2863			if (rv == -1) {
2864				error = EFAULT;
2865				break;
2866			}
2867			if (oldstate == state)
2868				goto sleep;
2869			state = oldstate;
2870			error = umtxq_check_susp(td);
2871			if (error != 0)
2872				break;
2873		}
2874		if (error != 0) {
2875			umtxq_unbusy_unlocked(&uq->uq_key);
2876			break;
2877		}
2878
2879		/* state is changed while setting flags, restart */
2880		if (!(state & wrflags)) {
2881			umtxq_unbusy_unlocked(&uq->uq_key);
2882			error = umtxq_check_susp(td);
2883			if (error != 0)
2884				break;
2885			continue;
2886		}
2887
2888sleep:
2889		/* contention bit is set, before sleeping, increase read waiter count */
2890		rv = fueword32(&rwlock->rw_blocked_readers,
2891		    &blocked_readers);
2892		if (rv == -1) {
2893			umtxq_unbusy_unlocked(&uq->uq_key);
2894			error = EFAULT;
2895			break;
2896		}
2897		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2898
2899		while (state & wrflags) {
2900			umtxq_lock(&uq->uq_key);
2901			umtxq_insert(uq);
2902			umtxq_unbusy(&uq->uq_key);
2903
2904			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2905			    NULL : &timo);
2906
2907			umtxq_busy(&uq->uq_key);
2908			umtxq_remove(uq);
2909			umtxq_unlock(&uq->uq_key);
2910			if (error)
2911				break;
2912			rv = fueword32(&rwlock->rw_state, &state);
2913			if (rv == -1) {
2914				error = EFAULT;
2915				break;
2916			}
2917		}
2918
2919		/* decrease read waiter count, and may clear read contention bit */
2920		rv = fueword32(&rwlock->rw_blocked_readers,
2921		    &blocked_readers);
2922		if (rv == -1) {
2923			umtxq_unbusy_unlocked(&uq->uq_key);
2924			error = EFAULT;
2925			break;
2926		}
2927		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2928		if (blocked_readers == 1) {
2929			rv = fueword32(&rwlock->rw_state, &state);
2930			if (rv == -1)
2931				error = EFAULT;
2932			while (error == 0) {
2933				rv = casueword32(&rwlock->rw_state, state,
2934				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2935				if (rv == -1) {
2936					error = EFAULT;
2937					break;
2938				}
2939				if (oldstate == state)
2940					break;
2941				state = oldstate;
2942				error = umtxq_check_susp(td);
2943			}
2944		}
2945
2946		umtxq_unbusy_unlocked(&uq->uq_key);
2947		if (error != 0)
2948			break;
2949	}
2950	umtx_key_release(&uq->uq_key);
2951	if (error == ERESTART)
2952		error = EINTR;
2953	return (error);
2954}
2955
2956static int
2957do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2958{
2959	struct abs_timeout timo;
2960	struct umtx_q *uq;
2961	uint32_t flags;
2962	int32_t state, oldstate;
2963	int32_t blocked_writers;
2964	int32_t blocked_readers;
2965	int error, rv;
2966
2967	uq = td->td_umtxq;
2968	error = fueword32(&rwlock->rw_flags, &flags);
2969	if (error == -1)
2970		return (EFAULT);
2971	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2972	if (error != 0)
2973		return (error);
2974
2975	if (timeout != NULL)
2976		abs_timeout_init2(&timo, timeout);
2977
2978	blocked_readers = 0;
2979	for (;;) {
2980		rv = fueword32(&rwlock->rw_state, &state);
2981		if (rv == -1) {
2982			umtx_key_release(&uq->uq_key);
2983			return (EFAULT);
2984		}
2985		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2986			rv = casueword32(&rwlock->rw_state, state,
2987			    &oldstate, state | URWLOCK_WRITE_OWNER);
2988			if (rv == -1) {
2989				umtx_key_release(&uq->uq_key);
2990				return (EFAULT);
2991			}
2992			if (oldstate == state) {
2993				umtx_key_release(&uq->uq_key);
2994				return (0);
2995			}
2996			state = oldstate;
2997			error = umtxq_check_susp(td);
2998			if (error != 0)
2999				break;
3000		}
3001
3002		if (error) {
3003			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
3004			    blocked_readers != 0) {
3005				umtxq_lock(&uq->uq_key);
3006				umtxq_busy(&uq->uq_key);
3007				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
3008				umtxq_unbusy(&uq->uq_key);
3009				umtxq_unlock(&uq->uq_key);
3010			}
3011
3012			break;
3013		}
3014
3015		/* grab monitor lock */
3016		umtxq_lock(&uq->uq_key);
3017		umtxq_busy(&uq->uq_key);
3018		umtxq_unlock(&uq->uq_key);
3019
3020		/*
3021		 * re-read the state, in case it changed between the try-lock above
3022		 * and the check below
3023		 */
3024		rv = fueword32(&rwlock->rw_state, &state);
3025		if (rv == -1)
3026			error = EFAULT;
3027
3028		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
3029		    URWLOCK_READER_COUNT(state) != 0) &&
3030		    (state & URWLOCK_WRITE_WAITERS) == 0) {
3031			rv = casueword32(&rwlock->rw_state, state,
3032			    &oldstate, state | URWLOCK_WRITE_WAITERS);
3033			if (rv == -1) {
3034				error = EFAULT;
3035				break;
3036			}
3037			if (oldstate == state)
3038				goto sleep;
3039			state = oldstate;
3040			error = umtxq_check_susp(td);
3041			if (error != 0)
3042				break;
3043		}
3044		if (error != 0) {
3045			umtxq_unbusy_unlocked(&uq->uq_key);
3046			break;
3047		}
3048
3049		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
3050			umtxq_unbusy_unlocked(&uq->uq_key);
3051			error = umtxq_check_susp(td);
3052			if (error != 0)
3053				break;
3054			continue;
3055		}
3056sleep:
3057		rv = fueword32(&rwlock->rw_blocked_writers,
3058		    &blocked_writers);
3059		if (rv == -1) {
3060			umtxq_unbusy_unlocked(&uq->uq_key);
3061			error = EFAULT;
3062			break;
3063		}
3064		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
3065
3066		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
3067			umtxq_lock(&uq->uq_key);
3068			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
3069			umtxq_unbusy(&uq->uq_key);
3070
3071			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
3072			    NULL : &timo);
3073
3074			umtxq_busy(&uq->uq_key);
3075			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
3076			umtxq_unlock(&uq->uq_key);
3077			if (error)
3078				break;
3079			rv = fueword32(&rwlock->rw_state, &state);
3080			if (rv == -1) {
3081				error = EFAULT;
3082				break;
3083			}
3084		}
3085
3086		rv = fueword32(&rwlock->rw_blocked_writers,
3087		    &blocked_writers);
3088		if (rv == -1) {
3089			umtxq_unbusy_unlocked(&uq->uq_key);
3090			error = EFAULT;
3091			break;
3092		}
3093		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
3094		if (blocked_writers == 1) {
3095			rv = fueword32(&rwlock->rw_state, &state);
3096			if (rv == -1) {
3097				umtxq_unbusy_unlocked(&uq->uq_key);
3098				error = EFAULT;
3099				break;
3100			}
3101			for (;;) {
3102				rv = casueword32(&rwlock->rw_state, state,
3103				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
3104				if (rv == -1) {
3105					error = EFAULT;
3106					break;
3107				}
3108				if (oldstate == state)
3109					break;
3110				state = oldstate;
3111				error = umtxq_check_susp(td);
3112				/*
3113				 * We are leaving the URWLOCK_WRITE_WAITERS
3114				 * behind, but this should not harm the
3115				 * correctness.
3116				 */
3117				if (error != 0)
3118					break;
3119			}
3120			rv = fueword32(&rwlock->rw_blocked_readers,
3121			    &blocked_readers);
3122			if (rv == -1) {
3123				umtxq_unbusy_unlocked(&uq->uq_key);
3124				error = EFAULT;
3125				break;
3126			}
3127		} else
3128			blocked_readers = 0;
3129
3130		umtxq_unbusy_unlocked(&uq->uq_key);
3131	}
3132
3133	umtx_key_release(&uq->uq_key);
3134	if (error == ERESTART)
3135		error = EINTR;
3136	return (error);
3137}
3138
3139static int
3140do_rw_unlock(struct thread *td, struct urwlock *rwlock)
3141{
3142	struct umtx_q *uq;
3143	uint32_t flags;
3144	int32_t state, oldstate;
3145	int error, rv, q, count;
3146
3147	uq = td->td_umtxq;
3148	error = fueword32(&rwlock->rw_flags, &flags);
3149	if (error == -1)
3150		return (EFAULT);
3151	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3152	if (error != 0)
3153		return (error);
3154
3155	error = fueword32(&rwlock->rw_state, &state);
3156	if (error == -1) {
3157		error = EFAULT;
3158		goto out;
3159	}
3160	if (state & URWLOCK_WRITE_OWNER) {
3161		for (;;) {
3162			rv = casueword32(&rwlock->rw_state, state,
3163			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
3164			if (rv == -1) {
3165				error = EFAULT;
3166				goto out;
3167			}
3168			if (oldstate != state) {
3169				state = oldstate;
3170				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3171					error = EPERM;
3172					goto out;
3173				}
3174				error = umtxq_check_susp(td);
3175				if (error != 0)
3176					goto out;
3177			} else
3178				break;
3179		}
3180	} else if (URWLOCK_READER_COUNT(state) != 0) {
3181		for (;;) {
3182			rv = casueword32(&rwlock->rw_state, state,
3183			    &oldstate, state - 1);
3184			if (rv == -1) {
3185				error = EFAULT;
3186				goto out;
3187			}
3188			if (oldstate != state) {
3189				state = oldstate;
3190				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3191					error = EPERM;
3192					goto out;
3193				}
3194				error = umtxq_check_susp(td);
3195				if (error != 0)
3196					goto out;
3197			} else
3198				break;
3199		}
3200	} else {
3201		error = EPERM;
3202		goto out;
3203	}
3204
3205	count = 0;
3206
3207	if (!(flags & URWLOCK_PREFER_READER)) {
3208		if (state & URWLOCK_WRITE_WAITERS) {
3209			count = 1;
3210			q = UMTX_EXCLUSIVE_QUEUE;
3211		} else if (state & URWLOCK_READ_WAITERS) {
3212			count = INT_MAX;
3213			q = UMTX_SHARED_QUEUE;
3214		}
3215	} else {
3216		if (state & URWLOCK_READ_WAITERS) {
3217			count = INT_MAX;
3218			q = UMTX_SHARED_QUEUE;
3219		} else if (state & URWLOCK_WRITE_WAITERS) {
3220			count = 1;
3221			q = UMTX_EXCLUSIVE_QUEUE;
3222		}
3223	}
3224
3225	if (count) {
3226		umtxq_lock(&uq->uq_key);
3227		umtxq_busy(&uq->uq_key);
3228		umtxq_signal_queue(&uq->uq_key, count, q);
3229		umtxq_unbusy(&uq->uq_key);
3230		umtxq_unlock(&uq->uq_key);
3231	}
3232out:
3233	umtx_key_release(&uq->uq_key);
3234	return (error);
3235}
3236
3237static int
3238do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3239{
3240	struct abs_timeout timo;
3241	struct umtx_q *uq;
3242	uint32_t flags, count, count1;
3243	int error, rv;
3244
3245	uq = td->td_umtxq;
3246	error = fueword32(&sem->_flags, &flags);
3247	if (error == -1)
3248		return (EFAULT);
3249	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3250	if (error != 0)
3251		return (error);
3252
3253	if (timeout != NULL)
3254		abs_timeout_init2(&timo, timeout);
3255
3256	umtxq_lock(&uq->uq_key);
3257	umtxq_busy(&uq->uq_key);
3258	umtxq_insert(uq);
3259	umtxq_unlock(&uq->uq_key);
3260	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
3261	if (rv == 0)
3262		rv = fueword32(&sem->_count, &count);
3263	if (rv == -1 || count != 0) {
3264		umtxq_lock(&uq->uq_key);
3265		umtxq_unbusy(&uq->uq_key);
3266		umtxq_remove(uq);
3267		umtxq_unlock(&uq->uq_key);
3268		umtx_key_release(&uq->uq_key);
3269		return (rv == -1 ? EFAULT : 0);
3270	}
3271	umtxq_lock(&uq->uq_key);
3272	umtxq_unbusy(&uq->uq_key);
3273
3274	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3275
3276	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3277		error = 0;
3278	else {
3279		umtxq_remove(uq);
3280		/* A relative timeout cannot be restarted. */
3281		if (error == ERESTART && timeout != NULL &&
3282		    (timeout->_flags & UMTX_ABSTIME) == 0)
3283			error = EINTR;
3284	}
3285	umtxq_unlock(&uq->uq_key);
3286	umtx_key_release(&uq->uq_key);
3287	return (error);
3288}
3289
3290/*
3291 * Signal a userland condition variable.
3292 */
3293static int
3294do_sem_wake(struct thread *td, struct _usem *sem)
3295{
3296	struct umtx_key key;
3297	int error, cnt;
3298	uint32_t flags;
3299
3300	error = fueword32(&sem->_flags, &flags);
3301	if (error == -1)
3302		return (EFAULT);
3303	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3304		return (error);
3305	umtxq_lock(&key);
3306	umtxq_busy(&key);
3307	cnt = umtxq_count(&key);
3308	if (cnt > 0) {
3309		umtxq_signal(&key, 1);
3310		/*
3311		 * Check if count is greater than 0, this means the memory is
3312		 * still being referenced by user code, so we can safely
3313		 * update _has_waiters flag.
3314		 */
3315		if (cnt == 1) {
3316			umtxq_unlock(&key);
3317			error = suword32(&sem->_has_waiters, 0);
3318			umtxq_lock(&key);
3319			if (error == -1)
3320				error = EFAULT;
3321		}
3322	}
3323	umtxq_unbusy(&key);
3324	umtxq_unlock(&key);
3325	umtx_key_release(&key);
3326	return (error);
3327}
3328
3329int
3330sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
3331    /* struct umtx *umtx */
3332{
3333	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
3334}
3335
3336int
3337sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
3338    /* struct umtx *umtx */
3339{
3340	return do_unlock_umtx(td, uap->umtx, td->td_tid);
3341}
3342
3343inline int
3344umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3345{
3346	int error;
3347
3348	error = copyin(addr, tsp, sizeof(struct timespec));
3349	if (error == 0) {
3350		if (tsp->tv_sec < 0 ||
3351		    tsp->tv_nsec >= 1000000000 ||
3352		    tsp->tv_nsec < 0)
3353			error = EINVAL;
3354	}
3355	return (error);
3356}
3357
3358static inline int
3359umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3360{
3361	int error;
3362
3363	if (size <= sizeof(struct timespec)) {
3364		tp->_clockid = CLOCK_REALTIME;
3365		tp->_flags = 0;
3366		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3367	} else
3368		error = copyin(addr, tp, sizeof(struct _umtx_time));
3369	if (error != 0)
3370		return (error);
3371	if (tp->_timeout.tv_sec < 0 ||
3372	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3373		return (EINVAL);
3374	return (0);
3375}
3376
3377static int
3378__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
3379{
3380	struct timespec *ts, timeout;
3381	int error;
3382
3383	/* Allow a null timespec (wait forever). */
3384	if (uap->uaddr2 == NULL)
3385		ts = NULL;
3386	else {
3387		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3388		if (error != 0)
3389			return (error);
3390		ts = &timeout;
3391	}
3392	return (do_lock_umtx(td, uap->obj, uap->val, ts));
3393}
3394
3395static int
3396__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
3397{
3398	return (do_unlock_umtx(td, uap->obj, uap->val));
3399}
3400
3401static int
3402__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3403{
3404	struct _umtx_time timeout, *tm_p;
3405	int error;
3406
3407	if (uap->uaddr2 == NULL)
3408		tm_p = NULL;
3409	else {
3410		error = umtx_copyin_umtx_time(
3411		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3412		if (error != 0)
3413			return (error);
3414		tm_p = &timeout;
3415	}
3416	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3417}
3418
3419static int
3420__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3421{
3422	struct _umtx_time timeout, *tm_p;
3423	int error;
3424
3425	if (uap->uaddr2 == NULL)
3426		tm_p = NULL;
3427	else {
3428		error = umtx_copyin_umtx_time(
3429		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3430		if (error != 0)
3431			return (error);
3432		tm_p = &timeout;
3433	}
3434	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3435}
3436
3437static int
3438__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3439{
3440	struct _umtx_time *tm_p, timeout;
3441	int error;
3442
3443	if (uap->uaddr2 == NULL)
3444		tm_p = NULL;
3445	else {
3446		error = umtx_copyin_umtx_time(
3447		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3448		if (error != 0)
3449			return (error);
3450		tm_p = &timeout;
3451	}
3452	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3453}
3454
3455static int
3456__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3457{
3458	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3459}
3460
3461#define BATCH_SIZE	128
3462static int
3463__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3464{
3465	int count = uap->val;
3466	void *uaddrs[BATCH_SIZE];
3467	char **upp = (char **)uap->obj;
3468	int tocopy;
3469	int error = 0;
3470	int i, pos = 0;
3471
3472	while (count > 0) {
3473		tocopy = count;
3474		if (tocopy > BATCH_SIZE)
3475			tocopy = BATCH_SIZE;
3476		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3477		if (error != 0)
3478			break;
3479		for (i = 0; i < tocopy; ++i)
3480			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3481		count -= tocopy;
3482		pos += tocopy;
3483	}
3484	return (error);
3485}
3486
3487static int
3488__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3489{
3490	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3491}
3492
3493static int
3494__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3495{
3496	struct _umtx_time *tm_p, timeout;
3497	int error;
3498
3499	/* Allow a null timespec (wait forever). */
3500	if (uap->uaddr2 == NULL)
3501		tm_p = NULL;
3502	else {
3503		error = umtx_copyin_umtx_time(
3504		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3505		if (error != 0)
3506			return (error);
3507		tm_p = &timeout;
3508	}
3509	return do_lock_umutex(td, uap->obj, tm_p, 0);
3510}
3511
3512static int
3513__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3514{
3515	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3516}
3517
3518static int
3519__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3520{
3521	struct _umtx_time *tm_p, timeout;
3522	int error;
3523
3524	/* Allow a null timespec (wait forever). */
3525	if (uap->uaddr2 == NULL)
3526		tm_p = NULL;
3527	else {
3528		error = umtx_copyin_umtx_time(
3529		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3530		if (error != 0)
3531			return (error);
3532		tm_p = &timeout;
3533	}
3534	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3535}
3536
3537static int
3538__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3539{
3540	return do_wake_umutex(td, uap->obj);
3541}
3542
3543static int
3544__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3545{
3546	return do_unlock_umutex(td, uap->obj);
3547}
3548
3549static int
3550__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3551{
3552	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3553}
3554
3555static int
3556__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3557{
3558	struct timespec *ts, timeout;
3559	int error;
3560
3561	/* Allow a null timespec (wait forever). */
3562	if (uap->uaddr2 == NULL)
3563		ts = NULL;
3564	else {
3565		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3566		if (error != 0)
3567			return (error);
3568		ts = &timeout;
3569	}
3570	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3571}
3572
3573static int
3574__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3575{
3576	return do_cv_signal(td, uap->obj);
3577}
3578
3579static int
3580__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3581{
3582	return do_cv_broadcast(td, uap->obj);
3583}
3584
3585static int
3586__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3587{
3588	struct _umtx_time timeout;
3589	int error;
3590
3591	/* Allow a null timespec (wait forever). */
3592	if (uap->uaddr2 == NULL) {
3593		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3594	} else {
3595		error = umtx_copyin_umtx_time(uap->uaddr2,
3596		   (size_t)uap->uaddr1, &timeout);
3597		if (error != 0)
3598			return (error);
3599		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3600	}
3601	return (error);
3602}
3603
3604static int
3605__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3606{
3607	struct _umtx_time timeout;
3608	int error;
3609
3610	/* Allow a null timespec (wait forever). */
3611	if (uap->uaddr2 == NULL) {
3612		error = do_rw_wrlock(td, uap->obj, 0);
3613	} else {
3614		error = umtx_copyin_umtx_time(uap->uaddr2,
3615		   (size_t)uap->uaddr1, &timeout);
3616		if (error != 0)
3617			return (error);
3618
3619		error = do_rw_wrlock(td, uap->obj, &timeout);
3620	}
3621	return (error);
3622}
3623
3624static int
3625__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3626{
3627	return do_rw_unlock(td, uap->obj);
3628}
3629
3630static int
3631__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3632{
3633	struct _umtx_time *tm_p, timeout;
3634	int error;
3635
3636	/* Allow a null timespec (wait forever). */
3637	if (uap->uaddr2 == NULL)
3638		tm_p = NULL;
3639	else {
3640		error = umtx_copyin_umtx_time(
3641		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3642		if (error != 0)
3643			return (error);
3644		tm_p = &timeout;
3645	}
3646	return (do_sem_wait(td, uap->obj, tm_p));
3647}
3648
3649static int
3650__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3651{
3652	return do_sem_wake(td, uap->obj);
3653}
3654
3655static int
3656__umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3657{
3658	return do_wake2_umutex(td, uap->obj, uap->val);
3659}
3660
3661typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3662
3663static _umtx_op_func op_table[] = {
3664	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3665	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3666	__umtx_op_wait,			/* UMTX_OP_WAIT */
3667	__umtx_op_wake,			/* UMTX_OP_WAKE */
3668	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3669	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3670	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3671	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3672	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3673	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3674	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3675	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3676	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3677	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3678	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3679	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3680	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3681	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3682	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3683	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3684	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3685	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3686	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3687};
3688
3689int
3690sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3691{
3692	if ((unsigned)uap->op < UMTX_OP_MAX)
3693		return (*op_table[uap->op])(td, uap);
3694	return (EINVAL);
3695}
3696
3697#ifdef COMPAT_FREEBSD32
3698int
3699freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3700    /* struct umtx *umtx */
3701{
3702	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3703}
3704
3705int
3706freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3707    /* struct umtx *umtx */
3708{
3709	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3710}
3711
3712struct timespec32 {
3713	int32_t tv_sec;
3714	int32_t tv_nsec;
3715};
3716
3717struct umtx_time32 {
3718	struct	timespec32	timeout;
3719	uint32_t		flags;
3720	uint32_t		clockid;
3721};
3722
3723static inline int
3724umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3725{
3726	struct timespec32 ts32;
3727	int error;
3728
3729	error = copyin(addr, &ts32, sizeof(struct timespec32));
3730	if (error == 0) {
3731		if (ts32.tv_sec < 0 ||
3732		    ts32.tv_nsec >= 1000000000 ||
3733		    ts32.tv_nsec < 0)
3734			error = EINVAL;
3735		else {
3736			tsp->tv_sec = ts32.tv_sec;
3737			tsp->tv_nsec = ts32.tv_nsec;
3738		}
3739	}
3740	return (error);
3741}
3742
3743static inline int
3744umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3745{
3746	struct umtx_time32 t32;
3747	int error;
3748
3749	t32.clockid = CLOCK_REALTIME;
3750	t32.flags   = 0;
3751	if (size <= sizeof(struct timespec32))
3752		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3753	else
3754		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3755	if (error != 0)
3756		return (error);
3757	if (t32.timeout.tv_sec < 0 ||
3758	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3759		return (EINVAL);
3760	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3761	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3762	tp->_flags = t32.flags;
3763	tp->_clockid = t32.clockid;
3764	return (0);
3765}
3766
3767static int
3768__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3769{
3770	struct timespec *ts, timeout;
3771	int error;
3772
3773	/* Allow a null timespec (wait forever). */
3774	if (uap->uaddr2 == NULL)
3775		ts = NULL;
3776	else {
3777		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3778		if (error != 0)
3779			return (error);
3780		ts = &timeout;
3781	}
3782	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3783}
3784
3785static int
3786__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3787{
3788	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3789}
3790
3791static int
3792__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3793{
3794	struct _umtx_time *tm_p, timeout;
3795	int error;
3796
3797	if (uap->uaddr2 == NULL)
3798		tm_p = NULL;
3799	else {
3800		error = umtx_copyin_umtx_time32(uap->uaddr2,
3801			(size_t)uap->uaddr1, &timeout);
3802		if (error != 0)
3803			return (error);
3804		tm_p = &timeout;
3805	}
3806	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3807}
3808
3809static int
3810__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3811{
3812	struct _umtx_time *tm_p, timeout;
3813	int error;
3814
3815	/* Allow a null timespec (wait forever). */
3816	if (uap->uaddr2 == NULL)
3817		tm_p = NULL;
3818	else {
3819		error = umtx_copyin_umtx_time(uap->uaddr2,
3820			    (size_t)uap->uaddr1, &timeout);
3821		if (error != 0)
3822			return (error);
3823		tm_p = &timeout;
3824	}
3825	return do_lock_umutex(td, uap->obj, tm_p, 0);
3826}
3827
3828static int
3829__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3830{
3831	struct _umtx_time *tm_p, timeout;
3832	int error;
3833
3834	/* Allow a null timespec (wait forever). */
3835	if (uap->uaddr2 == NULL)
3836		tm_p = NULL;
3837	else {
3838		error = umtx_copyin_umtx_time32(uap->uaddr2,
3839		    (size_t)uap->uaddr1, &timeout);
3840		if (error != 0)
3841			return (error);
3842		tm_p = &timeout;
3843	}
3844	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3845}
3846
3847static int
3848__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3849{
3850	struct timespec *ts, timeout;
3851	int error;
3852
3853	/* Allow a null timespec (wait forever). */
3854	if (uap->uaddr2 == NULL)
3855		ts = NULL;
3856	else {
3857		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3858		if (error != 0)
3859			return (error);
3860		ts = &timeout;
3861	}
3862	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3863}
3864
3865static int
3866__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3867{
3868	struct _umtx_time timeout;
3869	int error;
3870
3871	/* Allow a null timespec (wait forever). */
3872	if (uap->uaddr2 == NULL) {
3873		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3874	} else {
3875		error = umtx_copyin_umtx_time32(uap->uaddr2,
3876		    (size_t)uap->uaddr1, &timeout);
3877		if (error != 0)
3878			return (error);
3879		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3880	}
3881	return (error);
3882}
3883
3884static int
3885__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3886{
3887	struct _umtx_time timeout;
3888	int error;
3889
3890	/* Allow a null timespec (wait forever). */
3891	if (uap->uaddr2 == NULL) {
3892		error = do_rw_wrlock(td, uap->obj, 0);
3893	} else {
3894		error = umtx_copyin_umtx_time32(uap->uaddr2,
3895		    (size_t)uap->uaddr1, &timeout);
3896		if (error != 0)
3897			return (error);
3898		error = do_rw_wrlock(td, uap->obj, &timeout);
3899	}
3900	return (error);
3901}
3902
3903static int
3904__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3905{
3906	struct _umtx_time *tm_p, timeout;
3907	int error;
3908
3909	if (uap->uaddr2 == NULL)
3910		tm_p = NULL;
3911	else {
3912		error = umtx_copyin_umtx_time32(
3913		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3914		if (error != 0)
3915			return (error);
3916		tm_p = &timeout;
3917	}
3918	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3919}
3920
3921static int
3922__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3923{
3924	struct _umtx_time *tm_p, timeout;
3925	int error;
3926
3927	/* Allow a null timespec (wait forever). */
3928	if (uap->uaddr2 == NULL)
3929		tm_p = NULL;
3930	else {
3931		error = umtx_copyin_umtx_time32(uap->uaddr2,
3932		    (size_t)uap->uaddr1, &timeout);
3933		if (error != 0)
3934			return (error);
3935		tm_p = &timeout;
3936	}
3937	return (do_sem_wait(td, uap->obj, tm_p));
3938}
3939
3940static int
3941__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3942{
3943	int count = uap->val;
3944	uint32_t uaddrs[BATCH_SIZE];
3945	uint32_t **upp = (uint32_t **)uap->obj;
3946	int tocopy;
3947	int error = 0;
3948	int i, pos = 0;
3949
3950	while (count > 0) {
3951		tocopy = count;
3952		if (tocopy > BATCH_SIZE)
3953			tocopy = BATCH_SIZE;
3954		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3955		if (error != 0)
3956			break;
3957		for (i = 0; i < tocopy; ++i)
3958			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3959				INT_MAX, 1);
3960		count -= tocopy;
3961		pos += tocopy;
3962	}
3963	return (error);
3964}
3965
3966static _umtx_op_func op_table_compat32[] = {
3967	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3968	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3969	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3970	__umtx_op_wake,			/* UMTX_OP_WAKE */
3971	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3972	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3973	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3974	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3975	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3976	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3977	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3978	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3979	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3980	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3981	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3982	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3983	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3984	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3985	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3986	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3987	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3988	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3989	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3990};
3991
3992int
3993freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3994{
3995	if ((unsigned)uap->op < UMTX_OP_MAX)
3996		return (*op_table_compat32[uap->op])(td,
3997			(struct _umtx_op_args *)uap);
3998	return (EINVAL);
3999}
4000#endif
4001
4002void
4003umtx_thread_init(struct thread *td)
4004{
4005	td->td_umtxq = umtxq_alloc();
4006	td->td_umtxq->uq_thread = td;
4007}
4008
4009void
4010umtx_thread_fini(struct thread *td)
4011{
4012	umtxq_free(td->td_umtxq);
4013}
4014
4015/*
4016 * It will be called when new thread is created, e.g fork().
4017 */
4018void
4019umtx_thread_alloc(struct thread *td)
4020{
4021	struct umtx_q *uq;
4022
4023	uq = td->td_umtxq;
4024	uq->uq_inherited_pri = PRI_MAX;
4025
4026	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
4027	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
4028	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
4029	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
4030}
4031
4032/*
4033 * exec() hook.
4034 */
4035static void
4036umtx_exec_hook(void *arg __unused, struct proc *p __unused,
4037	struct image_params *imgp __unused)
4038{
4039	umtx_thread_cleanup(curthread);
4040}
4041
4042/*
4043 * thread_exit() hook.
4044 */
4045void
4046umtx_thread_exit(struct thread *td)
4047{
4048	umtx_thread_cleanup(td);
4049}
4050
4051/*
4052 * clean up umtx data.
4053 */
4054static void
4055umtx_thread_cleanup(struct thread *td)
4056{
4057	struct umtx_q *uq;
4058	struct umtx_pi *pi;
4059
4060	if ((uq = td->td_umtxq) == NULL)
4061		return;
4062
4063	mtx_lock_spin(&umtx_lock);
4064	uq->uq_inherited_pri = PRI_MAX;
4065	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4066		pi->pi_owner = NULL;
4067		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4068	}
4069	mtx_unlock_spin(&umtx_lock);
4070	thread_lock(td);
4071	sched_lend_user_prio(td, PRI_MAX);
4072	thread_unlock(td);
4073}
4074