kern_umtx.c revision 233691
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 233691 2012-03-30 05:49:32Z davidxu $");
30
31#include "opt_compat.h"
32#include "opt_umtx_profiling.h"
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mutex.h>
40#include <sys/priv.h>
41#include <sys/proc.h>
42#include <sys/sched.h>
43#include <sys/smp.h>
44#include <sys/sysctl.h>
45#include <sys/sysent.h>
46#include <sys/systm.h>
47#include <sys/sysproto.h>
48#include <sys/syscallsubr.h>
49#include <sys/eventhandler.h>
50#include <sys/umtx.h>
51
52#include <vm/vm.h>
53#include <vm/vm_param.h>
54#include <vm/pmap.h>
55#include <vm/vm_map.h>
56#include <vm/vm_object.h>
57
58#include <machine/cpu.h>
59
60#ifdef COMPAT_FREEBSD32
61#include <compat/freebsd32/freebsd32_proto.h>
62#endif
63
64#define _UMUTEX_TRY		1
65#define _UMUTEX_WAIT		2
66
67/* Priority inheritance mutex info. */
68struct umtx_pi {
69	/* Owner thread */
70	struct thread		*pi_owner;
71
72	/* Reference count */
73	int			pi_refcount;
74
75 	/* List entry to link umtx holding by thread */
76	TAILQ_ENTRY(umtx_pi)	pi_link;
77
78	/* List entry in hash */
79	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
80
81	/* List for waiters */
82	TAILQ_HEAD(,umtx_q)	pi_blocked;
83
84	/* Identify a userland lock object */
85	struct umtx_key		pi_key;
86};
87
88/* A userland synchronous object user. */
89struct umtx_q {
90	/* Linked list for the hash. */
91	TAILQ_ENTRY(umtx_q)	uq_link;
92
93	/* Umtx key. */
94	struct umtx_key		uq_key;
95
96	/* Umtx flags. */
97	int			uq_flags;
98#define UQF_UMTXQ	0x0001
99
100	/* The thread waits on. */
101	struct thread		*uq_thread;
102
103	/*
104	 * Blocked on PI mutex. read can use chain lock
105	 * or umtx_lock, write must have both chain lock and
106	 * umtx_lock being hold.
107	 */
108	struct umtx_pi		*uq_pi_blocked;
109
110	/* On blocked list */
111	TAILQ_ENTRY(umtx_q)	uq_lockq;
112
113	/* Thread contending with us */
114	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
115
116	/* Inherited priority from PP mutex */
117	u_char			uq_inherited_pri;
118
119	/* Spare queue ready to be reused */
120	struct umtxq_queue	*uq_spare_queue;
121
122	/* The queue we on */
123	struct umtxq_queue	*uq_cur_queue;
124};
125
126TAILQ_HEAD(umtxq_head, umtx_q);
127
128/* Per-key wait-queue */
129struct umtxq_queue {
130	struct umtxq_head	head;
131	struct umtx_key		key;
132	LIST_ENTRY(umtxq_queue)	link;
133	int			length;
134};
135
136LIST_HEAD(umtxq_list, umtxq_queue);
137
138/* Userland lock object's wait-queue chain */
139struct umtxq_chain {
140	/* Lock for this chain. */
141	struct mtx		uc_lock;
142
143	/* List of sleep queues. */
144	struct umtxq_list	uc_queue[2];
145#define UMTX_SHARED_QUEUE	0
146#define UMTX_EXCLUSIVE_QUEUE	1
147
148	LIST_HEAD(, umtxq_queue) uc_spare_queue;
149
150	/* Busy flag */
151	char			uc_busy;
152
153	/* Chain lock waiters */
154	int			uc_waiters;
155
156	/* All PI in the list */
157	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
158
159#ifdef UMTX_PROFILING
160	int 			length;
161	int			max_length;
162#endif
163};
164
165#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
167
168/*
169 * Don't propagate time-sharing priority, there is a security reason,
170 * a user can simply introduce PI-mutex, let thread A lock the mutex,
171 * and let another thread B block on the mutex, because B is
172 * sleeping, its priority will be boosted, this causes A's priority to
173 * be boosted via priority propagating too and will never be lowered even
174 * if it is using 100%CPU, this is unfair to other processes.
175 */
176
177#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
178			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
179			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
180
181#define	GOLDEN_RATIO_PRIME	2654404609U
182#define	UMTX_CHAINS		512
183#define	UMTX_SHIFTS		(__WORD_BIT - 9)
184
185#define	GET_SHARE(flags)	\
186    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
187
188#define BUSY_SPINS		200
189
190struct abs_timeout {
191	int clockid;
192	struct timespec cur;
193	struct timespec end;
194};
195
196static uma_zone_t		umtx_pi_zone;
197static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
198static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
199static int			umtx_pi_allocated;
200
201static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
202SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
203    &umtx_pi_allocated, 0, "Allocated umtx_pi");
204
205#ifdef UMTX_PROFILING
206static long max_length;
207SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
208static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
209#endif
210
211static void umtxq_sysinit(void *);
212static void umtxq_hash(struct umtx_key *key);
213static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
214static void umtxq_lock(struct umtx_key *key);
215static void umtxq_unlock(struct umtx_key *key);
216static void umtxq_busy(struct umtx_key *key);
217static void umtxq_unbusy(struct umtx_key *key);
218static void umtxq_insert_queue(struct umtx_q *uq, int q);
219static void umtxq_remove_queue(struct umtx_q *uq, int q);
220static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
221static int umtxq_count(struct umtx_key *key);
222static struct umtx_pi *umtx_pi_alloc(int);
223static void umtx_pi_free(struct umtx_pi *pi);
224static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
225static void umtx_thread_cleanup(struct thread *td);
226static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
227	struct image_params *imgp __unused);
228SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
229
230#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
231#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
232#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
233
234static struct mtx umtx_lock;
235
236#ifdef UMTX_PROFILING
237static void
238umtx_init_profiling(void)
239{
240	struct sysctl_oid *chain_oid;
241	char chain_name[10];
242	int i;
243
244	for (i = 0; i < UMTX_CHAINS; ++i) {
245		snprintf(chain_name, sizeof(chain_name), "%d", i);
246		chain_oid = SYSCTL_ADD_NODE(NULL,
247		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
248		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
249		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
250		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
251		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
252		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
253	}
254}
255#endif
256
257static void
258umtxq_sysinit(void *arg __unused)
259{
260	int i, j;
261
262	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
263		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
264	for (i = 0; i < 2; ++i) {
265		for (j = 0; j < UMTX_CHAINS; ++j) {
266			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
267				 MTX_DEF | MTX_DUPOK);
268			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
269			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
270			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
271			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
272			umtxq_chains[i][j].uc_busy = 0;
273			umtxq_chains[i][j].uc_waiters = 0;
274			#ifdef UMTX_PROFILING
275			umtxq_chains[i][j].length = 0;
276			umtxq_chains[i][j].max_length = 0;
277			#endif
278		}
279	}
280	#ifdef UMTX_PROFILING
281	umtx_init_profiling();
282	#endif
283	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
284	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
285	    EVENTHANDLER_PRI_ANY);
286}
287
288struct umtx_q *
289umtxq_alloc(void)
290{
291	struct umtx_q *uq;
292
293	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
294	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
295	TAILQ_INIT(&uq->uq_spare_queue->head);
296	TAILQ_INIT(&uq->uq_pi_contested);
297	uq->uq_inherited_pri = PRI_MAX;
298	return (uq);
299}
300
301void
302umtxq_free(struct umtx_q *uq)
303{
304	MPASS(uq->uq_spare_queue != NULL);
305	free(uq->uq_spare_queue, M_UMTX);
306	free(uq, M_UMTX);
307}
308
309static inline void
310umtxq_hash(struct umtx_key *key)
311{
312	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
313	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
314}
315
316static inline struct umtxq_chain *
317umtxq_getchain(struct umtx_key *key)
318{
319	if (key->type <= TYPE_SEM)
320		return (&umtxq_chains[1][key->hash]);
321	return (&umtxq_chains[0][key->hash]);
322}
323
324/*
325 * Lock a chain.
326 */
327static inline void
328umtxq_lock(struct umtx_key *key)
329{
330	struct umtxq_chain *uc;
331
332	uc = umtxq_getchain(key);
333	mtx_lock(&uc->uc_lock);
334}
335
336/*
337 * Unlock a chain.
338 */
339static inline void
340umtxq_unlock(struct umtx_key *key)
341{
342	struct umtxq_chain *uc;
343
344	uc = umtxq_getchain(key);
345	mtx_unlock(&uc->uc_lock);
346}
347
348/*
349 * Set chain to busy state when following operation
350 * may be blocked (kernel mutex can not be used).
351 */
352static inline void
353umtxq_busy(struct umtx_key *key)
354{
355	struct umtxq_chain *uc;
356
357	uc = umtxq_getchain(key);
358	mtx_assert(&uc->uc_lock, MA_OWNED);
359	if (uc->uc_busy) {
360#ifdef SMP
361		if (smp_cpus > 1) {
362			int count = BUSY_SPINS;
363			if (count > 0) {
364				umtxq_unlock(key);
365				while (uc->uc_busy && --count > 0)
366					cpu_spinwait();
367				umtxq_lock(key);
368			}
369		}
370#endif
371		while (uc->uc_busy) {
372			uc->uc_waiters++;
373			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
374			uc->uc_waiters--;
375		}
376	}
377	uc->uc_busy = 1;
378}
379
380/*
381 * Unbusy a chain.
382 */
383static inline void
384umtxq_unbusy(struct umtx_key *key)
385{
386	struct umtxq_chain *uc;
387
388	uc = umtxq_getchain(key);
389	mtx_assert(&uc->uc_lock, MA_OWNED);
390	KASSERT(uc->uc_busy != 0, ("not busy"));
391	uc->uc_busy = 0;
392	if (uc->uc_waiters)
393		wakeup_one(uc);
394}
395
396static struct umtxq_queue *
397umtxq_queue_lookup(struct umtx_key *key, int q)
398{
399	struct umtxq_queue *uh;
400	struct umtxq_chain *uc;
401
402	uc = umtxq_getchain(key);
403	UMTXQ_LOCKED_ASSERT(uc);
404	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
405		if (umtx_key_match(&uh->key, key))
406			return (uh);
407	}
408
409	return (NULL);
410}
411
412static inline void
413umtxq_insert_queue(struct umtx_q *uq, int q)
414{
415	struct umtxq_queue *uh;
416	struct umtxq_chain *uc;
417
418	uc = umtxq_getchain(&uq->uq_key);
419	UMTXQ_LOCKED_ASSERT(uc);
420	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
421	uh = umtxq_queue_lookup(&uq->uq_key, q);
422	if (uh != NULL) {
423		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
424	} else {
425		uh = uq->uq_spare_queue;
426		uh->key = uq->uq_key;
427		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
428	}
429	uq->uq_spare_queue = NULL;
430
431	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
432	uh->length++;
433	#ifdef UMTX_PROFILING
434	uc->length++;
435	if (uc->length > uc->max_length) {
436		uc->max_length = uc->length;
437		if (uc->max_length > max_length)
438			max_length = uc->max_length;
439	}
440	#endif
441	uq->uq_flags |= UQF_UMTXQ;
442	uq->uq_cur_queue = uh;
443	return;
444}
445
446static inline void
447umtxq_remove_queue(struct umtx_q *uq, int q)
448{
449	struct umtxq_chain *uc;
450	struct umtxq_queue *uh;
451
452	uc = umtxq_getchain(&uq->uq_key);
453	UMTXQ_LOCKED_ASSERT(uc);
454	if (uq->uq_flags & UQF_UMTXQ) {
455		uh = uq->uq_cur_queue;
456		TAILQ_REMOVE(&uh->head, uq, uq_link);
457		uh->length--;
458		#ifdef UMTX_PROFILING
459		uc->length--;
460		#endif
461		uq->uq_flags &= ~UQF_UMTXQ;
462		if (TAILQ_EMPTY(&uh->head)) {
463			KASSERT(uh->length == 0,
464			    ("inconsistent umtxq_queue length"));
465			LIST_REMOVE(uh, link);
466		} else {
467			uh = LIST_FIRST(&uc->uc_spare_queue);
468			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
469			LIST_REMOVE(uh, link);
470		}
471		uq->uq_spare_queue = uh;
472		uq->uq_cur_queue = NULL;
473	}
474}
475
476/*
477 * Check if there are multiple waiters
478 */
479static int
480umtxq_count(struct umtx_key *key)
481{
482	struct umtxq_chain *uc;
483	struct umtxq_queue *uh;
484
485	uc = umtxq_getchain(key);
486	UMTXQ_LOCKED_ASSERT(uc);
487	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
488	if (uh != NULL)
489		return (uh->length);
490	return (0);
491}
492
493/*
494 * Check if there are multiple PI waiters and returns first
495 * waiter.
496 */
497static int
498umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
499{
500	struct umtxq_chain *uc;
501	struct umtxq_queue *uh;
502
503	*first = NULL;
504	uc = umtxq_getchain(key);
505	UMTXQ_LOCKED_ASSERT(uc);
506	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
507	if (uh != NULL) {
508		*first = TAILQ_FIRST(&uh->head);
509		return (uh->length);
510	}
511	return (0);
512}
513
514/*
515 * Wake up threads waiting on an userland object.
516 */
517
518static int
519umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
520{
521	struct umtxq_chain *uc;
522	struct umtxq_queue *uh;
523	struct umtx_q *uq;
524	int ret;
525
526	ret = 0;
527	uc = umtxq_getchain(key);
528	UMTXQ_LOCKED_ASSERT(uc);
529	uh = umtxq_queue_lookup(key, q);
530	if (uh != NULL) {
531		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
532			umtxq_remove_queue(uq, q);
533			wakeup(uq);
534			if (++ret >= n_wake)
535				return (ret);
536		}
537	}
538	return (ret);
539}
540
541
542/*
543 * Wake up specified thread.
544 */
545static inline void
546umtxq_signal_thread(struct umtx_q *uq)
547{
548	struct umtxq_chain *uc;
549
550	uc = umtxq_getchain(&uq->uq_key);
551	UMTXQ_LOCKED_ASSERT(uc);
552	umtxq_remove(uq);
553	wakeup(uq);
554}
555
556static inline int
557tstohz(const struct timespec *tsp)
558{
559	struct timeval tv;
560
561	TIMESPEC_TO_TIMEVAL(&tv, tsp);
562	return tvtohz(&tv);
563}
564
565static void
566abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
567	const struct timespec *timeout)
568{
569
570	timo->clockid = clockid;
571	if (!absolute) {
572		kern_clock_gettime(curthread, clockid, &timo->end);
573		timo->cur = timo->end;
574		timespecadd(&timo->end, timeout);
575	} else {
576		timo->end = *timeout;
577		kern_clock_gettime(curthread, clockid, &timo->cur);
578	}
579}
580
581static void
582abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
583{
584
585	abs_timeout_init(timo, umtxtime->_clockid,
586		(umtxtime->_flags & UMTX_ABSTIME) != 0,
587		&umtxtime->_timeout);
588}
589
590static int
591abs_timeout_update(struct abs_timeout *timo)
592{
593	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
594	return (timespeccmp(&timo->cur, &timo->end, >=));
595}
596
597static int
598abs_timeout_gethz(struct abs_timeout *timo)
599{
600	struct timespec tts;
601
602	tts = timo->end;
603	timespecsub(&tts, &timo->cur);
604	return (tstohz(&tts));
605}
606
607/*
608 * Put thread into sleep state, before sleeping, check if
609 * thread was removed from umtx queue.
610 */
611static inline int
612umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *timo)
613{
614	struct umtxq_chain *uc;
615	int error;
616
617	uc = umtxq_getchain(&uq->uq_key);
618	UMTXQ_LOCKED_ASSERT(uc);
619	for (;;) {
620		if (!(uq->uq_flags & UQF_UMTXQ))
621			return (0);
622		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg,
623		    timo == NULL ? 0 : abs_timeout_gethz(timo));
624		if (error != EWOULDBLOCK)
625			break;
626		umtxq_unlock(&uq->uq_key);
627		if (abs_timeout_update(timo)) {
628			error = ETIMEDOUT;
629			umtxq_lock(&uq->uq_key);
630			break;
631		}
632		umtxq_lock(&uq->uq_key);
633	}
634	return (error);
635}
636
637/*
638 * Convert userspace address into unique logical address.
639 */
640int
641umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
642{
643	struct thread *td = curthread;
644	vm_map_t map;
645	vm_map_entry_t entry;
646	vm_pindex_t pindex;
647	vm_prot_t prot;
648	boolean_t wired;
649
650	key->type = type;
651	if (share == THREAD_SHARE) {
652		key->shared = 0;
653		key->info.private.vs = td->td_proc->p_vmspace;
654		key->info.private.addr = (uintptr_t)addr;
655	} else {
656		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
657		map = &td->td_proc->p_vmspace->vm_map;
658		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
659		    &entry, &key->info.shared.object, &pindex, &prot,
660		    &wired) != KERN_SUCCESS) {
661			return EFAULT;
662		}
663
664		if ((share == PROCESS_SHARE) ||
665		    (share == AUTO_SHARE &&
666		     VM_INHERIT_SHARE == entry->inheritance)) {
667			key->shared = 1;
668			key->info.shared.offset = entry->offset + entry->start -
669				(vm_offset_t)addr;
670			vm_object_reference(key->info.shared.object);
671		} else {
672			key->shared = 0;
673			key->info.private.vs = td->td_proc->p_vmspace;
674			key->info.private.addr = (uintptr_t)addr;
675		}
676		vm_map_lookup_done(map, entry);
677	}
678
679	umtxq_hash(key);
680	return (0);
681}
682
683/*
684 * Release key.
685 */
686void
687umtx_key_release(struct umtx_key *key)
688{
689	if (key->shared)
690		vm_object_deallocate(key->info.shared.object);
691}
692
693/*
694 * Lock a umtx object.
695 */
696static int
697do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
698	const struct timespec *timeout)
699{
700	struct abs_timeout timo;
701	struct umtx_q *uq;
702	u_long owner;
703	u_long old;
704	int error = 0;
705
706	uq = td->td_umtxq;
707	if (timeout != NULL)
708		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
709
710	/*
711	 * Care must be exercised when dealing with umtx structure. It
712	 * can fault on any access.
713	 */
714	for (;;) {
715		/*
716		 * Try the uncontested case.  This should be done in userland.
717		 */
718		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
719
720		/* The acquire succeeded. */
721		if (owner == UMTX_UNOWNED)
722			return (0);
723
724		/* The address was invalid. */
725		if (owner == -1)
726			return (EFAULT);
727
728		/* If no one owns it but it is contested try to acquire it. */
729		if (owner == UMTX_CONTESTED) {
730			owner = casuword(&umtx->u_owner,
731			    UMTX_CONTESTED, id | UMTX_CONTESTED);
732
733			if (owner == UMTX_CONTESTED)
734				return (0);
735
736			/* The address was invalid. */
737			if (owner == -1)
738				return (EFAULT);
739
740			/* If this failed the lock has changed, restart. */
741			continue;
742		}
743
744		/*
745		 * If we caught a signal, we have retried and now
746		 * exit immediately.
747		 */
748		if (error != 0)
749			break;
750
751		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
752			AUTO_SHARE, &uq->uq_key)) != 0)
753			return (error);
754
755		umtxq_lock(&uq->uq_key);
756		umtxq_busy(&uq->uq_key);
757		umtxq_insert(uq);
758		umtxq_unbusy(&uq->uq_key);
759		umtxq_unlock(&uq->uq_key);
760
761		/*
762		 * Set the contested bit so that a release in user space
763		 * knows to use the system call for unlock.  If this fails
764		 * either some one else has acquired the lock or it has been
765		 * released.
766		 */
767		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
768
769		/* The address was invalid. */
770		if (old == -1) {
771			umtxq_lock(&uq->uq_key);
772			umtxq_remove(uq);
773			umtxq_unlock(&uq->uq_key);
774			umtx_key_release(&uq->uq_key);
775			return (EFAULT);
776		}
777
778		/*
779		 * We set the contested bit, sleep. Otherwise the lock changed
780		 * and we need to retry or we lost a race to the thread
781		 * unlocking the umtx.
782		 */
783		umtxq_lock(&uq->uq_key);
784		if (old == owner)
785			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
786			    &timo);
787		umtxq_remove(uq);
788		umtxq_unlock(&uq->uq_key);
789		umtx_key_release(&uq->uq_key);
790	}
791
792	if (timeout == NULL) {
793		/* Mutex locking is restarted if it is interrupted. */
794		if (error == EINTR)
795			error = ERESTART;
796	} else {
797		/* Timed-locking is not restarted. */
798		if (error == ERESTART)
799			error = EINTR;
800	}
801	return (error);
802}
803
804/*
805 * Unlock a umtx object.
806 */
807static int
808do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
809{
810	struct umtx_key key;
811	u_long owner;
812	u_long old;
813	int error;
814	int count;
815
816	/*
817	 * Make sure we own this mtx.
818	 */
819	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
820	if (owner == -1)
821		return (EFAULT);
822
823	if ((owner & ~UMTX_CONTESTED) != id)
824		return (EPERM);
825
826	/* This should be done in userland */
827	if ((owner & UMTX_CONTESTED) == 0) {
828		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
829		if (old == -1)
830			return (EFAULT);
831		if (old == owner)
832			return (0);
833		owner = old;
834	}
835
836	/* We should only ever be in here for contested locks */
837	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
838		&key)) != 0)
839		return (error);
840
841	umtxq_lock(&key);
842	umtxq_busy(&key);
843	count = umtxq_count(&key);
844	umtxq_unlock(&key);
845
846	/*
847	 * When unlocking the umtx, it must be marked as unowned if
848	 * there is zero or one thread only waiting for it.
849	 * Otherwise, it must be marked as contested.
850	 */
851	old = casuword(&umtx->u_owner, owner,
852		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
853	umtxq_lock(&key);
854	umtxq_signal(&key,1);
855	umtxq_unbusy(&key);
856	umtxq_unlock(&key);
857	umtx_key_release(&key);
858	if (old == -1)
859		return (EFAULT);
860	if (old != owner)
861		return (EINVAL);
862	return (0);
863}
864
865#ifdef COMPAT_FREEBSD32
866
867/*
868 * Lock a umtx object.
869 */
870static int
871do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
872	const struct timespec *timeout)
873{
874	struct abs_timeout timo;
875	struct umtx_q *uq;
876	uint32_t owner;
877	uint32_t old;
878	int error = 0;
879
880	uq = td->td_umtxq;
881
882	if (timeout != NULL)
883		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
884
885	/*
886	 * Care must be exercised when dealing with umtx structure. It
887	 * can fault on any access.
888	 */
889	for (;;) {
890		/*
891		 * Try the uncontested case.  This should be done in userland.
892		 */
893		owner = casuword32(m, UMUTEX_UNOWNED, id);
894
895		/* The acquire succeeded. */
896		if (owner == UMUTEX_UNOWNED)
897			return (0);
898
899		/* The address was invalid. */
900		if (owner == -1)
901			return (EFAULT);
902
903		/* If no one owns it but it is contested try to acquire it. */
904		if (owner == UMUTEX_CONTESTED) {
905			owner = casuword32(m,
906			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
907			if (owner == UMUTEX_CONTESTED)
908				return (0);
909
910			/* The address was invalid. */
911			if (owner == -1)
912				return (EFAULT);
913
914			/* If this failed the lock has changed, restart. */
915			continue;
916		}
917
918		/*
919		 * If we caught a signal, we have retried and now
920		 * exit immediately.
921		 */
922		if (error != 0)
923			return (error);
924
925		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
926			AUTO_SHARE, &uq->uq_key)) != 0)
927			return (error);
928
929		umtxq_lock(&uq->uq_key);
930		umtxq_busy(&uq->uq_key);
931		umtxq_insert(uq);
932		umtxq_unbusy(&uq->uq_key);
933		umtxq_unlock(&uq->uq_key);
934
935		/*
936		 * Set the contested bit so that a release in user space
937		 * knows to use the system call for unlock.  If this fails
938		 * either some one else has acquired the lock or it has been
939		 * released.
940		 */
941		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
942
943		/* The address was invalid. */
944		if (old == -1) {
945			umtxq_lock(&uq->uq_key);
946			umtxq_remove(uq);
947			umtxq_unlock(&uq->uq_key);
948			umtx_key_release(&uq->uq_key);
949			return (EFAULT);
950		}
951
952		/*
953		 * We set the contested bit, sleep. Otherwise the lock changed
954		 * and we need to retry or we lost a race to the thread
955		 * unlocking the umtx.
956		 */
957		umtxq_lock(&uq->uq_key);
958		if (old == owner)
959			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
960			    NULL : timo);
961		umtxq_remove(uq);
962		umtxq_unlock(&uq->uq_key);
963		umtx_key_release(&uq->uq_key);
964	}
965
966	if (timeout == NULL) {
967		/* Mutex locking is restarted if it is interrupted. */
968		if (error == EINTR)
969			error = ERESTART;
970	} else {
971		/* Timed-locking is not restarted. */
972		if (error == ERESTART)
973			error = EINTR;
974	}
975	return (error);
976}
977
978/*
979 * Unlock a umtx object.
980 */
981static int
982do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
983{
984	struct umtx_key key;
985	uint32_t owner;
986	uint32_t old;
987	int error;
988	int count;
989
990	/*
991	 * Make sure we own this mtx.
992	 */
993	owner = fuword32(m);
994	if (owner == -1)
995		return (EFAULT);
996
997	if ((owner & ~UMUTEX_CONTESTED) != id)
998		return (EPERM);
999
1000	/* This should be done in userland */
1001	if ((owner & UMUTEX_CONTESTED) == 0) {
1002		old = casuword32(m, owner, UMUTEX_UNOWNED);
1003		if (old == -1)
1004			return (EFAULT);
1005		if (old == owner)
1006			return (0);
1007		owner = old;
1008	}
1009
1010	/* We should only ever be in here for contested locks */
1011	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1012		&key)) != 0)
1013		return (error);
1014
1015	umtxq_lock(&key);
1016	umtxq_busy(&key);
1017	count = umtxq_count(&key);
1018	umtxq_unlock(&key);
1019
1020	/*
1021	 * When unlocking the umtx, it must be marked as unowned if
1022	 * there is zero or one thread only waiting for it.
1023	 * Otherwise, it must be marked as contested.
1024	 */
1025	old = casuword32(m, owner,
1026		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1027	umtxq_lock(&key);
1028	umtxq_signal(&key,1);
1029	umtxq_unbusy(&key);
1030	umtxq_unlock(&key);
1031	umtx_key_release(&key);
1032	if (old == -1)
1033		return (EFAULT);
1034	if (old != owner)
1035		return (EINVAL);
1036	return (0);
1037}
1038#endif
1039
1040/*
1041 * Fetch and compare value, sleep on the address if value is not changed.
1042 */
1043static int
1044do_wait(struct thread *td, void *addr, u_long id,
1045	struct _umtx_time *timeout, int compat32, int is_private)
1046{
1047	struct abs_timeout timo;
1048	struct umtx_q *uq;
1049	u_long tmp;
1050	int error = 0;
1051
1052	uq = td->td_umtxq;
1053	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1054		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1055		return (error);
1056
1057	if (timeout != NULL)
1058		abs_timeout_init2(&timo, timeout);
1059
1060	umtxq_lock(&uq->uq_key);
1061	umtxq_insert(uq);
1062	umtxq_unlock(&uq->uq_key);
1063	if (compat32 == 0)
1064		tmp = fuword(addr);
1065        else
1066		tmp = (unsigned int)fuword32(addr);
1067	umtxq_lock(&uq->uq_key);
1068	if (tmp == id)
1069		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
1070		    NULL : &timo);
1071	if ((uq->uq_flags & UQF_UMTXQ) == 0)
1072		error = 0;
1073	else
1074		umtxq_remove(uq);
1075	umtxq_unlock(&uq->uq_key);
1076	umtx_key_release(&uq->uq_key);
1077	if (error == ERESTART)
1078		error = EINTR;
1079	return (error);
1080}
1081
1082/*
1083 * Wake up threads sleeping on the specified address.
1084 */
1085int
1086kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1087{
1088	struct umtx_key key;
1089	int ret;
1090
1091	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1092		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1093		return (ret);
1094	umtxq_lock(&key);
1095	ret = umtxq_signal(&key, n_wake);
1096	umtxq_unlock(&key);
1097	umtx_key_release(&key);
1098	return (0);
1099}
1100
1101/*
1102 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1103 */
1104static int
1105do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1106	struct _umtx_time *timeout, int mode)
1107{
1108	struct abs_timeout timo;
1109	struct umtx_q *uq;
1110	uint32_t owner, old, id;
1111	int error = 0;
1112
1113	id = td->td_tid;
1114	uq = td->td_umtxq;
1115
1116	if (timeout != NULL)
1117		abs_timeout_init2(&timo, timeout);
1118
1119	/*
1120	 * Care must be exercised when dealing with umtx structure. It
1121	 * can fault on any access.
1122	 */
1123	for (;;) {
1124		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1125		if (mode == _UMUTEX_WAIT) {
1126			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1127				return (0);
1128		} else {
1129			/*
1130			 * Try the uncontested case.  This should be done in userland.
1131			 */
1132			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1133
1134			/* The acquire succeeded. */
1135			if (owner == UMUTEX_UNOWNED)
1136				return (0);
1137
1138			/* The address was invalid. */
1139			if (owner == -1)
1140				return (EFAULT);
1141
1142			/* If no one owns it but it is contested try to acquire it. */
1143			if (owner == UMUTEX_CONTESTED) {
1144				owner = casuword32(&m->m_owner,
1145				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1146
1147				if (owner == UMUTEX_CONTESTED)
1148					return (0);
1149
1150				/* The address was invalid. */
1151				if (owner == -1)
1152					return (EFAULT);
1153
1154				/* If this failed the lock has changed, restart. */
1155				continue;
1156			}
1157		}
1158
1159		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1160		    (owner & ~UMUTEX_CONTESTED) == id)
1161			return (EDEADLK);
1162
1163		if (mode == _UMUTEX_TRY)
1164			return (EBUSY);
1165
1166		/*
1167		 * If we caught a signal, we have retried and now
1168		 * exit immediately.
1169		 */
1170		if (error != 0)
1171			return (error);
1172
1173		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1174		    GET_SHARE(flags), &uq->uq_key)) != 0)
1175			return (error);
1176
1177		umtxq_lock(&uq->uq_key);
1178		umtxq_busy(&uq->uq_key);
1179		umtxq_insert(uq);
1180		umtxq_unlock(&uq->uq_key);
1181
1182		/*
1183		 * Set the contested bit so that a release in user space
1184		 * knows to use the system call for unlock.  If this fails
1185		 * either some one else has acquired the lock or it has been
1186		 * released.
1187		 */
1188		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1189
1190		/* The address was invalid. */
1191		if (old == -1) {
1192			umtxq_lock(&uq->uq_key);
1193			umtxq_remove(uq);
1194			umtxq_unbusy(&uq->uq_key);
1195			umtxq_unlock(&uq->uq_key);
1196			umtx_key_release(&uq->uq_key);
1197			return (EFAULT);
1198		}
1199
1200		/*
1201		 * We set the contested bit, sleep. Otherwise the lock changed
1202		 * and we need to retry or we lost a race to the thread
1203		 * unlocking the umtx.
1204		 */
1205		umtxq_lock(&uq->uq_key);
1206		umtxq_unbusy(&uq->uq_key);
1207		if (old == owner)
1208			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1209			    NULL : &timo);
1210		umtxq_remove(uq);
1211		umtxq_unlock(&uq->uq_key);
1212		umtx_key_release(&uq->uq_key);
1213	}
1214
1215	return (0);
1216}
1217
1218/*
1219 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1220 */
1221/*
1222 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1223 */
1224static int
1225do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1226{
1227	struct umtx_key key;
1228	uint32_t owner, old, id;
1229	int error;
1230	int count;
1231
1232	id = td->td_tid;
1233	/*
1234	 * Make sure we own this mtx.
1235	 */
1236	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1237	if (owner == -1)
1238		return (EFAULT);
1239
1240	if ((owner & ~UMUTEX_CONTESTED) != id)
1241		return (EPERM);
1242
1243	if ((owner & UMUTEX_CONTESTED) == 0) {
1244		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1245		if (old == -1)
1246			return (EFAULT);
1247		if (old == owner)
1248			return (0);
1249		owner = old;
1250	}
1251
1252	/* We should only ever be in here for contested locks */
1253	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1254	    &key)) != 0)
1255		return (error);
1256
1257	umtxq_lock(&key);
1258	umtxq_busy(&key);
1259	count = umtxq_count(&key);
1260	umtxq_unlock(&key);
1261
1262	/*
1263	 * When unlocking the umtx, it must be marked as unowned if
1264	 * there is zero or one thread only waiting for it.
1265	 * Otherwise, it must be marked as contested.
1266	 */
1267	old = casuword32(&m->m_owner, owner,
1268		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1269	umtxq_lock(&key);
1270	umtxq_signal(&key,1);
1271	umtxq_unbusy(&key);
1272	umtxq_unlock(&key);
1273	umtx_key_release(&key);
1274	if (old == -1)
1275		return (EFAULT);
1276	if (old != owner)
1277		return (EINVAL);
1278	return (0);
1279}
1280
1281/*
1282 * Check if the mutex is available and wake up a waiter,
1283 * only for simple mutex.
1284 */
1285static int
1286do_wake_umutex(struct thread *td, struct umutex *m)
1287{
1288	struct umtx_key key;
1289	uint32_t owner;
1290	uint32_t flags;
1291	int error;
1292	int count;
1293
1294	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1295	if (owner == -1)
1296		return (EFAULT);
1297
1298	if ((owner & ~UMUTEX_CONTESTED) != 0)
1299		return (0);
1300
1301	flags = fuword32(&m->m_flags);
1302
1303	/* We should only ever be in here for contested locks */
1304	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1305	    &key)) != 0)
1306		return (error);
1307
1308	umtxq_lock(&key);
1309	umtxq_busy(&key);
1310	count = umtxq_count(&key);
1311	umtxq_unlock(&key);
1312
1313	if (count <= 1)
1314		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1315
1316	umtxq_lock(&key);
1317	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1318		umtxq_signal(&key, 1);
1319	umtxq_unbusy(&key);
1320	umtxq_unlock(&key);
1321	umtx_key_release(&key);
1322	return (0);
1323}
1324
1325static inline struct umtx_pi *
1326umtx_pi_alloc(int flags)
1327{
1328	struct umtx_pi *pi;
1329
1330	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1331	TAILQ_INIT(&pi->pi_blocked);
1332	atomic_add_int(&umtx_pi_allocated, 1);
1333	return (pi);
1334}
1335
1336static inline void
1337umtx_pi_free(struct umtx_pi *pi)
1338{
1339	uma_zfree(umtx_pi_zone, pi);
1340	atomic_add_int(&umtx_pi_allocated, -1);
1341}
1342
1343/*
1344 * Adjust the thread's position on a pi_state after its priority has been
1345 * changed.
1346 */
1347static int
1348umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1349{
1350	struct umtx_q *uq, *uq1, *uq2;
1351	struct thread *td1;
1352
1353	mtx_assert(&umtx_lock, MA_OWNED);
1354	if (pi == NULL)
1355		return (0);
1356
1357	uq = td->td_umtxq;
1358
1359	/*
1360	 * Check if the thread needs to be moved on the blocked chain.
1361	 * It needs to be moved if either its priority is lower than
1362	 * the previous thread or higher than the next thread.
1363	 */
1364	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1365	uq2 = TAILQ_NEXT(uq, uq_lockq);
1366	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1367	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1368		/*
1369		 * Remove thread from blocked chain and determine where
1370		 * it should be moved to.
1371		 */
1372		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1373		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1374			td1 = uq1->uq_thread;
1375			MPASS(td1->td_proc->p_magic == P_MAGIC);
1376			if (UPRI(td1) > UPRI(td))
1377				break;
1378		}
1379
1380		if (uq1 == NULL)
1381			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1382		else
1383			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1384	}
1385	return (1);
1386}
1387
1388/*
1389 * Propagate priority when a thread is blocked on POSIX
1390 * PI mutex.
1391 */
1392static void
1393umtx_propagate_priority(struct thread *td)
1394{
1395	struct umtx_q *uq;
1396	struct umtx_pi *pi;
1397	int pri;
1398
1399	mtx_assert(&umtx_lock, MA_OWNED);
1400	pri = UPRI(td);
1401	uq = td->td_umtxq;
1402	pi = uq->uq_pi_blocked;
1403	if (pi == NULL)
1404		return;
1405
1406	for (;;) {
1407		td = pi->pi_owner;
1408		if (td == NULL || td == curthread)
1409			return;
1410
1411		MPASS(td->td_proc != NULL);
1412		MPASS(td->td_proc->p_magic == P_MAGIC);
1413
1414		thread_lock(td);
1415		if (td->td_lend_user_pri > pri)
1416			sched_lend_user_prio(td, pri);
1417		else {
1418			thread_unlock(td);
1419			break;
1420		}
1421		thread_unlock(td);
1422
1423		/*
1424		 * Pick up the lock that td is blocked on.
1425		 */
1426		uq = td->td_umtxq;
1427		pi = uq->uq_pi_blocked;
1428		if (pi == NULL)
1429			break;
1430		/* Resort td on the list if needed. */
1431		umtx_pi_adjust_thread(pi, td);
1432	}
1433}
1434
1435/*
1436 * Unpropagate priority for a PI mutex when a thread blocked on
1437 * it is interrupted by signal or resumed by others.
1438 */
1439static void
1440umtx_repropagate_priority(struct umtx_pi *pi)
1441{
1442	struct umtx_q *uq, *uq_owner;
1443	struct umtx_pi *pi2;
1444	int pri;
1445
1446	mtx_assert(&umtx_lock, MA_OWNED);
1447
1448	while (pi != NULL && pi->pi_owner != NULL) {
1449		pri = PRI_MAX;
1450		uq_owner = pi->pi_owner->td_umtxq;
1451
1452		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1453			uq = TAILQ_FIRST(&pi2->pi_blocked);
1454			if (uq != NULL) {
1455				if (pri > UPRI(uq->uq_thread))
1456					pri = UPRI(uq->uq_thread);
1457			}
1458		}
1459
1460		if (pri > uq_owner->uq_inherited_pri)
1461			pri = uq_owner->uq_inherited_pri;
1462		thread_lock(pi->pi_owner);
1463		sched_lend_user_prio(pi->pi_owner, pri);
1464		thread_unlock(pi->pi_owner);
1465		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1466			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1467	}
1468}
1469
1470/*
1471 * Insert a PI mutex into owned list.
1472 */
1473static void
1474umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1475{
1476	struct umtx_q *uq_owner;
1477
1478	uq_owner = owner->td_umtxq;
1479	mtx_assert(&umtx_lock, MA_OWNED);
1480	if (pi->pi_owner != NULL)
1481		panic("pi_ower != NULL");
1482	pi->pi_owner = owner;
1483	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1484}
1485
1486/*
1487 * Claim ownership of a PI mutex.
1488 */
1489static int
1490umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1491{
1492	struct umtx_q *uq, *uq_owner;
1493
1494	uq_owner = owner->td_umtxq;
1495	mtx_lock_spin(&umtx_lock);
1496	if (pi->pi_owner == owner) {
1497		mtx_unlock_spin(&umtx_lock);
1498		return (0);
1499	}
1500
1501	if (pi->pi_owner != NULL) {
1502		/*
1503		 * userland may have already messed the mutex, sigh.
1504		 */
1505		mtx_unlock_spin(&umtx_lock);
1506		return (EPERM);
1507	}
1508	umtx_pi_setowner(pi, owner);
1509	uq = TAILQ_FIRST(&pi->pi_blocked);
1510	if (uq != NULL) {
1511		int pri;
1512
1513		pri = UPRI(uq->uq_thread);
1514		thread_lock(owner);
1515		if (pri < UPRI(owner))
1516			sched_lend_user_prio(owner, pri);
1517		thread_unlock(owner);
1518	}
1519	mtx_unlock_spin(&umtx_lock);
1520	return (0);
1521}
1522
1523/*
1524 * Adjust a thread's order position in its blocked PI mutex,
1525 * this may result new priority propagating process.
1526 */
1527void
1528umtx_pi_adjust(struct thread *td, u_char oldpri)
1529{
1530	struct umtx_q *uq;
1531	struct umtx_pi *pi;
1532
1533	uq = td->td_umtxq;
1534	mtx_lock_spin(&umtx_lock);
1535	/*
1536	 * Pick up the lock that td is blocked on.
1537	 */
1538	pi = uq->uq_pi_blocked;
1539	if (pi != NULL) {
1540		umtx_pi_adjust_thread(pi, td);
1541		umtx_repropagate_priority(pi);
1542	}
1543	mtx_unlock_spin(&umtx_lock);
1544}
1545
1546/*
1547 * Sleep on a PI mutex.
1548 */
1549static int
1550umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1551	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1552{
1553	struct umtxq_chain *uc;
1554	struct thread *td, *td1;
1555	struct umtx_q *uq1;
1556	int pri;
1557	int error = 0;
1558
1559	td = uq->uq_thread;
1560	KASSERT(td == curthread, ("inconsistent uq_thread"));
1561	uc = umtxq_getchain(&uq->uq_key);
1562	UMTXQ_LOCKED_ASSERT(uc);
1563	UMTXQ_BUSY_ASSERT(uc);
1564	umtxq_insert(uq);
1565	mtx_lock_spin(&umtx_lock);
1566	if (pi->pi_owner == NULL) {
1567		mtx_unlock_spin(&umtx_lock);
1568		/* XXX Only look up thread in current process. */
1569		td1 = tdfind(owner, curproc->p_pid);
1570		mtx_lock_spin(&umtx_lock);
1571		if (td1 != NULL) {
1572			if (pi->pi_owner == NULL)
1573				umtx_pi_setowner(pi, td1);
1574			PROC_UNLOCK(td1->td_proc);
1575		}
1576	}
1577
1578	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1579		pri = UPRI(uq1->uq_thread);
1580		if (pri > UPRI(td))
1581			break;
1582	}
1583
1584	if (uq1 != NULL)
1585		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1586	else
1587		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1588
1589	uq->uq_pi_blocked = pi;
1590	thread_lock(td);
1591	td->td_flags |= TDF_UPIBLOCKED;
1592	thread_unlock(td);
1593	umtx_propagate_priority(td);
1594	mtx_unlock_spin(&umtx_lock);
1595	umtxq_unbusy(&uq->uq_key);
1596
1597	error = umtxq_sleep(uq, wmesg, timo);
1598	umtxq_remove(uq);
1599
1600	mtx_lock_spin(&umtx_lock);
1601	uq->uq_pi_blocked = NULL;
1602	thread_lock(td);
1603	td->td_flags &= ~TDF_UPIBLOCKED;
1604	thread_unlock(td);
1605	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1606	umtx_repropagate_priority(pi);
1607	mtx_unlock_spin(&umtx_lock);
1608	umtxq_unlock(&uq->uq_key);
1609
1610	return (error);
1611}
1612
1613/*
1614 * Add reference count for a PI mutex.
1615 */
1616static void
1617umtx_pi_ref(struct umtx_pi *pi)
1618{
1619	struct umtxq_chain *uc;
1620
1621	uc = umtxq_getchain(&pi->pi_key);
1622	UMTXQ_LOCKED_ASSERT(uc);
1623	pi->pi_refcount++;
1624}
1625
1626/*
1627 * Decrease reference count for a PI mutex, if the counter
1628 * is decreased to zero, its memory space is freed.
1629 */
1630static void
1631umtx_pi_unref(struct umtx_pi *pi)
1632{
1633	struct umtxq_chain *uc;
1634
1635	uc = umtxq_getchain(&pi->pi_key);
1636	UMTXQ_LOCKED_ASSERT(uc);
1637	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1638	if (--pi->pi_refcount == 0) {
1639		mtx_lock_spin(&umtx_lock);
1640		if (pi->pi_owner != NULL) {
1641			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1642				pi, pi_link);
1643			pi->pi_owner = NULL;
1644		}
1645		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1646			("blocked queue not empty"));
1647		mtx_unlock_spin(&umtx_lock);
1648		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1649		umtx_pi_free(pi);
1650	}
1651}
1652
1653/*
1654 * Find a PI mutex in hash table.
1655 */
1656static struct umtx_pi *
1657umtx_pi_lookup(struct umtx_key *key)
1658{
1659	struct umtxq_chain *uc;
1660	struct umtx_pi *pi;
1661
1662	uc = umtxq_getchain(key);
1663	UMTXQ_LOCKED_ASSERT(uc);
1664
1665	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1666		if (umtx_key_match(&pi->pi_key, key)) {
1667			return (pi);
1668		}
1669	}
1670	return (NULL);
1671}
1672
1673/*
1674 * Insert a PI mutex into hash table.
1675 */
1676static inline void
1677umtx_pi_insert(struct umtx_pi *pi)
1678{
1679	struct umtxq_chain *uc;
1680
1681	uc = umtxq_getchain(&pi->pi_key);
1682	UMTXQ_LOCKED_ASSERT(uc);
1683	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1684}
1685
1686/*
1687 * Lock a PI mutex.
1688 */
1689static int
1690do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1691    struct _umtx_time *timeout, int try)
1692{
1693	struct abs_timeout timo;
1694	struct umtx_q *uq;
1695	struct umtx_pi *pi, *new_pi;
1696	uint32_t id, owner, old;
1697	int error;
1698
1699	id = td->td_tid;
1700	uq = td->td_umtxq;
1701
1702	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1703	    &uq->uq_key)) != 0)
1704		return (error);
1705
1706	if (timeout != NULL)
1707		abs_timeout_init2(&timo, timeout);
1708
1709	umtxq_lock(&uq->uq_key);
1710	pi = umtx_pi_lookup(&uq->uq_key);
1711	if (pi == NULL) {
1712		new_pi = umtx_pi_alloc(M_NOWAIT);
1713		if (new_pi == NULL) {
1714			umtxq_unlock(&uq->uq_key);
1715			new_pi = umtx_pi_alloc(M_WAITOK);
1716			umtxq_lock(&uq->uq_key);
1717			pi = umtx_pi_lookup(&uq->uq_key);
1718			if (pi != NULL) {
1719				umtx_pi_free(new_pi);
1720				new_pi = NULL;
1721			}
1722		}
1723		if (new_pi != NULL) {
1724			new_pi->pi_key = uq->uq_key;
1725			umtx_pi_insert(new_pi);
1726			pi = new_pi;
1727		}
1728	}
1729	umtx_pi_ref(pi);
1730	umtxq_unlock(&uq->uq_key);
1731
1732	/*
1733	 * Care must be exercised when dealing with umtx structure.  It
1734	 * can fault on any access.
1735	 */
1736	for (;;) {
1737		/*
1738		 * Try the uncontested case.  This should be done in userland.
1739		 */
1740		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1741
1742		/* The acquire succeeded. */
1743		if (owner == UMUTEX_UNOWNED) {
1744			error = 0;
1745			break;
1746		}
1747
1748		/* The address was invalid. */
1749		if (owner == -1) {
1750			error = EFAULT;
1751			break;
1752		}
1753
1754		/* If no one owns it but it is contested try to acquire it. */
1755		if (owner == UMUTEX_CONTESTED) {
1756			owner = casuword32(&m->m_owner,
1757			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1758
1759			if (owner == UMUTEX_CONTESTED) {
1760				umtxq_lock(&uq->uq_key);
1761				umtxq_busy(&uq->uq_key);
1762				error = umtx_pi_claim(pi, td);
1763				umtxq_unbusy(&uq->uq_key);
1764				umtxq_unlock(&uq->uq_key);
1765				break;
1766			}
1767
1768			/* The address was invalid. */
1769			if (owner == -1) {
1770				error = EFAULT;
1771				break;
1772			}
1773
1774			/* If this failed the lock has changed, restart. */
1775			continue;
1776		}
1777
1778		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1779		    (owner & ~UMUTEX_CONTESTED) == id) {
1780			error = EDEADLK;
1781			break;
1782		}
1783
1784		if (try != 0) {
1785			error = EBUSY;
1786			break;
1787		}
1788
1789		/*
1790		 * If we caught a signal, we have retried and now
1791		 * exit immediately.
1792		 */
1793		if (error != 0)
1794			break;
1795
1796		umtxq_lock(&uq->uq_key);
1797		umtxq_busy(&uq->uq_key);
1798		umtxq_unlock(&uq->uq_key);
1799
1800		/*
1801		 * Set the contested bit so that a release in user space
1802		 * knows to use the system call for unlock.  If this fails
1803		 * either some one else has acquired the lock or it has been
1804		 * released.
1805		 */
1806		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1807
1808		/* The address was invalid. */
1809		if (old == -1) {
1810			umtxq_lock(&uq->uq_key);
1811			umtxq_unbusy(&uq->uq_key);
1812			umtxq_unlock(&uq->uq_key);
1813			error = EFAULT;
1814			break;
1815		}
1816
1817		umtxq_lock(&uq->uq_key);
1818		/*
1819		 * We set the contested bit, sleep. Otherwise the lock changed
1820		 * and we need to retry or we lost a race to the thread
1821		 * unlocking the umtx.
1822		 */
1823		if (old == owner)
1824			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1825			    "umtxpi", timeout == NULL ? NULL : &timo);
1826		else {
1827			umtxq_unbusy(&uq->uq_key);
1828			umtxq_unlock(&uq->uq_key);
1829		}
1830	}
1831
1832	umtxq_lock(&uq->uq_key);
1833	umtx_pi_unref(pi);
1834	umtxq_unlock(&uq->uq_key);
1835
1836	umtx_key_release(&uq->uq_key);
1837	return (error);
1838}
1839
1840/*
1841 * Unlock a PI mutex.
1842 */
1843static int
1844do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1845{
1846	struct umtx_key key;
1847	struct umtx_q *uq_first, *uq_first2, *uq_me;
1848	struct umtx_pi *pi, *pi2;
1849	uint32_t owner, old, id;
1850	int error;
1851	int count;
1852	int pri;
1853
1854	id = td->td_tid;
1855	/*
1856	 * Make sure we own this mtx.
1857	 */
1858	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1859	if (owner == -1)
1860		return (EFAULT);
1861
1862	if ((owner & ~UMUTEX_CONTESTED) != id)
1863		return (EPERM);
1864
1865	/* This should be done in userland */
1866	if ((owner & UMUTEX_CONTESTED) == 0) {
1867		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1868		if (old == -1)
1869			return (EFAULT);
1870		if (old == owner)
1871			return (0);
1872		owner = old;
1873	}
1874
1875	/* We should only ever be in here for contested locks */
1876	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1877	    &key)) != 0)
1878		return (error);
1879
1880	umtxq_lock(&key);
1881	umtxq_busy(&key);
1882	count = umtxq_count_pi(&key, &uq_first);
1883	if (uq_first != NULL) {
1884		mtx_lock_spin(&umtx_lock);
1885		pi = uq_first->uq_pi_blocked;
1886		KASSERT(pi != NULL, ("pi == NULL?"));
1887		if (pi->pi_owner != curthread) {
1888			mtx_unlock_spin(&umtx_lock);
1889			umtxq_unbusy(&key);
1890			umtxq_unlock(&key);
1891			umtx_key_release(&key);
1892			/* userland messed the mutex */
1893			return (EPERM);
1894		}
1895		uq_me = curthread->td_umtxq;
1896		pi->pi_owner = NULL;
1897		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1898		/* get highest priority thread which is still sleeping. */
1899		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1900		while (uq_first != NULL &&
1901		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1902			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1903		}
1904		pri = PRI_MAX;
1905		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1906			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1907			if (uq_first2 != NULL) {
1908				if (pri > UPRI(uq_first2->uq_thread))
1909					pri = UPRI(uq_first2->uq_thread);
1910			}
1911		}
1912		thread_lock(curthread);
1913		sched_lend_user_prio(curthread, pri);
1914		thread_unlock(curthread);
1915		mtx_unlock_spin(&umtx_lock);
1916		if (uq_first)
1917			umtxq_signal_thread(uq_first);
1918	}
1919	umtxq_unlock(&key);
1920
1921	/*
1922	 * When unlocking the umtx, it must be marked as unowned if
1923	 * there is zero or one thread only waiting for it.
1924	 * Otherwise, it must be marked as contested.
1925	 */
1926	old = casuword32(&m->m_owner, owner,
1927		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1928
1929	umtxq_lock(&key);
1930	umtxq_unbusy(&key);
1931	umtxq_unlock(&key);
1932	umtx_key_release(&key);
1933	if (old == -1)
1934		return (EFAULT);
1935	if (old != owner)
1936		return (EINVAL);
1937	return (0);
1938}
1939
1940/*
1941 * Lock a PP mutex.
1942 */
1943static int
1944do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
1945    struct _umtx_time *timeout, int try)
1946{
1947	struct abs_timeout timo;
1948	struct umtx_q *uq, *uq2;
1949	struct umtx_pi *pi;
1950	uint32_t ceiling;
1951	uint32_t owner, id;
1952	int error, pri, old_inherited_pri, su;
1953
1954	id = td->td_tid;
1955	uq = td->td_umtxq;
1956	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1957	    &uq->uq_key)) != 0)
1958		return (error);
1959
1960	if (timeout != NULL)
1961		abs_timeout_init2(&timo, timeout);
1962
1963	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1964	for (;;) {
1965		old_inherited_pri = uq->uq_inherited_pri;
1966		umtxq_lock(&uq->uq_key);
1967		umtxq_busy(&uq->uq_key);
1968		umtxq_unlock(&uq->uq_key);
1969
1970		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1971		if (ceiling > RTP_PRIO_MAX) {
1972			error = EINVAL;
1973			goto out;
1974		}
1975
1976		mtx_lock_spin(&umtx_lock);
1977		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1978			mtx_unlock_spin(&umtx_lock);
1979			error = EINVAL;
1980			goto out;
1981		}
1982		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1983			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1984			thread_lock(td);
1985			if (uq->uq_inherited_pri < UPRI(td))
1986				sched_lend_user_prio(td, uq->uq_inherited_pri);
1987			thread_unlock(td);
1988		}
1989		mtx_unlock_spin(&umtx_lock);
1990
1991		owner = casuword32(&m->m_owner,
1992		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1993
1994		if (owner == UMUTEX_CONTESTED) {
1995			error = 0;
1996			break;
1997		}
1998
1999		/* The address was invalid. */
2000		if (owner == -1) {
2001			error = EFAULT;
2002			break;
2003		}
2004
2005		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2006		    (owner & ~UMUTEX_CONTESTED) == id) {
2007			error = EDEADLK;
2008			break;
2009		}
2010
2011		if (try != 0) {
2012			error = EBUSY;
2013			break;
2014		}
2015
2016		/*
2017		 * If we caught a signal, we have retried and now
2018		 * exit immediately.
2019		 */
2020		if (error != 0)
2021			break;
2022
2023		umtxq_lock(&uq->uq_key);
2024		umtxq_insert(uq);
2025		umtxq_unbusy(&uq->uq_key);
2026		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2027		    NULL : &timo);
2028		umtxq_remove(uq);
2029		umtxq_unlock(&uq->uq_key);
2030
2031		mtx_lock_spin(&umtx_lock);
2032		uq->uq_inherited_pri = old_inherited_pri;
2033		pri = PRI_MAX;
2034		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2035			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2036			if (uq2 != NULL) {
2037				if (pri > UPRI(uq2->uq_thread))
2038					pri = UPRI(uq2->uq_thread);
2039			}
2040		}
2041		if (pri > uq->uq_inherited_pri)
2042			pri = uq->uq_inherited_pri;
2043		thread_lock(td);
2044		sched_lend_user_prio(td, pri);
2045		thread_unlock(td);
2046		mtx_unlock_spin(&umtx_lock);
2047	}
2048
2049	if (error != 0) {
2050		mtx_lock_spin(&umtx_lock);
2051		uq->uq_inherited_pri = old_inherited_pri;
2052		pri = PRI_MAX;
2053		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2054			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2055			if (uq2 != NULL) {
2056				if (pri > UPRI(uq2->uq_thread))
2057					pri = UPRI(uq2->uq_thread);
2058			}
2059		}
2060		if (pri > uq->uq_inherited_pri)
2061			pri = uq->uq_inherited_pri;
2062		thread_lock(td);
2063		sched_lend_user_prio(td, pri);
2064		thread_unlock(td);
2065		mtx_unlock_spin(&umtx_lock);
2066	}
2067
2068out:
2069	umtxq_lock(&uq->uq_key);
2070	umtxq_unbusy(&uq->uq_key);
2071	umtxq_unlock(&uq->uq_key);
2072	umtx_key_release(&uq->uq_key);
2073	return (error);
2074}
2075
2076/*
2077 * Unlock a PP mutex.
2078 */
2079static int
2080do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2081{
2082	struct umtx_key key;
2083	struct umtx_q *uq, *uq2;
2084	struct umtx_pi *pi;
2085	uint32_t owner, id;
2086	uint32_t rceiling;
2087	int error, pri, new_inherited_pri, su;
2088
2089	id = td->td_tid;
2090	uq = td->td_umtxq;
2091	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2092
2093	/*
2094	 * Make sure we own this mtx.
2095	 */
2096	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2097	if (owner == -1)
2098		return (EFAULT);
2099
2100	if ((owner & ~UMUTEX_CONTESTED) != id)
2101		return (EPERM);
2102
2103	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2104	if (error != 0)
2105		return (error);
2106
2107	if (rceiling == -1)
2108		new_inherited_pri = PRI_MAX;
2109	else {
2110		rceiling = RTP_PRIO_MAX - rceiling;
2111		if (rceiling > RTP_PRIO_MAX)
2112			return (EINVAL);
2113		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2114	}
2115
2116	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2117	    &key)) != 0)
2118		return (error);
2119	umtxq_lock(&key);
2120	umtxq_busy(&key);
2121	umtxq_unlock(&key);
2122	/*
2123	 * For priority protected mutex, always set unlocked state
2124	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2125	 * to lock the mutex, it is necessary because thread priority
2126	 * has to be adjusted for such mutex.
2127	 */
2128	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2129		UMUTEX_CONTESTED);
2130
2131	umtxq_lock(&key);
2132	if (error == 0)
2133		umtxq_signal(&key, 1);
2134	umtxq_unbusy(&key);
2135	umtxq_unlock(&key);
2136
2137	if (error == -1)
2138		error = EFAULT;
2139	else {
2140		mtx_lock_spin(&umtx_lock);
2141		if (su != 0)
2142			uq->uq_inherited_pri = new_inherited_pri;
2143		pri = PRI_MAX;
2144		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2145			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2146			if (uq2 != NULL) {
2147				if (pri > UPRI(uq2->uq_thread))
2148					pri = UPRI(uq2->uq_thread);
2149			}
2150		}
2151		if (pri > uq->uq_inherited_pri)
2152			pri = uq->uq_inherited_pri;
2153		thread_lock(td);
2154		sched_lend_user_prio(td, pri);
2155		thread_unlock(td);
2156		mtx_unlock_spin(&umtx_lock);
2157	}
2158	umtx_key_release(&key);
2159	return (error);
2160}
2161
2162static int
2163do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2164	uint32_t *old_ceiling)
2165{
2166	struct umtx_q *uq;
2167	uint32_t save_ceiling;
2168	uint32_t owner, id;
2169	uint32_t flags;
2170	int error;
2171
2172	flags = fuword32(&m->m_flags);
2173	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2174		return (EINVAL);
2175	if (ceiling > RTP_PRIO_MAX)
2176		return (EINVAL);
2177	id = td->td_tid;
2178	uq = td->td_umtxq;
2179	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2180	   &uq->uq_key)) != 0)
2181		return (error);
2182	for (;;) {
2183		umtxq_lock(&uq->uq_key);
2184		umtxq_busy(&uq->uq_key);
2185		umtxq_unlock(&uq->uq_key);
2186
2187		save_ceiling = fuword32(&m->m_ceilings[0]);
2188
2189		owner = casuword32(&m->m_owner,
2190		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2191
2192		if (owner == UMUTEX_CONTESTED) {
2193			suword32(&m->m_ceilings[0], ceiling);
2194			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2195				UMUTEX_CONTESTED);
2196			error = 0;
2197			break;
2198		}
2199
2200		/* The address was invalid. */
2201		if (owner == -1) {
2202			error = EFAULT;
2203			break;
2204		}
2205
2206		if ((owner & ~UMUTEX_CONTESTED) == id) {
2207			suword32(&m->m_ceilings[0], ceiling);
2208			error = 0;
2209			break;
2210		}
2211
2212		/*
2213		 * If we caught a signal, we have retried and now
2214		 * exit immediately.
2215		 */
2216		if (error != 0)
2217			break;
2218
2219		/*
2220		 * We set the contested bit, sleep. Otherwise the lock changed
2221		 * and we need to retry or we lost a race to the thread
2222		 * unlocking the umtx.
2223		 */
2224		umtxq_lock(&uq->uq_key);
2225		umtxq_insert(uq);
2226		umtxq_unbusy(&uq->uq_key);
2227		error = umtxq_sleep(uq, "umtxpp", NULL);
2228		umtxq_remove(uq);
2229		umtxq_unlock(&uq->uq_key);
2230	}
2231	umtxq_lock(&uq->uq_key);
2232	if (error == 0)
2233		umtxq_signal(&uq->uq_key, INT_MAX);
2234	umtxq_unbusy(&uq->uq_key);
2235	umtxq_unlock(&uq->uq_key);
2236	umtx_key_release(&uq->uq_key);
2237	if (error == 0 && old_ceiling != NULL)
2238		suword32(old_ceiling, save_ceiling);
2239	return (error);
2240}
2241
2242/*
2243 * Lock a userland POSIX mutex.
2244 */
2245static int
2246do_lock_umutex(struct thread *td, struct umutex *m,
2247    struct _umtx_time *timeout, int mode)
2248{
2249	uint32_t flags;
2250	int error;
2251
2252	flags = fuword32(&m->m_flags);
2253	if (flags == -1)
2254		return (EFAULT);
2255
2256	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2257	case 0:
2258		error = do_lock_normal(td, m, flags, timeout, mode);
2259		break;
2260	case UMUTEX_PRIO_INHERIT:
2261		error = do_lock_pi(td, m, flags, timeout, mode);
2262		break;
2263	case UMUTEX_PRIO_PROTECT:
2264		error = do_lock_pp(td, m, flags, timeout, mode);
2265		break;
2266	default:
2267		return (EINVAL);
2268	}
2269	if (timeout == NULL) {
2270		if (error == EINTR && mode != _UMUTEX_WAIT)
2271			error = ERESTART;
2272	} else {
2273		/* Timed-locking is not restarted. */
2274		if (error == ERESTART)
2275			error = EINTR;
2276	}
2277	return (error);
2278}
2279
2280/*
2281 * Unlock a userland POSIX mutex.
2282 */
2283static int
2284do_unlock_umutex(struct thread *td, struct umutex *m)
2285{
2286	uint32_t flags;
2287
2288	flags = fuword32(&m->m_flags);
2289	if (flags == -1)
2290		return (EFAULT);
2291
2292	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2293	case 0:
2294		return (do_unlock_normal(td, m, flags));
2295	case UMUTEX_PRIO_INHERIT:
2296		return (do_unlock_pi(td, m, flags));
2297	case UMUTEX_PRIO_PROTECT:
2298		return (do_unlock_pp(td, m, flags));
2299	}
2300
2301	return (EINVAL);
2302}
2303
2304static int
2305do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2306	struct timespec *timeout, u_long wflags)
2307{
2308	struct abs_timeout timo;
2309	struct umtx_q *uq;
2310	uint32_t flags;
2311	uint32_t clockid;
2312	int error;
2313
2314	uq = td->td_umtxq;
2315	flags = fuword32(&cv->c_flags);
2316	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2317	if (error != 0)
2318		return (error);
2319
2320	if ((wflags & CVWAIT_CLOCKID) != 0) {
2321		clockid = fuword32(&cv->c_clockid);
2322		if (clockid < CLOCK_REALTIME ||
2323		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2324			/* hmm, only HW clock id will work. */
2325			return (EINVAL);
2326		}
2327	} else {
2328		clockid = CLOCK_REALTIME;
2329	}
2330
2331	umtxq_lock(&uq->uq_key);
2332	umtxq_busy(&uq->uq_key);
2333	umtxq_insert(uq);
2334	umtxq_unlock(&uq->uq_key);
2335
2336	/*
2337	 * Set c_has_waiters to 1 before releasing user mutex, also
2338	 * don't modify cache line when unnecessary.
2339	 */
2340	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2341		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2342
2343	umtxq_lock(&uq->uq_key);
2344	umtxq_unbusy(&uq->uq_key);
2345	umtxq_unlock(&uq->uq_key);
2346
2347	error = do_unlock_umutex(td, m);
2348
2349	if (timeout != NULL);
2350		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2351			timeout);
2352
2353	umtxq_lock(&uq->uq_key);
2354	if (error == 0) {
2355		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2356		    NULL : &timo);
2357	}
2358
2359	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2360		error = 0;
2361	else {
2362		/*
2363		 * This must be timeout,interrupted by signal or
2364		 * surprious wakeup, clear c_has_waiter flag when
2365		 * necessary.
2366		 */
2367		umtxq_busy(&uq->uq_key);
2368		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2369			int oldlen = uq->uq_cur_queue->length;
2370			umtxq_remove(uq);
2371			if (oldlen == 1) {
2372				umtxq_unlock(&uq->uq_key);
2373				suword32(
2374				    __DEVOLATILE(uint32_t *,
2375					 &cv->c_has_waiters), 0);
2376				umtxq_lock(&uq->uq_key);
2377			}
2378		}
2379		umtxq_unbusy(&uq->uq_key);
2380		if (error == ERESTART)
2381			error = EINTR;
2382	}
2383
2384	umtxq_unlock(&uq->uq_key);
2385	umtx_key_release(&uq->uq_key);
2386	return (error);
2387}
2388
2389/*
2390 * Signal a userland condition variable.
2391 */
2392static int
2393do_cv_signal(struct thread *td, struct ucond *cv)
2394{
2395	struct umtx_key key;
2396	int error, cnt, nwake;
2397	uint32_t flags;
2398
2399	flags = fuword32(&cv->c_flags);
2400	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2401		return (error);
2402	umtxq_lock(&key);
2403	umtxq_busy(&key);
2404	cnt = umtxq_count(&key);
2405	nwake = umtxq_signal(&key, 1);
2406	if (cnt <= nwake) {
2407		umtxq_unlock(&key);
2408		error = suword32(
2409		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2410		umtxq_lock(&key);
2411	}
2412	umtxq_unbusy(&key);
2413	umtxq_unlock(&key);
2414	umtx_key_release(&key);
2415	return (error);
2416}
2417
2418static int
2419do_cv_broadcast(struct thread *td, struct ucond *cv)
2420{
2421	struct umtx_key key;
2422	int error;
2423	uint32_t flags;
2424
2425	flags = fuword32(&cv->c_flags);
2426	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2427		return (error);
2428
2429	umtxq_lock(&key);
2430	umtxq_busy(&key);
2431	umtxq_signal(&key, INT_MAX);
2432	umtxq_unlock(&key);
2433
2434	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2435
2436	umtxq_lock(&key);
2437	umtxq_unbusy(&key);
2438	umtxq_unlock(&key);
2439
2440	umtx_key_release(&key);
2441	return (error);
2442}
2443
2444static int
2445do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2446{
2447	struct abs_timeout timo;
2448	struct umtx_q *uq;
2449	uint32_t flags, wrflags;
2450	int32_t state, oldstate;
2451	int32_t blocked_readers;
2452	int error;
2453
2454	uq = td->td_umtxq;
2455	flags = fuword32(&rwlock->rw_flags);
2456	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2457	if (error != 0)
2458		return (error);
2459
2460	if (timeout != NULL)
2461		abs_timeout_init2(&timo, timeout);
2462
2463	wrflags = URWLOCK_WRITE_OWNER;
2464	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2465		wrflags |= URWLOCK_WRITE_WAITERS;
2466
2467	for (;;) {
2468		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2469		/* try to lock it */
2470		while (!(state & wrflags)) {
2471			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2472				umtx_key_release(&uq->uq_key);
2473				return (EAGAIN);
2474			}
2475			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2476			if (oldstate == state) {
2477				umtx_key_release(&uq->uq_key);
2478				return (0);
2479			}
2480			state = oldstate;
2481		}
2482
2483		if (error)
2484			break;
2485
2486		/* grab monitor lock */
2487		umtxq_lock(&uq->uq_key);
2488		umtxq_busy(&uq->uq_key);
2489		umtxq_unlock(&uq->uq_key);
2490
2491		/*
2492		 * re-read the state, in case it changed between the try-lock above
2493		 * and the check below
2494		 */
2495		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2496
2497		/* set read contention bit */
2498		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2499			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2500			if (oldstate == state)
2501				goto sleep;
2502			state = oldstate;
2503		}
2504
2505		/* state is changed while setting flags, restart */
2506		if (!(state & wrflags)) {
2507			umtxq_lock(&uq->uq_key);
2508			umtxq_unbusy(&uq->uq_key);
2509			umtxq_unlock(&uq->uq_key);
2510			continue;
2511		}
2512
2513sleep:
2514		/* contention bit is set, before sleeping, increase read waiter count */
2515		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2516		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2517
2518		while (state & wrflags) {
2519			umtxq_lock(&uq->uq_key);
2520			umtxq_insert(uq);
2521			umtxq_unbusy(&uq->uq_key);
2522
2523			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2524			    NULL : &timo);
2525
2526			umtxq_busy(&uq->uq_key);
2527			umtxq_remove(uq);
2528			umtxq_unlock(&uq->uq_key);
2529			if (error)
2530				break;
2531			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2532		}
2533
2534		/* decrease read waiter count, and may clear read contention bit */
2535		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2536		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2537		if (blocked_readers == 1) {
2538			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2539			for (;;) {
2540				oldstate = casuword32(&rwlock->rw_state, state,
2541					 state & ~URWLOCK_READ_WAITERS);
2542				if (oldstate == state)
2543					break;
2544				state = oldstate;
2545			}
2546		}
2547
2548		umtxq_lock(&uq->uq_key);
2549		umtxq_unbusy(&uq->uq_key);
2550		umtxq_unlock(&uq->uq_key);
2551	}
2552	umtx_key_release(&uq->uq_key);
2553	if (error == ERESTART)
2554		error = EINTR;
2555	return (error);
2556}
2557
2558static int
2559do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2560{
2561	struct abs_timeout timo;
2562	struct umtx_q *uq;
2563	uint32_t flags;
2564	int32_t state, oldstate;
2565	int32_t blocked_writers;
2566	int32_t blocked_readers;
2567	int error;
2568
2569	uq = td->td_umtxq;
2570	flags = fuword32(&rwlock->rw_flags);
2571	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2572	if (error != 0)
2573		return (error);
2574
2575	if (timeout != NULL)
2576		abs_timeout_init2(&timo, timeout);
2577
2578	blocked_readers = 0;
2579	for (;;) {
2580		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2581		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2582			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2583			if (oldstate == state) {
2584				umtx_key_release(&uq->uq_key);
2585				return (0);
2586			}
2587			state = oldstate;
2588		}
2589
2590		if (error) {
2591			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2592			    blocked_readers != 0) {
2593				umtxq_lock(&uq->uq_key);
2594				umtxq_busy(&uq->uq_key);
2595				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2596				umtxq_unbusy(&uq->uq_key);
2597				umtxq_unlock(&uq->uq_key);
2598			}
2599
2600			break;
2601		}
2602
2603		/* grab monitor lock */
2604		umtxq_lock(&uq->uq_key);
2605		umtxq_busy(&uq->uq_key);
2606		umtxq_unlock(&uq->uq_key);
2607
2608		/*
2609		 * re-read the state, in case it changed between the try-lock above
2610		 * and the check below
2611		 */
2612		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2613
2614		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2615		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2616			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2617			if (oldstate == state)
2618				goto sleep;
2619			state = oldstate;
2620		}
2621
2622		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2623			umtxq_lock(&uq->uq_key);
2624			umtxq_unbusy(&uq->uq_key);
2625			umtxq_unlock(&uq->uq_key);
2626			continue;
2627		}
2628sleep:
2629		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2630		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2631
2632		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2633			umtxq_lock(&uq->uq_key);
2634			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2635			umtxq_unbusy(&uq->uq_key);
2636
2637			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2638			    NULL : &timo);
2639
2640			umtxq_busy(&uq->uq_key);
2641			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2642			umtxq_unlock(&uq->uq_key);
2643			if (error)
2644				break;
2645			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2646		}
2647
2648		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2649		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2650		if (blocked_writers == 1) {
2651			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2652			for (;;) {
2653				oldstate = casuword32(&rwlock->rw_state, state,
2654					 state & ~URWLOCK_WRITE_WAITERS);
2655				if (oldstate == state)
2656					break;
2657				state = oldstate;
2658			}
2659			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2660		} else
2661			blocked_readers = 0;
2662
2663		umtxq_lock(&uq->uq_key);
2664		umtxq_unbusy(&uq->uq_key);
2665		umtxq_unlock(&uq->uq_key);
2666	}
2667
2668	umtx_key_release(&uq->uq_key);
2669	if (error == ERESTART)
2670		error = EINTR;
2671	return (error);
2672}
2673
2674static int
2675do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2676{
2677	struct umtx_q *uq;
2678	uint32_t flags;
2679	int32_t state, oldstate;
2680	int error, q, count;
2681
2682	uq = td->td_umtxq;
2683	flags = fuword32(&rwlock->rw_flags);
2684	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2685	if (error != 0)
2686		return (error);
2687
2688	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2689	if (state & URWLOCK_WRITE_OWNER) {
2690		for (;;) {
2691			oldstate = casuword32(&rwlock->rw_state, state,
2692				state & ~URWLOCK_WRITE_OWNER);
2693			if (oldstate != state) {
2694				state = oldstate;
2695				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2696					error = EPERM;
2697					goto out;
2698				}
2699			} else
2700				break;
2701		}
2702	} else if (URWLOCK_READER_COUNT(state) != 0) {
2703		for (;;) {
2704			oldstate = casuword32(&rwlock->rw_state, state,
2705				state - 1);
2706			if (oldstate != state) {
2707				state = oldstate;
2708				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2709					error = EPERM;
2710					goto out;
2711				}
2712			}
2713			else
2714				break;
2715		}
2716	} else {
2717		error = EPERM;
2718		goto out;
2719	}
2720
2721	count = 0;
2722
2723	if (!(flags & URWLOCK_PREFER_READER)) {
2724		if (state & URWLOCK_WRITE_WAITERS) {
2725			count = 1;
2726			q = UMTX_EXCLUSIVE_QUEUE;
2727		} else if (state & URWLOCK_READ_WAITERS) {
2728			count = INT_MAX;
2729			q = UMTX_SHARED_QUEUE;
2730		}
2731	} else {
2732		if (state & URWLOCK_READ_WAITERS) {
2733			count = INT_MAX;
2734			q = UMTX_SHARED_QUEUE;
2735		} else if (state & URWLOCK_WRITE_WAITERS) {
2736			count = 1;
2737			q = UMTX_EXCLUSIVE_QUEUE;
2738		}
2739	}
2740
2741	if (count) {
2742		umtxq_lock(&uq->uq_key);
2743		umtxq_busy(&uq->uq_key);
2744		umtxq_signal_queue(&uq->uq_key, count, q);
2745		umtxq_unbusy(&uq->uq_key);
2746		umtxq_unlock(&uq->uq_key);
2747	}
2748out:
2749	umtx_key_release(&uq->uq_key);
2750	return (error);
2751}
2752
2753static int
2754do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2755{
2756	struct abs_timeout timo;
2757	struct umtx_q *uq;
2758	uint32_t flags, count;
2759	int error;
2760
2761	uq = td->td_umtxq;
2762	flags = fuword32(&sem->_flags);
2763	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2764	if (error != 0)
2765		return (error);
2766
2767	if (timeout != NULL)
2768		abs_timeout_init2(&timo, timeout);
2769
2770	umtxq_lock(&uq->uq_key);
2771	umtxq_busy(&uq->uq_key);
2772	umtxq_insert(uq);
2773	umtxq_unlock(&uq->uq_key);
2774
2775	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2776	rmb();
2777	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2778	if (count != 0) {
2779		umtxq_lock(&uq->uq_key);
2780		umtxq_unbusy(&uq->uq_key);
2781		umtxq_remove(uq);
2782		umtxq_unlock(&uq->uq_key);
2783		umtx_key_release(&uq->uq_key);
2784		return (0);
2785	}
2786	umtxq_lock(&uq->uq_key);
2787	umtxq_unbusy(&uq->uq_key);
2788
2789	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2790
2791	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2792		error = 0;
2793	else {
2794		umtxq_remove(uq);
2795		if (error == ERESTART)
2796			error = EINTR;
2797	}
2798	umtxq_unlock(&uq->uq_key);
2799	umtx_key_release(&uq->uq_key);
2800	return (error);
2801}
2802
2803/*
2804 * Signal a userland condition variable.
2805 */
2806static int
2807do_sem_wake(struct thread *td, struct _usem *sem)
2808{
2809	struct umtx_key key;
2810	int error, cnt, nwake;
2811	uint32_t flags;
2812
2813	flags = fuword32(&sem->_flags);
2814	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2815		return (error);
2816	umtxq_lock(&key);
2817	umtxq_busy(&key);
2818	cnt = umtxq_count(&key);
2819	nwake = umtxq_signal(&key, 1);
2820	if (cnt <= nwake) {
2821		umtxq_unlock(&key);
2822		error = suword32(
2823		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2824		umtxq_lock(&key);
2825	}
2826	umtxq_unbusy(&key);
2827	umtxq_unlock(&key);
2828	umtx_key_release(&key);
2829	return (error);
2830}
2831
2832int
2833sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2834    /* struct umtx *umtx */
2835{
2836	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2837}
2838
2839int
2840sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2841    /* struct umtx *umtx */
2842{
2843	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2844}
2845
2846inline int
2847umtx_copyin_timeout(const void *addr, struct timespec *tsp)
2848{
2849	int error;
2850
2851	error = copyin(addr, tsp, sizeof(struct timespec));
2852	if (error == 0) {
2853		if (tsp->tv_sec < 0 ||
2854		    tsp->tv_nsec >= 1000000000 ||
2855		    tsp->tv_nsec < 0)
2856			error = EINVAL;
2857	}
2858	return (error);
2859}
2860
2861static inline int
2862umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
2863{
2864	int error;
2865
2866	if (size <= sizeof(struct timespec)) {
2867		tp->_clockid = CLOCK_REALTIME;
2868		tp->_flags = 0;
2869		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
2870	} else
2871		error = copyin(addr, tp, sizeof(struct _umtx_time));
2872	if (error != 0)
2873		return (error);
2874	if (tp->_timeout.tv_sec < 0 ||
2875	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
2876		return (EINVAL);
2877	return (0);
2878}
2879
2880static int
2881__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2882{
2883	struct timespec *ts, timeout;
2884	int error;
2885
2886	/* Allow a null timespec (wait forever). */
2887	if (uap->uaddr2 == NULL)
2888		ts = NULL;
2889	else {
2890		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
2891		if (error != 0)
2892			return (error);
2893		ts = &timeout;
2894	}
2895	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2896}
2897
2898static int
2899__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2900{
2901	return (do_unlock_umtx(td, uap->obj, uap->val));
2902}
2903
2904static int
2905__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2906{
2907	struct _umtx_time timeout, *tm_p;
2908	int error;
2909
2910	if (uap->uaddr2 == NULL)
2911		tm_p = NULL;
2912	else {
2913		error = umtx_copyin_umtx_time(
2914		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2915		if (error != 0)
2916			return (error);
2917		tm_p = &timeout;
2918	}
2919	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
2920}
2921
2922static int
2923__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2924{
2925	struct _umtx_time timeout, *tm_p;
2926	int error;
2927
2928	if (uap->uaddr2 == NULL)
2929		tm_p = NULL;
2930	else {
2931		error = umtx_copyin_umtx_time(
2932		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2933		if (error != 0)
2934			return (error);
2935		tm_p = &timeout;
2936	}
2937	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
2938}
2939
2940static int
2941__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
2942{
2943	struct _umtx_time *tm_p, timeout;
2944	int error;
2945
2946	if (uap->uaddr2 == NULL)
2947		tm_p = NULL;
2948	else {
2949		error = umtx_copyin_umtx_time(
2950		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2951		if (error != 0)
2952			return (error);
2953		tm_p = &timeout;
2954	}
2955	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
2956}
2957
2958static int
2959__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2960{
2961	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
2962}
2963
2964#define BATCH_SIZE	128
2965static int
2966__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
2967{
2968	int count = uap->val;
2969	void *uaddrs[BATCH_SIZE];
2970	char **upp = (char **)uap->obj;
2971	int tocopy;
2972	int error = 0;
2973	int i, pos = 0;
2974
2975	while (count > 0) {
2976		tocopy = count;
2977		if (tocopy > BATCH_SIZE)
2978			tocopy = BATCH_SIZE;
2979		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
2980		if (error != 0)
2981			break;
2982		for (i = 0; i < tocopy; ++i)
2983			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
2984		count -= tocopy;
2985		pos += tocopy;
2986	}
2987	return (error);
2988}
2989
2990static int
2991__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
2992{
2993	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
2994}
2995
2996static int
2997__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2998{
2999	struct _umtx_time *tm_p, timeout;
3000	int error;
3001
3002	/* Allow a null timespec (wait forever). */
3003	if (uap->uaddr2 == NULL)
3004		tm_p = NULL;
3005	else {
3006		error = umtx_copyin_umtx_time(
3007		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3008		if (error != 0)
3009			return (error);
3010		tm_p = &timeout;
3011	}
3012	return do_lock_umutex(td, uap->obj, tm_p, 0);
3013}
3014
3015static int
3016__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3017{
3018	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3019}
3020
3021static int
3022__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3023{
3024	struct _umtx_time *tm_p, timeout;
3025	int error;
3026
3027	/* Allow a null timespec (wait forever). */
3028	if (uap->uaddr2 == NULL)
3029		tm_p = NULL;
3030	else {
3031		error = umtx_copyin_umtx_time(
3032		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3033		if (error != 0)
3034			return (error);
3035		tm_p = &timeout;
3036	}
3037	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3038}
3039
3040static int
3041__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3042{
3043	return do_wake_umutex(td, uap->obj);
3044}
3045
3046static int
3047__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3048{
3049	return do_unlock_umutex(td, uap->obj);
3050}
3051
3052static int
3053__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3054{
3055	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3056}
3057
3058static int
3059__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3060{
3061	struct timespec *ts, timeout;
3062	int error;
3063
3064	/* Allow a null timespec (wait forever). */
3065	if (uap->uaddr2 == NULL)
3066		ts = NULL;
3067	else {
3068		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3069		if (error != 0)
3070			return (error);
3071		ts = &timeout;
3072	}
3073	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3074}
3075
3076static int
3077__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3078{
3079	return do_cv_signal(td, uap->obj);
3080}
3081
3082static int
3083__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3084{
3085	return do_cv_broadcast(td, uap->obj);
3086}
3087
3088static int
3089__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3090{
3091	struct _umtx_time timeout;
3092	int error;
3093
3094	/* Allow a null timespec (wait forever). */
3095	if (uap->uaddr2 == NULL) {
3096		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3097	} else {
3098		error = umtx_copyin_umtx_time(uap->uaddr2,
3099		   (size_t)uap->uaddr1, &timeout);
3100		if (error != 0)
3101			return (error);
3102		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3103	}
3104	return (error);
3105}
3106
3107static int
3108__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3109{
3110	struct _umtx_time timeout;
3111	int error;
3112
3113	/* Allow a null timespec (wait forever). */
3114	if (uap->uaddr2 == NULL) {
3115		error = do_rw_wrlock(td, uap->obj, 0);
3116	} else {
3117		error = umtx_copyin_umtx_time(uap->uaddr2,
3118		   (size_t)uap->uaddr1, &timeout);
3119		if (error != 0)
3120			return (error);
3121
3122		error = do_rw_wrlock(td, uap->obj, &timeout);
3123	}
3124	return (error);
3125}
3126
3127static int
3128__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3129{
3130	return do_rw_unlock(td, uap->obj);
3131}
3132
3133static int
3134__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3135{
3136	struct _umtx_time *tm_p, timeout;
3137	int error;
3138
3139	/* Allow a null timespec (wait forever). */
3140	if (uap->uaddr2 == NULL)
3141		tm_p = NULL;
3142	else {
3143		error = umtx_copyin_umtx_time(
3144		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3145		if (error != 0)
3146			return (error);
3147		tm_p = &timeout;
3148	}
3149	return (do_sem_wait(td, uap->obj, tm_p));
3150}
3151
3152static int
3153__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3154{
3155	return do_sem_wake(td, uap->obj);
3156}
3157
3158typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3159
3160static _umtx_op_func op_table[] = {
3161	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3162	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3163	__umtx_op_wait,			/* UMTX_OP_WAIT */
3164	__umtx_op_wake,			/* UMTX_OP_WAKE */
3165	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3166	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3167	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3168	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3169	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3170	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3171	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3172	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3173	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3174	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3175	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3176	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3177	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3178	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3179	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3180	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3181	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3182	__umtx_op_nwake_private		/* UMTX_OP_NWAKE_PRIVATE */
3183};
3184
3185int
3186sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3187{
3188	if ((unsigned)uap->op < UMTX_OP_MAX)
3189		return (*op_table[uap->op])(td, uap);
3190	return (EINVAL);
3191}
3192
3193#ifdef COMPAT_FREEBSD32
3194int
3195freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3196    /* struct umtx *umtx */
3197{
3198	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3199}
3200
3201int
3202freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3203    /* struct umtx *umtx */
3204{
3205	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3206}
3207
3208struct timespec32 {
3209	uint32_t tv_sec;
3210	uint32_t tv_nsec;
3211};
3212
3213struct umtx_time32 {
3214	struct	timespec32	timeout;
3215	uint32_t		flags;
3216	uint32_t		clockid;
3217};
3218
3219static inline int
3220umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3221{
3222	struct timespec32 ts32;
3223	int error;
3224
3225	error = copyin(addr, &ts32, sizeof(struct timespec32));
3226	if (error == 0) {
3227		if (ts32.tv_sec < 0 ||
3228		    ts32.tv_nsec >= 1000000000 ||
3229		    ts32.tv_nsec < 0)
3230			error = EINVAL;
3231		else {
3232			tsp->tv_sec = ts32.tv_sec;
3233			tsp->tv_nsec = ts32.tv_nsec;
3234		}
3235	}
3236	return (error);
3237}
3238
3239static inline int
3240umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3241{
3242	struct umtx_time32 t32;
3243	int error;
3244
3245	t32.clockid = CLOCK_REALTIME;
3246	t32.flags   = 0;
3247	if (size <= sizeof(struct timespec32))
3248		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3249	else
3250		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3251	if (error != 0)
3252		return (error);
3253	if (t32.timeout.tv_sec < 0 ||
3254	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3255		return (EINVAL);
3256	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3257	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3258	tp->_flags = t32.flags;
3259	tp->_clockid = t32.clockid;
3260	return (0);
3261}
3262
3263static int
3264__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3265{
3266	struct timespec *ts, timeout;
3267	int error;
3268
3269	/* Allow a null timespec (wait forever). */
3270	if (uap->uaddr2 == NULL)
3271		ts = NULL;
3272	else {
3273		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3274		if (error != 0)
3275			return (error);
3276		ts = &timeout;
3277	}
3278	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3279}
3280
3281static int
3282__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3283{
3284	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3285}
3286
3287static int
3288__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3289{
3290	struct _umtx_time *tm_p, timeout;
3291	int error;
3292
3293	if (uap->uaddr2 == NULL)
3294		tm_p = NULL;
3295	else {
3296		error = umtx_copyin_umtx_time32(uap->uaddr2,
3297			(size_t)uap->uaddr1, &timeout);
3298		if (error != 0)
3299			return (error);
3300		tm_p = &timeout;
3301	}
3302	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3303}
3304
3305static int
3306__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3307{
3308	struct _umtx_time *tm_p, timeout;
3309	int error;
3310
3311	/* Allow a null timespec (wait forever). */
3312	if (uap->uaddr2 == NULL)
3313		tm_p = NULL;
3314	else {
3315		error = umtx_copyin_umtx_time(uap->uaddr2,
3316			    (size_t)uap->uaddr1, &timeout);
3317		if (error != 0)
3318			return (error);
3319		tm_p = &timeout;
3320	}
3321	return do_lock_umutex(td, uap->obj, tm_p, 0);
3322}
3323
3324static int
3325__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3326{
3327	struct _umtx_time *tm_p, timeout;
3328	int error;
3329
3330	/* Allow a null timespec (wait forever). */
3331	if (uap->uaddr2 == NULL)
3332		tm_p = NULL;
3333	else {
3334		error = umtx_copyin_umtx_time32(uap->uaddr2,
3335		    (size_t)uap->uaddr1, &timeout);
3336		if (error != 0)
3337			return (error);
3338		tm_p = &timeout;
3339	}
3340	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3341}
3342
3343static int
3344__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3345{
3346	struct timespec *ts, timeout;
3347	int error;
3348
3349	/* Allow a null timespec (wait forever). */
3350	if (uap->uaddr2 == NULL)
3351		ts = NULL;
3352	else {
3353		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3354		if (error != 0)
3355			return (error);
3356		ts = &timeout;
3357	}
3358	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3359}
3360
3361static int
3362__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3363{
3364	struct _umtx_time timeout;
3365	int error;
3366
3367	/* Allow a null timespec (wait forever). */
3368	if (uap->uaddr2 == NULL) {
3369		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3370	} else {
3371		error = umtx_copyin_umtx_time32(uap->uaddr2,
3372		    (size_t)uap->uaddr1, &timeout);
3373		if (error != 0)
3374			return (error);
3375		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3376	}
3377	return (error);
3378}
3379
3380static int
3381__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3382{
3383	struct _umtx_time timeout;
3384	int error;
3385
3386	/* Allow a null timespec (wait forever). */
3387	if (uap->uaddr2 == NULL) {
3388		error = do_rw_wrlock(td, uap->obj, 0);
3389	} else {
3390		error = umtx_copyin_umtx_time32(uap->uaddr2,
3391		    (size_t)uap->uaddr1, &timeout);
3392		if (error != 0)
3393			return (error);
3394		error = do_rw_wrlock2(td, uap->obj, &timeout);
3395	}
3396	return (error);
3397}
3398
3399static int
3400__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3401{
3402	struct _umtx_time *tm_p, timeout;
3403	int error;
3404
3405	if (uap->uaddr2 == NULL)
3406		tm_p = NULL;
3407	else {
3408		error = umtx_copyin_umtx_time32(
3409		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3410		if (error != 0)
3411			return (error);
3412		tm_p = &timeout;
3413	}
3414	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3415}
3416
3417static int
3418__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3419{
3420	struct _umtx_time *tm_p, timeout;
3421	int error;
3422
3423	/* Allow a null timespec (wait forever). */
3424	if (uap->uaddr2 == NULL)
3425		tm_p = NULL;
3426	else {
3427		error = umtx_copyin_umtx_time32(uap->uaddr2,
3428		    (size_t)uap->uaddr1, &timeout);
3429		if (error != 0)
3430			return (error);
3431		tm_p = &timeout;
3432	}
3433	return (do_sem_wait(td, uap->obj, tm_p));
3434}
3435
3436static int
3437__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3438{
3439	int count = uap->val;
3440	uint32_t uaddrs[BATCH_SIZE];
3441	uint32_t **upp = (uint32_t **)uap->obj;
3442	int tocopy;
3443	int error = 0;
3444	int i, pos = 0;
3445
3446	while (count > 0) {
3447		tocopy = count;
3448		if (tocopy > BATCH_SIZE)
3449			tocopy = BATCH_SIZE;
3450		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3451		if (error != 0)
3452			break;
3453		for (i = 0; i < tocopy; ++i)
3454			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3455				INT_MAX, 1);
3456		count -= tocopy;
3457		pos += tocopy;
3458	}
3459	return (error);
3460}
3461
3462static _umtx_op_func op_table_compat32[] = {
3463	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3464	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3465	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3466	__umtx_op_wake,			/* UMTX_OP_WAKE */
3467	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3468	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3469	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3470	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3471	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3472	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3473	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3474	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3475	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3476	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3477	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3478	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3479	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3480	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3481	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3482	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3483	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3484	__umtx_op_nwake_private32	/* UMTX_OP_NWAKE_PRIVATE */
3485};
3486
3487int
3488freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3489{
3490	if ((unsigned)uap->op < UMTX_OP_MAX)
3491		return (*op_table_compat32[uap->op])(td,
3492			(struct _umtx_op_args *)uap);
3493	return (EINVAL);
3494}
3495#endif
3496
3497void
3498umtx_thread_init(struct thread *td)
3499{
3500	td->td_umtxq = umtxq_alloc();
3501	td->td_umtxq->uq_thread = td;
3502}
3503
3504void
3505umtx_thread_fini(struct thread *td)
3506{
3507	umtxq_free(td->td_umtxq);
3508}
3509
3510/*
3511 * It will be called when new thread is created, e.g fork().
3512 */
3513void
3514umtx_thread_alloc(struct thread *td)
3515{
3516	struct umtx_q *uq;
3517
3518	uq = td->td_umtxq;
3519	uq->uq_inherited_pri = PRI_MAX;
3520
3521	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3522	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3523	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3524	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3525}
3526
3527/*
3528 * exec() hook.
3529 */
3530static void
3531umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3532	struct image_params *imgp __unused)
3533{
3534	umtx_thread_cleanup(curthread);
3535}
3536
3537/*
3538 * thread_exit() hook.
3539 */
3540void
3541umtx_thread_exit(struct thread *td)
3542{
3543	umtx_thread_cleanup(td);
3544}
3545
3546/*
3547 * clean up umtx data.
3548 */
3549static void
3550umtx_thread_cleanup(struct thread *td)
3551{
3552	struct umtx_q *uq;
3553	struct umtx_pi *pi;
3554
3555	if ((uq = td->td_umtxq) == NULL)
3556		return;
3557
3558	mtx_lock_spin(&umtx_lock);
3559	uq->uq_inherited_pri = PRI_MAX;
3560	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3561		pi->pi_owner = NULL;
3562		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3563	}
3564	mtx_unlock_spin(&umtx_lock);
3565	thread_lock(td);
3566	sched_lend_user_prio(td, PRI_MAX);
3567	thread_unlock(td);
3568}
3569