kern_umtx.c revision 205014
1233294Sstas/*-
2178825Sdfr * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3233294Sstas * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4178825Sdfr * All rights reserved.
5233294Sstas *
6233294Sstas * Redistribution and use in source and binary forms, with or without
7233294Sstas * modification, are permitted provided that the following conditions
8233294Sstas * are met:
9233294Sstas * 1. Redistributions of source code must retain the above copyright
10233294Sstas *    notice unmodified, this list of conditions, and the following
11233294Sstas *    disclaimer.
12233294Sstas * 2. Redistributions in binary form must reproduce the above copyright
13233294Sstas *    notice, this list of conditions and the following disclaimer in the
14233294Sstas *    documentation and/or other materials provided with the distribution.
15233294Sstas *
16233294Sstas * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17233294Sstas * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18233294Sstas * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19178825Sdfr * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20233294Sstas * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21233294Sstas * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22233294Sstas * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23233294Sstas * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24233294Sstas * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25233294Sstas * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26233294Sstas */
27233294Sstas
28233294Sstas#include <sys/cdefs.h>
29233294Sstas__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 205014 2010-03-11 14:49:06Z nwhitehorn $");
30233294Sstas
31233294Sstas#include "opt_compat.h"
32233294Sstas#include <sys/param.h>
33233294Sstas#include <sys/kernel.h>
34178825Sdfr#include <sys/limits.h>
35233294Sstas#include <sys/lock.h>
36233294Sstas#include <sys/malloc.h>
37233294Sstas#include <sys/mutex.h>
38233294Sstas#include <sys/priv.h>
39233294Sstas#include <sys/proc.h>
40233294Sstas#include <sys/sched.h>
41233294Sstas#include <sys/smp.h>
42233294Sstas#include <sys/sysctl.h>
43233294Sstas#include <sys/sysent.h>
44233294Sstas#include <sys/systm.h>
45178825Sdfr#include <sys/sysproto.h>
46233294Sstas#include <sys/eventhandler.h>
47233294Sstas#include <sys/umtx.h>
48233294Sstas
49233294Sstas#include <vm/vm.h>
50178825Sdfr#include <vm/vm_param.h>
51233294Sstas#include <vm/pmap.h>
52233294Sstas#include <vm/vm_map.h>
53178825Sdfr#include <vm/vm_object.h>
54178825Sdfr
55233294Sstas#include <machine/cpu.h>
56233294Sstas
57233294Sstas#ifdef COMPAT_FREEBSD32
58178825Sdfr#include <compat/freebsd32/freebsd32_proto.h>
59178825Sdfr#endif
60233294Sstas
61233294Sstasenum {
62233294Sstas	TYPE_SIMPLE_WAIT,
63233294Sstas	TYPE_CV,
64233294Sstas	TYPE_SEM,
65233294Sstas	TYPE_SIMPLE_LOCK,
66233294Sstas	TYPE_NORMAL_UMUTEX,
67233294Sstas	TYPE_PI_UMUTEX,
68233294Sstas	TYPE_PP_UMUTEX,
69178825Sdfr	TYPE_RWLOCK
70233294Sstas};
71233294Sstas
72233294Sstas#define _UMUTEX_TRY		1
73233294Sstas#define _UMUTEX_WAIT		2
74233294Sstas
75178825Sdfr/* Key to represent a unique userland synchronous object */
76233294Sstasstruct umtx_key {
77233294Sstas	int	hash;
78233294Sstas	int	type;
79233294Sstas	int	shared;
80233294Sstas	union {
81233294Sstas		struct {
82233294Sstas			vm_object_t	object;
83233294Sstas			uintptr_t	offset;
84233294Sstas		} shared;
85178825Sdfr		struct {
86233294Sstas			struct vmspace	*vs;
87233294Sstas			uintptr_t	addr;
88233294Sstas		} private;
89233294Sstas		struct {
90233294Sstas			void		*a;
91233294Sstas			uintptr_t	b;
92178825Sdfr		} both;
93233294Sstas	} info;
94233294Sstas};
95233294Sstas
96233294Sstas/* Priority inheritance mutex info. */
97233294Sstasstruct umtx_pi {
98233294Sstas	/* Owner thread */
99178825Sdfr	struct thread		*pi_owner;
100178825Sdfr
101178825Sdfr	/* Reference count */
102178825Sdfr	int			pi_refcount;
103233294Sstas
104233294Sstas 	/* List entry to link umtx holding by thread */
105233294Sstas	TAILQ_ENTRY(umtx_pi)	pi_link;
106233294Sstas
107233294Sstas	/* List entry in hash */
108233294Sstas	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
109233294Sstas
110233294Sstas	/* List for waiters */
111233294Sstas	TAILQ_HEAD(,umtx_q)	pi_blocked;
112233294Sstas
113233294Sstas	/* Identify a userland lock object */
114233294Sstas	struct umtx_key		pi_key;
115233294Sstas};
116233294Sstas
117233294Sstas/* A userland synchronous object user. */
118233294Sstasstruct umtx_q {
119233294Sstas	/* Linked list for the hash. */
120233294Sstas	TAILQ_ENTRY(umtx_q)	uq_link;
121178825Sdfr
122178825Sdfr	/* Umtx key. */
123178825Sdfr	struct umtx_key		uq_key;
124178825Sdfr
125233294Sstas	/* Umtx flags. */
126178825Sdfr	int			uq_flags;
127233294Sstas#define UQF_UMTXQ	0x0001
128233294Sstas
129233294Sstas	/* The thread waits on. */
130233294Sstas	struct thread		*uq_thread;
131233294Sstas
132233294Sstas	/*
133233294Sstas	 * Blocked on PI mutex. read can use chain lock
134233294Sstas	 * or umtx_lock, write must have both chain lock and
135233294Sstas	 * umtx_lock being hold.
136233294Sstas	 */
137233294Sstas	struct umtx_pi		*uq_pi_blocked;
138233294Sstas
139233294Sstas	/* On blocked list */
140233294Sstas	TAILQ_ENTRY(umtx_q)	uq_lockq;
141233294Sstas
142233294Sstas	/* Thread contending with us */
143233294Sstas	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
144233294Sstas
145233294Sstas	/* Inherited priority from PP mutex */
146233294Sstas	u_char			uq_inherited_pri;
147233294Sstas
148233294Sstas	/* Spare queue ready to be reused */
149233294Sstas	struct umtxq_queue	*uq_spare_queue;
150233294Sstas
151233294Sstas	/* The queue we on */
152233294Sstas	struct umtxq_queue	*uq_cur_queue;
153233294Sstas};
154233294Sstas
155233294SstasTAILQ_HEAD(umtxq_head, umtx_q);
156233294Sstas
157233294Sstas/* Per-key wait-queue */
158233294Sstasstruct umtxq_queue {
159233294Sstas	struct umtxq_head	head;
160233294Sstas	struct umtx_key		key;
161233294Sstas	LIST_ENTRY(umtxq_queue)	link;
162233294Sstas	int			length;
163233294Sstas};
164233294Sstas
165233294SstasLIST_HEAD(umtxq_list, umtxq_queue);
166233294Sstas
167233294Sstas/* Userland lock object's wait-queue chain */
168233294Sstasstruct umtxq_chain {
169233294Sstas	/* Lock for this chain. */
170233294Sstas	struct mtx		uc_lock;
171233294Sstas
172233294Sstas	/* List of sleep queues. */
173233294Sstas	struct umtxq_list	uc_queue[2];
174233294Sstas#define UMTX_SHARED_QUEUE	0
175233294Sstas#define UMTX_EXCLUSIVE_QUEUE	1
176233294Sstas
177233294Sstas	LIST_HEAD(, umtxq_queue) uc_spare_queue;
178233294Sstas
179233294Sstas	/* Busy flag */
180233294Sstas	char			uc_busy;
181233294Sstas
182233294Sstas	/* Chain lock waiters */
183233294Sstas	int			uc_waiters;
184233294Sstas
185233294Sstas	/* All PI in the list */
186233294Sstas	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
187233294Sstas
188233294Sstas};
189233294Sstas
190233294Sstas#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
191233294Sstas#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
192233294Sstas
193233294Sstas/*
194178825Sdfr * Don't propagate time-sharing priority, there is a security reason,
195178825Sdfr * a user can simply introduce PI-mutex, let thread A lock the mutex,
196178825Sdfr * and let another thread B block on the mutex, because B is
197 * sleeping, its priority will be boosted, this causes A's priority to
198 * be boosted via priority propagating too and will never be lowered even
199 * if it is using 100%CPU, this is unfair to other processes.
200 */
201
202#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
203			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
204			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
205
206#define	GOLDEN_RATIO_PRIME	2654404609U
207#define	UMTX_CHAINS		128
208#define	UMTX_SHIFTS		(__WORD_BIT - 7)
209
210#define THREAD_SHARE		0
211#define PROCESS_SHARE		1
212#define AUTO_SHARE		2
213
214#define	GET_SHARE(flags)	\
215    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
216
217#define BUSY_SPINS		200
218
219static uma_zone_t		umtx_pi_zone;
220static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
221static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
222static int			umtx_pi_allocated;
223
224SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
225SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
226    &umtx_pi_allocated, 0, "Allocated umtx_pi");
227
228static void umtxq_sysinit(void *);
229static void umtxq_hash(struct umtx_key *key);
230static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
231static void umtxq_lock(struct umtx_key *key);
232static void umtxq_unlock(struct umtx_key *key);
233static void umtxq_busy(struct umtx_key *key);
234static void umtxq_unbusy(struct umtx_key *key);
235static void umtxq_insert_queue(struct umtx_q *uq, int q);
236static void umtxq_remove_queue(struct umtx_q *uq, int q);
237static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
238static int umtxq_count(struct umtx_key *key);
239static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
240static int umtx_key_get(void *addr, int type, int share,
241	struct umtx_key *key);
242static void umtx_key_release(struct umtx_key *key);
243static struct umtx_pi *umtx_pi_alloc(int);
244static void umtx_pi_free(struct umtx_pi *pi);
245static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
246static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
247static void umtx_thread_cleanup(struct thread *td);
248static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
249	struct image_params *imgp __unused);
250SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
251
252#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
253#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
254#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
255
256static struct mtx umtx_lock;
257
258static void
259umtxq_sysinit(void *arg __unused)
260{
261	int i, j;
262
263	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
264		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
265	for (i = 0; i < 2; ++i) {
266		for (j = 0; j < UMTX_CHAINS; ++j) {
267			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
268				 MTX_DEF | MTX_DUPOK);
269			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
270			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
271			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
272			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
273			umtxq_chains[i][j].uc_busy = 0;
274			umtxq_chains[i][j].uc_waiters = 0;
275		}
276	}
277	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
278	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
279	    EVENTHANDLER_PRI_ANY);
280}
281
282struct umtx_q *
283umtxq_alloc(void)
284{
285	struct umtx_q *uq;
286
287	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
288	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
289	TAILQ_INIT(&uq->uq_spare_queue->head);
290	TAILQ_INIT(&uq->uq_pi_contested);
291	uq->uq_inherited_pri = PRI_MAX;
292	return (uq);
293}
294
295void
296umtxq_free(struct umtx_q *uq)
297{
298	MPASS(uq->uq_spare_queue != NULL);
299	free(uq->uq_spare_queue, M_UMTX);
300	free(uq, M_UMTX);
301}
302
303static inline void
304umtxq_hash(struct umtx_key *key)
305{
306	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
307	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
308}
309
310static inline int
311umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
312{
313	return (k1->type == k2->type &&
314		k1->info.both.a == k2->info.both.a &&
315	        k1->info.both.b == k2->info.both.b);
316}
317
318static inline struct umtxq_chain *
319umtxq_getchain(struct umtx_key *key)
320{
321	if (key->type <= TYPE_SEM)
322		return (&umtxq_chains[1][key->hash]);
323	return (&umtxq_chains[0][key->hash]);
324}
325
326/*
327 * Lock a chain.
328 */
329static inline void
330umtxq_lock(struct umtx_key *key)
331{
332	struct umtxq_chain *uc;
333
334	uc = umtxq_getchain(key);
335	mtx_lock(&uc->uc_lock);
336}
337
338/*
339 * Unlock a chain.
340 */
341static inline void
342umtxq_unlock(struct umtx_key *key)
343{
344	struct umtxq_chain *uc;
345
346	uc = umtxq_getchain(key);
347	mtx_unlock(&uc->uc_lock);
348}
349
350/*
351 * Set chain to busy state when following operation
352 * may be blocked (kernel mutex can not be used).
353 */
354static inline void
355umtxq_busy(struct umtx_key *key)
356{
357	struct umtxq_chain *uc;
358
359	uc = umtxq_getchain(key);
360	mtx_assert(&uc->uc_lock, MA_OWNED);
361	if (uc->uc_busy) {
362#ifdef SMP
363		if (smp_cpus > 1) {
364			int count = BUSY_SPINS;
365			if (count > 0) {
366				umtxq_unlock(key);
367				while (uc->uc_busy && --count > 0)
368					cpu_spinwait();
369				umtxq_lock(key);
370			}
371		}
372#endif
373		while (uc->uc_busy) {
374			uc->uc_waiters++;
375			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
376			uc->uc_waiters--;
377		}
378	}
379	uc->uc_busy = 1;
380}
381
382/*
383 * Unbusy a chain.
384 */
385static inline void
386umtxq_unbusy(struct umtx_key *key)
387{
388	struct umtxq_chain *uc;
389
390	uc = umtxq_getchain(key);
391	mtx_assert(&uc->uc_lock, MA_OWNED);
392	KASSERT(uc->uc_busy != 0, ("not busy"));
393	uc->uc_busy = 0;
394	if (uc->uc_waiters)
395		wakeup_one(uc);
396}
397
398static struct umtxq_queue *
399umtxq_queue_lookup(struct umtx_key *key, int q)
400{
401	struct umtxq_queue *uh;
402	struct umtxq_chain *uc;
403
404	uc = umtxq_getchain(key);
405	UMTXQ_LOCKED_ASSERT(uc);
406	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
407		if (umtx_key_match(&uh->key, key))
408			return (uh);
409	}
410
411	return (NULL);
412}
413
414static inline void
415umtxq_insert_queue(struct umtx_q *uq, int q)
416{
417	struct umtxq_queue *uh;
418	struct umtxq_chain *uc;
419
420	uc = umtxq_getchain(&uq->uq_key);
421	UMTXQ_LOCKED_ASSERT(uc);
422	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
423	uh = umtxq_queue_lookup(&uq->uq_key, q);
424	if (uh != NULL) {
425		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
426	} else {
427		uh = uq->uq_spare_queue;
428		uh->key = uq->uq_key;
429		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
430	}
431	uq->uq_spare_queue = NULL;
432
433	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
434	uh->length++;
435	uq->uq_flags |= UQF_UMTXQ;
436	uq->uq_cur_queue = uh;
437	return;
438}
439
440static inline void
441umtxq_remove_queue(struct umtx_q *uq, int q)
442{
443	struct umtxq_chain *uc;
444	struct umtxq_queue *uh;
445
446	uc = umtxq_getchain(&uq->uq_key);
447	UMTXQ_LOCKED_ASSERT(uc);
448	if (uq->uq_flags & UQF_UMTXQ) {
449		uh = uq->uq_cur_queue;
450		TAILQ_REMOVE(&uh->head, uq, uq_link);
451		uh->length--;
452		uq->uq_flags &= ~UQF_UMTXQ;
453		if (TAILQ_EMPTY(&uh->head)) {
454			KASSERT(uh->length == 0,
455			    ("inconsistent umtxq_queue length"));
456			LIST_REMOVE(uh, link);
457		} else {
458			uh = LIST_FIRST(&uc->uc_spare_queue);
459			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
460			LIST_REMOVE(uh, link);
461		}
462		uq->uq_spare_queue = uh;
463		uq->uq_cur_queue = NULL;
464	}
465}
466
467/*
468 * Check if there are multiple waiters
469 */
470static int
471umtxq_count(struct umtx_key *key)
472{
473	struct umtxq_chain *uc;
474	struct umtxq_queue *uh;
475
476	uc = umtxq_getchain(key);
477	UMTXQ_LOCKED_ASSERT(uc);
478	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
479	if (uh != NULL)
480		return (uh->length);
481	return (0);
482}
483
484/*
485 * Check if there are multiple PI waiters and returns first
486 * waiter.
487 */
488static int
489umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
490{
491	struct umtxq_chain *uc;
492	struct umtxq_queue *uh;
493
494	*first = NULL;
495	uc = umtxq_getchain(key);
496	UMTXQ_LOCKED_ASSERT(uc);
497	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
498	if (uh != NULL) {
499		*first = TAILQ_FIRST(&uh->head);
500		return (uh->length);
501	}
502	return (0);
503}
504
505/*
506 * Wake up threads waiting on an userland object.
507 */
508
509static int
510umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
511{
512	struct umtxq_chain *uc;
513	struct umtxq_queue *uh;
514	struct umtx_q *uq;
515	int ret;
516
517	ret = 0;
518	uc = umtxq_getchain(key);
519	UMTXQ_LOCKED_ASSERT(uc);
520	uh = umtxq_queue_lookup(key, q);
521	if (uh != NULL) {
522		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
523			umtxq_remove_queue(uq, q);
524			wakeup(uq);
525			if (++ret >= n_wake)
526				return (ret);
527		}
528	}
529	return (ret);
530}
531
532
533/*
534 * Wake up specified thread.
535 */
536static inline void
537umtxq_signal_thread(struct umtx_q *uq)
538{
539	struct umtxq_chain *uc;
540
541	uc = umtxq_getchain(&uq->uq_key);
542	UMTXQ_LOCKED_ASSERT(uc);
543	umtxq_remove(uq);
544	wakeup(uq);
545}
546
547/*
548 * Put thread into sleep state, before sleeping, check if
549 * thread was removed from umtx queue.
550 */
551static inline int
552umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
553{
554	struct umtxq_chain *uc;
555	int error;
556
557	uc = umtxq_getchain(&uq->uq_key);
558	UMTXQ_LOCKED_ASSERT(uc);
559	if (!(uq->uq_flags & UQF_UMTXQ))
560		return (0);
561	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
562	if (error == EWOULDBLOCK)
563		error = ETIMEDOUT;
564	return (error);
565}
566
567/*
568 * Convert userspace address into unique logical address.
569 */
570static int
571umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
572{
573	struct thread *td = curthread;
574	vm_map_t map;
575	vm_map_entry_t entry;
576	vm_pindex_t pindex;
577	vm_prot_t prot;
578	boolean_t wired;
579
580	key->type = type;
581	if (share == THREAD_SHARE) {
582		key->shared = 0;
583		key->info.private.vs = td->td_proc->p_vmspace;
584		key->info.private.addr = (uintptr_t)addr;
585	} else {
586		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
587		map = &td->td_proc->p_vmspace->vm_map;
588		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
589		    &entry, &key->info.shared.object, &pindex, &prot,
590		    &wired) != KERN_SUCCESS) {
591			return EFAULT;
592		}
593
594		if ((share == PROCESS_SHARE) ||
595		    (share == AUTO_SHARE &&
596		     VM_INHERIT_SHARE == entry->inheritance)) {
597			key->shared = 1;
598			key->info.shared.offset = entry->offset + entry->start -
599				(vm_offset_t)addr;
600			vm_object_reference(key->info.shared.object);
601		} else {
602			key->shared = 0;
603			key->info.private.vs = td->td_proc->p_vmspace;
604			key->info.private.addr = (uintptr_t)addr;
605		}
606		vm_map_lookup_done(map, entry);
607	}
608
609	umtxq_hash(key);
610	return (0);
611}
612
613/*
614 * Release key.
615 */
616static inline void
617umtx_key_release(struct umtx_key *key)
618{
619	if (key->shared)
620		vm_object_deallocate(key->info.shared.object);
621}
622
623/*
624 * Lock a umtx object.
625 */
626static int
627_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
628{
629	struct umtx_q *uq;
630	u_long owner;
631	u_long old;
632	int error = 0;
633
634	uq = td->td_umtxq;
635
636	/*
637	 * Care must be exercised when dealing with umtx structure. It
638	 * can fault on any access.
639	 */
640	for (;;) {
641		/*
642		 * Try the uncontested case.  This should be done in userland.
643		 */
644		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
645
646		/* The acquire succeeded. */
647		if (owner == UMTX_UNOWNED)
648			return (0);
649
650		/* The address was invalid. */
651		if (owner == -1)
652			return (EFAULT);
653
654		/* If no one owns it but it is contested try to acquire it. */
655		if (owner == UMTX_CONTESTED) {
656			owner = casuword(&umtx->u_owner,
657			    UMTX_CONTESTED, id | UMTX_CONTESTED);
658
659			if (owner == UMTX_CONTESTED)
660				return (0);
661
662			/* The address was invalid. */
663			if (owner == -1)
664				return (EFAULT);
665
666			/* If this failed the lock has changed, restart. */
667			continue;
668		}
669
670		/*
671		 * If we caught a signal, we have retried and now
672		 * exit immediately.
673		 */
674		if (error != 0)
675			return (error);
676
677		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
678			AUTO_SHARE, &uq->uq_key)) != 0)
679			return (error);
680
681		umtxq_lock(&uq->uq_key);
682		umtxq_busy(&uq->uq_key);
683		umtxq_insert(uq);
684		umtxq_unbusy(&uq->uq_key);
685		umtxq_unlock(&uq->uq_key);
686
687		/*
688		 * Set the contested bit so that a release in user space
689		 * knows to use the system call for unlock.  If this fails
690		 * either some one else has acquired the lock or it has been
691		 * released.
692		 */
693		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
694
695		/* The address was invalid. */
696		if (old == -1) {
697			umtxq_lock(&uq->uq_key);
698			umtxq_remove(uq);
699			umtxq_unlock(&uq->uq_key);
700			umtx_key_release(&uq->uq_key);
701			return (EFAULT);
702		}
703
704		/*
705		 * We set the contested bit, sleep. Otherwise the lock changed
706		 * and we need to retry or we lost a race to the thread
707		 * unlocking the umtx.
708		 */
709		umtxq_lock(&uq->uq_key);
710		if (old == owner)
711			error = umtxq_sleep(uq, "umtx", timo);
712		umtxq_remove(uq);
713		umtxq_unlock(&uq->uq_key);
714		umtx_key_release(&uq->uq_key);
715	}
716
717	return (0);
718}
719
720/*
721 * Lock a umtx object.
722 */
723static int
724do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
725	struct timespec *timeout)
726{
727	struct timespec ts, ts2, ts3;
728	struct timeval tv;
729	int error;
730
731	if (timeout == NULL) {
732		error = _do_lock_umtx(td, umtx, id, 0);
733		/* Mutex locking is restarted if it is interrupted. */
734		if (error == EINTR)
735			error = ERESTART;
736	} else {
737		getnanouptime(&ts);
738		timespecadd(&ts, timeout);
739		TIMESPEC_TO_TIMEVAL(&tv, timeout);
740		for (;;) {
741			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
742			if (error != ETIMEDOUT)
743				break;
744			getnanouptime(&ts2);
745			if (timespeccmp(&ts2, &ts, >=)) {
746				error = ETIMEDOUT;
747				break;
748			}
749			ts3 = ts;
750			timespecsub(&ts3, &ts2);
751			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
752		}
753		/* Timed-locking is not restarted. */
754		if (error == ERESTART)
755			error = EINTR;
756	}
757	return (error);
758}
759
760/*
761 * Unlock a umtx object.
762 */
763static int
764do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
765{
766	struct umtx_key key;
767	u_long owner;
768	u_long old;
769	int error;
770	int count;
771
772	/*
773	 * Make sure we own this mtx.
774	 */
775	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
776	if (owner == -1)
777		return (EFAULT);
778
779	if ((owner & ~UMTX_CONTESTED) != id)
780		return (EPERM);
781
782	/* This should be done in userland */
783	if ((owner & UMTX_CONTESTED) == 0) {
784		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
785		if (old == -1)
786			return (EFAULT);
787		if (old == owner)
788			return (0);
789		owner = old;
790	}
791
792	/* We should only ever be in here for contested locks */
793	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
794		&key)) != 0)
795		return (error);
796
797	umtxq_lock(&key);
798	umtxq_busy(&key);
799	count = umtxq_count(&key);
800	umtxq_unlock(&key);
801
802	/*
803	 * When unlocking the umtx, it must be marked as unowned if
804	 * there is zero or one thread only waiting for it.
805	 * Otherwise, it must be marked as contested.
806	 */
807	old = casuword(&umtx->u_owner, owner,
808		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
809	umtxq_lock(&key);
810	umtxq_signal(&key,1);
811	umtxq_unbusy(&key);
812	umtxq_unlock(&key);
813	umtx_key_release(&key);
814	if (old == -1)
815		return (EFAULT);
816	if (old != owner)
817		return (EINVAL);
818	return (0);
819}
820
821#ifdef COMPAT_FREEBSD32
822
823/*
824 * Lock a umtx object.
825 */
826static int
827_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
828{
829	struct umtx_q *uq;
830	uint32_t owner;
831	uint32_t old;
832	int error = 0;
833
834	uq = td->td_umtxq;
835
836	/*
837	 * Care must be exercised when dealing with umtx structure. It
838	 * can fault on any access.
839	 */
840	for (;;) {
841		/*
842		 * Try the uncontested case.  This should be done in userland.
843		 */
844		owner = casuword32(m, UMUTEX_UNOWNED, id);
845
846		/* The acquire succeeded. */
847		if (owner == UMUTEX_UNOWNED)
848			return (0);
849
850		/* The address was invalid. */
851		if (owner == -1)
852			return (EFAULT);
853
854		/* If no one owns it but it is contested try to acquire it. */
855		if (owner == UMUTEX_CONTESTED) {
856			owner = casuword32(m,
857			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
858			if (owner == UMUTEX_CONTESTED)
859				return (0);
860
861			/* The address was invalid. */
862			if (owner == -1)
863				return (EFAULT);
864
865			/* If this failed the lock has changed, restart. */
866			continue;
867		}
868
869		/*
870		 * If we caught a signal, we have retried and now
871		 * exit immediately.
872		 */
873		if (error != 0)
874			return (error);
875
876		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
877			AUTO_SHARE, &uq->uq_key)) != 0)
878			return (error);
879
880		umtxq_lock(&uq->uq_key);
881		umtxq_busy(&uq->uq_key);
882		umtxq_insert(uq);
883		umtxq_unbusy(&uq->uq_key);
884		umtxq_unlock(&uq->uq_key);
885
886		/*
887		 * Set the contested bit so that a release in user space
888		 * knows to use the system call for unlock.  If this fails
889		 * either some one else has acquired the lock or it has been
890		 * released.
891		 */
892		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
893
894		/* The address was invalid. */
895		if (old == -1) {
896			umtxq_lock(&uq->uq_key);
897			umtxq_remove(uq);
898			umtxq_unlock(&uq->uq_key);
899			umtx_key_release(&uq->uq_key);
900			return (EFAULT);
901		}
902
903		/*
904		 * We set the contested bit, sleep. Otherwise the lock changed
905		 * and we need to retry or we lost a race to the thread
906		 * unlocking the umtx.
907		 */
908		umtxq_lock(&uq->uq_key);
909		if (old == owner)
910			error = umtxq_sleep(uq, "umtx", timo);
911		umtxq_remove(uq);
912		umtxq_unlock(&uq->uq_key);
913		umtx_key_release(&uq->uq_key);
914	}
915
916	return (0);
917}
918
919/*
920 * Lock a umtx object.
921 */
922static int
923do_lock_umtx32(struct thread *td, void *m, uint32_t id,
924	struct timespec *timeout)
925{
926	struct timespec ts, ts2, ts3;
927	struct timeval tv;
928	int error;
929
930	if (timeout == NULL) {
931		error = _do_lock_umtx32(td, m, id, 0);
932		/* Mutex locking is restarted if it is interrupted. */
933		if (error == EINTR)
934			error = ERESTART;
935	} else {
936		getnanouptime(&ts);
937		timespecadd(&ts, timeout);
938		TIMESPEC_TO_TIMEVAL(&tv, timeout);
939		for (;;) {
940			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
941			if (error != ETIMEDOUT)
942				break;
943			getnanouptime(&ts2);
944			if (timespeccmp(&ts2, &ts, >=)) {
945				error = ETIMEDOUT;
946				break;
947			}
948			ts3 = ts;
949			timespecsub(&ts3, &ts2);
950			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
951		}
952		/* Timed-locking is not restarted. */
953		if (error == ERESTART)
954			error = EINTR;
955	}
956	return (error);
957}
958
959/*
960 * Unlock a umtx object.
961 */
962static int
963do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
964{
965	struct umtx_key key;
966	uint32_t owner;
967	uint32_t old;
968	int error;
969	int count;
970
971	/*
972	 * Make sure we own this mtx.
973	 */
974	owner = fuword32(m);
975	if (owner == -1)
976		return (EFAULT);
977
978	if ((owner & ~UMUTEX_CONTESTED) != id)
979		return (EPERM);
980
981	/* This should be done in userland */
982	if ((owner & UMUTEX_CONTESTED) == 0) {
983		old = casuword32(m, owner, UMUTEX_UNOWNED);
984		if (old == -1)
985			return (EFAULT);
986		if (old == owner)
987			return (0);
988		owner = old;
989	}
990
991	/* We should only ever be in here for contested locks */
992	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
993		&key)) != 0)
994		return (error);
995
996	umtxq_lock(&key);
997	umtxq_busy(&key);
998	count = umtxq_count(&key);
999	umtxq_unlock(&key);
1000
1001	/*
1002	 * When unlocking the umtx, it must be marked as unowned if
1003	 * there is zero or one thread only waiting for it.
1004	 * Otherwise, it must be marked as contested.
1005	 */
1006	old = casuword32(m, owner,
1007		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1008	umtxq_lock(&key);
1009	umtxq_signal(&key,1);
1010	umtxq_unbusy(&key);
1011	umtxq_unlock(&key);
1012	umtx_key_release(&key);
1013	if (old == -1)
1014		return (EFAULT);
1015	if (old != owner)
1016		return (EINVAL);
1017	return (0);
1018}
1019#endif
1020
1021/*
1022 * Fetch and compare value, sleep on the address if value is not changed.
1023 */
1024static int
1025do_wait(struct thread *td, void *addr, u_long id,
1026	struct timespec *timeout, int compat32, int is_private)
1027{
1028	struct umtx_q *uq;
1029	struct timespec ts, ts2, ts3;
1030	struct timeval tv;
1031	u_long tmp;
1032	int error = 0;
1033
1034	uq = td->td_umtxq;
1035	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1036		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1037		return (error);
1038
1039	umtxq_lock(&uq->uq_key);
1040	umtxq_insert(uq);
1041	umtxq_unlock(&uq->uq_key);
1042	if (compat32 == 0)
1043		tmp = fuword(addr);
1044        else
1045		tmp = (unsigned int)fuword32(addr);
1046	if (tmp != id) {
1047		umtxq_lock(&uq->uq_key);
1048		umtxq_remove(uq);
1049		umtxq_unlock(&uq->uq_key);
1050	} else if (timeout == NULL) {
1051		umtxq_lock(&uq->uq_key);
1052		error = umtxq_sleep(uq, "uwait", 0);
1053		umtxq_remove(uq);
1054		umtxq_unlock(&uq->uq_key);
1055	} else {
1056		getnanouptime(&ts);
1057		timespecadd(&ts, timeout);
1058		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1059		umtxq_lock(&uq->uq_key);
1060		for (;;) {
1061			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
1062			if (!(uq->uq_flags & UQF_UMTXQ))
1063				break;
1064			if (error != ETIMEDOUT)
1065				break;
1066			umtxq_unlock(&uq->uq_key);
1067			getnanouptime(&ts2);
1068			if (timespeccmp(&ts2, &ts, >=)) {
1069				error = ETIMEDOUT;
1070				umtxq_lock(&uq->uq_key);
1071				break;
1072			}
1073			ts3 = ts;
1074			timespecsub(&ts3, &ts2);
1075			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1076			umtxq_lock(&uq->uq_key);
1077		}
1078		umtxq_remove(uq);
1079		umtxq_unlock(&uq->uq_key);
1080	}
1081	umtx_key_release(&uq->uq_key);
1082	if (error == ERESTART)
1083		error = EINTR;
1084	return (error);
1085}
1086
1087/*
1088 * Wake up threads sleeping on the specified address.
1089 */
1090int
1091kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1092{
1093	struct umtx_key key;
1094	int ret;
1095
1096	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1097		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1098		return (ret);
1099	umtxq_lock(&key);
1100	ret = umtxq_signal(&key, n_wake);
1101	umtxq_unlock(&key);
1102	umtx_key_release(&key);
1103	return (0);
1104}
1105
1106/*
1107 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1108 */
1109static int
1110_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1111	int mode)
1112{
1113	struct umtx_q *uq;
1114	uint32_t owner, old, id;
1115	int error = 0;
1116
1117	id = td->td_tid;
1118	uq = td->td_umtxq;
1119
1120	/*
1121	 * Care must be exercised when dealing with umtx structure. It
1122	 * can fault on any access.
1123	 */
1124	for (;;) {
1125		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1126		if (mode == _UMUTEX_WAIT) {
1127			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1128				return (0);
1129		} else {
1130			/*
1131			 * Try the uncontested case.  This should be done in userland.
1132			 */
1133			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1134
1135			/* The acquire succeeded. */
1136			if (owner == UMUTEX_UNOWNED)
1137				return (0);
1138
1139			/* The address was invalid. */
1140			if (owner == -1)
1141				return (EFAULT);
1142
1143			/* If no one owns it but it is contested try to acquire it. */
1144			if (owner == UMUTEX_CONTESTED) {
1145				owner = casuword32(&m->m_owner,
1146				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1147
1148				if (owner == UMUTEX_CONTESTED)
1149					return (0);
1150
1151				/* The address was invalid. */
1152				if (owner == -1)
1153					return (EFAULT);
1154
1155				/* If this failed the lock has changed, restart. */
1156				continue;
1157			}
1158		}
1159
1160		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1161		    (owner & ~UMUTEX_CONTESTED) == id)
1162			return (EDEADLK);
1163
1164		if (mode == _UMUTEX_TRY)
1165			return (EBUSY);
1166
1167		/*
1168		 * If we caught a signal, we have retried and now
1169		 * exit immediately.
1170		 */
1171		if (error != 0)
1172			return (error);
1173
1174		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1175		    GET_SHARE(flags), &uq->uq_key)) != 0)
1176			return (error);
1177
1178		umtxq_lock(&uq->uq_key);
1179		umtxq_busy(&uq->uq_key);
1180		umtxq_insert(uq);
1181		umtxq_unlock(&uq->uq_key);
1182
1183		/*
1184		 * Set the contested bit so that a release in user space
1185		 * knows to use the system call for unlock.  If this fails
1186		 * either some one else has acquired the lock or it has been
1187		 * released.
1188		 */
1189		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1190
1191		/* The address was invalid. */
1192		if (old == -1) {
1193			umtxq_lock(&uq->uq_key);
1194			umtxq_remove(uq);
1195			umtxq_unbusy(&uq->uq_key);
1196			umtxq_unlock(&uq->uq_key);
1197			umtx_key_release(&uq->uq_key);
1198			return (EFAULT);
1199		}
1200
1201		/*
1202		 * We set the contested bit, sleep. Otherwise the lock changed
1203		 * and we need to retry or we lost a race to the thread
1204		 * unlocking the umtx.
1205		 */
1206		umtxq_lock(&uq->uq_key);
1207		umtxq_unbusy(&uq->uq_key);
1208		if (old == owner)
1209			error = umtxq_sleep(uq, "umtxn", timo);
1210		umtxq_remove(uq);
1211		umtxq_unlock(&uq->uq_key);
1212		umtx_key_release(&uq->uq_key);
1213	}
1214
1215	return (0);
1216}
1217
1218/*
1219 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1220 */
1221/*
1222 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1223 */
1224static int
1225do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1226{
1227	struct umtx_key key;
1228	uint32_t owner, old, id;
1229	int error;
1230	int count;
1231
1232	id = td->td_tid;
1233	/*
1234	 * Make sure we own this mtx.
1235	 */
1236	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1237	if (owner == -1)
1238		return (EFAULT);
1239
1240	if ((owner & ~UMUTEX_CONTESTED) != id)
1241		return (EPERM);
1242
1243	if ((owner & UMUTEX_CONTESTED) == 0) {
1244		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1245		if (old == -1)
1246			return (EFAULT);
1247		if (old == owner)
1248			return (0);
1249		owner = old;
1250	}
1251
1252	/* We should only ever be in here for contested locks */
1253	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1254	    &key)) != 0)
1255		return (error);
1256
1257	umtxq_lock(&key);
1258	umtxq_busy(&key);
1259	count = umtxq_count(&key);
1260	umtxq_unlock(&key);
1261
1262	/*
1263	 * When unlocking the umtx, it must be marked as unowned if
1264	 * there is zero or one thread only waiting for it.
1265	 * Otherwise, it must be marked as contested.
1266	 */
1267	old = casuword32(&m->m_owner, owner,
1268		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1269	umtxq_lock(&key);
1270	umtxq_signal(&key,1);
1271	umtxq_unbusy(&key);
1272	umtxq_unlock(&key);
1273	umtx_key_release(&key);
1274	if (old == -1)
1275		return (EFAULT);
1276	if (old != owner)
1277		return (EINVAL);
1278	return (0);
1279}
1280
1281/*
1282 * Check if the mutex is available and wake up a waiter,
1283 * only for simple mutex.
1284 */
1285static int
1286do_wake_umutex(struct thread *td, struct umutex *m)
1287{
1288	struct umtx_key key;
1289	uint32_t owner;
1290	uint32_t flags;
1291	int error;
1292	int count;
1293
1294	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1295	if (owner == -1)
1296		return (EFAULT);
1297
1298	if ((owner & ~UMUTEX_CONTESTED) != 0)
1299		return (0);
1300
1301	flags = fuword32(&m->m_flags);
1302
1303	/* We should only ever be in here for contested locks */
1304	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1305	    &key)) != 0)
1306		return (error);
1307
1308	umtxq_lock(&key);
1309	umtxq_busy(&key);
1310	count = umtxq_count(&key);
1311	umtxq_unlock(&key);
1312
1313	if (count <= 1)
1314		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1315
1316	umtxq_lock(&key);
1317	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1318		umtxq_signal(&key, 1);
1319	umtxq_unbusy(&key);
1320	umtxq_unlock(&key);
1321	umtx_key_release(&key);
1322	return (0);
1323}
1324
1325static inline struct umtx_pi *
1326umtx_pi_alloc(int flags)
1327{
1328	struct umtx_pi *pi;
1329
1330	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1331	TAILQ_INIT(&pi->pi_blocked);
1332	atomic_add_int(&umtx_pi_allocated, 1);
1333	return (pi);
1334}
1335
1336static inline void
1337umtx_pi_free(struct umtx_pi *pi)
1338{
1339	uma_zfree(umtx_pi_zone, pi);
1340	atomic_add_int(&umtx_pi_allocated, -1);
1341}
1342
1343/*
1344 * Adjust the thread's position on a pi_state after its priority has been
1345 * changed.
1346 */
1347static int
1348umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1349{
1350	struct umtx_q *uq, *uq1, *uq2;
1351	struct thread *td1;
1352
1353	mtx_assert(&umtx_lock, MA_OWNED);
1354	if (pi == NULL)
1355		return (0);
1356
1357	uq = td->td_umtxq;
1358
1359	/*
1360	 * Check if the thread needs to be moved on the blocked chain.
1361	 * It needs to be moved if either its priority is lower than
1362	 * the previous thread or higher than the next thread.
1363	 */
1364	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1365	uq2 = TAILQ_NEXT(uq, uq_lockq);
1366	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1367	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1368		/*
1369		 * Remove thread from blocked chain and determine where
1370		 * it should be moved to.
1371		 */
1372		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1373		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1374			td1 = uq1->uq_thread;
1375			MPASS(td1->td_proc->p_magic == P_MAGIC);
1376			if (UPRI(td1) > UPRI(td))
1377				break;
1378		}
1379
1380		if (uq1 == NULL)
1381			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1382		else
1383			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1384	}
1385	return (1);
1386}
1387
1388/*
1389 * Propagate priority when a thread is blocked on POSIX
1390 * PI mutex.
1391 */
1392static void
1393umtx_propagate_priority(struct thread *td)
1394{
1395	struct umtx_q *uq;
1396	struct umtx_pi *pi;
1397	int pri;
1398
1399	mtx_assert(&umtx_lock, MA_OWNED);
1400	pri = UPRI(td);
1401	uq = td->td_umtxq;
1402	pi = uq->uq_pi_blocked;
1403	if (pi == NULL)
1404		return;
1405
1406	for (;;) {
1407		td = pi->pi_owner;
1408		if (td == NULL)
1409			return;
1410
1411		MPASS(td->td_proc != NULL);
1412		MPASS(td->td_proc->p_magic == P_MAGIC);
1413
1414		if (UPRI(td) <= pri)
1415			return;
1416
1417		thread_lock(td);
1418		sched_lend_user_prio(td, pri);
1419		thread_unlock(td);
1420
1421		/*
1422		 * Pick up the lock that td is blocked on.
1423		 */
1424		uq = td->td_umtxq;
1425		pi = uq->uq_pi_blocked;
1426		/* Resort td on the list if needed. */
1427		if (!umtx_pi_adjust_thread(pi, td))
1428			break;
1429	}
1430}
1431
1432/*
1433 * Unpropagate priority for a PI mutex when a thread blocked on
1434 * it is interrupted by signal or resumed by others.
1435 */
1436static void
1437umtx_unpropagate_priority(struct umtx_pi *pi)
1438{
1439	struct umtx_q *uq, *uq_owner;
1440	struct umtx_pi *pi2;
1441	int pri, oldpri;
1442
1443	mtx_assert(&umtx_lock, MA_OWNED);
1444
1445	while (pi != NULL && pi->pi_owner != NULL) {
1446		pri = PRI_MAX;
1447		uq_owner = pi->pi_owner->td_umtxq;
1448
1449		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1450			uq = TAILQ_FIRST(&pi2->pi_blocked);
1451			if (uq != NULL) {
1452				if (pri > UPRI(uq->uq_thread))
1453					pri = UPRI(uq->uq_thread);
1454			}
1455		}
1456
1457		if (pri > uq_owner->uq_inherited_pri)
1458			pri = uq_owner->uq_inherited_pri;
1459		thread_lock(pi->pi_owner);
1460		oldpri = pi->pi_owner->td_user_pri;
1461		sched_unlend_user_prio(pi->pi_owner, pri);
1462		thread_unlock(pi->pi_owner);
1463		if (uq_owner->uq_pi_blocked != NULL)
1464			umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1465		pi = uq_owner->uq_pi_blocked;
1466	}
1467}
1468
1469/*
1470 * Insert a PI mutex into owned list.
1471 */
1472static void
1473umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1474{
1475	struct umtx_q *uq_owner;
1476
1477	uq_owner = owner->td_umtxq;
1478	mtx_assert(&umtx_lock, MA_OWNED);
1479	if (pi->pi_owner != NULL)
1480		panic("pi_ower != NULL");
1481	pi->pi_owner = owner;
1482	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1483}
1484
1485/*
1486 * Claim ownership of a PI mutex.
1487 */
1488static int
1489umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1490{
1491	struct umtx_q *uq, *uq_owner;
1492
1493	uq_owner = owner->td_umtxq;
1494	mtx_lock_spin(&umtx_lock);
1495	if (pi->pi_owner == owner) {
1496		mtx_unlock_spin(&umtx_lock);
1497		return (0);
1498	}
1499
1500	if (pi->pi_owner != NULL) {
1501		/*
1502		 * userland may have already messed the mutex, sigh.
1503		 */
1504		mtx_unlock_spin(&umtx_lock);
1505		return (EPERM);
1506	}
1507	umtx_pi_setowner(pi, owner);
1508	uq = TAILQ_FIRST(&pi->pi_blocked);
1509	if (uq != NULL) {
1510		int pri;
1511
1512		pri = UPRI(uq->uq_thread);
1513		thread_lock(owner);
1514		if (pri < UPRI(owner))
1515			sched_lend_user_prio(owner, pri);
1516		thread_unlock(owner);
1517	}
1518	mtx_unlock_spin(&umtx_lock);
1519	return (0);
1520}
1521
1522static void
1523umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1524{
1525	struct umtx_q *uq;
1526	struct umtx_pi *pi;
1527
1528	uq = td->td_umtxq;
1529	/*
1530	 * Pick up the lock that td is blocked on.
1531	 */
1532	pi = uq->uq_pi_blocked;
1533	MPASS(pi != NULL);
1534
1535	/* Resort the turnstile on the list. */
1536	if (!umtx_pi_adjust_thread(pi, td))
1537		return;
1538
1539	/*
1540	 * If our priority was lowered and we are at the head of the
1541	 * turnstile, then propagate our new priority up the chain.
1542	 */
1543	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1544		umtx_propagate_priority(td);
1545}
1546
1547/*
1548 * Adjust a thread's order position in its blocked PI mutex,
1549 * this may result new priority propagating process.
1550 */
1551void
1552umtx_pi_adjust(struct thread *td, u_char oldpri)
1553{
1554	struct umtx_q *uq;
1555	struct umtx_pi *pi;
1556
1557	uq = td->td_umtxq;
1558	mtx_lock_spin(&umtx_lock);
1559	/*
1560	 * Pick up the lock that td is blocked on.
1561	 */
1562	pi = uq->uq_pi_blocked;
1563	if (pi != NULL)
1564		umtx_pi_adjust_locked(td, oldpri);
1565	mtx_unlock_spin(&umtx_lock);
1566}
1567
1568/*
1569 * Sleep on a PI mutex.
1570 */
1571static int
1572umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1573	uint32_t owner, const char *wmesg, int timo)
1574{
1575	struct umtxq_chain *uc;
1576	struct thread *td, *td1;
1577	struct umtx_q *uq1;
1578	int pri;
1579	int error = 0;
1580
1581	td = uq->uq_thread;
1582	KASSERT(td == curthread, ("inconsistent uq_thread"));
1583	uc = umtxq_getchain(&uq->uq_key);
1584	UMTXQ_LOCKED_ASSERT(uc);
1585	UMTXQ_BUSY_ASSERT(uc);
1586	umtxq_insert(uq);
1587	mtx_lock_spin(&umtx_lock);
1588	if (pi->pi_owner == NULL) {
1589		/* XXX
1590		 * Current, We only support process private PI-mutex,
1591		 * we need a faster way to find an owner thread for
1592		 * process-shared mutex (not available yet).
1593		 */
1594		mtx_unlock_spin(&umtx_lock);
1595		PROC_LOCK(curproc);
1596		td1 = thread_find(curproc, owner);
1597		mtx_lock_spin(&umtx_lock);
1598		if (td1 != NULL && pi->pi_owner == NULL) {
1599			uq1 = td1->td_umtxq;
1600			umtx_pi_setowner(pi, td1);
1601		}
1602		PROC_UNLOCK(curproc);
1603	}
1604
1605	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1606		pri = UPRI(uq1->uq_thread);
1607		if (pri > UPRI(td))
1608			break;
1609	}
1610
1611	if (uq1 != NULL)
1612		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1613	else
1614		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1615
1616	uq->uq_pi_blocked = pi;
1617	thread_lock(td);
1618	td->td_flags |= TDF_UPIBLOCKED;
1619	thread_unlock(td);
1620	umtx_propagate_priority(td);
1621	mtx_unlock_spin(&umtx_lock);
1622	umtxq_unbusy(&uq->uq_key);
1623
1624	if (uq->uq_flags & UQF_UMTXQ) {
1625		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1626		if (error == EWOULDBLOCK)
1627			error = ETIMEDOUT;
1628		if (uq->uq_flags & UQF_UMTXQ) {
1629			umtxq_remove(uq);
1630		}
1631	}
1632	mtx_lock_spin(&umtx_lock);
1633	uq->uq_pi_blocked = NULL;
1634	thread_lock(td);
1635	td->td_flags &= ~TDF_UPIBLOCKED;
1636	thread_unlock(td);
1637	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1638	umtx_unpropagate_priority(pi);
1639	mtx_unlock_spin(&umtx_lock);
1640	umtxq_unlock(&uq->uq_key);
1641
1642	return (error);
1643}
1644
1645/*
1646 * Add reference count for a PI mutex.
1647 */
1648static void
1649umtx_pi_ref(struct umtx_pi *pi)
1650{
1651	struct umtxq_chain *uc;
1652
1653	uc = umtxq_getchain(&pi->pi_key);
1654	UMTXQ_LOCKED_ASSERT(uc);
1655	pi->pi_refcount++;
1656}
1657
1658/*
1659 * Decrease reference count for a PI mutex, if the counter
1660 * is decreased to zero, its memory space is freed.
1661 */
1662static void
1663umtx_pi_unref(struct umtx_pi *pi)
1664{
1665	struct umtxq_chain *uc;
1666
1667	uc = umtxq_getchain(&pi->pi_key);
1668	UMTXQ_LOCKED_ASSERT(uc);
1669	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1670	if (--pi->pi_refcount == 0) {
1671		mtx_lock_spin(&umtx_lock);
1672		if (pi->pi_owner != NULL) {
1673			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1674				pi, pi_link);
1675			pi->pi_owner = NULL;
1676		}
1677		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1678			("blocked queue not empty"));
1679		mtx_unlock_spin(&umtx_lock);
1680		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1681		umtx_pi_free(pi);
1682	}
1683}
1684
1685/*
1686 * Find a PI mutex in hash table.
1687 */
1688static struct umtx_pi *
1689umtx_pi_lookup(struct umtx_key *key)
1690{
1691	struct umtxq_chain *uc;
1692	struct umtx_pi *pi;
1693
1694	uc = umtxq_getchain(key);
1695	UMTXQ_LOCKED_ASSERT(uc);
1696
1697	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1698		if (umtx_key_match(&pi->pi_key, key)) {
1699			return (pi);
1700		}
1701	}
1702	return (NULL);
1703}
1704
1705/*
1706 * Insert a PI mutex into hash table.
1707 */
1708static inline void
1709umtx_pi_insert(struct umtx_pi *pi)
1710{
1711	struct umtxq_chain *uc;
1712
1713	uc = umtxq_getchain(&pi->pi_key);
1714	UMTXQ_LOCKED_ASSERT(uc);
1715	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1716}
1717
1718/*
1719 * Lock a PI mutex.
1720 */
1721static int
1722_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1723	int try)
1724{
1725	struct umtx_q *uq;
1726	struct umtx_pi *pi, *new_pi;
1727	uint32_t id, owner, old;
1728	int error;
1729
1730	id = td->td_tid;
1731	uq = td->td_umtxq;
1732
1733	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1734	    &uq->uq_key)) != 0)
1735		return (error);
1736	umtxq_lock(&uq->uq_key);
1737	pi = umtx_pi_lookup(&uq->uq_key);
1738	if (pi == NULL) {
1739		new_pi = umtx_pi_alloc(M_NOWAIT);
1740		if (new_pi == NULL) {
1741			umtxq_unlock(&uq->uq_key);
1742			new_pi = umtx_pi_alloc(M_WAITOK);
1743			umtxq_lock(&uq->uq_key);
1744			pi = umtx_pi_lookup(&uq->uq_key);
1745			if (pi != NULL) {
1746				umtx_pi_free(new_pi);
1747				new_pi = NULL;
1748			}
1749		}
1750		if (new_pi != NULL) {
1751			new_pi->pi_key = uq->uq_key;
1752			umtx_pi_insert(new_pi);
1753			pi = new_pi;
1754		}
1755	}
1756	umtx_pi_ref(pi);
1757	umtxq_unlock(&uq->uq_key);
1758
1759	/*
1760	 * Care must be exercised when dealing with umtx structure.  It
1761	 * can fault on any access.
1762	 */
1763	for (;;) {
1764		/*
1765		 * Try the uncontested case.  This should be done in userland.
1766		 */
1767		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1768
1769		/* The acquire succeeded. */
1770		if (owner == UMUTEX_UNOWNED) {
1771			error = 0;
1772			break;
1773		}
1774
1775		/* The address was invalid. */
1776		if (owner == -1) {
1777			error = EFAULT;
1778			break;
1779		}
1780
1781		/* If no one owns it but it is contested try to acquire it. */
1782		if (owner == UMUTEX_CONTESTED) {
1783			owner = casuword32(&m->m_owner,
1784			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1785
1786			if (owner == UMUTEX_CONTESTED) {
1787				umtxq_lock(&uq->uq_key);
1788				umtxq_busy(&uq->uq_key);
1789				error = umtx_pi_claim(pi, td);
1790				umtxq_unbusy(&uq->uq_key);
1791				umtxq_unlock(&uq->uq_key);
1792				break;
1793			}
1794
1795			/* The address was invalid. */
1796			if (owner == -1) {
1797				error = EFAULT;
1798				break;
1799			}
1800
1801			/* If this failed the lock has changed, restart. */
1802			continue;
1803		}
1804
1805		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1806		    (owner & ~UMUTEX_CONTESTED) == id) {
1807			error = EDEADLK;
1808			break;
1809		}
1810
1811		if (try != 0) {
1812			error = EBUSY;
1813			break;
1814		}
1815
1816		/*
1817		 * If we caught a signal, we have retried and now
1818		 * exit immediately.
1819		 */
1820		if (error != 0)
1821			break;
1822
1823		umtxq_lock(&uq->uq_key);
1824		umtxq_busy(&uq->uq_key);
1825		umtxq_unlock(&uq->uq_key);
1826
1827		/*
1828		 * Set the contested bit so that a release in user space
1829		 * knows to use the system call for unlock.  If this fails
1830		 * either some one else has acquired the lock or it has been
1831		 * released.
1832		 */
1833		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1834
1835		/* The address was invalid. */
1836		if (old == -1) {
1837			umtxq_lock(&uq->uq_key);
1838			umtxq_unbusy(&uq->uq_key);
1839			umtxq_unlock(&uq->uq_key);
1840			error = EFAULT;
1841			break;
1842		}
1843
1844		umtxq_lock(&uq->uq_key);
1845		/*
1846		 * We set the contested bit, sleep. Otherwise the lock changed
1847		 * and we need to retry or we lost a race to the thread
1848		 * unlocking the umtx.
1849		 */
1850		if (old == owner)
1851			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1852				 "umtxpi", timo);
1853		else {
1854			umtxq_unbusy(&uq->uq_key);
1855			umtxq_unlock(&uq->uq_key);
1856		}
1857	}
1858
1859	umtxq_lock(&uq->uq_key);
1860	umtx_pi_unref(pi);
1861	umtxq_unlock(&uq->uq_key);
1862
1863	umtx_key_release(&uq->uq_key);
1864	return (error);
1865}
1866
1867/*
1868 * Unlock a PI mutex.
1869 */
1870static int
1871do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1872{
1873	struct umtx_key key;
1874	struct umtx_q *uq_first, *uq_first2, *uq_me;
1875	struct umtx_pi *pi, *pi2;
1876	uint32_t owner, old, id;
1877	int error;
1878	int count;
1879	int pri;
1880
1881	id = td->td_tid;
1882	/*
1883	 * Make sure we own this mtx.
1884	 */
1885	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1886	if (owner == -1)
1887		return (EFAULT);
1888
1889	if ((owner & ~UMUTEX_CONTESTED) != id)
1890		return (EPERM);
1891
1892	/* This should be done in userland */
1893	if ((owner & UMUTEX_CONTESTED) == 0) {
1894		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1895		if (old == -1)
1896			return (EFAULT);
1897		if (old == owner)
1898			return (0);
1899		owner = old;
1900	}
1901
1902	/* We should only ever be in here for contested locks */
1903	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1904	    &key)) != 0)
1905		return (error);
1906
1907	umtxq_lock(&key);
1908	umtxq_busy(&key);
1909	count = umtxq_count_pi(&key, &uq_first);
1910	if (uq_first != NULL) {
1911		mtx_lock_spin(&umtx_lock);
1912		pi = uq_first->uq_pi_blocked;
1913		KASSERT(pi != NULL, ("pi == NULL?"));
1914		if (pi->pi_owner != curthread) {
1915			mtx_unlock_spin(&umtx_lock);
1916			umtxq_unbusy(&key);
1917			umtxq_unlock(&key);
1918			umtx_key_release(&key);
1919			/* userland messed the mutex */
1920			return (EPERM);
1921		}
1922		uq_me = curthread->td_umtxq;
1923		pi->pi_owner = NULL;
1924		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1925		/* get highest priority thread which is still sleeping. */
1926		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1927		while (uq_first != NULL &&
1928		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1929			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1930		}
1931		pri = PRI_MAX;
1932		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1933			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1934			if (uq_first2 != NULL) {
1935				if (pri > UPRI(uq_first2->uq_thread))
1936					pri = UPRI(uq_first2->uq_thread);
1937			}
1938		}
1939		thread_lock(curthread);
1940		sched_unlend_user_prio(curthread, pri);
1941		thread_unlock(curthread);
1942		mtx_unlock_spin(&umtx_lock);
1943		if (uq_first)
1944			umtxq_signal_thread(uq_first);
1945	}
1946	umtxq_unlock(&key);
1947
1948	/*
1949	 * When unlocking the umtx, it must be marked as unowned if
1950	 * there is zero or one thread only waiting for it.
1951	 * Otherwise, it must be marked as contested.
1952	 */
1953	old = casuword32(&m->m_owner, owner,
1954		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1955
1956	umtxq_lock(&key);
1957	umtxq_unbusy(&key);
1958	umtxq_unlock(&key);
1959	umtx_key_release(&key);
1960	if (old == -1)
1961		return (EFAULT);
1962	if (old != owner)
1963		return (EINVAL);
1964	return (0);
1965}
1966
1967/*
1968 * Lock a PP mutex.
1969 */
1970static int
1971_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1972	int try)
1973{
1974	struct umtx_q *uq, *uq2;
1975	struct umtx_pi *pi;
1976	uint32_t ceiling;
1977	uint32_t owner, id;
1978	int error, pri, old_inherited_pri, su;
1979
1980	id = td->td_tid;
1981	uq = td->td_umtxq;
1982	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1983	    &uq->uq_key)) != 0)
1984		return (error);
1985	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1986	for (;;) {
1987		old_inherited_pri = uq->uq_inherited_pri;
1988		umtxq_lock(&uq->uq_key);
1989		umtxq_busy(&uq->uq_key);
1990		umtxq_unlock(&uq->uq_key);
1991
1992		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1993		if (ceiling > RTP_PRIO_MAX) {
1994			error = EINVAL;
1995			goto out;
1996		}
1997
1998		mtx_lock_spin(&umtx_lock);
1999		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2000			mtx_unlock_spin(&umtx_lock);
2001			error = EINVAL;
2002			goto out;
2003		}
2004		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2005			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2006			thread_lock(td);
2007			if (uq->uq_inherited_pri < UPRI(td))
2008				sched_lend_user_prio(td, uq->uq_inherited_pri);
2009			thread_unlock(td);
2010		}
2011		mtx_unlock_spin(&umtx_lock);
2012
2013		owner = casuword32(&m->m_owner,
2014		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2015
2016		if (owner == UMUTEX_CONTESTED) {
2017			error = 0;
2018			break;
2019		}
2020
2021		/* The address was invalid. */
2022		if (owner == -1) {
2023			error = EFAULT;
2024			break;
2025		}
2026
2027		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2028		    (owner & ~UMUTEX_CONTESTED) == id) {
2029			error = EDEADLK;
2030			break;
2031		}
2032
2033		if (try != 0) {
2034			error = EBUSY;
2035			break;
2036		}
2037
2038		/*
2039		 * If we caught a signal, we have retried and now
2040		 * exit immediately.
2041		 */
2042		if (error != 0)
2043			break;
2044
2045		umtxq_lock(&uq->uq_key);
2046		umtxq_insert(uq);
2047		umtxq_unbusy(&uq->uq_key);
2048		error = umtxq_sleep(uq, "umtxpp", timo);
2049		umtxq_remove(uq);
2050		umtxq_unlock(&uq->uq_key);
2051
2052		mtx_lock_spin(&umtx_lock);
2053		uq->uq_inherited_pri = old_inherited_pri;
2054		pri = PRI_MAX;
2055		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2056			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2057			if (uq2 != NULL) {
2058				if (pri > UPRI(uq2->uq_thread))
2059					pri = UPRI(uq2->uq_thread);
2060			}
2061		}
2062		if (pri > uq->uq_inherited_pri)
2063			pri = uq->uq_inherited_pri;
2064		thread_lock(td);
2065		sched_unlend_user_prio(td, pri);
2066		thread_unlock(td);
2067		mtx_unlock_spin(&umtx_lock);
2068	}
2069
2070	if (error != 0) {
2071		mtx_lock_spin(&umtx_lock);
2072		uq->uq_inherited_pri = old_inherited_pri;
2073		pri = PRI_MAX;
2074		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2075			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2076			if (uq2 != NULL) {
2077				if (pri > UPRI(uq2->uq_thread))
2078					pri = UPRI(uq2->uq_thread);
2079			}
2080		}
2081		if (pri > uq->uq_inherited_pri)
2082			pri = uq->uq_inherited_pri;
2083		thread_lock(td);
2084		sched_unlend_user_prio(td, pri);
2085		thread_unlock(td);
2086		mtx_unlock_spin(&umtx_lock);
2087	}
2088
2089out:
2090	umtxq_lock(&uq->uq_key);
2091	umtxq_unbusy(&uq->uq_key);
2092	umtxq_unlock(&uq->uq_key);
2093	umtx_key_release(&uq->uq_key);
2094	return (error);
2095}
2096
2097/*
2098 * Unlock a PP mutex.
2099 */
2100static int
2101do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2102{
2103	struct umtx_key key;
2104	struct umtx_q *uq, *uq2;
2105	struct umtx_pi *pi;
2106	uint32_t owner, id;
2107	uint32_t rceiling;
2108	int error, pri, new_inherited_pri, su;
2109
2110	id = td->td_tid;
2111	uq = td->td_umtxq;
2112	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2113
2114	/*
2115	 * Make sure we own this mtx.
2116	 */
2117	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2118	if (owner == -1)
2119		return (EFAULT);
2120
2121	if ((owner & ~UMUTEX_CONTESTED) != id)
2122		return (EPERM);
2123
2124	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2125	if (error != 0)
2126		return (error);
2127
2128	if (rceiling == -1)
2129		new_inherited_pri = PRI_MAX;
2130	else {
2131		rceiling = RTP_PRIO_MAX - rceiling;
2132		if (rceiling > RTP_PRIO_MAX)
2133			return (EINVAL);
2134		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2135	}
2136
2137	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2138	    &key)) != 0)
2139		return (error);
2140	umtxq_lock(&key);
2141	umtxq_busy(&key);
2142	umtxq_unlock(&key);
2143	/*
2144	 * For priority protected mutex, always set unlocked state
2145	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2146	 * to lock the mutex, it is necessary because thread priority
2147	 * has to be adjusted for such mutex.
2148	 */
2149	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2150		UMUTEX_CONTESTED);
2151
2152	umtxq_lock(&key);
2153	if (error == 0)
2154		umtxq_signal(&key, 1);
2155	umtxq_unbusy(&key);
2156	umtxq_unlock(&key);
2157
2158	if (error == -1)
2159		error = EFAULT;
2160	else {
2161		mtx_lock_spin(&umtx_lock);
2162		if (su != 0)
2163			uq->uq_inherited_pri = new_inherited_pri;
2164		pri = PRI_MAX;
2165		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2166			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2167			if (uq2 != NULL) {
2168				if (pri > UPRI(uq2->uq_thread))
2169					pri = UPRI(uq2->uq_thread);
2170			}
2171		}
2172		if (pri > uq->uq_inherited_pri)
2173			pri = uq->uq_inherited_pri;
2174		thread_lock(td);
2175		sched_unlend_user_prio(td, pri);
2176		thread_unlock(td);
2177		mtx_unlock_spin(&umtx_lock);
2178	}
2179	umtx_key_release(&key);
2180	return (error);
2181}
2182
2183static int
2184do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2185	uint32_t *old_ceiling)
2186{
2187	struct umtx_q *uq;
2188	uint32_t save_ceiling;
2189	uint32_t owner, id;
2190	uint32_t flags;
2191	int error;
2192
2193	flags = fuword32(&m->m_flags);
2194	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2195		return (EINVAL);
2196	if (ceiling > RTP_PRIO_MAX)
2197		return (EINVAL);
2198	id = td->td_tid;
2199	uq = td->td_umtxq;
2200	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2201	   &uq->uq_key)) != 0)
2202		return (error);
2203	for (;;) {
2204		umtxq_lock(&uq->uq_key);
2205		umtxq_busy(&uq->uq_key);
2206		umtxq_unlock(&uq->uq_key);
2207
2208		save_ceiling = fuword32(&m->m_ceilings[0]);
2209
2210		owner = casuword32(&m->m_owner,
2211		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2212
2213		if (owner == UMUTEX_CONTESTED) {
2214			suword32(&m->m_ceilings[0], ceiling);
2215			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2216				UMUTEX_CONTESTED);
2217			error = 0;
2218			break;
2219		}
2220
2221		/* The address was invalid. */
2222		if (owner == -1) {
2223			error = EFAULT;
2224			break;
2225		}
2226
2227		if ((owner & ~UMUTEX_CONTESTED) == id) {
2228			suword32(&m->m_ceilings[0], ceiling);
2229			error = 0;
2230			break;
2231		}
2232
2233		/*
2234		 * If we caught a signal, we have retried and now
2235		 * exit immediately.
2236		 */
2237		if (error != 0)
2238			break;
2239
2240		/*
2241		 * We set the contested bit, sleep. Otherwise the lock changed
2242		 * and we need to retry or we lost a race to the thread
2243		 * unlocking the umtx.
2244		 */
2245		umtxq_lock(&uq->uq_key);
2246		umtxq_insert(uq);
2247		umtxq_unbusy(&uq->uq_key);
2248		error = umtxq_sleep(uq, "umtxpp", 0);
2249		umtxq_remove(uq);
2250		umtxq_unlock(&uq->uq_key);
2251	}
2252	umtxq_lock(&uq->uq_key);
2253	if (error == 0)
2254		umtxq_signal(&uq->uq_key, INT_MAX);
2255	umtxq_unbusy(&uq->uq_key);
2256	umtxq_unlock(&uq->uq_key);
2257	umtx_key_release(&uq->uq_key);
2258	if (error == 0 && old_ceiling != NULL)
2259		suword32(old_ceiling, save_ceiling);
2260	return (error);
2261}
2262
2263static int
2264_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2265	int mode)
2266{
2267	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2268	case 0:
2269		return (_do_lock_normal(td, m, flags, timo, mode));
2270	case UMUTEX_PRIO_INHERIT:
2271		return (_do_lock_pi(td, m, flags, timo, mode));
2272	case UMUTEX_PRIO_PROTECT:
2273		return (_do_lock_pp(td, m, flags, timo, mode));
2274	}
2275	return (EINVAL);
2276}
2277
2278/*
2279 * Lock a userland POSIX mutex.
2280 */
2281static int
2282do_lock_umutex(struct thread *td, struct umutex *m,
2283	struct timespec *timeout, int mode)
2284{
2285	struct timespec ts, ts2, ts3;
2286	struct timeval tv;
2287	uint32_t flags;
2288	int error;
2289
2290	flags = fuword32(&m->m_flags);
2291	if (flags == -1)
2292		return (EFAULT);
2293
2294	if (timeout == NULL) {
2295		error = _do_lock_umutex(td, m, flags, 0, mode);
2296		/* Mutex locking is restarted if it is interrupted. */
2297		if (error == EINTR && mode != _UMUTEX_WAIT)
2298			error = ERESTART;
2299	} else {
2300		getnanouptime(&ts);
2301		timespecadd(&ts, timeout);
2302		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2303		for (;;) {
2304			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2305			if (error != ETIMEDOUT)
2306				break;
2307			getnanouptime(&ts2);
2308			if (timespeccmp(&ts2, &ts, >=)) {
2309				error = ETIMEDOUT;
2310				break;
2311			}
2312			ts3 = ts;
2313			timespecsub(&ts3, &ts2);
2314			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2315		}
2316		/* Timed-locking is not restarted. */
2317		if (error == ERESTART)
2318			error = EINTR;
2319	}
2320	return (error);
2321}
2322
2323/*
2324 * Unlock a userland POSIX mutex.
2325 */
2326static int
2327do_unlock_umutex(struct thread *td, struct umutex *m)
2328{
2329	uint32_t flags;
2330
2331	flags = fuword32(&m->m_flags);
2332	if (flags == -1)
2333		return (EFAULT);
2334
2335	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2336	case 0:
2337		return (do_unlock_normal(td, m, flags));
2338	case UMUTEX_PRIO_INHERIT:
2339		return (do_unlock_pi(td, m, flags));
2340	case UMUTEX_PRIO_PROTECT:
2341		return (do_unlock_pp(td, m, flags));
2342	}
2343
2344	return (EINVAL);
2345}
2346
2347static int
2348do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2349	struct timespec *timeout, u_long wflags)
2350{
2351	struct umtx_q *uq;
2352	struct timeval tv;
2353	struct timespec cts, ets, tts;
2354	uint32_t flags;
2355	int error;
2356
2357	uq = td->td_umtxq;
2358	flags = fuword32(&cv->c_flags);
2359	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2360	if (error != 0)
2361		return (error);
2362	umtxq_lock(&uq->uq_key);
2363	umtxq_busy(&uq->uq_key);
2364	umtxq_insert(uq);
2365	umtxq_unlock(&uq->uq_key);
2366
2367	/*
2368	 * The magic thing is we should set c_has_waiters to 1 before
2369	 * releasing user mutex.
2370	 */
2371	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2372
2373	umtxq_lock(&uq->uq_key);
2374	umtxq_unbusy(&uq->uq_key);
2375	umtxq_unlock(&uq->uq_key);
2376
2377	error = do_unlock_umutex(td, m);
2378
2379	umtxq_lock(&uq->uq_key);
2380	if (error == 0) {
2381		if ((wflags & UMTX_CHECK_UNPARKING) &&
2382		    (td->td_pflags & TDP_WAKEUP)) {
2383			td->td_pflags &= ~TDP_WAKEUP;
2384			error = EINTR;
2385		} else if (timeout == NULL) {
2386			error = umtxq_sleep(uq, "ucond", 0);
2387		} else {
2388			getnanouptime(&ets);
2389			timespecadd(&ets, timeout);
2390			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2391			for (;;) {
2392				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2393				if (error != ETIMEDOUT)
2394					break;
2395				getnanouptime(&cts);
2396				if (timespeccmp(&cts, &ets, >=)) {
2397					error = ETIMEDOUT;
2398					break;
2399				}
2400				tts = ets;
2401				timespecsub(&tts, &cts);
2402				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2403			}
2404		}
2405	}
2406
2407	if (error != 0) {
2408		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2409			/*
2410			 * If we concurrently got do_cv_signal()d
2411			 * and we got an error or UNIX signals or a timeout,
2412			 * then, perform another umtxq_signal to avoid
2413			 * consuming the wakeup. This may cause supurious
2414			 * wakeup for another thread which was just queued,
2415			 * but SUSV3 explicitly allows supurious wakeup to
2416			 * occur, and indeed a kernel based implementation
2417			 * can not avoid it.
2418			 */
2419			if (!umtxq_signal(&uq->uq_key, 1))
2420				error = 0;
2421		}
2422		if (error == ERESTART)
2423			error = EINTR;
2424	}
2425	umtxq_remove(uq);
2426	umtxq_unlock(&uq->uq_key);
2427	umtx_key_release(&uq->uq_key);
2428	return (error);
2429}
2430
2431/*
2432 * Signal a userland condition variable.
2433 */
2434static int
2435do_cv_signal(struct thread *td, struct ucond *cv)
2436{
2437	struct umtx_key key;
2438	int error, cnt, nwake;
2439	uint32_t flags;
2440
2441	flags = fuword32(&cv->c_flags);
2442	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2443		return (error);
2444	umtxq_lock(&key);
2445	umtxq_busy(&key);
2446	cnt = umtxq_count(&key);
2447	nwake = umtxq_signal(&key, 1);
2448	if (cnt <= nwake) {
2449		umtxq_unlock(&key);
2450		error = suword32(
2451		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2452		umtxq_lock(&key);
2453	}
2454	umtxq_unbusy(&key);
2455	umtxq_unlock(&key);
2456	umtx_key_release(&key);
2457	return (error);
2458}
2459
2460static int
2461do_cv_broadcast(struct thread *td, struct ucond *cv)
2462{
2463	struct umtx_key key;
2464	int error;
2465	uint32_t flags;
2466
2467	flags = fuword32(&cv->c_flags);
2468	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2469		return (error);
2470
2471	umtxq_lock(&key);
2472	umtxq_busy(&key);
2473	umtxq_signal(&key, INT_MAX);
2474	umtxq_unlock(&key);
2475
2476	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2477
2478	umtxq_lock(&key);
2479	umtxq_unbusy(&key);
2480	umtxq_unlock(&key);
2481
2482	umtx_key_release(&key);
2483	return (error);
2484}
2485
2486static int
2487do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2488{
2489	struct umtx_q *uq;
2490	uint32_t flags, wrflags;
2491	int32_t state, oldstate;
2492	int32_t blocked_readers;
2493	int error;
2494
2495	uq = td->td_umtxq;
2496	flags = fuword32(&rwlock->rw_flags);
2497	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2498	if (error != 0)
2499		return (error);
2500
2501	wrflags = URWLOCK_WRITE_OWNER;
2502	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2503		wrflags |= URWLOCK_WRITE_WAITERS;
2504
2505	for (;;) {
2506		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2507		/* try to lock it */
2508		while (!(state & wrflags)) {
2509			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2510				umtx_key_release(&uq->uq_key);
2511				return (EAGAIN);
2512			}
2513			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2514			if (oldstate == state) {
2515				umtx_key_release(&uq->uq_key);
2516				return (0);
2517			}
2518			state = oldstate;
2519		}
2520
2521		if (error)
2522			break;
2523
2524		/* grab monitor lock */
2525		umtxq_lock(&uq->uq_key);
2526		umtxq_busy(&uq->uq_key);
2527		umtxq_unlock(&uq->uq_key);
2528
2529		/*
2530		 * re-read the state, in case it changed between the try-lock above
2531		 * and the check below
2532		 */
2533		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2534
2535		/* set read contention bit */
2536		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2537			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2538			if (oldstate == state)
2539				goto sleep;
2540			state = oldstate;
2541		}
2542
2543		/* state is changed while setting flags, restart */
2544		if (!(state & wrflags)) {
2545			umtxq_lock(&uq->uq_key);
2546			umtxq_unbusy(&uq->uq_key);
2547			umtxq_unlock(&uq->uq_key);
2548			continue;
2549		}
2550
2551sleep:
2552		/* contention bit is set, before sleeping, increase read waiter count */
2553		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2554		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2555
2556		while (state & wrflags) {
2557			umtxq_lock(&uq->uq_key);
2558			umtxq_insert(uq);
2559			umtxq_unbusy(&uq->uq_key);
2560
2561			error = umtxq_sleep(uq, "urdlck", timo);
2562
2563			umtxq_busy(&uq->uq_key);
2564			umtxq_remove(uq);
2565			umtxq_unlock(&uq->uq_key);
2566			if (error)
2567				break;
2568			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2569		}
2570
2571		/* decrease read waiter count, and may clear read contention bit */
2572		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2573		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2574		if (blocked_readers == 1) {
2575			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2576			for (;;) {
2577				oldstate = casuword32(&rwlock->rw_state, state,
2578					 state & ~URWLOCK_READ_WAITERS);
2579				if (oldstate == state)
2580					break;
2581				state = oldstate;
2582			}
2583		}
2584
2585		umtxq_lock(&uq->uq_key);
2586		umtxq_unbusy(&uq->uq_key);
2587		umtxq_unlock(&uq->uq_key);
2588	}
2589	umtx_key_release(&uq->uq_key);
2590	return (error);
2591}
2592
2593static int
2594do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2595{
2596	struct timespec ts, ts2, ts3;
2597	struct timeval tv;
2598	int error;
2599
2600	getnanouptime(&ts);
2601	timespecadd(&ts, timeout);
2602	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2603	for (;;) {
2604		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2605		if (error != ETIMEDOUT)
2606			break;
2607		getnanouptime(&ts2);
2608		if (timespeccmp(&ts2, &ts, >=)) {
2609			error = ETIMEDOUT;
2610			break;
2611		}
2612		ts3 = ts;
2613		timespecsub(&ts3, &ts2);
2614		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2615	}
2616	if (error == ERESTART)
2617		error = EINTR;
2618	return (error);
2619}
2620
2621static int
2622do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2623{
2624	struct umtx_q *uq;
2625	uint32_t flags;
2626	int32_t state, oldstate;
2627	int32_t blocked_writers;
2628	int32_t blocked_readers;
2629	int error;
2630
2631	uq = td->td_umtxq;
2632	flags = fuword32(&rwlock->rw_flags);
2633	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2634	if (error != 0)
2635		return (error);
2636
2637	blocked_readers = 0;
2638	for (;;) {
2639		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2640		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2641			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2642			if (oldstate == state) {
2643				umtx_key_release(&uq->uq_key);
2644				return (0);
2645			}
2646			state = oldstate;
2647		}
2648
2649		if (error) {
2650			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2651			    blocked_readers != 0) {
2652				umtxq_lock(&uq->uq_key);
2653				umtxq_busy(&uq->uq_key);
2654				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2655				umtxq_unbusy(&uq->uq_key);
2656				umtxq_unlock(&uq->uq_key);
2657			}
2658
2659			break;
2660		}
2661
2662		/* grab monitor lock */
2663		umtxq_lock(&uq->uq_key);
2664		umtxq_busy(&uq->uq_key);
2665		umtxq_unlock(&uq->uq_key);
2666
2667		/*
2668		 * re-read the state, in case it changed between the try-lock above
2669		 * and the check below
2670		 */
2671		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2672
2673		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2674		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2675			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2676			if (oldstate == state)
2677				goto sleep;
2678			state = oldstate;
2679		}
2680
2681		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2682			umtxq_lock(&uq->uq_key);
2683			umtxq_unbusy(&uq->uq_key);
2684			umtxq_unlock(&uq->uq_key);
2685			continue;
2686		}
2687sleep:
2688		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2689		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2690
2691		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2692			umtxq_lock(&uq->uq_key);
2693			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2694			umtxq_unbusy(&uq->uq_key);
2695
2696			error = umtxq_sleep(uq, "uwrlck", timo);
2697
2698			umtxq_busy(&uq->uq_key);
2699			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2700			umtxq_unlock(&uq->uq_key);
2701			if (error)
2702				break;
2703			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2704		}
2705
2706		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2707		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2708		if (blocked_writers == 1) {
2709			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2710			for (;;) {
2711				oldstate = casuword32(&rwlock->rw_state, state,
2712					 state & ~URWLOCK_WRITE_WAITERS);
2713				if (oldstate == state)
2714					break;
2715				state = oldstate;
2716			}
2717			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2718		} else
2719			blocked_readers = 0;
2720
2721		umtxq_lock(&uq->uq_key);
2722		umtxq_unbusy(&uq->uq_key);
2723		umtxq_unlock(&uq->uq_key);
2724	}
2725
2726	umtx_key_release(&uq->uq_key);
2727	return (error);
2728}
2729
2730static int
2731do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2732{
2733	struct timespec ts, ts2, ts3;
2734	struct timeval tv;
2735	int error;
2736
2737	getnanouptime(&ts);
2738	timespecadd(&ts, timeout);
2739	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2740	for (;;) {
2741		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2742		if (error != ETIMEDOUT)
2743			break;
2744		getnanouptime(&ts2);
2745		if (timespeccmp(&ts2, &ts, >=)) {
2746			error = ETIMEDOUT;
2747			break;
2748		}
2749		ts3 = ts;
2750		timespecsub(&ts3, &ts2);
2751		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2752	}
2753	if (error == ERESTART)
2754		error = EINTR;
2755	return (error);
2756}
2757
2758static int
2759do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2760{
2761	struct umtx_q *uq;
2762	uint32_t flags;
2763	int32_t state, oldstate;
2764	int error, q, count;
2765
2766	uq = td->td_umtxq;
2767	flags = fuword32(&rwlock->rw_flags);
2768	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2769	if (error != 0)
2770		return (error);
2771
2772	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2773	if (state & URWLOCK_WRITE_OWNER) {
2774		for (;;) {
2775			oldstate = casuword32(&rwlock->rw_state, state,
2776				state & ~URWLOCK_WRITE_OWNER);
2777			if (oldstate != state) {
2778				state = oldstate;
2779				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2780					error = EPERM;
2781					goto out;
2782				}
2783			} else
2784				break;
2785		}
2786	} else if (URWLOCK_READER_COUNT(state) != 0) {
2787		for (;;) {
2788			oldstate = casuword32(&rwlock->rw_state, state,
2789				state - 1);
2790			if (oldstate != state) {
2791				state = oldstate;
2792				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2793					error = EPERM;
2794					goto out;
2795				}
2796			}
2797			else
2798				break;
2799		}
2800	} else {
2801		error = EPERM;
2802		goto out;
2803	}
2804
2805	count = 0;
2806
2807	if (!(flags & URWLOCK_PREFER_READER)) {
2808		if (state & URWLOCK_WRITE_WAITERS) {
2809			count = 1;
2810			q = UMTX_EXCLUSIVE_QUEUE;
2811		} else if (state & URWLOCK_READ_WAITERS) {
2812			count = INT_MAX;
2813			q = UMTX_SHARED_QUEUE;
2814		}
2815	} else {
2816		if (state & URWLOCK_READ_WAITERS) {
2817			count = INT_MAX;
2818			q = UMTX_SHARED_QUEUE;
2819		} else if (state & URWLOCK_WRITE_WAITERS) {
2820			count = 1;
2821			q = UMTX_EXCLUSIVE_QUEUE;
2822		}
2823	}
2824
2825	if (count) {
2826		umtxq_lock(&uq->uq_key);
2827		umtxq_busy(&uq->uq_key);
2828		umtxq_signal_queue(&uq->uq_key, count, q);
2829		umtxq_unbusy(&uq->uq_key);
2830		umtxq_unlock(&uq->uq_key);
2831	}
2832out:
2833	umtx_key_release(&uq->uq_key);
2834	return (error);
2835}
2836
2837static int
2838do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
2839{
2840	struct umtx_q *uq;
2841	struct timeval tv;
2842	struct timespec cts, ets, tts;
2843	uint32_t flags, count;
2844	int error;
2845
2846	uq = td->td_umtxq;
2847	flags = fuword32(&sem->_flags);
2848	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2849	if (error != 0)
2850		return (error);
2851	umtxq_lock(&uq->uq_key);
2852	umtxq_busy(&uq->uq_key);
2853	umtxq_insert(uq);
2854	umtxq_unlock(&uq->uq_key);
2855
2856	suword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 1);
2857
2858	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2859	if (count != 0) {
2860		umtxq_lock(&uq->uq_key);
2861		umtxq_unbusy(&uq->uq_key);
2862		umtxq_remove(uq);
2863		umtxq_unlock(&uq->uq_key);
2864		umtx_key_release(&uq->uq_key);
2865		return (0);
2866	}
2867
2868	umtxq_lock(&uq->uq_key);
2869	umtxq_unbusy(&uq->uq_key);
2870	umtxq_unlock(&uq->uq_key);
2871
2872	umtxq_lock(&uq->uq_key);
2873	if (timeout == NULL) {
2874		error = umtxq_sleep(uq, "usem", 0);
2875	} else {
2876		getnanouptime(&ets);
2877		timespecadd(&ets, timeout);
2878		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2879		for (;;) {
2880			error = umtxq_sleep(uq, "usem", tvtohz(&tv));
2881			if (error != ETIMEDOUT)
2882				break;
2883			getnanouptime(&cts);
2884			if (timespeccmp(&cts, &ets, >=)) {
2885				error = ETIMEDOUT;
2886				break;
2887			}
2888			tts = ets;
2889			timespecsub(&tts, &cts);
2890			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2891		}
2892	}
2893
2894	if (error != 0) {
2895		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2896			if (!umtxq_signal(&uq->uq_key, 1))
2897				error = 0;
2898		}
2899		if (error == ERESTART)
2900			error = EINTR;
2901	}
2902	umtxq_remove(uq);
2903	umtxq_unlock(&uq->uq_key);
2904	umtx_key_release(&uq->uq_key);
2905	return (error);
2906}
2907
2908/*
2909 * Signal a userland condition variable.
2910 */
2911static int
2912do_sem_wake(struct thread *td, struct _usem *sem)
2913{
2914	struct umtx_key key;
2915	int error, cnt, nwake;
2916	uint32_t flags;
2917
2918	flags = fuword32(&sem->_flags);
2919	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2920		return (error);
2921	umtxq_lock(&key);
2922	umtxq_busy(&key);
2923	cnt = umtxq_count(&key);
2924	nwake = umtxq_signal(&key, 1);
2925	if (cnt <= nwake) {
2926		umtxq_unlock(&key);
2927		error = suword32(
2928		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2929		umtxq_lock(&key);
2930	}
2931	umtxq_unbusy(&key);
2932	umtxq_unlock(&key);
2933	umtx_key_release(&key);
2934	return (error);
2935}
2936
2937int
2938_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2939    /* struct umtx *umtx */
2940{
2941	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2942}
2943
2944int
2945_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2946    /* struct umtx *umtx */
2947{
2948	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2949}
2950
2951static int
2952__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2953{
2954	struct timespec *ts, timeout;
2955	int error;
2956
2957	/* Allow a null timespec (wait forever). */
2958	if (uap->uaddr2 == NULL)
2959		ts = NULL;
2960	else {
2961		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2962		if (error != 0)
2963			return (error);
2964		if (timeout.tv_nsec >= 1000000000 ||
2965		    timeout.tv_nsec < 0) {
2966			return (EINVAL);
2967		}
2968		ts = &timeout;
2969	}
2970	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2971}
2972
2973static int
2974__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2975{
2976	return (do_unlock_umtx(td, uap->obj, uap->val));
2977}
2978
2979static int
2980__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2981{
2982	struct timespec *ts, timeout;
2983	int error;
2984
2985	if (uap->uaddr2 == NULL)
2986		ts = NULL;
2987	else {
2988		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2989		if (error != 0)
2990			return (error);
2991		if (timeout.tv_nsec >= 1000000000 ||
2992		    timeout.tv_nsec < 0)
2993			return (EINVAL);
2994		ts = &timeout;
2995	}
2996	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2997}
2998
2999static int
3000__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3001{
3002	struct timespec *ts, timeout;
3003	int error;
3004
3005	if (uap->uaddr2 == NULL)
3006		ts = NULL;
3007	else {
3008		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3009		if (error != 0)
3010			return (error);
3011		if (timeout.tv_nsec >= 1000000000 ||
3012		    timeout.tv_nsec < 0)
3013			return (EINVAL);
3014		ts = &timeout;
3015	}
3016	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3017}
3018
3019static int
3020__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3021{
3022	struct timespec *ts, timeout;
3023	int error;
3024
3025	if (uap->uaddr2 == NULL)
3026		ts = NULL;
3027	else {
3028		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3029		if (error != 0)
3030			return (error);
3031		if (timeout.tv_nsec >= 1000000000 ||
3032		    timeout.tv_nsec < 0)
3033			return (EINVAL);
3034		ts = &timeout;
3035	}
3036	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3037}
3038
3039static int
3040__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3041{
3042	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3043}
3044
3045static int
3046__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3047{
3048	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3049}
3050
3051static int
3052__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3053{
3054	struct timespec *ts, timeout;
3055	int error;
3056
3057	/* Allow a null timespec (wait forever). */
3058	if (uap->uaddr2 == NULL)
3059		ts = NULL;
3060	else {
3061		error = copyin(uap->uaddr2, &timeout,
3062		    sizeof(timeout));
3063		if (error != 0)
3064			return (error);
3065		if (timeout.tv_nsec >= 1000000000 ||
3066		    timeout.tv_nsec < 0) {
3067			return (EINVAL);
3068		}
3069		ts = &timeout;
3070	}
3071	return do_lock_umutex(td, uap->obj, ts, 0);
3072}
3073
3074static int
3075__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3076{
3077	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3078}
3079
3080static int
3081__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3082{
3083	struct timespec *ts, timeout;
3084	int error;
3085
3086	/* Allow a null timespec (wait forever). */
3087	if (uap->uaddr2 == NULL)
3088		ts = NULL;
3089	else {
3090		error = copyin(uap->uaddr2, &timeout,
3091		    sizeof(timeout));
3092		if (error != 0)
3093			return (error);
3094		if (timeout.tv_nsec >= 1000000000 ||
3095		    timeout.tv_nsec < 0) {
3096			return (EINVAL);
3097		}
3098		ts = &timeout;
3099	}
3100	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3101}
3102
3103static int
3104__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3105{
3106	return do_wake_umutex(td, uap->obj);
3107}
3108
3109static int
3110__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3111{
3112	return do_unlock_umutex(td, uap->obj);
3113}
3114
3115static int
3116__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3117{
3118	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3119}
3120
3121static int
3122__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3123{
3124	struct timespec *ts, timeout;
3125	int error;
3126
3127	/* Allow a null timespec (wait forever). */
3128	if (uap->uaddr2 == NULL)
3129		ts = NULL;
3130	else {
3131		error = copyin(uap->uaddr2, &timeout,
3132		    sizeof(timeout));
3133		if (error != 0)
3134			return (error);
3135		if (timeout.tv_nsec >= 1000000000 ||
3136		    timeout.tv_nsec < 0) {
3137			return (EINVAL);
3138		}
3139		ts = &timeout;
3140	}
3141	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3142}
3143
3144static int
3145__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3146{
3147	return do_cv_signal(td, uap->obj);
3148}
3149
3150static int
3151__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3152{
3153	return do_cv_broadcast(td, uap->obj);
3154}
3155
3156static int
3157__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3158{
3159	struct timespec timeout;
3160	int error;
3161
3162	/* Allow a null timespec (wait forever). */
3163	if (uap->uaddr2 == NULL) {
3164		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3165	} else {
3166		error = copyin(uap->uaddr2, &timeout,
3167		    sizeof(timeout));
3168		if (error != 0)
3169			return (error);
3170		if (timeout.tv_nsec >= 1000000000 ||
3171		    timeout.tv_nsec < 0) {
3172			return (EINVAL);
3173		}
3174		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3175	}
3176	return (error);
3177}
3178
3179static int
3180__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3181{
3182	struct timespec timeout;
3183	int error;
3184
3185	/* Allow a null timespec (wait forever). */
3186	if (uap->uaddr2 == NULL) {
3187		error = do_rw_wrlock(td, uap->obj, 0);
3188	} else {
3189		error = copyin(uap->uaddr2, &timeout,
3190		    sizeof(timeout));
3191		if (error != 0)
3192			return (error);
3193		if (timeout.tv_nsec >= 1000000000 ||
3194		    timeout.tv_nsec < 0) {
3195			return (EINVAL);
3196		}
3197
3198		error = do_rw_wrlock2(td, uap->obj, &timeout);
3199	}
3200	return (error);
3201}
3202
3203static int
3204__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3205{
3206	return do_rw_unlock(td, uap->obj);
3207}
3208
3209static int
3210__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3211{
3212	struct timespec *ts, timeout;
3213	int error;
3214
3215	/* Allow a null timespec (wait forever). */
3216	if (uap->uaddr2 == NULL)
3217		ts = NULL;
3218	else {
3219		error = copyin(uap->uaddr2, &timeout,
3220		    sizeof(timeout));
3221		if (error != 0)
3222			return (error);
3223		if (timeout.tv_nsec >= 1000000000 ||
3224		    timeout.tv_nsec < 0) {
3225			return (EINVAL);
3226		}
3227		ts = &timeout;
3228	}
3229	return (do_sem_wait(td, uap->obj, ts));
3230}
3231
3232static int
3233__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3234{
3235	return do_sem_wake(td, uap->obj);
3236}
3237
3238typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3239
3240static _umtx_op_func op_table[] = {
3241	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3242	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3243	__umtx_op_wait,			/* UMTX_OP_WAIT */
3244	__umtx_op_wake,			/* UMTX_OP_WAKE */
3245	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3246	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3247	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3248	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3249	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3250	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3251	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3252	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3253	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3254	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3255	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3256	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3257	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3258	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3259	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3260	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3261	__umtx_op_sem_wake		/* UMTX_OP_SEM_WAKE */
3262};
3263
3264int
3265_umtx_op(struct thread *td, struct _umtx_op_args *uap)
3266{
3267	if ((unsigned)uap->op < UMTX_OP_MAX)
3268		return (*op_table[uap->op])(td, uap);
3269	return (EINVAL);
3270}
3271
3272#ifdef COMPAT_FREEBSD32
3273int
3274freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3275    /* struct umtx *umtx */
3276{
3277	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3278}
3279
3280int
3281freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3282    /* struct umtx *umtx */
3283{
3284	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3285}
3286
3287struct timespec32 {
3288	u_int32_t tv_sec;
3289	u_int32_t tv_nsec;
3290};
3291
3292static inline int
3293copyin_timeout32(void *addr, struct timespec *tsp)
3294{
3295	struct timespec32 ts32;
3296	int error;
3297
3298	error = copyin(addr, &ts32, sizeof(struct timespec32));
3299	if (error == 0) {
3300		tsp->tv_sec = ts32.tv_sec;
3301		tsp->tv_nsec = ts32.tv_nsec;
3302	}
3303	return (error);
3304}
3305
3306static int
3307__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3308{
3309	struct timespec *ts, timeout;
3310	int error;
3311
3312	/* Allow a null timespec (wait forever). */
3313	if (uap->uaddr2 == NULL)
3314		ts = NULL;
3315	else {
3316		error = copyin_timeout32(uap->uaddr2, &timeout);
3317		if (error != 0)
3318			return (error);
3319		if (timeout.tv_nsec >= 1000000000 ||
3320		    timeout.tv_nsec < 0) {
3321			return (EINVAL);
3322		}
3323		ts = &timeout;
3324	}
3325	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3326}
3327
3328static int
3329__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3330{
3331	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3332}
3333
3334static int
3335__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3336{
3337	struct timespec *ts, timeout;
3338	int error;
3339
3340	if (uap->uaddr2 == NULL)
3341		ts = NULL;
3342	else {
3343		error = copyin_timeout32(uap->uaddr2, &timeout);
3344		if (error != 0)
3345			return (error);
3346		if (timeout.tv_nsec >= 1000000000 ||
3347		    timeout.tv_nsec < 0)
3348			return (EINVAL);
3349		ts = &timeout;
3350	}
3351	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3352}
3353
3354static int
3355__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3356{
3357	struct timespec *ts, timeout;
3358	int error;
3359
3360	/* Allow a null timespec (wait forever). */
3361	if (uap->uaddr2 == NULL)
3362		ts = NULL;
3363	else {
3364		error = copyin_timeout32(uap->uaddr2, &timeout);
3365		if (error != 0)
3366			return (error);
3367		if (timeout.tv_nsec >= 1000000000 ||
3368		    timeout.tv_nsec < 0)
3369			return (EINVAL);
3370		ts = &timeout;
3371	}
3372	return do_lock_umutex(td, uap->obj, ts, 0);
3373}
3374
3375static int
3376__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3377{
3378	struct timespec *ts, timeout;
3379	int error;
3380
3381	/* Allow a null timespec (wait forever). */
3382	if (uap->uaddr2 == NULL)
3383		ts = NULL;
3384	else {
3385		error = copyin_timeout32(uap->uaddr2, &timeout);
3386		if (error != 0)
3387			return (error);
3388		if (timeout.tv_nsec >= 1000000000 ||
3389		    timeout.tv_nsec < 0)
3390			return (EINVAL);
3391		ts = &timeout;
3392	}
3393	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3394}
3395
3396static int
3397__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3398{
3399	struct timespec *ts, timeout;
3400	int error;
3401
3402	/* Allow a null timespec (wait forever). */
3403	if (uap->uaddr2 == NULL)
3404		ts = NULL;
3405	else {
3406		error = copyin_timeout32(uap->uaddr2, &timeout);
3407		if (error != 0)
3408			return (error);
3409		if (timeout.tv_nsec >= 1000000000 ||
3410		    timeout.tv_nsec < 0)
3411			return (EINVAL);
3412		ts = &timeout;
3413	}
3414	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3415}
3416
3417static int
3418__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3419{
3420	struct timespec timeout;
3421	int error;
3422
3423	/* Allow a null timespec (wait forever). */
3424	if (uap->uaddr2 == NULL) {
3425		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3426	} else {
3427		error = copyin(uap->uaddr2, &timeout,
3428		    sizeof(timeout));
3429		if (error != 0)
3430			return (error);
3431		if (timeout.tv_nsec >= 1000000000 ||
3432		    timeout.tv_nsec < 0) {
3433			return (EINVAL);
3434		}
3435		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3436	}
3437	return (error);
3438}
3439
3440static int
3441__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3442{
3443	struct timespec timeout;
3444	int error;
3445
3446	/* Allow a null timespec (wait forever). */
3447	if (uap->uaddr2 == NULL) {
3448		error = do_rw_wrlock(td, uap->obj, 0);
3449	} else {
3450		error = copyin_timeout32(uap->uaddr2, &timeout);
3451		if (error != 0)
3452			return (error);
3453		if (timeout.tv_nsec >= 1000000000 ||
3454		    timeout.tv_nsec < 0) {
3455			return (EINVAL);
3456		}
3457
3458		error = do_rw_wrlock2(td, uap->obj, &timeout);
3459	}
3460	return (error);
3461}
3462
3463static int
3464__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3465{
3466	struct timespec *ts, timeout;
3467	int error;
3468
3469	if (uap->uaddr2 == NULL)
3470		ts = NULL;
3471	else {
3472		error = copyin_timeout32(uap->uaddr2, &timeout);
3473		if (error != 0)
3474			return (error);
3475		if (timeout.tv_nsec >= 1000000000 ||
3476		    timeout.tv_nsec < 0)
3477			return (EINVAL);
3478		ts = &timeout;
3479	}
3480	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3481}
3482
3483static int
3484__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3485{
3486	struct timespec *ts, timeout;
3487	int error;
3488
3489	/* Allow a null timespec (wait forever). */
3490	if (uap->uaddr2 == NULL)
3491		ts = NULL;
3492	else {
3493		error = copyin_timeout32(uap->uaddr2, &timeout);
3494		if (error != 0)
3495			return (error);
3496		if (timeout.tv_nsec >= 1000000000 ||
3497		    timeout.tv_nsec < 0)
3498			return (EINVAL);
3499		ts = &timeout;
3500	}
3501	return (do_sem_wait(td, uap->obj, ts));
3502}
3503
3504static _umtx_op_func op_table_compat32[] = {
3505	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3506	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3507	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3508	__umtx_op_wake,			/* UMTX_OP_WAKE */
3509	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3510	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3511	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3512	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3513	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3514	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3515	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3516	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3517	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3518	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3519	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3520	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3521	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3522	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3523	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3524	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3525	__umtx_op_sem_wake		/* UMTX_OP_SEM_WAKE */
3526};
3527
3528int
3529freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3530{
3531	if ((unsigned)uap->op < UMTX_OP_MAX)
3532		return (*op_table_compat32[uap->op])(td,
3533			(struct _umtx_op_args *)uap);
3534	return (EINVAL);
3535}
3536#endif
3537
3538void
3539umtx_thread_init(struct thread *td)
3540{
3541	td->td_umtxq = umtxq_alloc();
3542	td->td_umtxq->uq_thread = td;
3543}
3544
3545void
3546umtx_thread_fini(struct thread *td)
3547{
3548	umtxq_free(td->td_umtxq);
3549}
3550
3551/*
3552 * It will be called when new thread is created, e.g fork().
3553 */
3554void
3555umtx_thread_alloc(struct thread *td)
3556{
3557	struct umtx_q *uq;
3558
3559	uq = td->td_umtxq;
3560	uq->uq_inherited_pri = PRI_MAX;
3561
3562	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3563	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3564	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3565	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3566}
3567
3568/*
3569 * exec() hook.
3570 */
3571static void
3572umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3573	struct image_params *imgp __unused)
3574{
3575	umtx_thread_cleanup(curthread);
3576}
3577
3578/*
3579 * thread_exit() hook.
3580 */
3581void
3582umtx_thread_exit(struct thread *td)
3583{
3584	umtx_thread_cleanup(td);
3585}
3586
3587/*
3588 * clean up umtx data.
3589 */
3590static void
3591umtx_thread_cleanup(struct thread *td)
3592{
3593	struct umtx_q *uq;
3594	struct umtx_pi *pi;
3595
3596	if ((uq = td->td_umtxq) == NULL)
3597		return;
3598
3599	mtx_lock_spin(&umtx_lock);
3600	uq->uq_inherited_pri = PRI_MAX;
3601	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3602		pi->pi_owner = NULL;
3603		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3604	}
3605	thread_lock(td);
3606	td->td_flags &= ~TDF_UBORROWING;
3607	thread_unlock(td);
3608	mtx_unlock_spin(&umtx_lock);
3609}
3610