kern_umtx.c revision 216791
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 216791 2010-12-29 09:26:46Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/syscallsubr.h>
47#include <sys/eventhandler.h>
48#include <sys/umtx.h>
49
50#include <vm/vm.h>
51#include <vm/vm_param.h>
52#include <vm/pmap.h>
53#include <vm/vm_map.h>
54#include <vm/vm_object.h>
55
56#include <machine/cpu.h>
57
58#ifdef COMPAT_FREEBSD32
59#include <compat/freebsd32/freebsd32_proto.h>
60#endif
61
62enum {
63	TYPE_SIMPLE_WAIT,
64	TYPE_CV,
65	TYPE_SEM,
66	TYPE_SIMPLE_LOCK,
67	TYPE_NORMAL_UMUTEX,
68	TYPE_PI_UMUTEX,
69	TYPE_PP_UMUTEX,
70	TYPE_RWLOCK
71};
72
73#define _UMUTEX_TRY		1
74#define _UMUTEX_WAIT		2
75
76/* Key to represent a unique userland synchronous object */
77struct umtx_key {
78	int	hash;
79	int	type;
80	int	shared;
81	union {
82		struct {
83			vm_object_t	object;
84			uintptr_t	offset;
85		} shared;
86		struct {
87			struct vmspace	*vs;
88			uintptr_t	addr;
89		} private;
90		struct {
91			void		*a;
92			uintptr_t	b;
93		} both;
94	} info;
95};
96
97/* Priority inheritance mutex info. */
98struct umtx_pi {
99	/* Owner thread */
100	struct thread		*pi_owner;
101
102	/* Reference count */
103	int			pi_refcount;
104
105 	/* List entry to link umtx holding by thread */
106	TAILQ_ENTRY(umtx_pi)	pi_link;
107
108	/* List entry in hash */
109	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
110
111	/* List for waiters */
112	TAILQ_HEAD(,umtx_q)	pi_blocked;
113
114	/* Identify a userland lock object */
115	struct umtx_key		pi_key;
116};
117
118/* A userland synchronous object user. */
119struct umtx_q {
120	/* Linked list for the hash. */
121	TAILQ_ENTRY(umtx_q)	uq_link;
122
123	/* Umtx key. */
124	struct umtx_key		uq_key;
125
126	/* Umtx flags. */
127	int			uq_flags;
128#define UQF_UMTXQ	0x0001
129
130	/* The thread waits on. */
131	struct thread		*uq_thread;
132
133	/*
134	 * Blocked on PI mutex. read can use chain lock
135	 * or umtx_lock, write must have both chain lock and
136	 * umtx_lock being hold.
137	 */
138	struct umtx_pi		*uq_pi_blocked;
139
140	/* On blocked list */
141	TAILQ_ENTRY(umtx_q)	uq_lockq;
142
143	/* Thread contending with us */
144	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
145
146	/* Inherited priority from PP mutex */
147	u_char			uq_inherited_pri;
148
149	/* Spare queue ready to be reused */
150	struct umtxq_queue	*uq_spare_queue;
151
152	/* The queue we on */
153	struct umtxq_queue	*uq_cur_queue;
154};
155
156TAILQ_HEAD(umtxq_head, umtx_q);
157
158/* Per-key wait-queue */
159struct umtxq_queue {
160	struct umtxq_head	head;
161	struct umtx_key		key;
162	LIST_ENTRY(umtxq_queue)	link;
163	int			length;
164};
165
166LIST_HEAD(umtxq_list, umtxq_queue);
167
168/* Userland lock object's wait-queue chain */
169struct umtxq_chain {
170	/* Lock for this chain. */
171	struct mtx		uc_lock;
172
173	/* List of sleep queues. */
174	struct umtxq_list	uc_queue[2];
175#define UMTX_SHARED_QUEUE	0
176#define UMTX_EXCLUSIVE_QUEUE	1
177
178	LIST_HEAD(, umtxq_queue) uc_spare_queue;
179
180	/* Busy flag */
181	char			uc_busy;
182
183	/* Chain lock waiters */
184	int			uc_waiters;
185
186	/* All PI in the list */
187	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
188
189};
190
191#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
192#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
193
194/*
195 * Don't propagate time-sharing priority, there is a security reason,
196 * a user can simply introduce PI-mutex, let thread A lock the mutex,
197 * and let another thread B block on the mutex, because B is
198 * sleeping, its priority will be boosted, this causes A's priority to
199 * be boosted via priority propagating too and will never be lowered even
200 * if it is using 100%CPU, this is unfair to other processes.
201 */
202
203#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
204			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
205			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
206
207#define	GOLDEN_RATIO_PRIME	2654404609U
208#define	UMTX_CHAINS		512
209#define	UMTX_SHIFTS		(__WORD_BIT - 9)
210
211#define THREAD_SHARE		0
212#define PROCESS_SHARE		1
213#define AUTO_SHARE		2
214
215#define	GET_SHARE(flags)	\
216    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
217
218#define BUSY_SPINS		200
219
220static uma_zone_t		umtx_pi_zone;
221static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
222static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
223static int			umtx_pi_allocated;
224
225SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
226SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
227    &umtx_pi_allocated, 0, "Allocated umtx_pi");
228
229static void umtxq_sysinit(void *);
230static void umtxq_hash(struct umtx_key *key);
231static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
232static void umtxq_lock(struct umtx_key *key);
233static void umtxq_unlock(struct umtx_key *key);
234static void umtxq_busy(struct umtx_key *key);
235static void umtxq_unbusy(struct umtx_key *key);
236static void umtxq_insert_queue(struct umtx_q *uq, int q);
237static void umtxq_remove_queue(struct umtx_q *uq, int q);
238static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
239static int umtxq_count(struct umtx_key *key);
240static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
241static int umtx_key_get(void *addr, int type, int share,
242	struct umtx_key *key);
243static void umtx_key_release(struct umtx_key *key);
244static struct umtx_pi *umtx_pi_alloc(int);
245static void umtx_pi_free(struct umtx_pi *pi);
246static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
247static void umtx_thread_cleanup(struct thread *td);
248static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
249	struct image_params *imgp __unused);
250SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
251
252#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
253#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
254#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
255
256static struct mtx umtx_lock;
257
258static void
259umtxq_sysinit(void *arg __unused)
260{
261	int i, j;
262
263	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
264		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
265	for (i = 0; i < 2; ++i) {
266		for (j = 0; j < UMTX_CHAINS; ++j) {
267			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
268				 MTX_DEF | MTX_DUPOK);
269			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
270			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
271			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
272			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
273			umtxq_chains[i][j].uc_busy = 0;
274			umtxq_chains[i][j].uc_waiters = 0;
275		}
276	}
277	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
278	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
279	    EVENTHANDLER_PRI_ANY);
280}
281
282struct umtx_q *
283umtxq_alloc(void)
284{
285	struct umtx_q *uq;
286
287	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
288	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
289	TAILQ_INIT(&uq->uq_spare_queue->head);
290	TAILQ_INIT(&uq->uq_pi_contested);
291	uq->uq_inherited_pri = PRI_MAX;
292	return (uq);
293}
294
295void
296umtxq_free(struct umtx_q *uq)
297{
298	MPASS(uq->uq_spare_queue != NULL);
299	free(uq->uq_spare_queue, M_UMTX);
300	free(uq, M_UMTX);
301}
302
303static inline void
304umtxq_hash(struct umtx_key *key)
305{
306	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
307	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
308}
309
310static inline int
311umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
312{
313	return (k1->type == k2->type &&
314		k1->info.both.a == k2->info.both.a &&
315	        k1->info.both.b == k2->info.both.b);
316}
317
318static inline struct umtxq_chain *
319umtxq_getchain(struct umtx_key *key)
320{
321	if (key->type <= TYPE_SEM)
322		return (&umtxq_chains[1][key->hash]);
323	return (&umtxq_chains[0][key->hash]);
324}
325
326/*
327 * Lock a chain.
328 */
329static inline void
330umtxq_lock(struct umtx_key *key)
331{
332	struct umtxq_chain *uc;
333
334	uc = umtxq_getchain(key);
335	mtx_lock(&uc->uc_lock);
336}
337
338/*
339 * Unlock a chain.
340 */
341static inline void
342umtxq_unlock(struct umtx_key *key)
343{
344	struct umtxq_chain *uc;
345
346	uc = umtxq_getchain(key);
347	mtx_unlock(&uc->uc_lock);
348}
349
350/*
351 * Set chain to busy state when following operation
352 * may be blocked (kernel mutex can not be used).
353 */
354static inline void
355umtxq_busy(struct umtx_key *key)
356{
357	struct umtxq_chain *uc;
358
359	uc = umtxq_getchain(key);
360	mtx_assert(&uc->uc_lock, MA_OWNED);
361	if (uc->uc_busy) {
362#ifdef SMP
363		if (smp_cpus > 1) {
364			int count = BUSY_SPINS;
365			if (count > 0) {
366				umtxq_unlock(key);
367				while (uc->uc_busy && --count > 0)
368					cpu_spinwait();
369				umtxq_lock(key);
370			}
371		}
372#endif
373		while (uc->uc_busy) {
374			uc->uc_waiters++;
375			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
376			uc->uc_waiters--;
377		}
378	}
379	uc->uc_busy = 1;
380}
381
382/*
383 * Unbusy a chain.
384 */
385static inline void
386umtxq_unbusy(struct umtx_key *key)
387{
388	struct umtxq_chain *uc;
389
390	uc = umtxq_getchain(key);
391	mtx_assert(&uc->uc_lock, MA_OWNED);
392	KASSERT(uc->uc_busy != 0, ("not busy"));
393	uc->uc_busy = 0;
394	if (uc->uc_waiters)
395		wakeup_one(uc);
396}
397
398static struct umtxq_queue *
399umtxq_queue_lookup(struct umtx_key *key, int q)
400{
401	struct umtxq_queue *uh;
402	struct umtxq_chain *uc;
403
404	uc = umtxq_getchain(key);
405	UMTXQ_LOCKED_ASSERT(uc);
406	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
407		if (umtx_key_match(&uh->key, key))
408			return (uh);
409	}
410
411	return (NULL);
412}
413
414static inline void
415umtxq_insert_queue(struct umtx_q *uq, int q)
416{
417	struct umtxq_queue *uh;
418	struct umtxq_chain *uc;
419
420	uc = umtxq_getchain(&uq->uq_key);
421	UMTXQ_LOCKED_ASSERT(uc);
422	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
423	uh = umtxq_queue_lookup(&uq->uq_key, q);
424	if (uh != NULL) {
425		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
426	} else {
427		uh = uq->uq_spare_queue;
428		uh->key = uq->uq_key;
429		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
430	}
431	uq->uq_spare_queue = NULL;
432
433	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
434	uh->length++;
435	uq->uq_flags |= UQF_UMTXQ;
436	uq->uq_cur_queue = uh;
437	return;
438}
439
440static inline void
441umtxq_remove_queue(struct umtx_q *uq, int q)
442{
443	struct umtxq_chain *uc;
444	struct umtxq_queue *uh;
445
446	uc = umtxq_getchain(&uq->uq_key);
447	UMTXQ_LOCKED_ASSERT(uc);
448	if (uq->uq_flags & UQF_UMTXQ) {
449		uh = uq->uq_cur_queue;
450		TAILQ_REMOVE(&uh->head, uq, uq_link);
451		uh->length--;
452		uq->uq_flags &= ~UQF_UMTXQ;
453		if (TAILQ_EMPTY(&uh->head)) {
454			KASSERT(uh->length == 0,
455			    ("inconsistent umtxq_queue length"));
456			LIST_REMOVE(uh, link);
457		} else {
458			uh = LIST_FIRST(&uc->uc_spare_queue);
459			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
460			LIST_REMOVE(uh, link);
461		}
462		uq->uq_spare_queue = uh;
463		uq->uq_cur_queue = NULL;
464	}
465}
466
467/*
468 * Check if there are multiple waiters
469 */
470static int
471umtxq_count(struct umtx_key *key)
472{
473	struct umtxq_chain *uc;
474	struct umtxq_queue *uh;
475
476	uc = umtxq_getchain(key);
477	UMTXQ_LOCKED_ASSERT(uc);
478	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
479	if (uh != NULL)
480		return (uh->length);
481	return (0);
482}
483
484/*
485 * Check if there are multiple PI waiters and returns first
486 * waiter.
487 */
488static int
489umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
490{
491	struct umtxq_chain *uc;
492	struct umtxq_queue *uh;
493
494	*first = NULL;
495	uc = umtxq_getchain(key);
496	UMTXQ_LOCKED_ASSERT(uc);
497	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
498	if (uh != NULL) {
499		*first = TAILQ_FIRST(&uh->head);
500		return (uh->length);
501	}
502	return (0);
503}
504
505/*
506 * Wake up threads waiting on an userland object.
507 */
508
509static int
510umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
511{
512	struct umtxq_chain *uc;
513	struct umtxq_queue *uh;
514	struct umtx_q *uq;
515	int ret;
516
517	ret = 0;
518	uc = umtxq_getchain(key);
519	UMTXQ_LOCKED_ASSERT(uc);
520	uh = umtxq_queue_lookup(key, q);
521	if (uh != NULL) {
522		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
523			umtxq_remove_queue(uq, q);
524			wakeup(uq);
525			if (++ret >= n_wake)
526				return (ret);
527		}
528	}
529	return (ret);
530}
531
532
533/*
534 * Wake up specified thread.
535 */
536static inline void
537umtxq_signal_thread(struct umtx_q *uq)
538{
539	struct umtxq_chain *uc;
540
541	uc = umtxq_getchain(&uq->uq_key);
542	UMTXQ_LOCKED_ASSERT(uc);
543	umtxq_remove(uq);
544	wakeup(uq);
545}
546
547/*
548 * Put thread into sleep state, before sleeping, check if
549 * thread was removed from umtx queue.
550 */
551static inline int
552umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
553{
554	struct umtxq_chain *uc;
555	int error;
556
557	uc = umtxq_getchain(&uq->uq_key);
558	UMTXQ_LOCKED_ASSERT(uc);
559	if (!(uq->uq_flags & UQF_UMTXQ))
560		return (0);
561	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
562	if (error == EWOULDBLOCK)
563		error = ETIMEDOUT;
564	return (error);
565}
566
567/*
568 * Convert userspace address into unique logical address.
569 */
570static int
571umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
572{
573	struct thread *td = curthread;
574	vm_map_t map;
575	vm_map_entry_t entry;
576	vm_pindex_t pindex;
577	vm_prot_t prot;
578	boolean_t wired;
579
580	key->type = type;
581	if (share == THREAD_SHARE) {
582		key->shared = 0;
583		key->info.private.vs = td->td_proc->p_vmspace;
584		key->info.private.addr = (uintptr_t)addr;
585	} else {
586		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
587		map = &td->td_proc->p_vmspace->vm_map;
588		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
589		    &entry, &key->info.shared.object, &pindex, &prot,
590		    &wired) != KERN_SUCCESS) {
591			return EFAULT;
592		}
593
594		if ((share == PROCESS_SHARE) ||
595		    (share == AUTO_SHARE &&
596		     VM_INHERIT_SHARE == entry->inheritance)) {
597			key->shared = 1;
598			key->info.shared.offset = entry->offset + entry->start -
599				(vm_offset_t)addr;
600			vm_object_reference(key->info.shared.object);
601		} else {
602			key->shared = 0;
603			key->info.private.vs = td->td_proc->p_vmspace;
604			key->info.private.addr = (uintptr_t)addr;
605		}
606		vm_map_lookup_done(map, entry);
607	}
608
609	umtxq_hash(key);
610	return (0);
611}
612
613/*
614 * Release key.
615 */
616static inline void
617umtx_key_release(struct umtx_key *key)
618{
619	if (key->shared)
620		vm_object_deallocate(key->info.shared.object);
621}
622
623/*
624 * Lock a umtx object.
625 */
626static int
627_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
628{
629	struct umtx_q *uq;
630	u_long owner;
631	u_long old;
632	int error = 0;
633
634	uq = td->td_umtxq;
635
636	/*
637	 * Care must be exercised when dealing with umtx structure. It
638	 * can fault on any access.
639	 */
640	for (;;) {
641		/*
642		 * Try the uncontested case.  This should be done in userland.
643		 */
644		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
645
646		/* The acquire succeeded. */
647		if (owner == UMTX_UNOWNED)
648			return (0);
649
650		/* The address was invalid. */
651		if (owner == -1)
652			return (EFAULT);
653
654		/* If no one owns it but it is contested try to acquire it. */
655		if (owner == UMTX_CONTESTED) {
656			owner = casuword(&umtx->u_owner,
657			    UMTX_CONTESTED, id | UMTX_CONTESTED);
658
659			if (owner == UMTX_CONTESTED)
660				return (0);
661
662			/* The address was invalid. */
663			if (owner == -1)
664				return (EFAULT);
665
666			/* If this failed the lock has changed, restart. */
667			continue;
668		}
669
670		/*
671		 * If we caught a signal, we have retried and now
672		 * exit immediately.
673		 */
674		if (error != 0)
675			return (error);
676
677		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
678			AUTO_SHARE, &uq->uq_key)) != 0)
679			return (error);
680
681		umtxq_lock(&uq->uq_key);
682		umtxq_busy(&uq->uq_key);
683		umtxq_insert(uq);
684		umtxq_unbusy(&uq->uq_key);
685		umtxq_unlock(&uq->uq_key);
686
687		/*
688		 * Set the contested bit so that a release in user space
689		 * knows to use the system call for unlock.  If this fails
690		 * either some one else has acquired the lock or it has been
691		 * released.
692		 */
693		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
694
695		/* The address was invalid. */
696		if (old == -1) {
697			umtxq_lock(&uq->uq_key);
698			umtxq_remove(uq);
699			umtxq_unlock(&uq->uq_key);
700			umtx_key_release(&uq->uq_key);
701			return (EFAULT);
702		}
703
704		/*
705		 * We set the contested bit, sleep. Otherwise the lock changed
706		 * and we need to retry or we lost a race to the thread
707		 * unlocking the umtx.
708		 */
709		umtxq_lock(&uq->uq_key);
710		if (old == owner)
711			error = umtxq_sleep(uq, "umtx", timo);
712		umtxq_remove(uq);
713		umtxq_unlock(&uq->uq_key);
714		umtx_key_release(&uq->uq_key);
715	}
716
717	return (0);
718}
719
720/*
721 * Lock a umtx object.
722 */
723static int
724do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
725	struct timespec *timeout)
726{
727	struct timespec ts, ts2, ts3;
728	struct timeval tv;
729	int error;
730
731	if (timeout == NULL) {
732		error = _do_lock_umtx(td, umtx, id, 0);
733		/* Mutex locking is restarted if it is interrupted. */
734		if (error == EINTR)
735			error = ERESTART;
736	} else {
737		getnanouptime(&ts);
738		timespecadd(&ts, timeout);
739		TIMESPEC_TO_TIMEVAL(&tv, timeout);
740		for (;;) {
741			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
742			if (error != ETIMEDOUT)
743				break;
744			getnanouptime(&ts2);
745			if (timespeccmp(&ts2, &ts, >=)) {
746				error = ETIMEDOUT;
747				break;
748			}
749			ts3 = ts;
750			timespecsub(&ts3, &ts2);
751			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
752		}
753		/* Timed-locking is not restarted. */
754		if (error == ERESTART)
755			error = EINTR;
756	}
757	return (error);
758}
759
760/*
761 * Unlock a umtx object.
762 */
763static int
764do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
765{
766	struct umtx_key key;
767	u_long owner;
768	u_long old;
769	int error;
770	int count;
771
772	/*
773	 * Make sure we own this mtx.
774	 */
775	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
776	if (owner == -1)
777		return (EFAULT);
778
779	if ((owner & ~UMTX_CONTESTED) != id)
780		return (EPERM);
781
782	/* This should be done in userland */
783	if ((owner & UMTX_CONTESTED) == 0) {
784		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
785		if (old == -1)
786			return (EFAULT);
787		if (old == owner)
788			return (0);
789		owner = old;
790	}
791
792	/* We should only ever be in here for contested locks */
793	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
794		&key)) != 0)
795		return (error);
796
797	umtxq_lock(&key);
798	umtxq_busy(&key);
799	count = umtxq_count(&key);
800	umtxq_unlock(&key);
801
802	/*
803	 * When unlocking the umtx, it must be marked as unowned if
804	 * there is zero or one thread only waiting for it.
805	 * Otherwise, it must be marked as contested.
806	 */
807	old = casuword(&umtx->u_owner, owner,
808		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
809	umtxq_lock(&key);
810	umtxq_signal(&key,1);
811	umtxq_unbusy(&key);
812	umtxq_unlock(&key);
813	umtx_key_release(&key);
814	if (old == -1)
815		return (EFAULT);
816	if (old != owner)
817		return (EINVAL);
818	return (0);
819}
820
821#ifdef COMPAT_FREEBSD32
822
823/*
824 * Lock a umtx object.
825 */
826static int
827_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
828{
829	struct umtx_q *uq;
830	uint32_t owner;
831	uint32_t old;
832	int error = 0;
833
834	uq = td->td_umtxq;
835
836	/*
837	 * Care must be exercised when dealing with umtx structure. It
838	 * can fault on any access.
839	 */
840	for (;;) {
841		/*
842		 * Try the uncontested case.  This should be done in userland.
843		 */
844		owner = casuword32(m, UMUTEX_UNOWNED, id);
845
846		/* The acquire succeeded. */
847		if (owner == UMUTEX_UNOWNED)
848			return (0);
849
850		/* The address was invalid. */
851		if (owner == -1)
852			return (EFAULT);
853
854		/* If no one owns it but it is contested try to acquire it. */
855		if (owner == UMUTEX_CONTESTED) {
856			owner = casuword32(m,
857			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
858			if (owner == UMUTEX_CONTESTED)
859				return (0);
860
861			/* The address was invalid. */
862			if (owner == -1)
863				return (EFAULT);
864
865			/* If this failed the lock has changed, restart. */
866			continue;
867		}
868
869		/*
870		 * If we caught a signal, we have retried and now
871		 * exit immediately.
872		 */
873		if (error != 0)
874			return (error);
875
876		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
877			AUTO_SHARE, &uq->uq_key)) != 0)
878			return (error);
879
880		umtxq_lock(&uq->uq_key);
881		umtxq_busy(&uq->uq_key);
882		umtxq_insert(uq);
883		umtxq_unbusy(&uq->uq_key);
884		umtxq_unlock(&uq->uq_key);
885
886		/*
887		 * Set the contested bit so that a release in user space
888		 * knows to use the system call for unlock.  If this fails
889		 * either some one else has acquired the lock or it has been
890		 * released.
891		 */
892		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
893
894		/* The address was invalid. */
895		if (old == -1) {
896			umtxq_lock(&uq->uq_key);
897			umtxq_remove(uq);
898			umtxq_unlock(&uq->uq_key);
899			umtx_key_release(&uq->uq_key);
900			return (EFAULT);
901		}
902
903		/*
904		 * We set the contested bit, sleep. Otherwise the lock changed
905		 * and we need to retry or we lost a race to the thread
906		 * unlocking the umtx.
907		 */
908		umtxq_lock(&uq->uq_key);
909		if (old == owner)
910			error = umtxq_sleep(uq, "umtx", timo);
911		umtxq_remove(uq);
912		umtxq_unlock(&uq->uq_key);
913		umtx_key_release(&uq->uq_key);
914	}
915
916	return (0);
917}
918
919/*
920 * Lock a umtx object.
921 */
922static int
923do_lock_umtx32(struct thread *td, void *m, uint32_t id,
924	struct timespec *timeout)
925{
926	struct timespec ts, ts2, ts3;
927	struct timeval tv;
928	int error;
929
930	if (timeout == NULL) {
931		error = _do_lock_umtx32(td, m, id, 0);
932		/* Mutex locking is restarted if it is interrupted. */
933		if (error == EINTR)
934			error = ERESTART;
935	} else {
936		getnanouptime(&ts);
937		timespecadd(&ts, timeout);
938		TIMESPEC_TO_TIMEVAL(&tv, timeout);
939		for (;;) {
940			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
941			if (error != ETIMEDOUT)
942				break;
943			getnanouptime(&ts2);
944			if (timespeccmp(&ts2, &ts, >=)) {
945				error = ETIMEDOUT;
946				break;
947			}
948			ts3 = ts;
949			timespecsub(&ts3, &ts2);
950			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
951		}
952		/* Timed-locking is not restarted. */
953		if (error == ERESTART)
954			error = EINTR;
955	}
956	return (error);
957}
958
959/*
960 * Unlock a umtx object.
961 */
962static int
963do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
964{
965	struct umtx_key key;
966	uint32_t owner;
967	uint32_t old;
968	int error;
969	int count;
970
971	/*
972	 * Make sure we own this mtx.
973	 */
974	owner = fuword32(m);
975	if (owner == -1)
976		return (EFAULT);
977
978	if ((owner & ~UMUTEX_CONTESTED) != id)
979		return (EPERM);
980
981	/* This should be done in userland */
982	if ((owner & UMUTEX_CONTESTED) == 0) {
983		old = casuword32(m, owner, UMUTEX_UNOWNED);
984		if (old == -1)
985			return (EFAULT);
986		if (old == owner)
987			return (0);
988		owner = old;
989	}
990
991	/* We should only ever be in here for contested locks */
992	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
993		&key)) != 0)
994		return (error);
995
996	umtxq_lock(&key);
997	umtxq_busy(&key);
998	count = umtxq_count(&key);
999	umtxq_unlock(&key);
1000
1001	/*
1002	 * When unlocking the umtx, it must be marked as unowned if
1003	 * there is zero or one thread only waiting for it.
1004	 * Otherwise, it must be marked as contested.
1005	 */
1006	old = casuword32(m, owner,
1007		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1008	umtxq_lock(&key);
1009	umtxq_signal(&key,1);
1010	umtxq_unbusy(&key);
1011	umtxq_unlock(&key);
1012	umtx_key_release(&key);
1013	if (old == -1)
1014		return (EFAULT);
1015	if (old != owner)
1016		return (EINVAL);
1017	return (0);
1018}
1019#endif
1020
1021/*
1022 * Fetch and compare value, sleep on the address if value is not changed.
1023 */
1024static int
1025do_wait(struct thread *td, void *addr, u_long id,
1026	struct timespec *timeout, int compat32, int is_private)
1027{
1028	struct umtx_q *uq;
1029	struct timespec ts, ts2, ts3;
1030	struct timeval tv;
1031	u_long tmp;
1032	int error = 0;
1033
1034	uq = td->td_umtxq;
1035	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1036		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1037		return (error);
1038
1039	umtxq_lock(&uq->uq_key);
1040	umtxq_insert(uq);
1041	umtxq_unlock(&uq->uq_key);
1042	if (compat32 == 0)
1043		tmp = fuword(addr);
1044        else
1045		tmp = (unsigned int)fuword32(addr);
1046	if (tmp != id) {
1047		umtxq_lock(&uq->uq_key);
1048		umtxq_remove(uq);
1049		umtxq_unlock(&uq->uq_key);
1050	} else if (timeout == NULL) {
1051		umtxq_lock(&uq->uq_key);
1052		error = umtxq_sleep(uq, "uwait", 0);
1053		umtxq_remove(uq);
1054		umtxq_unlock(&uq->uq_key);
1055	} else {
1056		getnanouptime(&ts);
1057		timespecadd(&ts, timeout);
1058		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1059		umtxq_lock(&uq->uq_key);
1060		for (;;) {
1061			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
1062			if (!(uq->uq_flags & UQF_UMTXQ)) {
1063				error = 0;
1064				break;
1065			}
1066			if (error != ETIMEDOUT)
1067				break;
1068			umtxq_unlock(&uq->uq_key);
1069			getnanouptime(&ts2);
1070			if (timespeccmp(&ts2, &ts, >=)) {
1071				error = ETIMEDOUT;
1072				umtxq_lock(&uq->uq_key);
1073				break;
1074			}
1075			ts3 = ts;
1076			timespecsub(&ts3, &ts2);
1077			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1078			umtxq_lock(&uq->uq_key);
1079		}
1080		umtxq_remove(uq);
1081		umtxq_unlock(&uq->uq_key);
1082	}
1083	umtx_key_release(&uq->uq_key);
1084	if (error == ERESTART)
1085		error = EINTR;
1086	return (error);
1087}
1088
1089/*
1090 * Wake up threads sleeping on the specified address.
1091 */
1092int
1093kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1094{
1095	struct umtx_key key;
1096	int ret;
1097
1098	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1099		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1100		return (ret);
1101	umtxq_lock(&key);
1102	ret = umtxq_signal(&key, n_wake);
1103	umtxq_unlock(&key);
1104	umtx_key_release(&key);
1105	return (0);
1106}
1107
1108/*
1109 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1110 */
1111static int
1112_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1113	int mode)
1114{
1115	struct umtx_q *uq;
1116	uint32_t owner, old, id;
1117	int error = 0;
1118
1119	id = td->td_tid;
1120	uq = td->td_umtxq;
1121
1122	/*
1123	 * Care must be exercised when dealing with umtx structure. It
1124	 * can fault on any access.
1125	 */
1126	for (;;) {
1127		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1128		if (mode == _UMUTEX_WAIT) {
1129			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1130				return (0);
1131		} else {
1132			/*
1133			 * Try the uncontested case.  This should be done in userland.
1134			 */
1135			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1136
1137			/* The acquire succeeded. */
1138			if (owner == UMUTEX_UNOWNED)
1139				return (0);
1140
1141			/* The address was invalid. */
1142			if (owner == -1)
1143				return (EFAULT);
1144
1145			/* If no one owns it but it is contested try to acquire it. */
1146			if (owner == UMUTEX_CONTESTED) {
1147				owner = casuword32(&m->m_owner,
1148				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1149
1150				if (owner == UMUTEX_CONTESTED)
1151					return (0);
1152
1153				/* The address was invalid. */
1154				if (owner == -1)
1155					return (EFAULT);
1156
1157				/* If this failed the lock has changed, restart. */
1158				continue;
1159			}
1160		}
1161
1162		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1163		    (owner & ~UMUTEX_CONTESTED) == id)
1164			return (EDEADLK);
1165
1166		if (mode == _UMUTEX_TRY)
1167			return (EBUSY);
1168
1169		/*
1170		 * If we caught a signal, we have retried and now
1171		 * exit immediately.
1172		 */
1173		if (error != 0)
1174			return (error);
1175
1176		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1177		    GET_SHARE(flags), &uq->uq_key)) != 0)
1178			return (error);
1179
1180		umtxq_lock(&uq->uq_key);
1181		umtxq_busy(&uq->uq_key);
1182		umtxq_insert(uq);
1183		umtxq_unlock(&uq->uq_key);
1184
1185		/*
1186		 * Set the contested bit so that a release in user space
1187		 * knows to use the system call for unlock.  If this fails
1188		 * either some one else has acquired the lock or it has been
1189		 * released.
1190		 */
1191		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1192
1193		/* The address was invalid. */
1194		if (old == -1) {
1195			umtxq_lock(&uq->uq_key);
1196			umtxq_remove(uq);
1197			umtxq_unbusy(&uq->uq_key);
1198			umtxq_unlock(&uq->uq_key);
1199			umtx_key_release(&uq->uq_key);
1200			return (EFAULT);
1201		}
1202
1203		/*
1204		 * We set the contested bit, sleep. Otherwise the lock changed
1205		 * and we need to retry or we lost a race to the thread
1206		 * unlocking the umtx.
1207		 */
1208		umtxq_lock(&uq->uq_key);
1209		umtxq_unbusy(&uq->uq_key);
1210		if (old == owner)
1211			error = umtxq_sleep(uq, "umtxn", timo);
1212		umtxq_remove(uq);
1213		umtxq_unlock(&uq->uq_key);
1214		umtx_key_release(&uq->uq_key);
1215	}
1216
1217	return (0);
1218}
1219
1220/*
1221 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1222 */
1223/*
1224 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1225 */
1226static int
1227do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1228{
1229	struct umtx_key key;
1230	uint32_t owner, old, id;
1231	int error;
1232	int count;
1233
1234	id = td->td_tid;
1235	/*
1236	 * Make sure we own this mtx.
1237	 */
1238	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1239	if (owner == -1)
1240		return (EFAULT);
1241
1242	if ((owner & ~UMUTEX_CONTESTED) != id)
1243		return (EPERM);
1244
1245	if ((owner & UMUTEX_CONTESTED) == 0) {
1246		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1247		if (old == -1)
1248			return (EFAULT);
1249		if (old == owner)
1250			return (0);
1251		owner = old;
1252	}
1253
1254	/* We should only ever be in here for contested locks */
1255	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1256	    &key)) != 0)
1257		return (error);
1258
1259	umtxq_lock(&key);
1260	umtxq_busy(&key);
1261	count = umtxq_count(&key);
1262	umtxq_unlock(&key);
1263
1264	/*
1265	 * When unlocking the umtx, it must be marked as unowned if
1266	 * there is zero or one thread only waiting for it.
1267	 * Otherwise, it must be marked as contested.
1268	 */
1269	old = casuword32(&m->m_owner, owner,
1270		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1271	umtxq_lock(&key);
1272	umtxq_signal(&key,1);
1273	umtxq_unbusy(&key);
1274	umtxq_unlock(&key);
1275	umtx_key_release(&key);
1276	if (old == -1)
1277		return (EFAULT);
1278	if (old != owner)
1279		return (EINVAL);
1280	return (0);
1281}
1282
1283/*
1284 * Check if the mutex is available and wake up a waiter,
1285 * only for simple mutex.
1286 */
1287static int
1288do_wake_umutex(struct thread *td, struct umutex *m)
1289{
1290	struct umtx_key key;
1291	uint32_t owner;
1292	uint32_t flags;
1293	int error;
1294	int count;
1295
1296	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1297	if (owner == -1)
1298		return (EFAULT);
1299
1300	if ((owner & ~UMUTEX_CONTESTED) != 0)
1301		return (0);
1302
1303	flags = fuword32(&m->m_flags);
1304
1305	/* We should only ever be in here for contested locks */
1306	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1307	    &key)) != 0)
1308		return (error);
1309
1310	umtxq_lock(&key);
1311	umtxq_busy(&key);
1312	count = umtxq_count(&key);
1313	umtxq_unlock(&key);
1314
1315	if (count <= 1)
1316		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1317
1318	umtxq_lock(&key);
1319	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1320		umtxq_signal(&key, 1);
1321	umtxq_unbusy(&key);
1322	umtxq_unlock(&key);
1323	umtx_key_release(&key);
1324	return (0);
1325}
1326
1327static inline struct umtx_pi *
1328umtx_pi_alloc(int flags)
1329{
1330	struct umtx_pi *pi;
1331
1332	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1333	TAILQ_INIT(&pi->pi_blocked);
1334	atomic_add_int(&umtx_pi_allocated, 1);
1335	return (pi);
1336}
1337
1338static inline void
1339umtx_pi_free(struct umtx_pi *pi)
1340{
1341	uma_zfree(umtx_pi_zone, pi);
1342	atomic_add_int(&umtx_pi_allocated, -1);
1343}
1344
1345/*
1346 * Adjust the thread's position on a pi_state after its priority has been
1347 * changed.
1348 */
1349static int
1350umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1351{
1352	struct umtx_q *uq, *uq1, *uq2;
1353	struct thread *td1;
1354
1355	mtx_assert(&umtx_lock, MA_OWNED);
1356	if (pi == NULL)
1357		return (0);
1358
1359	uq = td->td_umtxq;
1360
1361	/*
1362	 * Check if the thread needs to be moved on the blocked chain.
1363	 * It needs to be moved if either its priority is lower than
1364	 * the previous thread or higher than the next thread.
1365	 */
1366	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1367	uq2 = TAILQ_NEXT(uq, uq_lockq);
1368	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1369	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1370		/*
1371		 * Remove thread from blocked chain and determine where
1372		 * it should be moved to.
1373		 */
1374		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1375		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1376			td1 = uq1->uq_thread;
1377			MPASS(td1->td_proc->p_magic == P_MAGIC);
1378			if (UPRI(td1) > UPRI(td))
1379				break;
1380		}
1381
1382		if (uq1 == NULL)
1383			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1384		else
1385			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1386	}
1387	return (1);
1388}
1389
1390/*
1391 * Propagate priority when a thread is blocked on POSIX
1392 * PI mutex.
1393 */
1394static void
1395umtx_propagate_priority(struct thread *td)
1396{
1397	struct umtx_q *uq;
1398	struct umtx_pi *pi;
1399	int pri;
1400
1401	mtx_assert(&umtx_lock, MA_OWNED);
1402	pri = UPRI(td);
1403	uq = td->td_umtxq;
1404	pi = uq->uq_pi_blocked;
1405	if (pi == NULL)
1406		return;
1407
1408	for (;;) {
1409		td = pi->pi_owner;
1410		if (td == NULL || td == curthread)
1411			return;
1412
1413		MPASS(td->td_proc != NULL);
1414		MPASS(td->td_proc->p_magic == P_MAGIC);
1415
1416		thread_lock(td);
1417		if (td->td_lend_user_pri > pri)
1418			sched_lend_user_prio(td, pri);
1419		else {
1420			thread_unlock(td);
1421			break;
1422		}
1423		thread_unlock(td);
1424
1425		/*
1426		 * Pick up the lock that td is blocked on.
1427		 */
1428		uq = td->td_umtxq;
1429		pi = uq->uq_pi_blocked;
1430		if (pi == NULL)
1431			break;
1432		/* Resort td on the list if needed. */
1433		umtx_pi_adjust_thread(pi, td);
1434	}
1435}
1436
1437/*
1438 * Unpropagate priority for a PI mutex when a thread blocked on
1439 * it is interrupted by signal or resumed by others.
1440 */
1441static void
1442umtx_repropagate_priority(struct umtx_pi *pi)
1443{
1444	struct umtx_q *uq, *uq_owner;
1445	struct umtx_pi *pi2;
1446	int pri;
1447
1448	mtx_assert(&umtx_lock, MA_OWNED);
1449
1450	while (pi != NULL && pi->pi_owner != NULL) {
1451		pri = PRI_MAX;
1452		uq_owner = pi->pi_owner->td_umtxq;
1453
1454		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1455			uq = TAILQ_FIRST(&pi2->pi_blocked);
1456			if (uq != NULL) {
1457				if (pri > UPRI(uq->uq_thread))
1458					pri = UPRI(uq->uq_thread);
1459			}
1460		}
1461
1462		if (pri > uq_owner->uq_inherited_pri)
1463			pri = uq_owner->uq_inherited_pri;
1464		thread_lock(pi->pi_owner);
1465		sched_lend_user_prio(pi->pi_owner, pri);
1466		thread_unlock(pi->pi_owner);
1467		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1468			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1469	}
1470}
1471
1472/*
1473 * Insert a PI mutex into owned list.
1474 */
1475static void
1476umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1477{
1478	struct umtx_q *uq_owner;
1479
1480	uq_owner = owner->td_umtxq;
1481	mtx_assert(&umtx_lock, MA_OWNED);
1482	if (pi->pi_owner != NULL)
1483		panic("pi_ower != NULL");
1484	pi->pi_owner = owner;
1485	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1486}
1487
1488/*
1489 * Claim ownership of a PI mutex.
1490 */
1491static int
1492umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1493{
1494	struct umtx_q *uq, *uq_owner;
1495
1496	uq_owner = owner->td_umtxq;
1497	mtx_lock_spin(&umtx_lock);
1498	if (pi->pi_owner == owner) {
1499		mtx_unlock_spin(&umtx_lock);
1500		return (0);
1501	}
1502
1503	if (pi->pi_owner != NULL) {
1504		/*
1505		 * userland may have already messed the mutex, sigh.
1506		 */
1507		mtx_unlock_spin(&umtx_lock);
1508		return (EPERM);
1509	}
1510	umtx_pi_setowner(pi, owner);
1511	uq = TAILQ_FIRST(&pi->pi_blocked);
1512	if (uq != NULL) {
1513		int pri;
1514
1515		pri = UPRI(uq->uq_thread);
1516		thread_lock(owner);
1517		if (pri < UPRI(owner))
1518			sched_lend_user_prio(owner, pri);
1519		thread_unlock(owner);
1520	}
1521	mtx_unlock_spin(&umtx_lock);
1522	return (0);
1523}
1524
1525/*
1526 * Adjust a thread's order position in its blocked PI mutex,
1527 * this may result new priority propagating process.
1528 */
1529void
1530umtx_pi_adjust(struct thread *td, u_char oldpri)
1531{
1532	struct umtx_q *uq;
1533	struct umtx_pi *pi;
1534
1535	uq = td->td_umtxq;
1536	mtx_lock_spin(&umtx_lock);
1537	/*
1538	 * Pick up the lock that td is blocked on.
1539	 */
1540	pi = uq->uq_pi_blocked;
1541	if (pi != NULL) {
1542		umtx_pi_adjust_thread(pi, td);
1543		umtx_repropagate_priority(pi);
1544	}
1545	mtx_unlock_spin(&umtx_lock);
1546}
1547
1548/*
1549 * Sleep on a PI mutex.
1550 */
1551static int
1552umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1553	uint32_t owner, const char *wmesg, int timo)
1554{
1555	struct umtxq_chain *uc;
1556	struct thread *td, *td1;
1557	struct umtx_q *uq1;
1558	int pri;
1559	int error = 0;
1560
1561	td = uq->uq_thread;
1562	KASSERT(td == curthread, ("inconsistent uq_thread"));
1563	uc = umtxq_getchain(&uq->uq_key);
1564	UMTXQ_LOCKED_ASSERT(uc);
1565	UMTXQ_BUSY_ASSERT(uc);
1566	umtxq_insert(uq);
1567	mtx_lock_spin(&umtx_lock);
1568	if (pi->pi_owner == NULL) {
1569		mtx_unlock_spin(&umtx_lock);
1570		/* XXX Only look up thread in current process. */
1571		td1 = tdfind(owner, curproc->p_pid);
1572		mtx_lock_spin(&umtx_lock);
1573		if (td1 != NULL) {
1574			if (pi->pi_owner == NULL)
1575				umtx_pi_setowner(pi, td1);
1576			PROC_UNLOCK(td1->td_proc);
1577		}
1578	}
1579
1580	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1581		pri = UPRI(uq1->uq_thread);
1582		if (pri > UPRI(td))
1583			break;
1584	}
1585
1586	if (uq1 != NULL)
1587		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1588	else
1589		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1590
1591	uq->uq_pi_blocked = pi;
1592	thread_lock(td);
1593	td->td_flags |= TDF_UPIBLOCKED;
1594	thread_unlock(td);
1595	umtx_propagate_priority(td);
1596	mtx_unlock_spin(&umtx_lock);
1597	umtxq_unbusy(&uq->uq_key);
1598
1599	if (uq->uq_flags & UQF_UMTXQ) {
1600		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1601		if (error == EWOULDBLOCK)
1602			error = ETIMEDOUT;
1603		if (uq->uq_flags & UQF_UMTXQ) {
1604			umtxq_remove(uq);
1605		}
1606	}
1607	mtx_lock_spin(&umtx_lock);
1608	uq->uq_pi_blocked = NULL;
1609	thread_lock(td);
1610	td->td_flags &= ~TDF_UPIBLOCKED;
1611	thread_unlock(td);
1612	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1613	umtx_repropagate_priority(pi);
1614	mtx_unlock_spin(&umtx_lock);
1615	umtxq_unlock(&uq->uq_key);
1616
1617	return (error);
1618}
1619
1620/*
1621 * Add reference count for a PI mutex.
1622 */
1623static void
1624umtx_pi_ref(struct umtx_pi *pi)
1625{
1626	struct umtxq_chain *uc;
1627
1628	uc = umtxq_getchain(&pi->pi_key);
1629	UMTXQ_LOCKED_ASSERT(uc);
1630	pi->pi_refcount++;
1631}
1632
1633/*
1634 * Decrease reference count for a PI mutex, if the counter
1635 * is decreased to zero, its memory space is freed.
1636 */
1637static void
1638umtx_pi_unref(struct umtx_pi *pi)
1639{
1640	struct umtxq_chain *uc;
1641
1642	uc = umtxq_getchain(&pi->pi_key);
1643	UMTXQ_LOCKED_ASSERT(uc);
1644	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1645	if (--pi->pi_refcount == 0) {
1646		mtx_lock_spin(&umtx_lock);
1647		if (pi->pi_owner != NULL) {
1648			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1649				pi, pi_link);
1650			pi->pi_owner = NULL;
1651		}
1652		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1653			("blocked queue not empty"));
1654		mtx_unlock_spin(&umtx_lock);
1655		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1656		umtx_pi_free(pi);
1657	}
1658}
1659
1660/*
1661 * Find a PI mutex in hash table.
1662 */
1663static struct umtx_pi *
1664umtx_pi_lookup(struct umtx_key *key)
1665{
1666	struct umtxq_chain *uc;
1667	struct umtx_pi *pi;
1668
1669	uc = umtxq_getchain(key);
1670	UMTXQ_LOCKED_ASSERT(uc);
1671
1672	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1673		if (umtx_key_match(&pi->pi_key, key)) {
1674			return (pi);
1675		}
1676	}
1677	return (NULL);
1678}
1679
1680/*
1681 * Insert a PI mutex into hash table.
1682 */
1683static inline void
1684umtx_pi_insert(struct umtx_pi *pi)
1685{
1686	struct umtxq_chain *uc;
1687
1688	uc = umtxq_getchain(&pi->pi_key);
1689	UMTXQ_LOCKED_ASSERT(uc);
1690	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1691}
1692
1693/*
1694 * Lock a PI mutex.
1695 */
1696static int
1697_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1698	int try)
1699{
1700	struct umtx_q *uq;
1701	struct umtx_pi *pi, *new_pi;
1702	uint32_t id, owner, old;
1703	int error;
1704
1705	id = td->td_tid;
1706	uq = td->td_umtxq;
1707
1708	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1709	    &uq->uq_key)) != 0)
1710		return (error);
1711	umtxq_lock(&uq->uq_key);
1712	pi = umtx_pi_lookup(&uq->uq_key);
1713	if (pi == NULL) {
1714		new_pi = umtx_pi_alloc(M_NOWAIT);
1715		if (new_pi == NULL) {
1716			umtxq_unlock(&uq->uq_key);
1717			new_pi = umtx_pi_alloc(M_WAITOK);
1718			umtxq_lock(&uq->uq_key);
1719			pi = umtx_pi_lookup(&uq->uq_key);
1720			if (pi != NULL) {
1721				umtx_pi_free(new_pi);
1722				new_pi = NULL;
1723			}
1724		}
1725		if (new_pi != NULL) {
1726			new_pi->pi_key = uq->uq_key;
1727			umtx_pi_insert(new_pi);
1728			pi = new_pi;
1729		}
1730	}
1731	umtx_pi_ref(pi);
1732	umtxq_unlock(&uq->uq_key);
1733
1734	/*
1735	 * Care must be exercised when dealing with umtx structure.  It
1736	 * can fault on any access.
1737	 */
1738	for (;;) {
1739		/*
1740		 * Try the uncontested case.  This should be done in userland.
1741		 */
1742		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1743
1744		/* The acquire succeeded. */
1745		if (owner == UMUTEX_UNOWNED) {
1746			error = 0;
1747			break;
1748		}
1749
1750		/* The address was invalid. */
1751		if (owner == -1) {
1752			error = EFAULT;
1753			break;
1754		}
1755
1756		/* If no one owns it but it is contested try to acquire it. */
1757		if (owner == UMUTEX_CONTESTED) {
1758			owner = casuword32(&m->m_owner,
1759			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1760
1761			if (owner == UMUTEX_CONTESTED) {
1762				umtxq_lock(&uq->uq_key);
1763				umtxq_busy(&uq->uq_key);
1764				error = umtx_pi_claim(pi, td);
1765				umtxq_unbusy(&uq->uq_key);
1766				umtxq_unlock(&uq->uq_key);
1767				break;
1768			}
1769
1770			/* The address was invalid. */
1771			if (owner == -1) {
1772				error = EFAULT;
1773				break;
1774			}
1775
1776			/* If this failed the lock has changed, restart. */
1777			continue;
1778		}
1779
1780		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1781		    (owner & ~UMUTEX_CONTESTED) == id) {
1782			error = EDEADLK;
1783			break;
1784		}
1785
1786		if (try != 0) {
1787			error = EBUSY;
1788			break;
1789		}
1790
1791		/*
1792		 * If we caught a signal, we have retried and now
1793		 * exit immediately.
1794		 */
1795		if (error != 0)
1796			break;
1797
1798		umtxq_lock(&uq->uq_key);
1799		umtxq_busy(&uq->uq_key);
1800		umtxq_unlock(&uq->uq_key);
1801
1802		/*
1803		 * Set the contested bit so that a release in user space
1804		 * knows to use the system call for unlock.  If this fails
1805		 * either some one else has acquired the lock or it has been
1806		 * released.
1807		 */
1808		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1809
1810		/* The address was invalid. */
1811		if (old == -1) {
1812			umtxq_lock(&uq->uq_key);
1813			umtxq_unbusy(&uq->uq_key);
1814			umtxq_unlock(&uq->uq_key);
1815			error = EFAULT;
1816			break;
1817		}
1818
1819		umtxq_lock(&uq->uq_key);
1820		/*
1821		 * We set the contested bit, sleep. Otherwise the lock changed
1822		 * and we need to retry or we lost a race to the thread
1823		 * unlocking the umtx.
1824		 */
1825		if (old == owner)
1826			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1827				 "umtxpi", timo);
1828		else {
1829			umtxq_unbusy(&uq->uq_key);
1830			umtxq_unlock(&uq->uq_key);
1831		}
1832	}
1833
1834	umtxq_lock(&uq->uq_key);
1835	umtx_pi_unref(pi);
1836	umtxq_unlock(&uq->uq_key);
1837
1838	umtx_key_release(&uq->uq_key);
1839	return (error);
1840}
1841
1842/*
1843 * Unlock a PI mutex.
1844 */
1845static int
1846do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1847{
1848	struct umtx_key key;
1849	struct umtx_q *uq_first, *uq_first2, *uq_me;
1850	struct umtx_pi *pi, *pi2;
1851	uint32_t owner, old, id;
1852	int error;
1853	int count;
1854	int pri;
1855
1856	id = td->td_tid;
1857	/*
1858	 * Make sure we own this mtx.
1859	 */
1860	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1861	if (owner == -1)
1862		return (EFAULT);
1863
1864	if ((owner & ~UMUTEX_CONTESTED) != id)
1865		return (EPERM);
1866
1867	/* This should be done in userland */
1868	if ((owner & UMUTEX_CONTESTED) == 0) {
1869		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1870		if (old == -1)
1871			return (EFAULT);
1872		if (old == owner)
1873			return (0);
1874		owner = old;
1875	}
1876
1877	/* We should only ever be in here for contested locks */
1878	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1879	    &key)) != 0)
1880		return (error);
1881
1882	umtxq_lock(&key);
1883	umtxq_busy(&key);
1884	count = umtxq_count_pi(&key, &uq_first);
1885	if (uq_first != NULL) {
1886		mtx_lock_spin(&umtx_lock);
1887		pi = uq_first->uq_pi_blocked;
1888		KASSERT(pi != NULL, ("pi == NULL?"));
1889		if (pi->pi_owner != curthread) {
1890			mtx_unlock_spin(&umtx_lock);
1891			umtxq_unbusy(&key);
1892			umtxq_unlock(&key);
1893			umtx_key_release(&key);
1894			/* userland messed the mutex */
1895			return (EPERM);
1896		}
1897		uq_me = curthread->td_umtxq;
1898		pi->pi_owner = NULL;
1899		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1900		/* get highest priority thread which is still sleeping. */
1901		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1902		while (uq_first != NULL &&
1903		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1904			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1905		}
1906		pri = PRI_MAX;
1907		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1908			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1909			if (uq_first2 != NULL) {
1910				if (pri > UPRI(uq_first2->uq_thread))
1911					pri = UPRI(uq_first2->uq_thread);
1912			}
1913		}
1914		thread_lock(curthread);
1915		sched_lend_user_prio(curthread, pri);
1916		thread_unlock(curthread);
1917		mtx_unlock_spin(&umtx_lock);
1918		if (uq_first)
1919			umtxq_signal_thread(uq_first);
1920	}
1921	umtxq_unlock(&key);
1922
1923	/*
1924	 * When unlocking the umtx, it must be marked as unowned if
1925	 * there is zero or one thread only waiting for it.
1926	 * Otherwise, it must be marked as contested.
1927	 */
1928	old = casuword32(&m->m_owner, owner,
1929		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1930
1931	umtxq_lock(&key);
1932	umtxq_unbusy(&key);
1933	umtxq_unlock(&key);
1934	umtx_key_release(&key);
1935	if (old == -1)
1936		return (EFAULT);
1937	if (old != owner)
1938		return (EINVAL);
1939	return (0);
1940}
1941
1942/*
1943 * Lock a PP mutex.
1944 */
1945static int
1946_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1947	int try)
1948{
1949	struct umtx_q *uq, *uq2;
1950	struct umtx_pi *pi;
1951	uint32_t ceiling;
1952	uint32_t owner, id;
1953	int error, pri, old_inherited_pri, su;
1954
1955	id = td->td_tid;
1956	uq = td->td_umtxq;
1957	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1958	    &uq->uq_key)) != 0)
1959		return (error);
1960	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1961	for (;;) {
1962		old_inherited_pri = uq->uq_inherited_pri;
1963		umtxq_lock(&uq->uq_key);
1964		umtxq_busy(&uq->uq_key);
1965		umtxq_unlock(&uq->uq_key);
1966
1967		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1968		if (ceiling > RTP_PRIO_MAX) {
1969			error = EINVAL;
1970			goto out;
1971		}
1972
1973		mtx_lock_spin(&umtx_lock);
1974		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1975			mtx_unlock_spin(&umtx_lock);
1976			error = EINVAL;
1977			goto out;
1978		}
1979		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1980			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1981			thread_lock(td);
1982			if (uq->uq_inherited_pri < UPRI(td))
1983				sched_lend_user_prio(td, uq->uq_inherited_pri);
1984			thread_unlock(td);
1985		}
1986		mtx_unlock_spin(&umtx_lock);
1987
1988		owner = casuword32(&m->m_owner,
1989		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1990
1991		if (owner == UMUTEX_CONTESTED) {
1992			error = 0;
1993			break;
1994		}
1995
1996		/* The address was invalid. */
1997		if (owner == -1) {
1998			error = EFAULT;
1999			break;
2000		}
2001
2002		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2003		    (owner & ~UMUTEX_CONTESTED) == id) {
2004			error = EDEADLK;
2005			break;
2006		}
2007
2008		if (try != 0) {
2009			error = EBUSY;
2010			break;
2011		}
2012
2013		/*
2014		 * If we caught a signal, we have retried and now
2015		 * exit immediately.
2016		 */
2017		if (error != 0)
2018			break;
2019
2020		umtxq_lock(&uq->uq_key);
2021		umtxq_insert(uq);
2022		umtxq_unbusy(&uq->uq_key);
2023		error = umtxq_sleep(uq, "umtxpp", timo);
2024		umtxq_remove(uq);
2025		umtxq_unlock(&uq->uq_key);
2026
2027		mtx_lock_spin(&umtx_lock);
2028		uq->uq_inherited_pri = old_inherited_pri;
2029		pri = PRI_MAX;
2030		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2031			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2032			if (uq2 != NULL) {
2033				if (pri > UPRI(uq2->uq_thread))
2034					pri = UPRI(uq2->uq_thread);
2035			}
2036		}
2037		if (pri > uq->uq_inherited_pri)
2038			pri = uq->uq_inherited_pri;
2039		thread_lock(td);
2040		sched_lend_user_prio(td, pri);
2041		thread_unlock(td);
2042		mtx_unlock_spin(&umtx_lock);
2043	}
2044
2045	if (error != 0) {
2046		mtx_lock_spin(&umtx_lock);
2047		uq->uq_inherited_pri = old_inherited_pri;
2048		pri = PRI_MAX;
2049		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2050			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2051			if (uq2 != NULL) {
2052				if (pri > UPRI(uq2->uq_thread))
2053					pri = UPRI(uq2->uq_thread);
2054			}
2055		}
2056		if (pri > uq->uq_inherited_pri)
2057			pri = uq->uq_inherited_pri;
2058		thread_lock(td);
2059		sched_lend_user_prio(td, pri);
2060		thread_unlock(td);
2061		mtx_unlock_spin(&umtx_lock);
2062	}
2063
2064out:
2065	umtxq_lock(&uq->uq_key);
2066	umtxq_unbusy(&uq->uq_key);
2067	umtxq_unlock(&uq->uq_key);
2068	umtx_key_release(&uq->uq_key);
2069	return (error);
2070}
2071
2072/*
2073 * Unlock a PP mutex.
2074 */
2075static int
2076do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2077{
2078	struct umtx_key key;
2079	struct umtx_q *uq, *uq2;
2080	struct umtx_pi *pi;
2081	uint32_t owner, id;
2082	uint32_t rceiling;
2083	int error, pri, new_inherited_pri, su;
2084
2085	id = td->td_tid;
2086	uq = td->td_umtxq;
2087	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2088
2089	/*
2090	 * Make sure we own this mtx.
2091	 */
2092	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2093	if (owner == -1)
2094		return (EFAULT);
2095
2096	if ((owner & ~UMUTEX_CONTESTED) != id)
2097		return (EPERM);
2098
2099	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2100	if (error != 0)
2101		return (error);
2102
2103	if (rceiling == -1)
2104		new_inherited_pri = PRI_MAX;
2105	else {
2106		rceiling = RTP_PRIO_MAX - rceiling;
2107		if (rceiling > RTP_PRIO_MAX)
2108			return (EINVAL);
2109		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2110	}
2111
2112	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2113	    &key)) != 0)
2114		return (error);
2115	umtxq_lock(&key);
2116	umtxq_busy(&key);
2117	umtxq_unlock(&key);
2118	/*
2119	 * For priority protected mutex, always set unlocked state
2120	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2121	 * to lock the mutex, it is necessary because thread priority
2122	 * has to be adjusted for such mutex.
2123	 */
2124	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2125		UMUTEX_CONTESTED);
2126
2127	umtxq_lock(&key);
2128	if (error == 0)
2129		umtxq_signal(&key, 1);
2130	umtxq_unbusy(&key);
2131	umtxq_unlock(&key);
2132
2133	if (error == -1)
2134		error = EFAULT;
2135	else {
2136		mtx_lock_spin(&umtx_lock);
2137		if (su != 0)
2138			uq->uq_inherited_pri = new_inherited_pri;
2139		pri = PRI_MAX;
2140		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2141			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2142			if (uq2 != NULL) {
2143				if (pri > UPRI(uq2->uq_thread))
2144					pri = UPRI(uq2->uq_thread);
2145			}
2146		}
2147		if (pri > uq->uq_inherited_pri)
2148			pri = uq->uq_inherited_pri;
2149		thread_lock(td);
2150		sched_lend_user_prio(td, pri);
2151		thread_unlock(td);
2152		mtx_unlock_spin(&umtx_lock);
2153	}
2154	umtx_key_release(&key);
2155	return (error);
2156}
2157
2158static int
2159do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2160	uint32_t *old_ceiling)
2161{
2162	struct umtx_q *uq;
2163	uint32_t save_ceiling;
2164	uint32_t owner, id;
2165	uint32_t flags;
2166	int error;
2167
2168	flags = fuword32(&m->m_flags);
2169	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2170		return (EINVAL);
2171	if (ceiling > RTP_PRIO_MAX)
2172		return (EINVAL);
2173	id = td->td_tid;
2174	uq = td->td_umtxq;
2175	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2176	   &uq->uq_key)) != 0)
2177		return (error);
2178	for (;;) {
2179		umtxq_lock(&uq->uq_key);
2180		umtxq_busy(&uq->uq_key);
2181		umtxq_unlock(&uq->uq_key);
2182
2183		save_ceiling = fuword32(&m->m_ceilings[0]);
2184
2185		owner = casuword32(&m->m_owner,
2186		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2187
2188		if (owner == UMUTEX_CONTESTED) {
2189			suword32(&m->m_ceilings[0], ceiling);
2190			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2191				UMUTEX_CONTESTED);
2192			error = 0;
2193			break;
2194		}
2195
2196		/* The address was invalid. */
2197		if (owner == -1) {
2198			error = EFAULT;
2199			break;
2200		}
2201
2202		if ((owner & ~UMUTEX_CONTESTED) == id) {
2203			suword32(&m->m_ceilings[0], ceiling);
2204			error = 0;
2205			break;
2206		}
2207
2208		/*
2209		 * If we caught a signal, we have retried and now
2210		 * exit immediately.
2211		 */
2212		if (error != 0)
2213			break;
2214
2215		/*
2216		 * We set the contested bit, sleep. Otherwise the lock changed
2217		 * and we need to retry or we lost a race to the thread
2218		 * unlocking the umtx.
2219		 */
2220		umtxq_lock(&uq->uq_key);
2221		umtxq_insert(uq);
2222		umtxq_unbusy(&uq->uq_key);
2223		error = umtxq_sleep(uq, "umtxpp", 0);
2224		umtxq_remove(uq);
2225		umtxq_unlock(&uq->uq_key);
2226	}
2227	umtxq_lock(&uq->uq_key);
2228	if (error == 0)
2229		umtxq_signal(&uq->uq_key, INT_MAX);
2230	umtxq_unbusy(&uq->uq_key);
2231	umtxq_unlock(&uq->uq_key);
2232	umtx_key_release(&uq->uq_key);
2233	if (error == 0 && old_ceiling != NULL)
2234		suword32(old_ceiling, save_ceiling);
2235	return (error);
2236}
2237
2238static int
2239_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2240	int mode)
2241{
2242	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2243	case 0:
2244		return (_do_lock_normal(td, m, flags, timo, mode));
2245	case UMUTEX_PRIO_INHERIT:
2246		return (_do_lock_pi(td, m, flags, timo, mode));
2247	case UMUTEX_PRIO_PROTECT:
2248		return (_do_lock_pp(td, m, flags, timo, mode));
2249	}
2250	return (EINVAL);
2251}
2252
2253/*
2254 * Lock a userland POSIX mutex.
2255 */
2256static int
2257do_lock_umutex(struct thread *td, struct umutex *m,
2258	struct timespec *timeout, int mode)
2259{
2260	struct timespec ts, ts2, ts3;
2261	struct timeval tv;
2262	uint32_t flags;
2263	int error;
2264
2265	flags = fuword32(&m->m_flags);
2266	if (flags == -1)
2267		return (EFAULT);
2268
2269	if (timeout == NULL) {
2270		error = _do_lock_umutex(td, m, flags, 0, mode);
2271		/* Mutex locking is restarted if it is interrupted. */
2272		if (error == EINTR && mode != _UMUTEX_WAIT)
2273			error = ERESTART;
2274	} else {
2275		getnanouptime(&ts);
2276		timespecadd(&ts, timeout);
2277		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2278		for (;;) {
2279			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2280			if (error != ETIMEDOUT)
2281				break;
2282			getnanouptime(&ts2);
2283			if (timespeccmp(&ts2, &ts, >=)) {
2284				error = ETIMEDOUT;
2285				break;
2286			}
2287			ts3 = ts;
2288			timespecsub(&ts3, &ts2);
2289			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2290		}
2291		/* Timed-locking is not restarted. */
2292		if (error == ERESTART)
2293			error = EINTR;
2294	}
2295	return (error);
2296}
2297
2298/*
2299 * Unlock a userland POSIX mutex.
2300 */
2301static int
2302do_unlock_umutex(struct thread *td, struct umutex *m)
2303{
2304	uint32_t flags;
2305
2306	flags = fuword32(&m->m_flags);
2307	if (flags == -1)
2308		return (EFAULT);
2309
2310	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2311	case 0:
2312		return (do_unlock_normal(td, m, flags));
2313	case UMUTEX_PRIO_INHERIT:
2314		return (do_unlock_pi(td, m, flags));
2315	case UMUTEX_PRIO_PROTECT:
2316		return (do_unlock_pp(td, m, flags));
2317	}
2318
2319	return (EINVAL);
2320}
2321
2322static int
2323do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2324	struct timespec *timeout, u_long wflags)
2325{
2326	struct umtx_q *uq;
2327	struct timeval tv;
2328	struct timespec cts, ets, tts;
2329	uint32_t flags;
2330	uint32_t clockid;
2331	int error;
2332
2333	uq = td->td_umtxq;
2334	flags = fuword32(&cv->c_flags);
2335	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2336	if (error != 0)
2337		return (error);
2338
2339	if ((wflags & CVWAIT_CLOCKID) != 0) {
2340		clockid = fuword32(&cv->c_clockid);
2341		if (clockid < CLOCK_REALTIME ||
2342		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2343			/* hmm, only HW clock id will work. */
2344			return (EINVAL);
2345		}
2346	} else {
2347		clockid = CLOCK_REALTIME;
2348	}
2349
2350	umtxq_lock(&uq->uq_key);
2351	umtxq_busy(&uq->uq_key);
2352	umtxq_insert(uq);
2353	umtxq_unlock(&uq->uq_key);
2354
2355	/*
2356	 * Set c_has_waiters to 1 before releasing user mutex, also
2357	 * don't modify cache line when unnecessary.
2358	 */
2359	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2360		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2361
2362	umtxq_lock(&uq->uq_key);
2363	umtxq_unbusy(&uq->uq_key);
2364	umtxq_unlock(&uq->uq_key);
2365
2366	error = do_unlock_umutex(td, m);
2367
2368	umtxq_lock(&uq->uq_key);
2369	if (error == 0) {
2370		if (timeout == NULL) {
2371			error = umtxq_sleep(uq, "ucond", 0);
2372		} else {
2373			if ((wflags & CVWAIT_ABSTIME) == 0) {
2374				kern_clock_gettime(td, clockid, &ets);
2375				timespecadd(&ets, timeout);
2376				tts = *timeout;
2377			} else { /* absolute time */
2378				ets = *timeout;
2379				tts = *timeout;
2380				kern_clock_gettime(td, clockid, &cts);
2381				timespecsub(&tts, &cts);
2382			}
2383			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2384			for (;;) {
2385				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2386				if (error != ETIMEDOUT)
2387					break;
2388				kern_clock_gettime(td, clockid, &cts);
2389				if (timespeccmp(&cts, &ets, >=)) {
2390					error = ETIMEDOUT;
2391					break;
2392				}
2393				tts = ets;
2394				timespecsub(&tts, &cts);
2395				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2396			}
2397		}
2398	}
2399
2400	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2401		error = 0;
2402	else {
2403		/*
2404		 * This must be timeout,interrupted by signal or
2405		 * surprious wakeup, clear c_has_waiter flag when
2406		 * necessary.
2407		 */
2408		umtxq_busy(&uq->uq_key);
2409		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2410			int oldlen = uq->uq_cur_queue->length;
2411			umtxq_remove(uq);
2412			if (oldlen == 1) {
2413				umtxq_unlock(&uq->uq_key);
2414				suword32(
2415				    __DEVOLATILE(uint32_t *,
2416					 &cv->c_has_waiters), 0);
2417				umtxq_lock(&uq->uq_key);
2418			}
2419		}
2420		umtxq_unbusy(&uq->uq_key);
2421		if (error == ERESTART)
2422			error = EINTR;
2423	}
2424
2425	umtxq_unlock(&uq->uq_key);
2426	umtx_key_release(&uq->uq_key);
2427	return (error);
2428}
2429
2430/*
2431 * Signal a userland condition variable.
2432 */
2433static int
2434do_cv_signal(struct thread *td, struct ucond *cv)
2435{
2436	struct umtx_key key;
2437	int error, cnt, nwake;
2438	uint32_t flags;
2439
2440	flags = fuword32(&cv->c_flags);
2441	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2442		return (error);
2443	umtxq_lock(&key);
2444	umtxq_busy(&key);
2445	cnt = umtxq_count(&key);
2446	nwake = umtxq_signal(&key, 1);
2447	if (cnt <= nwake) {
2448		umtxq_unlock(&key);
2449		error = suword32(
2450		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2451		umtxq_lock(&key);
2452	}
2453	umtxq_unbusy(&key);
2454	umtxq_unlock(&key);
2455	umtx_key_release(&key);
2456	return (error);
2457}
2458
2459static int
2460do_cv_broadcast(struct thread *td, struct ucond *cv)
2461{
2462	struct umtx_key key;
2463	int error;
2464	uint32_t flags;
2465
2466	flags = fuword32(&cv->c_flags);
2467	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2468		return (error);
2469
2470	umtxq_lock(&key);
2471	umtxq_busy(&key);
2472	umtxq_signal(&key, INT_MAX);
2473	umtxq_unlock(&key);
2474
2475	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2476
2477	umtxq_lock(&key);
2478	umtxq_unbusy(&key);
2479	umtxq_unlock(&key);
2480
2481	umtx_key_release(&key);
2482	return (error);
2483}
2484
2485static int
2486do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2487{
2488	struct umtx_q *uq;
2489	uint32_t flags, wrflags;
2490	int32_t state, oldstate;
2491	int32_t blocked_readers;
2492	int error;
2493
2494	uq = td->td_umtxq;
2495	flags = fuword32(&rwlock->rw_flags);
2496	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2497	if (error != 0)
2498		return (error);
2499
2500	wrflags = URWLOCK_WRITE_OWNER;
2501	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2502		wrflags |= URWLOCK_WRITE_WAITERS;
2503
2504	for (;;) {
2505		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2506		/* try to lock it */
2507		while (!(state & wrflags)) {
2508			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2509				umtx_key_release(&uq->uq_key);
2510				return (EAGAIN);
2511			}
2512			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2513			if (oldstate == state) {
2514				umtx_key_release(&uq->uq_key);
2515				return (0);
2516			}
2517			state = oldstate;
2518		}
2519
2520		if (error)
2521			break;
2522
2523		/* grab monitor lock */
2524		umtxq_lock(&uq->uq_key);
2525		umtxq_busy(&uq->uq_key);
2526		umtxq_unlock(&uq->uq_key);
2527
2528		/*
2529		 * re-read the state, in case it changed between the try-lock above
2530		 * and the check below
2531		 */
2532		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2533
2534		/* set read contention bit */
2535		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2536			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2537			if (oldstate == state)
2538				goto sleep;
2539			state = oldstate;
2540		}
2541
2542		/* state is changed while setting flags, restart */
2543		if (!(state & wrflags)) {
2544			umtxq_lock(&uq->uq_key);
2545			umtxq_unbusy(&uq->uq_key);
2546			umtxq_unlock(&uq->uq_key);
2547			continue;
2548		}
2549
2550sleep:
2551		/* contention bit is set, before sleeping, increase read waiter count */
2552		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2553		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2554
2555		while (state & wrflags) {
2556			umtxq_lock(&uq->uq_key);
2557			umtxq_insert(uq);
2558			umtxq_unbusy(&uq->uq_key);
2559
2560			error = umtxq_sleep(uq, "urdlck", timo);
2561
2562			umtxq_busy(&uq->uq_key);
2563			umtxq_remove(uq);
2564			umtxq_unlock(&uq->uq_key);
2565			if (error)
2566				break;
2567			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2568		}
2569
2570		/* decrease read waiter count, and may clear read contention bit */
2571		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2572		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2573		if (blocked_readers == 1) {
2574			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2575			for (;;) {
2576				oldstate = casuword32(&rwlock->rw_state, state,
2577					 state & ~URWLOCK_READ_WAITERS);
2578				if (oldstate == state)
2579					break;
2580				state = oldstate;
2581			}
2582		}
2583
2584		umtxq_lock(&uq->uq_key);
2585		umtxq_unbusy(&uq->uq_key);
2586		umtxq_unlock(&uq->uq_key);
2587	}
2588	umtx_key_release(&uq->uq_key);
2589	return (error);
2590}
2591
2592static int
2593do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2594{
2595	struct timespec ts, ts2, ts3;
2596	struct timeval tv;
2597	int error;
2598
2599	getnanouptime(&ts);
2600	timespecadd(&ts, timeout);
2601	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2602	for (;;) {
2603		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2604		if (error != ETIMEDOUT)
2605			break;
2606		getnanouptime(&ts2);
2607		if (timespeccmp(&ts2, &ts, >=)) {
2608			error = ETIMEDOUT;
2609			break;
2610		}
2611		ts3 = ts;
2612		timespecsub(&ts3, &ts2);
2613		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2614	}
2615	if (error == ERESTART)
2616		error = EINTR;
2617	return (error);
2618}
2619
2620static int
2621do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2622{
2623	struct umtx_q *uq;
2624	uint32_t flags;
2625	int32_t state, oldstate;
2626	int32_t blocked_writers;
2627	int32_t blocked_readers;
2628	int error;
2629
2630	uq = td->td_umtxq;
2631	flags = fuword32(&rwlock->rw_flags);
2632	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2633	if (error != 0)
2634		return (error);
2635
2636	blocked_readers = 0;
2637	for (;;) {
2638		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2639		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2640			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2641			if (oldstate == state) {
2642				umtx_key_release(&uq->uq_key);
2643				return (0);
2644			}
2645			state = oldstate;
2646		}
2647
2648		if (error) {
2649			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2650			    blocked_readers != 0) {
2651				umtxq_lock(&uq->uq_key);
2652				umtxq_busy(&uq->uq_key);
2653				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2654				umtxq_unbusy(&uq->uq_key);
2655				umtxq_unlock(&uq->uq_key);
2656			}
2657
2658			break;
2659		}
2660
2661		/* grab monitor lock */
2662		umtxq_lock(&uq->uq_key);
2663		umtxq_busy(&uq->uq_key);
2664		umtxq_unlock(&uq->uq_key);
2665
2666		/*
2667		 * re-read the state, in case it changed between the try-lock above
2668		 * and the check below
2669		 */
2670		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2671
2672		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2673		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2674			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2675			if (oldstate == state)
2676				goto sleep;
2677			state = oldstate;
2678		}
2679
2680		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2681			umtxq_lock(&uq->uq_key);
2682			umtxq_unbusy(&uq->uq_key);
2683			umtxq_unlock(&uq->uq_key);
2684			continue;
2685		}
2686sleep:
2687		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2688		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2689
2690		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2691			umtxq_lock(&uq->uq_key);
2692			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2693			umtxq_unbusy(&uq->uq_key);
2694
2695			error = umtxq_sleep(uq, "uwrlck", timo);
2696
2697			umtxq_busy(&uq->uq_key);
2698			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2699			umtxq_unlock(&uq->uq_key);
2700			if (error)
2701				break;
2702			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2703		}
2704
2705		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2706		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2707		if (blocked_writers == 1) {
2708			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2709			for (;;) {
2710				oldstate = casuword32(&rwlock->rw_state, state,
2711					 state & ~URWLOCK_WRITE_WAITERS);
2712				if (oldstate == state)
2713					break;
2714				state = oldstate;
2715			}
2716			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2717		} else
2718			blocked_readers = 0;
2719
2720		umtxq_lock(&uq->uq_key);
2721		umtxq_unbusy(&uq->uq_key);
2722		umtxq_unlock(&uq->uq_key);
2723	}
2724
2725	umtx_key_release(&uq->uq_key);
2726	return (error);
2727}
2728
2729static int
2730do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2731{
2732	struct timespec ts, ts2, ts3;
2733	struct timeval tv;
2734	int error;
2735
2736	getnanouptime(&ts);
2737	timespecadd(&ts, timeout);
2738	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2739	for (;;) {
2740		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2741		if (error != ETIMEDOUT)
2742			break;
2743		getnanouptime(&ts2);
2744		if (timespeccmp(&ts2, &ts, >=)) {
2745			error = ETIMEDOUT;
2746			break;
2747		}
2748		ts3 = ts;
2749		timespecsub(&ts3, &ts2);
2750		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2751	}
2752	if (error == ERESTART)
2753		error = EINTR;
2754	return (error);
2755}
2756
2757static int
2758do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2759{
2760	struct umtx_q *uq;
2761	uint32_t flags;
2762	int32_t state, oldstate;
2763	int error, q, count;
2764
2765	uq = td->td_umtxq;
2766	flags = fuword32(&rwlock->rw_flags);
2767	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2768	if (error != 0)
2769		return (error);
2770
2771	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2772	if (state & URWLOCK_WRITE_OWNER) {
2773		for (;;) {
2774			oldstate = casuword32(&rwlock->rw_state, state,
2775				state & ~URWLOCK_WRITE_OWNER);
2776			if (oldstate != state) {
2777				state = oldstate;
2778				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2779					error = EPERM;
2780					goto out;
2781				}
2782			} else
2783				break;
2784		}
2785	} else if (URWLOCK_READER_COUNT(state) != 0) {
2786		for (;;) {
2787			oldstate = casuword32(&rwlock->rw_state, state,
2788				state - 1);
2789			if (oldstate != state) {
2790				state = oldstate;
2791				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2792					error = EPERM;
2793					goto out;
2794				}
2795			}
2796			else
2797				break;
2798		}
2799	} else {
2800		error = EPERM;
2801		goto out;
2802	}
2803
2804	count = 0;
2805
2806	if (!(flags & URWLOCK_PREFER_READER)) {
2807		if (state & URWLOCK_WRITE_WAITERS) {
2808			count = 1;
2809			q = UMTX_EXCLUSIVE_QUEUE;
2810		} else if (state & URWLOCK_READ_WAITERS) {
2811			count = INT_MAX;
2812			q = UMTX_SHARED_QUEUE;
2813		}
2814	} else {
2815		if (state & URWLOCK_READ_WAITERS) {
2816			count = INT_MAX;
2817			q = UMTX_SHARED_QUEUE;
2818		} else if (state & URWLOCK_WRITE_WAITERS) {
2819			count = 1;
2820			q = UMTX_EXCLUSIVE_QUEUE;
2821		}
2822	}
2823
2824	if (count) {
2825		umtxq_lock(&uq->uq_key);
2826		umtxq_busy(&uq->uq_key);
2827		umtxq_signal_queue(&uq->uq_key, count, q);
2828		umtxq_unbusy(&uq->uq_key);
2829		umtxq_unlock(&uq->uq_key);
2830	}
2831out:
2832	umtx_key_release(&uq->uq_key);
2833	return (error);
2834}
2835
2836static int
2837do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
2838{
2839	struct umtx_q *uq;
2840	struct timeval tv;
2841	struct timespec cts, ets, tts;
2842	uint32_t flags, count;
2843	int error;
2844
2845	uq = td->td_umtxq;
2846	flags = fuword32(&sem->_flags);
2847	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2848	if (error != 0)
2849		return (error);
2850	umtxq_lock(&uq->uq_key);
2851	umtxq_busy(&uq->uq_key);
2852	umtxq_insert(uq);
2853	umtxq_unlock(&uq->uq_key);
2854
2855	if (fuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters)) == 0)
2856		casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2857
2858	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2859	if (count != 0) {
2860		umtxq_lock(&uq->uq_key);
2861		umtxq_unbusy(&uq->uq_key);
2862		umtxq_remove(uq);
2863		umtxq_unlock(&uq->uq_key);
2864		umtx_key_release(&uq->uq_key);
2865		return (0);
2866	}
2867
2868	umtxq_lock(&uq->uq_key);
2869	umtxq_unbusy(&uq->uq_key);
2870	umtxq_unlock(&uq->uq_key);
2871
2872	umtxq_lock(&uq->uq_key);
2873	if (timeout == NULL) {
2874		error = umtxq_sleep(uq, "usem", 0);
2875	} else {
2876		getnanouptime(&ets);
2877		timespecadd(&ets, timeout);
2878		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2879		for (;;) {
2880			error = umtxq_sleep(uq, "usem", tvtohz(&tv));
2881			if (error != ETIMEDOUT)
2882				break;
2883			getnanouptime(&cts);
2884			if (timespeccmp(&cts, &ets, >=)) {
2885				error = ETIMEDOUT;
2886				break;
2887			}
2888			tts = ets;
2889			timespecsub(&tts, &cts);
2890			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2891		}
2892	}
2893
2894	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2895		error = 0;
2896	else {
2897		umtxq_remove(uq);
2898		if (error == ERESTART)
2899			error = EINTR;
2900	}
2901	umtxq_unlock(&uq->uq_key);
2902	umtx_key_release(&uq->uq_key);
2903	return (error);
2904}
2905
2906/*
2907 * Signal a userland condition variable.
2908 */
2909static int
2910do_sem_wake(struct thread *td, struct _usem *sem)
2911{
2912	struct umtx_key key;
2913	int error, cnt, nwake;
2914	uint32_t flags;
2915
2916	flags = fuword32(&sem->_flags);
2917	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2918		return (error);
2919	umtxq_lock(&key);
2920	umtxq_busy(&key);
2921	cnt = umtxq_count(&key);
2922	nwake = umtxq_signal(&key, 1);
2923	if (cnt <= nwake) {
2924		umtxq_unlock(&key);
2925		error = suword32(
2926		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2927		umtxq_lock(&key);
2928	}
2929	umtxq_unbusy(&key);
2930	umtxq_unlock(&key);
2931	umtx_key_release(&key);
2932	return (error);
2933}
2934
2935int
2936_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2937    /* struct umtx *umtx */
2938{
2939	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2940}
2941
2942int
2943_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2944    /* struct umtx *umtx */
2945{
2946	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2947}
2948
2949static int
2950__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2951{
2952	struct timespec *ts, timeout;
2953	int error;
2954
2955	/* Allow a null timespec (wait forever). */
2956	if (uap->uaddr2 == NULL)
2957		ts = NULL;
2958	else {
2959		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2960		if (error != 0)
2961			return (error);
2962		if (timeout.tv_nsec >= 1000000000 ||
2963		    timeout.tv_nsec < 0) {
2964			return (EINVAL);
2965		}
2966		ts = &timeout;
2967	}
2968	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2969}
2970
2971static int
2972__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2973{
2974	return (do_unlock_umtx(td, uap->obj, uap->val));
2975}
2976
2977static int
2978__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2979{
2980	struct timespec *ts, timeout;
2981	int error;
2982
2983	if (uap->uaddr2 == NULL)
2984		ts = NULL;
2985	else {
2986		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2987		if (error != 0)
2988			return (error);
2989		if (timeout.tv_nsec >= 1000000000 ||
2990		    timeout.tv_nsec < 0)
2991			return (EINVAL);
2992		ts = &timeout;
2993	}
2994	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2995}
2996
2997static int
2998__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2999{
3000	struct timespec *ts, timeout;
3001	int error;
3002
3003	if (uap->uaddr2 == NULL)
3004		ts = NULL;
3005	else {
3006		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3007		if (error != 0)
3008			return (error);
3009		if (timeout.tv_nsec >= 1000000000 ||
3010		    timeout.tv_nsec < 0)
3011			return (EINVAL);
3012		ts = &timeout;
3013	}
3014	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3015}
3016
3017static int
3018__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3019{
3020	struct timespec *ts, timeout;
3021	int error;
3022
3023	if (uap->uaddr2 == NULL)
3024		ts = NULL;
3025	else {
3026		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3027		if (error != 0)
3028			return (error);
3029		if (timeout.tv_nsec >= 1000000000 ||
3030		    timeout.tv_nsec < 0)
3031			return (EINVAL);
3032		ts = &timeout;
3033	}
3034	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3035}
3036
3037static int
3038__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3039{
3040	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3041}
3042
3043#define BATCH_SIZE	128
3044static int
3045__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3046{
3047	int count = uap->val;
3048	void *uaddrs[BATCH_SIZE];
3049	char **upp = (char **)uap->obj;
3050	int tocopy;
3051	int error = 0;
3052	int i, pos = 0;
3053
3054	while (count > 0) {
3055		tocopy = count;
3056		if (tocopy > BATCH_SIZE)
3057			tocopy = BATCH_SIZE;
3058		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3059		if (error != 0)
3060			break;
3061		for (i = 0; i < tocopy; ++i)
3062			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3063		count -= tocopy;
3064		pos += tocopy;
3065	}
3066	return (error);
3067}
3068
3069static int
3070__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3071{
3072	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3073}
3074
3075static int
3076__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3077{
3078	struct timespec *ts, timeout;
3079	int error;
3080
3081	/* Allow a null timespec (wait forever). */
3082	if (uap->uaddr2 == NULL)
3083		ts = NULL;
3084	else {
3085		error = copyin(uap->uaddr2, &timeout,
3086		    sizeof(timeout));
3087		if (error != 0)
3088			return (error);
3089		if (timeout.tv_nsec >= 1000000000 ||
3090		    timeout.tv_nsec < 0) {
3091			return (EINVAL);
3092		}
3093		ts = &timeout;
3094	}
3095	return do_lock_umutex(td, uap->obj, ts, 0);
3096}
3097
3098static int
3099__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3100{
3101	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3102}
3103
3104static int
3105__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3106{
3107	struct timespec *ts, timeout;
3108	int error;
3109
3110	/* Allow a null timespec (wait forever). */
3111	if (uap->uaddr2 == NULL)
3112		ts = NULL;
3113	else {
3114		error = copyin(uap->uaddr2, &timeout,
3115		    sizeof(timeout));
3116		if (error != 0)
3117			return (error);
3118		if (timeout.tv_nsec >= 1000000000 ||
3119		    timeout.tv_nsec < 0) {
3120			return (EINVAL);
3121		}
3122		ts = &timeout;
3123	}
3124	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3125}
3126
3127static int
3128__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3129{
3130	return do_wake_umutex(td, uap->obj);
3131}
3132
3133static int
3134__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3135{
3136	return do_unlock_umutex(td, uap->obj);
3137}
3138
3139static int
3140__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3141{
3142	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3143}
3144
3145static int
3146__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3147{
3148	struct timespec *ts, timeout;
3149	int error;
3150
3151	/* Allow a null timespec (wait forever). */
3152	if (uap->uaddr2 == NULL)
3153		ts = NULL;
3154	else {
3155		error = copyin(uap->uaddr2, &timeout,
3156		    sizeof(timeout));
3157		if (error != 0)
3158			return (error);
3159		if (timeout.tv_nsec >= 1000000000 ||
3160		    timeout.tv_nsec < 0) {
3161			return (EINVAL);
3162		}
3163		ts = &timeout;
3164	}
3165	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3166}
3167
3168static int
3169__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3170{
3171	return do_cv_signal(td, uap->obj);
3172}
3173
3174static int
3175__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3176{
3177	return do_cv_broadcast(td, uap->obj);
3178}
3179
3180static int
3181__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3182{
3183	struct timespec timeout;
3184	int error;
3185
3186	/* Allow a null timespec (wait forever). */
3187	if (uap->uaddr2 == NULL) {
3188		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3189	} else {
3190		error = copyin(uap->uaddr2, &timeout,
3191		    sizeof(timeout));
3192		if (error != 0)
3193			return (error);
3194		if (timeout.tv_nsec >= 1000000000 ||
3195		    timeout.tv_nsec < 0) {
3196			return (EINVAL);
3197		}
3198		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3199	}
3200	return (error);
3201}
3202
3203static int
3204__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3205{
3206	struct timespec timeout;
3207	int error;
3208
3209	/* Allow a null timespec (wait forever). */
3210	if (uap->uaddr2 == NULL) {
3211		error = do_rw_wrlock(td, uap->obj, 0);
3212	} else {
3213		error = copyin(uap->uaddr2, &timeout,
3214		    sizeof(timeout));
3215		if (error != 0)
3216			return (error);
3217		if (timeout.tv_nsec >= 1000000000 ||
3218		    timeout.tv_nsec < 0) {
3219			return (EINVAL);
3220		}
3221
3222		error = do_rw_wrlock2(td, uap->obj, &timeout);
3223	}
3224	return (error);
3225}
3226
3227static int
3228__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3229{
3230	return do_rw_unlock(td, uap->obj);
3231}
3232
3233static int
3234__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3235{
3236	struct timespec *ts, timeout;
3237	int error;
3238
3239	/* Allow a null timespec (wait forever). */
3240	if (uap->uaddr2 == NULL)
3241		ts = NULL;
3242	else {
3243		error = copyin(uap->uaddr2, &timeout,
3244		    sizeof(timeout));
3245		if (error != 0)
3246			return (error);
3247		if (timeout.tv_nsec >= 1000000000 ||
3248		    timeout.tv_nsec < 0) {
3249			return (EINVAL);
3250		}
3251		ts = &timeout;
3252	}
3253	return (do_sem_wait(td, uap->obj, ts));
3254}
3255
3256static int
3257__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3258{
3259	return do_sem_wake(td, uap->obj);
3260}
3261
3262typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3263
3264static _umtx_op_func op_table[] = {
3265	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3266	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3267	__umtx_op_wait,			/* UMTX_OP_WAIT */
3268	__umtx_op_wake,			/* UMTX_OP_WAKE */
3269	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3270	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3271	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3272	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3273	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3274	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3275	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3276	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3277	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3278	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3279	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3280	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3281	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3282	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3283	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3284	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3285	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3286	__umtx_op_nwake_private		/* UMTX_OP_NWAKE_PRIVATE */
3287};
3288
3289int
3290_umtx_op(struct thread *td, struct _umtx_op_args *uap)
3291{
3292	if ((unsigned)uap->op < UMTX_OP_MAX)
3293		return (*op_table[uap->op])(td, uap);
3294	return (EINVAL);
3295}
3296
3297#ifdef COMPAT_FREEBSD32
3298int
3299freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3300    /* struct umtx *umtx */
3301{
3302	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3303}
3304
3305int
3306freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3307    /* struct umtx *umtx */
3308{
3309	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3310}
3311
3312struct timespec32 {
3313	uint32_t tv_sec;
3314	uint32_t tv_nsec;
3315};
3316
3317static inline int
3318copyin_timeout32(void *addr, struct timespec *tsp)
3319{
3320	struct timespec32 ts32;
3321	int error;
3322
3323	error = copyin(addr, &ts32, sizeof(struct timespec32));
3324	if (error == 0) {
3325		tsp->tv_sec = ts32.tv_sec;
3326		tsp->tv_nsec = ts32.tv_nsec;
3327	}
3328	return (error);
3329}
3330
3331static int
3332__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3333{
3334	struct timespec *ts, timeout;
3335	int error;
3336
3337	/* Allow a null timespec (wait forever). */
3338	if (uap->uaddr2 == NULL)
3339		ts = NULL;
3340	else {
3341		error = copyin_timeout32(uap->uaddr2, &timeout);
3342		if (error != 0)
3343			return (error);
3344		if (timeout.tv_nsec >= 1000000000 ||
3345		    timeout.tv_nsec < 0) {
3346			return (EINVAL);
3347		}
3348		ts = &timeout;
3349	}
3350	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3351}
3352
3353static int
3354__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3355{
3356	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3357}
3358
3359static int
3360__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3361{
3362	struct timespec *ts, timeout;
3363	int error;
3364
3365	if (uap->uaddr2 == NULL)
3366		ts = NULL;
3367	else {
3368		error = copyin_timeout32(uap->uaddr2, &timeout);
3369		if (error != 0)
3370			return (error);
3371		if (timeout.tv_nsec >= 1000000000 ||
3372		    timeout.tv_nsec < 0)
3373			return (EINVAL);
3374		ts = &timeout;
3375	}
3376	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3377}
3378
3379static int
3380__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3381{
3382	struct timespec *ts, timeout;
3383	int error;
3384
3385	/* Allow a null timespec (wait forever). */
3386	if (uap->uaddr2 == NULL)
3387		ts = NULL;
3388	else {
3389		error = copyin_timeout32(uap->uaddr2, &timeout);
3390		if (error != 0)
3391			return (error);
3392		if (timeout.tv_nsec >= 1000000000 ||
3393		    timeout.tv_nsec < 0)
3394			return (EINVAL);
3395		ts = &timeout;
3396	}
3397	return do_lock_umutex(td, uap->obj, ts, 0);
3398}
3399
3400static int
3401__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3402{
3403	struct timespec *ts, timeout;
3404	int error;
3405
3406	/* Allow a null timespec (wait forever). */
3407	if (uap->uaddr2 == NULL)
3408		ts = NULL;
3409	else {
3410		error = copyin_timeout32(uap->uaddr2, &timeout);
3411		if (error != 0)
3412			return (error);
3413		if (timeout.tv_nsec >= 1000000000 ||
3414		    timeout.tv_nsec < 0)
3415			return (EINVAL);
3416		ts = &timeout;
3417	}
3418	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3419}
3420
3421static int
3422__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3423{
3424	struct timespec *ts, timeout;
3425	int error;
3426
3427	/* Allow a null timespec (wait forever). */
3428	if (uap->uaddr2 == NULL)
3429		ts = NULL;
3430	else {
3431		error = copyin_timeout32(uap->uaddr2, &timeout);
3432		if (error != 0)
3433			return (error);
3434		if (timeout.tv_nsec >= 1000000000 ||
3435		    timeout.tv_nsec < 0)
3436			return (EINVAL);
3437		ts = &timeout;
3438	}
3439	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3440}
3441
3442static int
3443__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3444{
3445	struct timespec timeout;
3446	int error;
3447
3448	/* Allow a null timespec (wait forever). */
3449	if (uap->uaddr2 == NULL) {
3450		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3451	} else {
3452		error = copyin_timeout32(uap->uaddr2, &timeout);
3453		if (error != 0)
3454			return (error);
3455		if (timeout.tv_nsec >= 1000000000 ||
3456		    timeout.tv_nsec < 0) {
3457			return (EINVAL);
3458		}
3459		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3460	}
3461	return (error);
3462}
3463
3464static int
3465__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3466{
3467	struct timespec timeout;
3468	int error;
3469
3470	/* Allow a null timespec (wait forever). */
3471	if (uap->uaddr2 == NULL) {
3472		error = do_rw_wrlock(td, uap->obj, 0);
3473	} else {
3474		error = copyin_timeout32(uap->uaddr2, &timeout);
3475		if (error != 0)
3476			return (error);
3477		if (timeout.tv_nsec >= 1000000000 ||
3478		    timeout.tv_nsec < 0) {
3479			return (EINVAL);
3480		}
3481
3482		error = do_rw_wrlock2(td, uap->obj, &timeout);
3483	}
3484	return (error);
3485}
3486
3487static int
3488__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3489{
3490	struct timespec *ts, timeout;
3491	int error;
3492
3493	if (uap->uaddr2 == NULL)
3494		ts = NULL;
3495	else {
3496		error = copyin_timeout32(uap->uaddr2, &timeout);
3497		if (error != 0)
3498			return (error);
3499		if (timeout.tv_nsec >= 1000000000 ||
3500		    timeout.tv_nsec < 0)
3501			return (EINVAL);
3502		ts = &timeout;
3503	}
3504	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3505}
3506
3507static int
3508__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3509{
3510	struct timespec *ts, timeout;
3511	int error;
3512
3513	/* Allow a null timespec (wait forever). */
3514	if (uap->uaddr2 == NULL)
3515		ts = NULL;
3516	else {
3517		error = copyin_timeout32(uap->uaddr2, &timeout);
3518		if (error != 0)
3519			return (error);
3520		if (timeout.tv_nsec >= 1000000000 ||
3521		    timeout.tv_nsec < 0)
3522			return (EINVAL);
3523		ts = &timeout;
3524	}
3525	return (do_sem_wait(td, uap->obj, ts));
3526}
3527
3528static int
3529__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3530{
3531	int count = uap->val;
3532	uint32_t uaddrs[BATCH_SIZE];
3533	uint32_t **upp = (uint32_t **)uap->obj;
3534	int tocopy;
3535	int error = 0;
3536	int i, pos = 0;
3537
3538	while (count > 0) {
3539		tocopy = count;
3540		if (tocopy > BATCH_SIZE)
3541			tocopy = BATCH_SIZE;
3542		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3543		if (error != 0)
3544			break;
3545		for (i = 0; i < tocopy; ++i)
3546			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3547				INT_MAX, 1);
3548		count -= tocopy;
3549		pos += tocopy;
3550	}
3551	return (error);
3552}
3553
3554static _umtx_op_func op_table_compat32[] = {
3555	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3556	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3557	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3558	__umtx_op_wake,			/* UMTX_OP_WAKE */
3559	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3560	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3561	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3562	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3563	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3564	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3565	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3566	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3567	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3568	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3569	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3570	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3571	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3572	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3573	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3574	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3575	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3576	__umtx_op_nwake_private32	/* UMTX_OP_NWAKE_PRIVATE */
3577};
3578
3579int
3580freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3581{
3582	if ((unsigned)uap->op < UMTX_OP_MAX)
3583		return (*op_table_compat32[uap->op])(td,
3584			(struct _umtx_op_args *)uap);
3585	return (EINVAL);
3586}
3587#endif
3588
3589void
3590umtx_thread_init(struct thread *td)
3591{
3592	td->td_umtxq = umtxq_alloc();
3593	td->td_umtxq->uq_thread = td;
3594}
3595
3596void
3597umtx_thread_fini(struct thread *td)
3598{
3599	umtxq_free(td->td_umtxq);
3600}
3601
3602/*
3603 * It will be called when new thread is created, e.g fork().
3604 */
3605void
3606umtx_thread_alloc(struct thread *td)
3607{
3608	struct umtx_q *uq;
3609
3610	uq = td->td_umtxq;
3611	uq->uq_inherited_pri = PRI_MAX;
3612
3613	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3614	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3615	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3616	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3617}
3618
3619/*
3620 * exec() hook.
3621 */
3622static void
3623umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3624	struct image_params *imgp __unused)
3625{
3626	umtx_thread_cleanup(curthread);
3627}
3628
3629/*
3630 * thread_exit() hook.
3631 */
3632void
3633umtx_thread_exit(struct thread *td)
3634{
3635	umtx_thread_cleanup(td);
3636}
3637
3638/*
3639 * clean up umtx data.
3640 */
3641static void
3642umtx_thread_cleanup(struct thread *td)
3643{
3644	struct umtx_q *uq;
3645	struct umtx_pi *pi;
3646
3647	if ((uq = td->td_umtxq) == NULL)
3648		return;
3649
3650	mtx_lock_spin(&umtx_lock);
3651	uq->uq_inherited_pri = PRI_MAX;
3652	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3653		pi->pi_owner = NULL;
3654		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3655	}
3656	mtx_unlock_spin(&umtx_lock);
3657	thread_lock(td);
3658	sched_lend_user_prio(td, PRI_MAX);
3659	thread_unlock(td);
3660}
3661