kern_umtx.c revision 161678
155682Smarkm/*-
2233294Sstas * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3233294Sstas * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4233294Sstas * All rights reserved.
555682Smarkm *
6233294Sstas * Redistribution and use in source and binary forms, with or without
7233294Sstas * modification, are permitted provided that the following conditions
8233294Sstas * are met:
955682Smarkm * 1. Redistributions of source code must retain the above copyright
10233294Sstas *    notice unmodified, this list of conditions, and the following
11233294Sstas *    disclaimer.
1255682Smarkm * 2. Redistributions in binary form must reproduce the above copyright
13233294Sstas *    notice, this list of conditions and the following disclaimer in the
14233294Sstas *    documentation and/or other materials provided with the distribution.
15233294Sstas *
1655682Smarkm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17233294Sstas * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18233294Sstas * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19233294Sstas * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
2055682Smarkm * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21233294Sstas * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22233294Sstas * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23233294Sstas * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24233294Sstas * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25233294Sstas * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26233294Sstas */
27233294Sstas
28233294Sstas#include <sys/cdefs.h>
29233294Sstas__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 161678 2006-08-28 04:24:51Z davidxu $");
30233294Sstas
31233294Sstas#include <sys/param.h>
3255682Smarkm#include <sys/kernel.h>
3355682Smarkm#include <sys/limits.h>
3455682Smarkm#include <sys/lock.h>
3555682Smarkm#include <sys/malloc.h>
3690926Snectar#include <sys/mutex.h>
3790926Snectar#include <sys/proc.h>
3890926Snectar#include <sys/sched.h>
3990926Snectar#include <sys/sysctl.h>
4090926Snectar#include <sys/sysent.h>
4190926Snectar#include <sys/systm.h>
4290926Snectar#include <sys/sysproto.h>
4390926Snectar#include <sys/eventhandler.h>
4490926Snectar#include <sys/umtx.h>
4590926Snectar
4690926Snectar#include <vm/vm.h>
4790926Snectar#include <vm/vm_param.h>
4890926Snectar#include <vm/pmap.h>
4990926Snectar#include <vm/vm_map.h>
5090926Snectar#include <vm/vm_object.h>
5190926Snectar
5290926Snectar#define TYPE_SIMPLE_LOCK	0
5390926Snectar#define TYPE_SIMPLE_WAIT	1
5455682Smarkm#define TYPE_NORMAL_UMUTEX	2
5555682Smarkm#define TYPE_PI_UMUTEX		3
5655682Smarkm#define TYPE_PP_UMUTEX		4
5755682Smarkm#define TYPE_CV			5
5855682Smarkm
5955682Smarkm/* Key to represent a unique userland synchronous object */
6055682Smarkmstruct umtx_key {
61178825Sdfr	int	hash;
6255682Smarkm	int	type;
63178825Sdfr	int	shared;
64178825Sdfr	union {
6555682Smarkm		struct {
6690926Snectar			vm_object_t	object;
6790926Snectar			uintptr_t	offset;
68178825Sdfr		} shared;
6990926Snectar		struct {
70233294Sstas			struct vmspace	*vs;
7190926Snectar			uintptr_t	addr;
7290926Snectar		} private;
7390926Snectar		struct {
7490926Snectar			void		*a;
7590926Snectar			uintptr_t	b;
7690926Snectar		} both;
7790926Snectar	} info;
78233294Sstas};
79178825Sdfr
80178825Sdfr/* Priority inheritance mutex info. */
8190926Snectarstruct umtx_pi {
8272445Sassar	/* Owner thread */
8372445Sassar	struct thread		*pi_owner;
8472445Sassar
8555682Smarkm	/* Reference count */
86178825Sdfr	int			pi_refcount;
87178825Sdfr
88178825Sdfr 	/* List entry to link umtx holding by thread */
8955682Smarkm	TAILQ_ENTRY(umtx_pi)	pi_link;
90178825Sdfr
9155682Smarkm	/* List entry in hash */
9255682Smarkm	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
9355682Smarkm
9455682Smarkm	/* List for waiters */
95233294Sstas	TAILQ_HEAD(,umtx_q)	pi_blocked;
96178825Sdfr
97178825Sdfr	/* Identify a userland lock object */
98178825Sdfr	struct umtx_key		pi_key;
99178825Sdfr};
100233294Sstas
101233294Sstas/* A userland synchronous object user. */
102233294Sstasstruct umtx_q {
103233294Sstas	/* Linked list for the hash. */
104233294Sstas	TAILQ_ENTRY(umtx_q)	uq_link;
105178825Sdfr
106178825Sdfr	/* Umtx key. */
10755682Smarkm	struct umtx_key		uq_key;
10855682Smarkm
109178825Sdfr	/* Umtx flags. */
11055682Smarkm	int			uq_flags;
11155682Smarkm#define UQF_UMTXQ	0x0001
11255682Smarkm
11355682Smarkm	/* The thread waits on. */
11455682Smarkm	struct thread		*uq_thread;
115178825Sdfr
11655682Smarkm	/*
117233294Sstas	 * Blocked on PI mutex. read can use chain lock
11855682Smarkm	 * or sched_lock, write must have both chain lock and
11955682Smarkm	 * sched_lock being hold.
12055682Smarkm	 */
12155682Smarkm	struct umtx_pi		*uq_pi_blocked;
12255682Smarkm
12355682Smarkm	/* On blocked list */
12455682Smarkm	TAILQ_ENTRY(umtx_q)	uq_lockq;
12555682Smarkm
126233294Sstas	/* Thread contending with us */
127178825Sdfr	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
12855682Smarkm
12955682Smarkm	/* Inherited prioroty from PP mutex */
13055682Smarkm	u_char			uq_inherited_pri;
131};
132
133TAILQ_HEAD(umtxq_head, umtx_q);
134
135/* Userland lock object's wait-queue chain */
136struct umtxq_chain {
137	/* Lock for this chain. */
138	struct mtx		uc_lock;
139
140	/* List of sleep queues. */
141	struct umtxq_head	uc_queue;
142
143	/* Busy flag */
144	char			uc_busy;
145
146	/* Chain lock waiters */
147	int			uc_waiters;
148
149	/* All PI in the list */
150	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
151};
152
153#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
154
155/*
156 * Don't propagate time-sharing priority, there is a security reason,
157 * a user can simply introduce PI-mutex, let thread A lock the mutex,
158 * and let another thread B block on the mutex, because B is
159 * sleeping, its priority will be boosted, this causes A's priority to
160 * be boosted via priority propagating too and will never be lowered even
161 * if it is using 100%CPU, this is unfair to other processes.
162 */
163
164#define UPRI(td)	(((td)->td_ksegrp->kg_user_pri >= PRI_MIN_TIMESHARE &&\
165			  (td)->td_ksegrp->kg_user_pri <= PRI_MAX_TIMESHARE) ?\
166			 PRI_MAX_TIMESHARE : (td)->td_ksegrp->kg_user_pri)
167
168#define	GOLDEN_RATIO_PRIME	2654404609U
169#define	UMTX_CHAINS		128
170#define	UMTX_SHIFTS		(__WORD_BIT - 7)
171
172#define THREAD_SHARE		0
173#define PROCESS_SHARE		1
174#define AUTO_SHARE		2
175
176#define	GET_SHARE(flags)	\
177    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
178
179static uma_zone_t		umtx_pi_zone;
180static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
181static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
182static int			umtx_pi_allocated;
183
184SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
185SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
186    &umtx_pi_allocated, 0, "Allocated umtx_pi");
187
188static void umtxq_sysinit(void *);
189static void umtxq_hash(struct umtx_key *key);
190static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
191static void umtxq_lock(struct umtx_key *key);
192static void umtxq_unlock(struct umtx_key *key);
193static void umtxq_busy(struct umtx_key *key);
194static void umtxq_unbusy(struct umtx_key *key);
195static void umtxq_insert(struct umtx_q *uq);
196static void umtxq_remove(struct umtx_q *uq);
197static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
198static int umtxq_count(struct umtx_key *key);
199static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
200static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
201static int umtx_key_get(void *addr, int type, int share,
202	struct umtx_key *key);
203static void umtx_key_release(struct umtx_key *key);
204static struct umtx_pi *umtx_pi_alloc(void);
205static void umtx_pi_free(struct umtx_pi *pi);
206static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
207static void umtx_thread_cleanup(struct thread *td);
208static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
209	struct image_params *imgp __unused);
210SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
211
212static void
213umtxq_sysinit(void *arg __unused)
214{
215	int i;
216
217	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
218		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
219	for (i = 0; i < UMTX_CHAINS; ++i) {
220		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
221			 MTX_DEF | MTX_DUPOK);
222		TAILQ_INIT(&umtxq_chains[i].uc_queue);
223		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
224		umtxq_chains[i].uc_busy = 0;
225		umtxq_chains[i].uc_waiters = 0;
226	}
227	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
228	    EVENTHANDLER_PRI_ANY);
229}
230
231struct umtx_q *
232umtxq_alloc(void)
233{
234	struct umtx_q *uq;
235
236	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
237	TAILQ_INIT(&uq->uq_pi_contested);
238	uq->uq_inherited_pri = PRI_MAX;
239	return (uq);
240}
241
242void
243umtxq_free(struct umtx_q *uq)
244{
245	free(uq, M_UMTX);
246}
247
248static inline void
249umtxq_hash(struct umtx_key *key)
250{
251	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
252	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
253}
254
255static inline int
256umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
257{
258	return (k1->type == k2->type &&
259		k1->info.both.a == k2->info.both.a &&
260	        k1->info.both.b == k2->info.both.b);
261}
262
263static inline struct umtxq_chain *
264umtxq_getchain(struct umtx_key *key)
265{
266	return (&umtxq_chains[key->hash]);
267}
268
269/*
270 * Set chain to busy state when following operation
271 * may be blocked (kernel mutex can not be used).
272 */
273static inline void
274umtxq_busy(struct umtx_key *key)
275{
276	struct umtxq_chain *uc;
277
278	uc = umtxq_getchain(key);
279	mtx_assert(&uc->uc_lock, MA_OWNED);
280	while (uc->uc_busy != 0) {
281		uc->uc_waiters++;
282		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
283		uc->uc_waiters--;
284	}
285	uc->uc_busy = 1;
286}
287
288/*
289 * Unbusy a chain.
290 */
291static inline void
292umtxq_unbusy(struct umtx_key *key)
293{
294	struct umtxq_chain *uc;
295
296	uc = umtxq_getchain(key);
297	mtx_assert(&uc->uc_lock, MA_OWNED);
298	KASSERT(uc->uc_busy != 0, ("not busy"));
299	uc->uc_busy = 0;
300	if (uc->uc_waiters)
301		wakeup_one(uc);
302}
303
304/*
305 * Lock a chain.
306 */
307static inline void
308umtxq_lock(struct umtx_key *key)
309{
310	struct umtxq_chain *uc;
311
312	uc = umtxq_getchain(key);
313	mtx_lock(&uc->uc_lock);
314}
315
316/*
317 * Unlock a chain.
318 */
319static inline void
320umtxq_unlock(struct umtx_key *key)
321{
322	struct umtxq_chain *uc;
323
324	uc = umtxq_getchain(key);
325	mtx_unlock(&uc->uc_lock);
326}
327
328/*
329 * Insert a thread onto the umtx queue.
330 */
331static inline void
332umtxq_insert(struct umtx_q *uq)
333{
334	struct umtxq_chain *uc;
335
336	uc = umtxq_getchain(&uq->uq_key);
337	UMTXQ_LOCKED_ASSERT(uc);
338	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
339	uq->uq_flags |= UQF_UMTXQ;
340}
341
342/*
343 * Remove thread from the umtx queue.
344 */
345static inline void
346umtxq_remove(struct umtx_q *uq)
347{
348	struct umtxq_chain *uc;
349
350	uc = umtxq_getchain(&uq->uq_key);
351	UMTXQ_LOCKED_ASSERT(uc);
352	if (uq->uq_flags & UQF_UMTXQ) {
353		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
354		uq->uq_flags &= ~UQF_UMTXQ;
355	}
356}
357
358/*
359 * Check if there are multiple waiters
360 */
361static int
362umtxq_count(struct umtx_key *key)
363{
364	struct umtxq_chain *uc;
365	struct umtx_q *uq;
366	int count = 0;
367
368	uc = umtxq_getchain(key);
369	UMTXQ_LOCKED_ASSERT(uc);
370	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
371		if (umtx_key_match(&uq->uq_key, key)) {
372			if (++count > 1)
373				break;
374		}
375	}
376	return (count);
377}
378
379/*
380 * Check if there are multiple PI waiters and returns first
381 * waiter.
382 */
383static int
384umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
385{
386	struct umtxq_chain *uc;
387	struct umtx_q *uq;
388	int count = 0;
389
390	*first = NULL;
391	uc = umtxq_getchain(key);
392	UMTXQ_LOCKED_ASSERT(uc);
393	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
394		if (umtx_key_match(&uq->uq_key, key)) {
395			if (++count > 1)
396				break;
397			*first = uq;
398		}
399	}
400	return (count);
401}
402
403/*
404 * Wake up threads waiting on an userland object.
405 */
406static int
407umtxq_signal(struct umtx_key *key, int n_wake)
408{
409	struct umtxq_chain *uc;
410	struct umtx_q *uq, *next;
411	int ret;
412
413	ret = 0;
414	uc = umtxq_getchain(key);
415	UMTXQ_LOCKED_ASSERT(uc);
416	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
417		if (umtx_key_match(&uq->uq_key, key)) {
418			umtxq_remove(uq);
419			wakeup(uq);
420			if (++ret >= n_wake)
421				break;
422		}
423	}
424	return (ret);
425}
426
427/*
428 * Wake up specified thread.
429 */
430static inline void
431umtxq_signal_thread(struct umtx_q *uq)
432{
433	struct umtxq_chain *uc;
434
435	uc = umtxq_getchain(&uq->uq_key);
436	UMTXQ_LOCKED_ASSERT(uc);
437	umtxq_remove(uq);
438	wakeup(uq);
439}
440
441/*
442 * Put thread into sleep state, before sleeping, check if
443 * thread was removed from umtx queue.
444 */
445static inline int
446umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
447{
448	struct umtxq_chain *uc;
449	int error;
450
451	uc = umtxq_getchain(&uq->uq_key);
452	UMTXQ_LOCKED_ASSERT(uc);
453	if (!(uq->uq_flags & UQF_UMTXQ))
454		return (0);
455	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
456	if (error == EWOULDBLOCK)
457		error = ETIMEDOUT;
458	return (error);
459}
460
461/*
462 * Convert userspace address into unique logical address.
463 */
464static int
465umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
466{
467	struct thread *td = curthread;
468	vm_map_t map;
469	vm_map_entry_t entry;
470	vm_pindex_t pindex;
471	vm_prot_t prot;
472	boolean_t wired;
473
474	key->type = type;
475	if (share == THREAD_SHARE) {
476		key->shared = 0;
477		key->info.private.vs = td->td_proc->p_vmspace;
478		key->info.private.addr = (uintptr_t)addr;
479	} else if (share == PROCESS_SHARE || share == AUTO_SHARE) {
480		map = &td->td_proc->p_vmspace->vm_map;
481		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
482		    &entry, &key->info.shared.object, &pindex, &prot,
483		    &wired) != KERN_SUCCESS) {
484			return EFAULT;
485		}
486
487		if ((share == PROCESS_SHARE) ||
488		    (share == AUTO_SHARE &&
489		     VM_INHERIT_SHARE == entry->inheritance)) {
490			key->shared = 1;
491			key->info.shared.offset = entry->offset + entry->start -
492				(vm_offset_t)addr;
493			vm_object_reference(key->info.shared.object);
494		} else {
495			key->shared = 0;
496			key->info.private.vs = td->td_proc->p_vmspace;
497			key->info.private.addr = (uintptr_t)addr;
498		}
499		vm_map_lookup_done(map, entry);
500	}
501
502	umtxq_hash(key);
503	return (0);
504}
505
506/*
507 * Release key.
508 */
509static inline void
510umtx_key_release(struct umtx_key *key)
511{
512	if (key->shared)
513		vm_object_deallocate(key->info.shared.object);
514}
515
516/*
517 * Lock a umtx object.
518 */
519static int
520_do_lock(struct thread *td, struct umtx *umtx, uintptr_t id, int timo)
521{
522	struct umtx_q *uq;
523	intptr_t owner;
524	intptr_t old;
525	int error = 0;
526
527	uq = td->td_umtxq;
528
529	/*
530	 * Care must be exercised when dealing with umtx structure. It
531	 * can fault on any access.
532	 */
533	for (;;) {
534		/*
535		 * Try the uncontested case.  This should be done in userland.
536		 */
537		owner = casuptr((intptr_t *)&umtx->u_owner, UMTX_UNOWNED, id);
538
539		/* The acquire succeeded. */
540		if (owner == UMTX_UNOWNED)
541			return (0);
542
543		/* The address was invalid. */
544		if (owner == -1)
545			return (EFAULT);
546
547		/* If no one owns it but it is contested try to acquire it. */
548		if (owner == UMTX_CONTESTED) {
549			owner = casuptr((intptr_t *)&umtx->u_owner,
550			    UMTX_CONTESTED, id | UMTX_CONTESTED);
551
552			if (owner == UMTX_CONTESTED)
553				return (0);
554
555			/* The address was invalid. */
556			if (owner == -1)
557				return (EFAULT);
558
559			/* If this failed the lock has changed, restart. */
560			continue;
561		}
562
563		/*
564		 * If we caught a signal, we have retried and now
565		 * exit immediately.
566		 */
567		if (error != 0)
568			return (error);
569
570		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
571			AUTO_SHARE, &uq->uq_key)) != 0)
572			return (error);
573
574		umtxq_lock(&uq->uq_key);
575		umtxq_busy(&uq->uq_key);
576		umtxq_insert(uq);
577		umtxq_unbusy(&uq->uq_key);
578		umtxq_unlock(&uq->uq_key);
579
580		/*
581		 * Set the contested bit so that a release in user space
582		 * knows to use the system call for unlock.  If this fails
583		 * either some one else has acquired the lock or it has been
584		 * released.
585		 */
586		old = casuptr((intptr_t *)&umtx->u_owner, owner,
587		    owner | UMTX_CONTESTED);
588
589		/* The address was invalid. */
590		if (old == -1) {
591			umtxq_lock(&uq->uq_key);
592			umtxq_remove(uq);
593			umtxq_unlock(&uq->uq_key);
594			umtx_key_release(&uq->uq_key);
595			return (EFAULT);
596		}
597
598		/*
599		 * We set the contested bit, sleep. Otherwise the lock changed
600		 * and we need to retry or we lost a race to the thread
601		 * unlocking the umtx.
602		 */
603		umtxq_lock(&uq->uq_key);
604		if (old == owner)
605			error = umtxq_sleep(uq, "umtx", timo);
606		umtxq_remove(uq);
607		umtxq_unlock(&uq->uq_key);
608		umtx_key_release(&uq->uq_key);
609	}
610
611	return (0);
612}
613
614/*
615 * Lock a umtx object.
616 */
617static int
618do_lock(struct thread *td, struct umtx *umtx, uintptr_t id,
619	struct timespec *timeout)
620{
621	struct timespec ts, ts2, ts3;
622	struct timeval tv;
623	int error;
624
625	if (timeout == NULL) {
626		error = _do_lock(td, umtx, id, 0);
627	} else {
628		getnanouptime(&ts);
629		timespecadd(&ts, timeout);
630		TIMESPEC_TO_TIMEVAL(&tv, timeout);
631		for (;;) {
632			error = _do_lock(td, umtx, id, tvtohz(&tv));
633			if (error != ETIMEDOUT)
634				break;
635			getnanouptime(&ts2);
636			if (timespeccmp(&ts2, &ts, >=)) {
637				error = ETIMEDOUT;
638				break;
639			}
640			ts3 = ts;
641			timespecsub(&ts3, &ts2);
642			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
643		}
644	}
645	/*
646	 * This lets userland back off critical region if needed.
647	 */
648	if (error == EINTR)
649		error = ERESTART;
650	return (error);
651}
652
653/*
654 * Unlock a umtx object.
655 */
656static int
657do_unlock(struct thread *td, struct umtx *umtx, uintptr_t id)
658{
659	struct umtx_key key;
660	intptr_t owner;
661	intptr_t old;
662	int error;
663	int count;
664
665	/*
666	 * Make sure we own this mtx.
667	 *
668	 * XXX Need a {fu,su}ptr this is not correct on arch where
669	 * sizeof(intptr_t) != sizeof(long).
670	 */
671	owner = fuword(&umtx->u_owner);
672	if (owner == -1)
673		return (EFAULT);
674
675	if ((owner & ~UMTX_CONTESTED) != id)
676		return (EPERM);
677
678	/* This should be done in userland */
679	if ((owner & UMTX_CONTESTED) == 0) {
680		old = casuptr((intptr_t *)&umtx->u_owner, owner,
681			UMTX_UNOWNED);
682		if (old == -1)
683			return (EFAULT);
684		if (old == owner)
685			return (0);
686	}
687
688	/* We should only ever be in here for contested locks */
689	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
690		&key)) != 0)
691		return (error);
692
693	umtxq_lock(&key);
694	umtxq_busy(&key);
695	count = umtxq_count(&key);
696	umtxq_unlock(&key);
697
698	/*
699	 * When unlocking the umtx, it must be marked as unowned if
700	 * there is zero or one thread only waiting for it.
701	 * Otherwise, it must be marked as contested.
702	 */
703	old = casuptr((intptr_t *)&umtx->u_owner, owner,
704			count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
705	umtxq_lock(&key);
706	umtxq_signal(&key,1);
707	umtxq_unbusy(&key);
708	umtxq_unlock(&key);
709	umtx_key_release(&key);
710	if (old == -1)
711		return (EFAULT);
712	if (old != owner)
713		return (EINVAL);
714	return (0);
715}
716
717/*
718 * Fetch and compare value, sleep on the address if value is not changed.
719 */
720static int
721do_wait(struct thread *td, struct umtx *umtx, uintptr_t id, struct timespec *timeout)
722{
723	struct umtx_q *uq;
724	struct timespec ts, ts2, ts3;
725	struct timeval tv;
726	uintptr_t tmp;
727	int error = 0;
728
729	uq = td->td_umtxq;
730	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_WAIT, AUTO_SHARE,
731	    &uq->uq_key)) != 0)
732		return (error);
733
734	umtxq_lock(&uq->uq_key);
735	umtxq_insert(uq);
736	umtxq_unlock(&uq->uq_key);
737	tmp = fuword(&umtx->u_owner);
738	if (tmp != id) {
739		umtxq_lock(&uq->uq_key);
740		umtxq_remove(uq);
741		umtxq_unlock(&uq->uq_key);
742	} else if (timeout == NULL) {
743		umtxq_lock(&uq->uq_key);
744		error = umtxq_sleep(uq, "ucond", 0);
745		umtxq_remove(uq);
746		umtxq_unlock(&uq->uq_key);
747	} else {
748		getnanouptime(&ts);
749		timespecadd(&ts, timeout);
750		TIMESPEC_TO_TIMEVAL(&tv, timeout);
751		umtxq_lock(&uq->uq_key);
752		for (;;) {
753			error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
754			if (!(uq->uq_flags & UQF_UMTXQ))
755				break;
756			if (error != ETIMEDOUT)
757				break;
758			umtxq_unlock(&uq->uq_key);
759			getnanouptime(&ts2);
760			if (timespeccmp(&ts2, &ts, >=)) {
761				error = ETIMEDOUT;
762				umtxq_lock(&uq->uq_key);
763				break;
764			}
765			ts3 = ts;
766			timespecsub(&ts3, &ts2);
767			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
768			umtxq_lock(&uq->uq_key);
769		}
770		umtxq_remove(uq);
771		umtxq_unlock(&uq->uq_key);
772	}
773	umtx_key_release(&uq->uq_key);
774	if (error == ERESTART)
775		error = EINTR;
776	return (error);
777}
778
779/*
780 * Wake up threads sleeping on the specified address.
781 */
782int
783kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
784{
785	struct umtx_key key;
786	int ret;
787
788	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
789	   &key)) != 0)
790		return (ret);
791	umtxq_lock(&key);
792	ret = umtxq_signal(&key, n_wake);
793	umtxq_unlock(&key);
794	umtx_key_release(&key);
795	return (0);
796}
797
798/*
799 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
800 */
801static int
802_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
803	int try)
804{
805	struct umtx_q *uq;
806	uint32_t owner, old, id;
807	int error = 0;
808
809	id = td->td_tid;
810	uq = td->td_umtxq;
811
812	/*
813	 * Care must be exercised when dealing with umtx structure. It
814	 * can fault on any access.
815	 */
816	for (;;) {
817		/*
818		 * Try the uncontested case.  This should be done in userland.
819		 */
820		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
821
822		/* The acquire succeeded. */
823		if (owner == UMUTEX_UNOWNED)
824			return (0);
825
826		/* The address was invalid. */
827		if (owner == -1)
828			return (EFAULT);
829
830		/* If no one owns it but it is contested try to acquire it. */
831		if (owner == UMUTEX_CONTESTED) {
832			owner = casuword32(&m->m_owner,
833			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
834
835			if (owner == UMUTEX_CONTESTED)
836				return (0);
837
838			/* The address was invalid. */
839			if (owner == -1)
840				return (EFAULT);
841
842			/* If this failed the lock has changed, restart. */
843			continue;
844		}
845
846		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
847		    (owner & ~UMUTEX_CONTESTED) == id)
848			return (EDEADLK);
849
850		if (try != 0)
851			return (EBUSY);
852
853		/*
854		 * If we caught a signal, we have retried and now
855		 * exit immediately.
856		 */
857		if (error != 0)
858			return (error);
859
860		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
861		    GET_SHARE(flags), &uq->uq_key)) != 0)
862			return (error);
863
864		umtxq_lock(&uq->uq_key);
865		umtxq_busy(&uq->uq_key);
866		umtxq_insert(uq);
867		umtxq_unbusy(&uq->uq_key);
868		umtxq_unlock(&uq->uq_key);
869
870		/*
871		 * Set the contested bit so that a release in user space
872		 * knows to use the system call for unlock.  If this fails
873		 * either some one else has acquired the lock or it has been
874		 * released.
875		 */
876		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
877
878		/* The address was invalid. */
879		if (old == -1) {
880			umtxq_lock(&uq->uq_key);
881			umtxq_remove(uq);
882			umtxq_unlock(&uq->uq_key);
883			umtx_key_release(&uq->uq_key);
884			return (EFAULT);
885		}
886
887		/*
888		 * We set the contested bit, sleep. Otherwise the lock changed
889		 * and we need to retry or we lost a race to the thread
890		 * unlocking the umtx.
891		 */
892		umtxq_lock(&uq->uq_key);
893		if (old == owner)
894			error = umtxq_sleep(uq, "umtxn", timo);
895		umtxq_remove(uq);
896		umtxq_unlock(&uq->uq_key);
897		umtx_key_release(&uq->uq_key);
898	}
899
900	return (0);
901}
902
903/*
904 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
905 */
906static int
907do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
908	struct timespec *timeout, int try)
909{
910	struct timespec ts, ts2, ts3;
911	struct timeval tv;
912	int error;
913
914	if (timeout == NULL) {
915		error = _do_lock_normal(td, m, flags, 0, try);
916	} else {
917		getnanouptime(&ts);
918		timespecadd(&ts, timeout);
919		TIMESPEC_TO_TIMEVAL(&tv, timeout);
920		for (;;) {
921			error = _do_lock_normal(td, m, flags, tvtohz(&tv), try);
922			if (error != ETIMEDOUT)
923				break;
924			getnanouptime(&ts2);
925			if (timespeccmp(&ts2, &ts, >=)) {
926				error = ETIMEDOUT;
927				break;
928			}
929			ts3 = ts;
930			timespecsub(&ts3, &ts2);
931			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
932		}
933	}
934	/*
935	 * This lets userland back off critical region if needed.
936	 */
937	if (error == EINTR)
938		error = ERESTART;
939	return (error);
940}
941
942/*
943 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
944 */
945static int
946do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
947{
948	struct umtx_key key;
949	uint32_t owner, old, id;
950	int error;
951	int count;
952
953	id = td->td_tid;
954	/*
955	 * Make sure we own this mtx.
956	 */
957	owner = fuword32(&m->m_owner);
958	if (owner == -1)
959		return (EFAULT);
960
961	if ((owner & ~UMUTEX_CONTESTED) != id)
962		return (EPERM);
963
964	/* This should be done in userland */
965	if ((owner & UMUTEX_CONTESTED) == 0) {
966		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
967		if (old == -1)
968			return (EFAULT);
969		if (old == owner)
970			return (0);
971	}
972
973	/* We should only ever be in here for contested locks */
974	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
975	    &key)) != 0)
976		return (error);
977
978	umtxq_lock(&key);
979	umtxq_busy(&key);
980	count = umtxq_count(&key);
981	umtxq_unlock(&key);
982
983	/*
984	 * When unlocking the umtx, it must be marked as unowned if
985	 * there is zero or one thread only waiting for it.
986	 * Otherwise, it must be marked as contested.
987	 */
988	old = casuword32(&m->m_owner, owner,
989		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
990	umtxq_lock(&key);
991	umtxq_signal(&key,1);
992	umtxq_unbusy(&key);
993	umtxq_unlock(&key);
994	umtx_key_release(&key);
995	if (old == -1)
996		return (EFAULT);
997	if (old != owner)
998		return (EINVAL);
999	return (0);
1000}
1001
1002static inline struct umtx_pi *
1003umtx_pi_alloc(void)
1004{
1005	struct umtx_pi *pi;
1006
1007	pi = uma_zalloc(umtx_pi_zone, M_ZERO | M_WAITOK);
1008	TAILQ_INIT(&pi->pi_blocked);
1009	atomic_add_int(&umtx_pi_allocated, 1);
1010	return (pi);
1011}
1012
1013static inline void
1014umtx_pi_free(struct umtx_pi *pi)
1015{
1016	uma_zfree(umtx_pi_zone, pi);
1017	atomic_add_int(&umtx_pi_allocated, -1);
1018}
1019
1020/*
1021 * Adjust the thread's position on a pi_state after its priority has been
1022 * changed.
1023 */
1024static int
1025umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1026{
1027	struct umtx_q *uq, *uq1, *uq2;
1028	struct thread *td1;
1029
1030	mtx_assert(&sched_lock, MA_OWNED);
1031	if (pi == NULL)
1032		return (0);
1033
1034	uq = td->td_umtxq;
1035
1036	/*
1037	 * Check if the thread needs to be moved on the blocked chain.
1038	 * It needs to be moved if either its priority is lower than
1039	 * the previous thread or higher than the next thread.
1040	 */
1041	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1042	uq2 = TAILQ_NEXT(uq, uq_lockq);
1043	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1044	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1045		/*
1046		 * Remove thread from blocked chain and determine where
1047		 * it should be moved to.
1048		 */
1049		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1050		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1051			td1 = uq1->uq_thread;
1052			MPASS(td1->td_proc->p_magic == P_MAGIC);
1053			if (UPRI(td1) > UPRI(td))
1054				break;
1055		}
1056
1057		if (uq1 == NULL)
1058			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1059		else
1060			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1061	}
1062	return (1);
1063}
1064
1065/*
1066 * Propagate priority when a thread is blocked on POSIX
1067 * PI mutex.
1068 */
1069static void
1070umtx_propagate_priority(struct thread *td)
1071{
1072	struct umtx_q *uq;
1073	struct umtx_pi *pi;
1074	int pri;
1075
1076	mtx_assert(&sched_lock, MA_OWNED);
1077	pri = UPRI(td);
1078	uq = td->td_umtxq;
1079	pi = uq->uq_pi_blocked;
1080	if (pi == NULL)
1081		return;
1082
1083	for (;;) {
1084		td = pi->pi_owner;
1085		if (td == NULL)
1086			return;
1087
1088		MPASS(td->td_proc != NULL);
1089		MPASS(td->td_proc->p_magic == P_MAGIC);
1090
1091		if (UPRI(td) <= pri)
1092			return;
1093
1094		sched_lend_user_prio(td, pri);
1095
1096		/*
1097		 * Pick up the lock that td is blocked on.
1098		 */
1099		uq = td->td_umtxq;
1100		pi = uq->uq_pi_blocked;
1101		/* Resort td on the list if needed. */
1102		if (!umtx_pi_adjust_thread(pi, td))
1103			break;
1104	}
1105}
1106
1107/*
1108 * Unpropagate priority for a PI mutex when a thread blocked on
1109 * it is interrupted by signal or resumed by others.
1110 */
1111static void
1112umtx_unpropagate_priority(struct umtx_pi *pi)
1113{
1114	struct umtx_q *uq, *uq_owner;
1115	struct umtx_pi *pi2;
1116	int pri;
1117
1118	mtx_assert(&sched_lock, MA_OWNED);
1119
1120	while (pi != NULL && pi->pi_owner != NULL) {
1121		pri = PRI_MAX;
1122		uq_owner = pi->pi_owner->td_umtxq;
1123
1124		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1125			uq = TAILQ_FIRST(&pi2->pi_blocked);
1126			if (uq != NULL) {
1127				if (pri > UPRI(uq->uq_thread))
1128					pri = UPRI(uq->uq_thread);
1129			}
1130		}
1131
1132		if (pri > uq_owner->uq_inherited_pri)
1133			pri = uq_owner->uq_inherited_pri;
1134		sched_unlend_user_prio(pi->pi_owner, pri);
1135		pi = uq_owner->uq_pi_blocked;
1136	}
1137}
1138
1139/*
1140 * Insert a PI mutex into owned list.
1141 */
1142static void
1143umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1144{
1145	struct umtx_q *uq_owner;
1146
1147	uq_owner = owner->td_umtxq;
1148	mtx_assert(&sched_lock, MA_OWNED);
1149	if (pi->pi_owner != NULL)
1150		panic("pi_ower != NULL");
1151	pi->pi_owner = owner;
1152	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1153}
1154
1155/*
1156 * Claim ownership of a PI mutex.
1157 */
1158static int
1159umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1160{
1161	struct umtx_q *uq, *uq_owner;
1162
1163	uq_owner = owner->td_umtxq;
1164	mtx_lock_spin(&sched_lock);
1165	if (pi->pi_owner == owner) {
1166		mtx_unlock_spin(&sched_lock);
1167		return (0);
1168	}
1169
1170	if (pi->pi_owner != NULL) {
1171		/*
1172		 * userland may have already messed the mutex, sigh.
1173		 */
1174		mtx_unlock_spin(&sched_lock);
1175		return (EPERM);
1176	}
1177	umtx_pi_setowner(pi, owner);
1178	uq = TAILQ_FIRST(&pi->pi_blocked);
1179	if (uq != NULL) {
1180		int pri;
1181
1182		pri = UPRI(uq->uq_thread);
1183		if (pri < UPRI(owner))
1184			sched_lend_user_prio(owner, pri);
1185	}
1186	mtx_unlock_spin(&sched_lock);
1187	return (0);
1188}
1189
1190/*
1191 * Adjust a thread's order position in its blocked PI mutex,
1192 * this may result new priority propagating process.
1193 */
1194void
1195umtx_pi_adjust(struct thread *td, u_char oldpri)
1196{
1197	struct umtx_q *uq;
1198	struct umtx_pi *pi;
1199
1200	uq = td->td_umtxq;
1201
1202	mtx_assert(&sched_lock, MA_OWNED);
1203	MPASS(TD_ON_UPILOCK(td));
1204
1205	/*
1206	 * Pick up the lock that td is blocked on.
1207	 */
1208	pi = uq->uq_pi_blocked;
1209	MPASS(pi != NULL);
1210
1211	/* Resort the turnstile on the list. */
1212	if (!umtx_pi_adjust_thread(pi, td))
1213		return;
1214
1215	/*
1216	 * If our priority was lowered and we are at the head of the
1217	 * turnstile, then propagate our new priority up the chain.
1218	 */
1219	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1220		umtx_propagate_priority(td);
1221}
1222
1223/*
1224 * Sleep on a PI mutex.
1225 */
1226static int
1227umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1228	uint32_t owner, const char *wmesg, int timo)
1229{
1230	struct umtxq_chain *uc;
1231	struct thread *td, *td1;
1232	struct umtx_q *uq1;
1233	int pri;
1234	int error = 0;
1235
1236	td = uq->uq_thread;
1237	KASSERT(td == curthread, ("inconsistent uq_thread"));
1238	uc = umtxq_getchain(&uq->uq_key);
1239	UMTXQ_LOCKED_ASSERT(uc);
1240	umtxq_insert(uq);
1241	if (pi->pi_owner == NULL) {
1242		/* XXX
1243		 * Current, We only support process private PI-mutex,
1244		 * non-contended PI-mutexes are locked in userland.
1245		 * Process shared PI-mutex should always be initialized
1246		 * by kernel and be registered in kernel, locking should
1247		 * always be done by kernel to avoid security problems.
1248		 * For process private PI-mutex, we can find owner
1249		 * thread and boost its priority safely.
1250		 */
1251		PROC_LOCK(curproc);
1252		td1 = thread_find(curproc, owner);
1253		mtx_lock_spin(&sched_lock);
1254		if (td1 != NULL && pi->pi_owner == NULL) {
1255			uq1 = td1->td_umtxq;
1256			umtx_pi_setowner(pi, td1);
1257		}
1258		PROC_UNLOCK(curproc);
1259	} else {
1260		mtx_lock_spin(&sched_lock);
1261	}
1262
1263	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1264		pri = UPRI(uq1->uq_thread);
1265		if (pri > UPRI(td))
1266			break;
1267	}
1268
1269	if (uq1 != NULL)
1270		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1271	else
1272		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1273
1274	uq->uq_pi_blocked = pi;
1275	td->td_flags |= TDF_UPIBLOCKED;
1276	mtx_unlock_spin(&sched_lock);
1277	umtxq_unlock(&uq->uq_key);
1278
1279	mtx_lock_spin(&sched_lock);
1280	umtx_propagate_priority(td);
1281	mtx_unlock_spin(&sched_lock);
1282
1283	umtxq_lock(&uq->uq_key);
1284	if (uq->uq_flags & UQF_UMTXQ) {
1285		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1286		if (error == EWOULDBLOCK)
1287			error = ETIMEDOUT;
1288		if (uq->uq_flags & UQF_UMTXQ) {
1289			umtxq_busy(&uq->uq_key);
1290			umtxq_remove(uq);
1291			umtxq_unbusy(&uq->uq_key);
1292		}
1293	}
1294	umtxq_unlock(&uq->uq_key);
1295
1296	mtx_lock_spin(&sched_lock);
1297	uq->uq_pi_blocked = NULL;
1298	td->td_flags &= ~TDF_UPIBLOCKED;
1299	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1300	umtx_unpropagate_priority(pi);
1301	mtx_unlock_spin(&sched_lock);
1302
1303	umtxq_lock(&uq->uq_key);
1304
1305	return (error);
1306}
1307
1308/*
1309 * Add reference count for a PI mutex.
1310 */
1311static void
1312umtx_pi_ref(struct umtx_pi *pi)
1313{
1314	struct umtxq_chain *uc;
1315
1316	uc = umtxq_getchain(&pi->pi_key);
1317	UMTXQ_LOCKED_ASSERT(uc);
1318	pi->pi_refcount++;
1319}
1320
1321/*
1322 * Decrease reference count for a PI mutex, if the counter
1323 * is decreased to zero, its memory space is freed.
1324 */
1325static void
1326umtx_pi_unref(struct umtx_pi *pi)
1327{
1328	struct umtxq_chain *uc;
1329	int free = 0;
1330
1331	uc = umtxq_getchain(&pi->pi_key);
1332	UMTXQ_LOCKED_ASSERT(uc);
1333	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1334	if (--pi->pi_refcount == 0) {
1335		mtx_lock_spin(&sched_lock);
1336		if (pi->pi_owner != NULL) {
1337			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1338				pi, pi_link);
1339			pi->pi_owner = NULL;
1340		}
1341		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1342			("blocked queue not empty"));
1343		mtx_unlock_spin(&sched_lock);
1344		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1345		free = 1;
1346	}
1347	if (free)
1348		umtx_pi_free(pi);
1349}
1350
1351/*
1352 * Find a PI mutex in hash table.
1353 */
1354static struct umtx_pi *
1355umtx_pi_lookup(struct umtx_key *key)
1356{
1357	struct umtxq_chain *uc;
1358	struct umtx_pi *pi;
1359
1360	uc = umtxq_getchain(key);
1361	UMTXQ_LOCKED_ASSERT(uc);
1362
1363	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1364		if (umtx_key_match(&pi->pi_key, key)) {
1365			return (pi);
1366		}
1367	}
1368	return (NULL);
1369}
1370
1371/*
1372 * Insert a PI mutex into hash table.
1373 */
1374static inline void
1375umtx_pi_insert(struct umtx_pi *pi)
1376{
1377	struct umtxq_chain *uc;
1378
1379	uc = umtxq_getchain(&pi->pi_key);
1380	UMTXQ_LOCKED_ASSERT(uc);
1381	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1382}
1383
1384/*
1385 * Lock a PI mutex.
1386 */
1387static int
1388_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1389	int try)
1390{
1391	struct umtx_q *uq;
1392	struct umtx_pi *pi, *new_pi;
1393	uint32_t id, owner, old;
1394	int error;
1395
1396	id = td->td_tid;
1397	uq = td->td_umtxq;
1398
1399	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1400	    &uq->uq_key)) != 0)
1401		return (error);
1402	for (;;) {
1403		pi = NULL;
1404		umtxq_lock(&uq->uq_key);
1405		pi = umtx_pi_lookup(&uq->uq_key);
1406		if (pi == NULL) {
1407			umtxq_unlock(&uq->uq_key);
1408			new_pi = umtx_pi_alloc();
1409			new_pi->pi_key = uq->uq_key;
1410			umtxq_lock(&uq->uq_key);
1411			pi = umtx_pi_lookup(&uq->uq_key);
1412			if (pi != NULL)
1413				umtx_pi_free(new_pi);
1414			else {
1415				umtx_pi_insert(new_pi);
1416				pi = new_pi;
1417			}
1418		}
1419
1420		umtx_pi_ref(pi);
1421		umtxq_unlock(&uq->uq_key);
1422
1423		/*
1424		 * Care must be exercised when dealing with umtx structure.  It
1425		 * can fault on any access.
1426		 */
1427
1428		/*
1429		 * Try the uncontested case.  This should be done in userland.
1430		 */
1431		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1432
1433		/* The acquire succeeded. */
1434		if (owner == UMUTEX_UNOWNED) {
1435			error = 0;
1436			break;
1437		}
1438
1439		/* The address was invalid. */
1440		if (owner == -1) {
1441			error = EFAULT;
1442			break;
1443		}
1444
1445		/* If no one owns it but it is contested try to acquire it. */
1446		if (owner == UMUTEX_CONTESTED) {
1447			owner = casuword32(&m->m_owner,
1448			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1449
1450			if (owner == UMUTEX_CONTESTED) {
1451				umtxq_lock(&uq->uq_key);
1452				error = umtx_pi_claim(pi, td);
1453				umtxq_unlock(&uq->uq_key);
1454				break;
1455			}
1456
1457			/* The address was invalid. */
1458			if (owner == -1) {
1459				error = EFAULT;
1460				break;
1461			}
1462
1463			/* If this failed the lock has changed, restart. */
1464			umtxq_lock(&uq->uq_key);
1465			umtx_pi_unref(pi);
1466			umtxq_unlock(&uq->uq_key);
1467			pi = NULL;
1468			continue;
1469		}
1470
1471		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1472		    (owner & ~UMUTEX_CONTESTED) == id) {
1473			error = EDEADLK;
1474			break;
1475		}
1476
1477		if (try != 0) {
1478			error = EBUSY;
1479			break;
1480		}
1481
1482		/*
1483		 * If we caught a signal, we have retried and now
1484		 * exit immediately.
1485		 */
1486		if (error != 0)
1487			break;
1488
1489		umtxq_lock(&uq->uq_key);
1490		umtxq_busy(&uq->uq_key);
1491		umtxq_unlock(&uq->uq_key);
1492
1493		/*
1494		 * Set the contested bit so that a release in user space
1495		 * knows to use the system call for unlock.  If this fails
1496		 * either some one else has acquired the lock or it has been
1497		 * released.
1498		 */
1499		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1500
1501		/* The address was invalid. */
1502		if (old == -1) {
1503			umtxq_lock(&uq->uq_key);
1504			umtxq_unbusy(&uq->uq_key);
1505			umtxq_unlock(&uq->uq_key);
1506			error = EFAULT;
1507			break;
1508		}
1509
1510		umtxq_lock(&uq->uq_key);
1511		umtxq_unbusy(&uq->uq_key);
1512		/*
1513		 * We set the contested bit, sleep. Otherwise the lock changed
1514		 * and we need to retry or we lost a race to the thread
1515		 * unlocking the umtx.
1516		 */
1517		if (old == owner)
1518			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1519				 "umtxpi", timo);
1520		umtx_pi_unref(pi);
1521		umtxq_unlock(&uq->uq_key);
1522		pi = NULL;
1523	}
1524
1525	if (pi != NULL) {
1526		umtxq_lock(&uq->uq_key);
1527		umtx_pi_unref(pi);
1528		umtxq_unlock(&uq->uq_key);
1529	}
1530
1531	umtx_key_release(&uq->uq_key);
1532	return (error);
1533}
1534
1535static int
1536do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1537	struct timespec *timeout, int try)
1538{
1539	struct timespec ts, ts2, ts3;
1540	struct timeval tv;
1541	int error;
1542
1543	if (timeout == NULL) {
1544		error = _do_lock_pi(td, m, flags, 0, try);
1545	} else {
1546		getnanouptime(&ts);
1547		timespecadd(&ts, timeout);
1548		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1549		for (;;) {
1550			error = _do_lock_pi(td, m, flags, tvtohz(&tv), try);
1551			if (error != ETIMEDOUT)
1552				break;
1553			getnanouptime(&ts2);
1554			if (timespeccmp(&ts2, &ts, >=)) {
1555				error = ETIMEDOUT;
1556				break;
1557			}
1558			ts3 = ts;
1559			timespecsub(&ts3, &ts2);
1560			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1561		}
1562	}
1563	/*
1564	 * This lets userland back off critical region if needed.
1565	 */
1566	if (error == EINTR)
1567		error = ERESTART;
1568	return (error);
1569}
1570
1571/*
1572 * Unlock a PI mutex.
1573 */
1574static int
1575do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1576{
1577	struct umtx_key key;
1578	struct umtx_q *uq_first, *uq_first2, *uq_me;
1579	struct umtx_pi *pi, *pi2;
1580	uint32_t owner, old, id;
1581	int error;
1582	int count;
1583	int pri;
1584
1585	id = td->td_tid;
1586	/*
1587	 * Make sure we own this mtx.
1588	 */
1589	owner = fuword32(&m->m_owner);
1590	if (owner == -1)
1591		return (EFAULT);
1592
1593	if ((owner & ~UMUTEX_CONTESTED) != id)
1594		return (EPERM);
1595
1596	/* This should be done in userland */
1597	if ((owner & UMUTEX_CONTESTED) == 0) {
1598		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1599		if (old == -1)
1600			return (EFAULT);
1601		if (old == owner)
1602			return (0);
1603	}
1604
1605	/* We should only ever be in here for contested locks */
1606	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1607	    &key)) != 0)
1608		return (error);
1609
1610	umtxq_lock(&key);
1611	umtxq_busy(&key);
1612	count = umtxq_count_pi(&key, &uq_first);
1613	if (uq_first != NULL) {
1614		pi = uq_first->uq_pi_blocked;
1615		if (pi->pi_owner != curthread) {
1616			umtxq_unbusy(&key);
1617			umtxq_unlock(&key);
1618			/* userland messed the mutex */
1619			return (EPERM);
1620		}
1621		uq_me = curthread->td_umtxq;
1622		mtx_lock_spin(&sched_lock);
1623		pi->pi_owner = NULL;
1624		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1625		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1626		pri = PRI_MAX;
1627		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1628			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1629			if (uq_first2 != NULL) {
1630				if (pri > UPRI(uq_first2->uq_thread))
1631					pri = UPRI(uq_first2->uq_thread);
1632			}
1633		}
1634		sched_unlend_user_prio(curthread, pri);
1635		mtx_unlock_spin(&sched_lock);
1636	}
1637	umtxq_unlock(&key);
1638
1639	/*
1640	 * When unlocking the umtx, it must be marked as unowned if
1641	 * there is zero or one thread only waiting for it.
1642	 * Otherwise, it must be marked as contested.
1643	 */
1644	old = casuword32(&m->m_owner, owner,
1645		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1646
1647	umtxq_lock(&key);
1648	if (uq_first != NULL)
1649		umtxq_signal_thread(uq_first);
1650	umtxq_unbusy(&key);
1651	umtxq_unlock(&key);
1652	umtx_key_release(&key);
1653	if (old == -1)
1654		return (EFAULT);
1655	if (old != owner)
1656		return (EINVAL);
1657	return (0);
1658}
1659
1660/*
1661 * Lock a PP mutex.
1662 */
1663static int
1664_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1665	int try)
1666{
1667	struct umtx_q *uq, *uq2;
1668	struct umtx_pi *pi;
1669	uint32_t ceiling;
1670	uint32_t owner, id;
1671	int error, pri, old_inherited_pri, su;
1672
1673	id = td->td_tid;
1674	uq = td->td_umtxq;
1675	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1676	    &uq->uq_key)) != 0)
1677		return (error);
1678	su = (suser(td) == 0);
1679	for (;;) {
1680		old_inherited_pri = uq->uq_inherited_pri;
1681		umtxq_lock(&uq->uq_key);
1682		umtxq_busy(&uq->uq_key);
1683		umtxq_unlock(&uq->uq_key);
1684
1685		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1686		if (ceiling > RTP_PRIO_MAX) {
1687			error = EINVAL;
1688			goto out;
1689		}
1690
1691		mtx_lock_spin(&sched_lock);
1692		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1693			mtx_unlock_spin(&sched_lock);
1694			error = EINVAL;
1695			goto out;
1696		}
1697		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1698			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1699			if (uq->uq_inherited_pri < UPRI(td))
1700				sched_lend_user_prio(td, uq->uq_inherited_pri);
1701		}
1702		mtx_unlock_spin(&sched_lock);
1703
1704		owner = casuword32(&m->m_owner,
1705		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1706
1707		if (owner == UMUTEX_CONTESTED) {
1708			error = 0;
1709			break;
1710		}
1711
1712		/* The address was invalid. */
1713		if (owner == -1) {
1714			error = EFAULT;
1715			break;
1716		}
1717
1718		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1719		    (owner & ~UMUTEX_CONTESTED) == id) {
1720			error = EDEADLK;
1721			break;
1722		}
1723
1724		if (try != 0) {
1725			error = EBUSY;
1726			break;
1727		}
1728
1729		/*
1730		 * If we caught a signal, we have retried and now
1731		 * exit immediately.
1732		 */
1733		if (error != 0)
1734			break;
1735
1736		/*
1737		 * We set the contested bit, sleep. Otherwise the lock changed
1738		 * and we need to retry or we lost a race to the thread
1739		 * unlocking the umtx.
1740		 */
1741		umtxq_lock(&uq->uq_key);
1742		umtxq_insert(uq);
1743		umtxq_unbusy(&uq->uq_key);
1744		error = umtxq_sleep(uq, "umtxpp", timo);
1745		umtxq_remove(uq);
1746		umtxq_unlock(&uq->uq_key);
1747
1748		mtx_lock_spin(&sched_lock);
1749		uq->uq_inherited_pri = old_inherited_pri;
1750		pri = PRI_MAX;
1751		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1752			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1753			if (uq2 != NULL) {
1754				if (pri > UPRI(uq2->uq_thread))
1755					pri = UPRI(uq2->uq_thread);
1756			}
1757		}
1758		if (pri > uq->uq_inherited_pri)
1759			pri = uq->uq_inherited_pri;
1760		sched_unlend_user_prio(td, pri);
1761		mtx_unlock_spin(&sched_lock);
1762	}
1763
1764	if (error != 0) {
1765		mtx_lock_spin(&sched_lock);
1766		uq->uq_inherited_pri = old_inherited_pri;
1767		pri = PRI_MAX;
1768		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1769			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1770			if (uq2 != NULL) {
1771				if (pri > UPRI(uq2->uq_thread))
1772					pri = UPRI(uq2->uq_thread);
1773			}
1774		}
1775		if (pri > uq->uq_inherited_pri)
1776			pri = uq->uq_inherited_pri;
1777		sched_unlend_user_prio(td, pri);
1778		mtx_unlock_spin(&sched_lock);
1779	}
1780
1781out:
1782	umtxq_lock(&uq->uq_key);
1783	umtxq_unbusy(&uq->uq_key);
1784	umtxq_unlock(&uq->uq_key);
1785	umtx_key_release(&uq->uq_key);
1786	return (error);
1787}
1788
1789/*
1790 * Lock a PP mutex.
1791 */
1792static int
1793do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
1794	struct timespec *timeout, int try)
1795{
1796	struct timespec ts, ts2, ts3;
1797	struct timeval tv;
1798	int error;
1799
1800	if (timeout == NULL) {
1801		error = _do_lock_pp(td, m, flags, 0, try);
1802	} else {
1803		getnanouptime(&ts);
1804		timespecadd(&ts, timeout);
1805		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1806		for (;;) {
1807			error = _do_lock_pp(td, m, flags, tvtohz(&tv), try);
1808			if (error != ETIMEDOUT)
1809				break;
1810			getnanouptime(&ts2);
1811			if (timespeccmp(&ts2, &ts, >=)) {
1812				error = ETIMEDOUT;
1813				break;
1814			}
1815			ts3 = ts;
1816			timespecsub(&ts3, &ts2);
1817			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1818		}
1819	}
1820	/*
1821	 * This lets userland back off critical region if needed.
1822	 */
1823	if (error == EINTR)
1824		error = ERESTART;
1825	return (error);
1826}
1827
1828/*
1829 * Unlock a PP mutex.
1830 */
1831static int
1832do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1833{
1834	struct umtx_key key;
1835	struct umtx_q *uq, *uq2;
1836	struct umtx_pi *pi;
1837	uint32_t owner, id;
1838	uint32_t rceiling;
1839	int error, pri, new_inherited_pri;
1840
1841	id = td->td_tid;
1842	uq = td->td_umtxq;
1843
1844	/*
1845	 * Make sure we own this mtx.
1846	 */
1847	owner = fuword32(&m->m_owner);
1848	if (owner == -1)
1849		return (EFAULT);
1850
1851	if ((owner & ~UMUTEX_CONTESTED) != id)
1852		return (EPERM);
1853
1854	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1855	if (error != 0)
1856		return (error);
1857
1858	if (rceiling == -1)
1859		new_inherited_pri = PRI_MAX;
1860	else {
1861		rceiling = RTP_PRIO_MAX - rceiling;
1862		if (rceiling > RTP_PRIO_MAX)
1863			return (EINVAL);
1864		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1865	}
1866
1867	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1868	    &key)) != 0)
1869		return (error);
1870	umtxq_lock(&key);
1871	umtxq_busy(&key);
1872	umtxq_unlock(&key);
1873	/*
1874	 * For priority protected mutex, always set unlocked state
1875	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1876	 * to lock the mutex, it is necessary because thread priority
1877	 * has to be adjusted for such mutex.
1878	 */
1879	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
1880
1881	umtxq_lock(&key);
1882	if (error == 0)
1883		umtxq_signal(&key, 1);
1884	umtxq_unbusy(&key);
1885	umtxq_unlock(&key);
1886
1887	if (error == -1)
1888		error = EFAULT;
1889	else {
1890		mtx_lock_spin(&sched_lock);
1891		uq->uq_inherited_pri = new_inherited_pri;
1892		pri = PRI_MAX;
1893		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1894			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1895			if (uq2 != NULL) {
1896				if (pri > UPRI(uq2->uq_thread))
1897					pri = UPRI(uq2->uq_thread);
1898			}
1899		}
1900		if (pri > uq->uq_inherited_pri)
1901			pri = uq->uq_inherited_pri;
1902		sched_unlend_user_prio(td, pri);
1903		mtx_unlock_spin(&sched_lock);
1904	}
1905	umtx_key_release(&key);
1906	return (error);
1907}
1908
1909static int
1910do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
1911	uint32_t *old_ceiling)
1912{
1913	struct umtx_q *uq;
1914	uint32_t save_ceiling;
1915	uint32_t owner, id;
1916	uint32_t flags;
1917	int error;
1918
1919	flags = fuword32(&m->m_flags);
1920	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
1921		return (EINVAL);
1922	if (ceiling > RTP_PRIO_MAX)
1923		return (EINVAL);
1924	id = td->td_tid;
1925	uq = td->td_umtxq;
1926	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1927	   &uq->uq_key)) != 0)
1928		return (error);
1929	for (;;) {
1930		umtxq_lock(&uq->uq_key);
1931		umtxq_busy(&uq->uq_key);
1932		umtxq_unlock(&uq->uq_key);
1933
1934		save_ceiling = fuword32(&m->m_ceilings[0]);
1935
1936		owner = casuword32(&m->m_owner,
1937		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1938
1939		if (owner == UMUTEX_CONTESTED) {
1940			suword32(&m->m_ceilings[0], ceiling);
1941			suword32(&m->m_owner, UMUTEX_CONTESTED);
1942			error = 0;
1943			break;
1944		}
1945
1946		/* The address was invalid. */
1947		if (owner == -1) {
1948			error = EFAULT;
1949			break;
1950		}
1951
1952		if ((owner & ~UMUTEX_CONTESTED) == id) {
1953			suword32(&m->m_ceilings[0], ceiling);
1954			error = 0;
1955			break;
1956		}
1957
1958		/*
1959		 * If we caught a signal, we have retried and now
1960		 * exit immediately.
1961		 */
1962		if (error != 0)
1963			break;
1964
1965		/*
1966		 * We set the contested bit, sleep. Otherwise the lock changed
1967		 * and we need to retry or we lost a race to the thread
1968		 * unlocking the umtx.
1969		 */
1970		umtxq_lock(&uq->uq_key);
1971		umtxq_insert(uq);
1972		umtxq_unbusy(&uq->uq_key);
1973		error = umtxq_sleep(uq, "umtxpp", 0);
1974		umtxq_remove(uq);
1975		umtxq_unlock(&uq->uq_key);
1976	}
1977	umtxq_lock(&uq->uq_key);
1978	if (error == 0)
1979		umtxq_signal(&uq->uq_key, INT_MAX);
1980	umtxq_unbusy(&uq->uq_key);
1981	umtxq_unlock(&uq->uq_key);
1982	umtx_key_release(&uq->uq_key);
1983	if (error == 0 && old_ceiling != NULL)
1984		suword32(old_ceiling, save_ceiling);
1985	return (error);
1986}
1987
1988/*
1989 * Lock a userland POSIX mutex.
1990 */
1991static int
1992do_lock_umutex(struct thread *td, struct umutex *m, struct timespec *ts,
1993	int try)
1994{
1995	uint32_t flags;
1996	int ret;
1997
1998	flags = fuword32(&m->m_flags);
1999	if (flags == -1)
2000		return (EFAULT);
2001
2002	if ((flags & UMUTEX_PRIO_INHERIT) != 0)
2003		ret = do_lock_pi(td, m, flags, ts, try);
2004	else if ((flags & UMUTEX_PRIO_PROTECT) != 0)
2005		ret = do_lock_pp(td, m, flags, ts, try);
2006	else
2007		ret = do_lock_normal(td, m, flags, ts, try);
2008
2009	return (ret);
2010}
2011
2012/*
2013 * Unlock a userland POSIX mutex.
2014 */
2015static int
2016do_unlock_umutex(struct thread *td, struct umutex *m)
2017{
2018	uint32_t flags;
2019	int ret;
2020
2021	flags = fuword32(&m->m_flags);
2022	if (flags == -1)
2023		return (EFAULT);
2024
2025	if ((flags & UMUTEX_PRIO_INHERIT) != 0)
2026		ret = do_unlock_pi(td, m, flags);
2027	else if ((flags & UMUTEX_PRIO_PROTECT) != 0)
2028		ret = do_unlock_pp(td, m, flags);
2029	else
2030		ret = do_unlock_normal(td, m, flags);
2031
2032	return (ret);
2033}
2034
2035int
2036_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2037    /* struct umtx *umtx */
2038{
2039	return _do_lock(td, uap->umtx, td->td_tid, 0);
2040}
2041
2042int
2043_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2044    /* struct umtx *umtx */
2045{
2046	return do_unlock(td, uap->umtx, td->td_tid);
2047}
2048
2049int
2050_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2051{
2052	struct timespec timeout;
2053	struct timespec *ts;
2054	int error;
2055
2056	switch(uap->op) {
2057	case UMTX_OP_MUTEX_LOCK:
2058		/* Allow a null timespec (wait forever). */
2059		if (uap->uaddr2 == NULL)
2060			ts = NULL;
2061		else {
2062			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2063			if (error != 0)
2064				break;
2065			if (timeout.tv_nsec >= 1000000000 ||
2066			    timeout.tv_nsec < 0) {
2067				error = EINVAL;
2068				break;
2069			}
2070			ts = &timeout;
2071		}
2072		error = do_lock_umutex(td, uap->obj, ts, 0);
2073		break;
2074	case UMTX_OP_MUTEX_UNLOCK:
2075		error = do_unlock_umutex(td, uap->obj);
2076		break;
2077	case UMTX_OP_MUTEX_TRYLOCK:
2078		error = do_lock_umutex(td, uap->obj, NULL, 1);
2079		break;
2080	case UMTX_OP_SET_CEILING:
2081		error = do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2082		break;
2083	case UMTX_OP_LOCK:
2084		/* Allow a null timespec (wait forever). */
2085		if (uap->uaddr2 == NULL)
2086			ts = NULL;
2087		else {
2088			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2089			if (error != 0)
2090				break;
2091			if (timeout.tv_nsec >= 1000000000 ||
2092			    timeout.tv_nsec < 0) {
2093				error = EINVAL;
2094				break;
2095			}
2096			ts = &timeout;
2097		}
2098		error = do_lock(td, uap->obj, uap->val, ts);
2099		break;
2100	case UMTX_OP_UNLOCK:
2101		error = do_unlock(td, uap->obj, uap->val);
2102		break;
2103	case UMTX_OP_WAIT:
2104		/* Allow a null timespec (wait forever). */
2105		if (uap->uaddr2 == NULL)
2106			ts = NULL;
2107		else {
2108			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2109			if (error != 0)
2110				break;
2111			if (timeout.tv_nsec >= 1000000000 ||
2112			    timeout.tv_nsec < 0) {
2113				error = EINVAL;
2114				break;
2115			}
2116			ts = &timeout;
2117		}
2118		error = do_wait(td, uap->obj, uap->val, ts);
2119		break;
2120	case UMTX_OP_WAKE:
2121		error = kern_umtx_wake(td, uap->obj, uap->val);
2122		break;
2123	default:
2124		error = EINVAL;
2125		break;
2126	}
2127	return (error);
2128}
2129
2130void
2131umtx_thread_init(struct thread *td)
2132{
2133	td->td_umtxq = umtxq_alloc();
2134	td->td_umtxq->uq_thread = td;
2135}
2136
2137void
2138umtx_thread_fini(struct thread *td)
2139{
2140	umtxq_free(td->td_umtxq);
2141}
2142
2143/*
2144 * It will be called when new thread is created, e.g fork().
2145 */
2146void
2147umtx_thread_alloc(struct thread *td)
2148{
2149	struct umtx_q *uq;
2150
2151	uq = td->td_umtxq;
2152	uq->uq_inherited_pri = PRI_MAX;
2153
2154	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2155	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2156	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2157	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2158}
2159
2160/*
2161 * exec() hook.
2162 */
2163static void
2164umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2165	struct image_params *imgp __unused)
2166{
2167	umtx_thread_cleanup(curthread);
2168}
2169
2170/*
2171 * thread_exit() hook.
2172 */
2173void
2174umtx_thread_exit(struct thread *td)
2175{
2176	umtx_thread_cleanup(td);
2177}
2178
2179/*
2180 * clean up umtx data.
2181 */
2182static void
2183umtx_thread_cleanup(struct thread *td)
2184{
2185	struct umtx_q *uq;
2186	struct umtx_pi *pi;
2187
2188	if ((uq = td->td_umtxq) == NULL)
2189		return;
2190
2191	mtx_lock_spin(&sched_lock);
2192	uq->uq_inherited_pri = PRI_MAX;
2193	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2194		pi->pi_owner = NULL;
2195		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2196	}
2197	td->td_flags &= ~TDF_UBORROWING;
2198	mtx_unlock_spin(&sched_lock);
2199}
2200