kern_umtx.c revision 161678
1139804Simp/*-
2139013Sdavidxu * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3112904Sjeff * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4112904Sjeff * All rights reserved.
5112904Sjeff *
6112904Sjeff * Redistribution and use in source and binary forms, with or without
7112904Sjeff * modification, are permitted provided that the following conditions
8112904Sjeff * are met:
9112904Sjeff * 1. Redistributions of source code must retain the above copyright
10112904Sjeff *    notice unmodified, this list of conditions, and the following
11112904Sjeff *    disclaimer.
12112904Sjeff * 2. Redistributions in binary form must reproduce the above copyright
13112904Sjeff *    notice, this list of conditions and the following disclaimer in the
14112904Sjeff *    documentation and/or other materials provided with the distribution.
15112904Sjeff *
16112904Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17112904Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18112904Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19112904Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20112904Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21112904Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22112904Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23112904Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24112904Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25112904Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26112904Sjeff */
27112904Sjeff
28116182Sobrien#include <sys/cdefs.h>
29116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 161678 2006-08-28 04:24:51Z davidxu $");
30116182Sobrien
31112904Sjeff#include <sys/param.h>
32112904Sjeff#include <sys/kernel.h>
33131431Smarcel#include <sys/limits.h>
34112904Sjeff#include <sys/lock.h>
35115765Sjeff#include <sys/malloc.h>
36112904Sjeff#include <sys/mutex.h>
37112904Sjeff#include <sys/proc.h>
38161678Sdavidxu#include <sys/sched.h>
39161678Sdavidxu#include <sys/sysctl.h>
40112904Sjeff#include <sys/sysent.h>
41112904Sjeff#include <sys/systm.h>
42112904Sjeff#include <sys/sysproto.h>
43139013Sdavidxu#include <sys/eventhandler.h>
44112904Sjeff#include <sys/umtx.h>
45112904Sjeff
46139013Sdavidxu#include <vm/vm.h>
47139013Sdavidxu#include <vm/vm_param.h>
48139013Sdavidxu#include <vm/pmap.h>
49139013Sdavidxu#include <vm/vm_map.h>
50139013Sdavidxu#include <vm/vm_object.h>
51139013Sdavidxu
52161678Sdavidxu#define TYPE_SIMPLE_LOCK	0
53161678Sdavidxu#define TYPE_SIMPLE_WAIT	1
54161678Sdavidxu#define TYPE_NORMAL_UMUTEX	2
55161678Sdavidxu#define TYPE_PI_UMUTEX		3
56161678Sdavidxu#define TYPE_PP_UMUTEX		4
57161678Sdavidxu#define TYPE_CV			5
58139013Sdavidxu
59161678Sdavidxu/* Key to represent a unique userland synchronous object */
60139013Sdavidxustruct umtx_key {
61161678Sdavidxu	int	hash;
62139013Sdavidxu	int	type;
63161678Sdavidxu	int	shared;
64139013Sdavidxu	union {
65139013Sdavidxu		struct {
66139013Sdavidxu			vm_object_t	object;
67161678Sdavidxu			uintptr_t	offset;
68139013Sdavidxu		} shared;
69139013Sdavidxu		struct {
70161678Sdavidxu			struct vmspace	*vs;
71161678Sdavidxu			uintptr_t	addr;
72139013Sdavidxu		} private;
73139013Sdavidxu		struct {
74161678Sdavidxu			void		*a;
75161678Sdavidxu			uintptr_t	b;
76139013Sdavidxu		} both;
77139013Sdavidxu	} info;
78139013Sdavidxu};
79139013Sdavidxu
80161678Sdavidxu/* Priority inheritance mutex info. */
81161678Sdavidxustruct umtx_pi {
82161678Sdavidxu	/* Owner thread */
83161678Sdavidxu	struct thread		*pi_owner;
84161678Sdavidxu
85161678Sdavidxu	/* Reference count */
86161678Sdavidxu	int			pi_refcount;
87161678Sdavidxu
88161678Sdavidxu 	/* List entry to link umtx holding by thread */
89161678Sdavidxu	TAILQ_ENTRY(umtx_pi)	pi_link;
90161678Sdavidxu
91161678Sdavidxu	/* List entry in hash */
92161678Sdavidxu	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
93161678Sdavidxu
94161678Sdavidxu	/* List for waiters */
95161678Sdavidxu	TAILQ_HEAD(,umtx_q)	pi_blocked;
96161678Sdavidxu
97161678Sdavidxu	/* Identify a userland lock object */
98161678Sdavidxu	struct umtx_key		pi_key;
99161678Sdavidxu};
100161678Sdavidxu
101161678Sdavidxu/* A userland synchronous object user. */
102115765Sjeffstruct umtx_q {
103161678Sdavidxu	/* Linked list for the hash. */
104161678Sdavidxu	TAILQ_ENTRY(umtx_q)	uq_link;
105161678Sdavidxu
106161678Sdavidxu	/* Umtx key. */
107161678Sdavidxu	struct umtx_key		uq_key;
108161678Sdavidxu
109161678Sdavidxu	/* Umtx flags. */
110161678Sdavidxu	int			uq_flags;
111161678Sdavidxu#define UQF_UMTXQ	0x0001
112161678Sdavidxu
113161678Sdavidxu	/* The thread waits on. */
114161678Sdavidxu	struct thread		*uq_thread;
115161678Sdavidxu
116161678Sdavidxu	/*
117161678Sdavidxu	 * Blocked on PI mutex. read can use chain lock
118161678Sdavidxu	 * or sched_lock, write must have both chain lock and
119161678Sdavidxu	 * sched_lock being hold.
120161678Sdavidxu	 */
121161678Sdavidxu	struct umtx_pi		*uq_pi_blocked;
122161678Sdavidxu
123161678Sdavidxu	/* On blocked list */
124161678Sdavidxu	TAILQ_ENTRY(umtx_q)	uq_lockq;
125161678Sdavidxu
126161678Sdavidxu	/* Thread contending with us */
127161678Sdavidxu	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
128161678Sdavidxu
129161678Sdavidxu	/* Inherited prioroty from PP mutex */
130161678Sdavidxu	u_char			uq_inherited_pri;
131115765Sjeff};
132115765Sjeff
133161678SdavidxuTAILQ_HEAD(umtxq_head, umtx_q);
134161678Sdavidxu
135161678Sdavidxu/* Userland lock object's wait-queue chain */
136138224Sdavidxustruct umtxq_chain {
137161678Sdavidxu	/* Lock for this chain. */
138161678Sdavidxu	struct mtx		uc_lock;
139161678Sdavidxu
140161678Sdavidxu	/* List of sleep queues. */
141161678Sdavidxu	struct umtxq_head	uc_queue;
142161678Sdavidxu
143161678Sdavidxu	/* Busy flag */
144161678Sdavidxu	char			uc_busy;
145161678Sdavidxu
146161678Sdavidxu	/* Chain lock waiters */
147158377Sdavidxu	int			uc_waiters;
148161678Sdavidxu
149161678Sdavidxu	/* All PI in the list */
150161678Sdavidxu	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
151138224Sdavidxu};
152115765Sjeff
153161678Sdavidxu#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
154161678Sdavidxu
155161678Sdavidxu/*
156161678Sdavidxu * Don't propagate time-sharing priority, there is a security reason,
157161678Sdavidxu * a user can simply introduce PI-mutex, let thread A lock the mutex,
158161678Sdavidxu * and let another thread B block on the mutex, because B is
159161678Sdavidxu * sleeping, its priority will be boosted, this causes A's priority to
160161678Sdavidxu * be boosted via priority propagating too and will never be lowered even
161161678Sdavidxu * if it is using 100%CPU, this is unfair to other processes.
162161678Sdavidxu */
163161678Sdavidxu
164161678Sdavidxu#define UPRI(td)	(((td)->td_ksegrp->kg_user_pri >= PRI_MIN_TIMESHARE &&\
165161678Sdavidxu			  (td)->td_ksegrp->kg_user_pri <= PRI_MAX_TIMESHARE) ?\
166161678Sdavidxu			 PRI_MAX_TIMESHARE : (td)->td_ksegrp->kg_user_pri)
167161678Sdavidxu
168138224Sdavidxu#define	GOLDEN_RATIO_PRIME	2654404609U
169138224Sdavidxu#define	UMTX_CHAINS		128
170138224Sdavidxu#define	UMTX_SHIFTS		(__WORD_BIT - 7)
171115765Sjeff
172161678Sdavidxu#define THREAD_SHARE		0
173161678Sdavidxu#define PROCESS_SHARE		1
174161678Sdavidxu#define AUTO_SHARE		2
175161678Sdavidxu
176161678Sdavidxu#define	GET_SHARE(flags)	\
177161678Sdavidxu    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
178161678Sdavidxu
179161678Sdavidxustatic uma_zone_t		umtx_pi_zone;
180161678Sdavidxustatic struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
181138224Sdavidxustatic MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
182161678Sdavidxustatic int			umtx_pi_allocated;
183115310Sjeff
184161678SdavidxuSYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
185161678SdavidxuSYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
186161678Sdavidxu    &umtx_pi_allocated, 0, "Allocated umtx_pi");
187161678Sdavidxu
188161678Sdavidxustatic void umtxq_sysinit(void *);
189161678Sdavidxustatic void umtxq_hash(struct umtx_key *key);
190161678Sdavidxustatic struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
191139013Sdavidxustatic void umtxq_lock(struct umtx_key *key);
192139013Sdavidxustatic void umtxq_unlock(struct umtx_key *key);
193139257Sdavidxustatic void umtxq_busy(struct umtx_key *key);
194139257Sdavidxustatic void umtxq_unbusy(struct umtx_key *key);
195139013Sdavidxustatic void umtxq_insert(struct umtx_q *uq);
196139013Sdavidxustatic void umtxq_remove(struct umtx_q *uq);
197161678Sdavidxustatic int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
198139257Sdavidxustatic int umtxq_count(struct umtx_key *key);
199139257Sdavidxustatic int umtxq_signal(struct umtx_key *key, int nr_wakeup);
200139013Sdavidxustatic int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
201161678Sdavidxustatic int umtx_key_get(void *addr, int type, int share,
202139013Sdavidxu	struct umtx_key *key);
203139013Sdavidxustatic void umtx_key_release(struct umtx_key *key);
204161678Sdavidxustatic struct umtx_pi *umtx_pi_alloc(void);
205161678Sdavidxustatic void umtx_pi_free(struct umtx_pi *pi);
206161678Sdavidxustatic int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
207161678Sdavidxustatic void umtx_thread_cleanup(struct thread *td);
208161678Sdavidxustatic void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
209161678Sdavidxu	struct image_params *imgp __unused);
210161678SdavidxuSYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
211115310Sjeff
212161678Sdavidxustatic void
213161678Sdavidxuumtxq_sysinit(void *arg __unused)
214161678Sdavidxu{
215161678Sdavidxu	int i;
216138224Sdavidxu
217161678Sdavidxu	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
218161678Sdavidxu		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
219161678Sdavidxu	for (i = 0; i < UMTX_CHAINS; ++i) {
220161678Sdavidxu		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
221161678Sdavidxu			 MTX_DEF | MTX_DUPOK);
222161678Sdavidxu		TAILQ_INIT(&umtxq_chains[i].uc_queue);
223161678Sdavidxu		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
224161678Sdavidxu		umtxq_chains[i].uc_busy = 0;
225161678Sdavidxu		umtxq_chains[i].uc_waiters = 0;
226161678Sdavidxu	}
227161678Sdavidxu	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
228161678Sdavidxu	    EVENTHANDLER_PRI_ANY);
229161678Sdavidxu}
230161678Sdavidxu
231143149Sdavidxustruct umtx_q *
232143149Sdavidxuumtxq_alloc(void)
233143149Sdavidxu{
234161678Sdavidxu	struct umtx_q *uq;
235161678Sdavidxu
236161678Sdavidxu	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
237161678Sdavidxu	TAILQ_INIT(&uq->uq_pi_contested);
238161678Sdavidxu	uq->uq_inherited_pri = PRI_MAX;
239161678Sdavidxu	return (uq);
240143149Sdavidxu}
241143149Sdavidxu
242143149Sdavidxuvoid
243143149Sdavidxuumtxq_free(struct umtx_q *uq)
244143149Sdavidxu{
245143149Sdavidxu	free(uq, M_UMTX);
246143149Sdavidxu}
247143149Sdavidxu
248161678Sdavidxustatic inline void
249139013Sdavidxuumtxq_hash(struct umtx_key *key)
250138224Sdavidxu{
251161678Sdavidxu	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
252161678Sdavidxu	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
253138224Sdavidxu}
254138224Sdavidxu
255139013Sdavidxustatic inline int
256139013Sdavidxuumtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
257139013Sdavidxu{
258139013Sdavidxu	return (k1->type == k2->type &&
259161678Sdavidxu		k1->info.both.a == k2->info.both.a &&
260161678Sdavidxu	        k1->info.both.b == k2->info.both.b);
261139013Sdavidxu}
262139013Sdavidxu
263161678Sdavidxustatic inline struct umtxq_chain *
264161678Sdavidxuumtxq_getchain(struct umtx_key *key)
265139013Sdavidxu{
266161678Sdavidxu	return (&umtxq_chains[key->hash]);
267139013Sdavidxu}
268139013Sdavidxu
269161678Sdavidxu/*
270161678Sdavidxu * Set chain to busy state when following operation
271161678Sdavidxu * may be blocked (kernel mutex can not be used).
272161678Sdavidxu */
273138224Sdavidxustatic inline void
274139257Sdavidxuumtxq_busy(struct umtx_key *key)
275139257Sdavidxu{
276161678Sdavidxu	struct umtxq_chain *uc;
277139257Sdavidxu
278161678Sdavidxu	uc = umtxq_getchain(key);
279161678Sdavidxu	mtx_assert(&uc->uc_lock, MA_OWNED);
280161678Sdavidxu	while (uc->uc_busy != 0) {
281161678Sdavidxu		uc->uc_waiters++;
282161678Sdavidxu		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
283161678Sdavidxu		uc->uc_waiters--;
284139257Sdavidxu	}
285161678Sdavidxu	uc->uc_busy = 1;
286139257Sdavidxu}
287139257Sdavidxu
288161678Sdavidxu/*
289161678Sdavidxu * Unbusy a chain.
290161678Sdavidxu */
291139257Sdavidxustatic inline void
292139257Sdavidxuumtxq_unbusy(struct umtx_key *key)
293139257Sdavidxu{
294161678Sdavidxu	struct umtxq_chain *uc;
295139257Sdavidxu
296161678Sdavidxu	uc = umtxq_getchain(key);
297161678Sdavidxu	mtx_assert(&uc->uc_lock, MA_OWNED);
298161678Sdavidxu	KASSERT(uc->uc_busy != 0, ("not busy"));
299161678Sdavidxu	uc->uc_busy = 0;
300161678Sdavidxu	if (uc->uc_waiters)
301161678Sdavidxu		wakeup_one(uc);
302139257Sdavidxu}
303139257Sdavidxu
304161678Sdavidxu/*
305161678Sdavidxu * Lock a chain.
306161678Sdavidxu */
307139257Sdavidxustatic inline void
308139013Sdavidxuumtxq_lock(struct umtx_key *key)
309138224Sdavidxu{
310161678Sdavidxu	struct umtxq_chain *uc;
311161678Sdavidxu
312161678Sdavidxu	uc = umtxq_getchain(key);
313161678Sdavidxu	mtx_lock(&uc->uc_lock);
314138224Sdavidxu}
315138224Sdavidxu
316161678Sdavidxu/*
317161678Sdavidxu * Unlock a chain.
318161678Sdavidxu */
319138225Sdavidxustatic inline void
320139013Sdavidxuumtxq_unlock(struct umtx_key *key)
321138224Sdavidxu{
322161678Sdavidxu	struct umtxq_chain *uc;
323161678Sdavidxu
324161678Sdavidxu	uc = umtxq_getchain(key);
325161678Sdavidxu	mtx_unlock(&uc->uc_lock);
326138224Sdavidxu}
327138224Sdavidxu
328139013Sdavidxu/*
329139013Sdavidxu * Insert a thread onto the umtx queue.
330139013Sdavidxu */
331139013Sdavidxustatic inline void
332139013Sdavidxuumtxq_insert(struct umtx_q *uq)
333115765Sjeff{
334161678Sdavidxu	struct umtxq_chain *uc;
335139013Sdavidxu
336161678Sdavidxu	uc = umtxq_getchain(&uq->uq_key);
337161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
338161678Sdavidxu	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
339158718Sdavidxu	uq->uq_flags |= UQF_UMTXQ;
340139013Sdavidxu}
341139013Sdavidxu
342139013Sdavidxu/*
343139013Sdavidxu * Remove thread from the umtx queue.
344139013Sdavidxu */
345139013Sdavidxustatic inline void
346139013Sdavidxuumtxq_remove(struct umtx_q *uq)
347139013Sdavidxu{
348161678Sdavidxu	struct umtxq_chain *uc;
349161678Sdavidxu
350161678Sdavidxu	uc = umtxq_getchain(&uq->uq_key);
351161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
352158718Sdavidxu	if (uq->uq_flags & UQF_UMTXQ) {
353161678Sdavidxu		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
354158718Sdavidxu		uq->uq_flags &= ~UQF_UMTXQ;
355139013Sdavidxu	}
356139013Sdavidxu}
357139013Sdavidxu
358161678Sdavidxu/*
359161678Sdavidxu * Check if there are multiple waiters
360161678Sdavidxu */
361139013Sdavidxustatic int
362139013Sdavidxuumtxq_count(struct umtx_key *key)
363139013Sdavidxu{
364161678Sdavidxu	struct umtxq_chain *uc;
365115765Sjeff	struct umtx_q *uq;
366161678Sdavidxu	int count = 0;
367115765Sjeff
368161678Sdavidxu	uc = umtxq_getchain(key);
369161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
370161678Sdavidxu	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
371139013Sdavidxu		if (umtx_key_match(&uq->uq_key, key)) {
372139013Sdavidxu			if (++count > 1)
373139013Sdavidxu				break;
374139013Sdavidxu		}
375115765Sjeff	}
376139013Sdavidxu	return (count);
377115765Sjeff}
378115765Sjeff
379161678Sdavidxu/*
380161678Sdavidxu * Check if there are multiple PI waiters and returns first
381161678Sdavidxu * waiter.
382161678Sdavidxu */
383139257Sdavidxustatic int
384161678Sdavidxuumtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
385161678Sdavidxu{
386161678Sdavidxu	struct umtxq_chain *uc;
387161678Sdavidxu	struct umtx_q *uq;
388161678Sdavidxu	int count = 0;
389161678Sdavidxu
390161678Sdavidxu	*first = NULL;
391161678Sdavidxu	uc = umtxq_getchain(key);
392161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
393161678Sdavidxu	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
394161678Sdavidxu		if (umtx_key_match(&uq->uq_key, key)) {
395161678Sdavidxu			if (++count > 1)
396161678Sdavidxu				break;
397161678Sdavidxu			*first = uq;
398161678Sdavidxu		}
399161678Sdavidxu	}
400161678Sdavidxu	return (count);
401161678Sdavidxu}
402161678Sdavidxu
403161678Sdavidxu/*
404161678Sdavidxu * Wake up threads waiting on an userland object.
405161678Sdavidxu */
406161678Sdavidxustatic int
407139257Sdavidxuumtxq_signal(struct umtx_key *key, int n_wake)
408115765Sjeff{
409161678Sdavidxu	struct umtxq_chain *uc;
410139257Sdavidxu	struct umtx_q *uq, *next;
411161678Sdavidxu	int ret;
412115765Sjeff
413139257Sdavidxu	ret = 0;
414161678Sdavidxu	uc = umtxq_getchain(key);
415161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
416161678Sdavidxu	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
417139013Sdavidxu		if (umtx_key_match(&uq->uq_key, key)) {
418139013Sdavidxu			umtxq_remove(uq);
419161678Sdavidxu			wakeup(uq);
420139257Sdavidxu			if (++ret >= n_wake)
421139257Sdavidxu				break;
422139013Sdavidxu		}
423139013Sdavidxu	}
424139257Sdavidxu	return (ret);
425138224Sdavidxu}
426138224Sdavidxu
427161678Sdavidxu/*
428161678Sdavidxu * Wake up specified thread.
429161678Sdavidxu */
430161678Sdavidxustatic inline void
431161678Sdavidxuumtxq_signal_thread(struct umtx_q *uq)
432161678Sdavidxu{
433161678Sdavidxu	struct umtxq_chain *uc;
434161678Sdavidxu
435161678Sdavidxu	uc = umtxq_getchain(&uq->uq_key);
436161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
437161678Sdavidxu	umtxq_remove(uq);
438161678Sdavidxu	wakeup(uq);
439161678Sdavidxu}
440161678Sdavidxu
441161678Sdavidxu/*
442161678Sdavidxu * Put thread into sleep state, before sleeping, check if
443161678Sdavidxu * thread was removed from umtx queue.
444161678Sdavidxu */
445138224Sdavidxustatic inline int
446161678Sdavidxuumtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
447138224Sdavidxu{
448161678Sdavidxu	struct umtxq_chain *uc;
449161678Sdavidxu	int error;
450161678Sdavidxu
451161678Sdavidxu	uc = umtxq_getchain(&uq->uq_key);
452161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
453161678Sdavidxu	if (!(uq->uq_flags & UQF_UMTXQ))
454161678Sdavidxu		return (0);
455161678Sdavidxu	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
456139751Sdavidxu	if (error == EWOULDBLOCK)
457139751Sdavidxu		error = ETIMEDOUT;
458139751Sdavidxu	return (error);
459138224Sdavidxu}
460138224Sdavidxu
461161678Sdavidxu/*
462161678Sdavidxu * Convert userspace address into unique logical address.
463161678Sdavidxu */
464139013Sdavidxustatic int
465161678Sdavidxuumtx_key_get(void *addr, int type, int share, struct umtx_key *key)
466139013Sdavidxu{
467161678Sdavidxu	struct thread *td = curthread;
468139013Sdavidxu	vm_map_t map;
469139013Sdavidxu	vm_map_entry_t entry;
470139013Sdavidxu	vm_pindex_t pindex;
471139013Sdavidxu	vm_prot_t prot;
472139013Sdavidxu	boolean_t wired;
473139013Sdavidxu
474161678Sdavidxu	key->type = type;
475161678Sdavidxu	if (share == THREAD_SHARE) {
476161678Sdavidxu		key->shared = 0;
477161678Sdavidxu		key->info.private.vs = td->td_proc->p_vmspace;
478161678Sdavidxu		key->info.private.addr = (uintptr_t)addr;
479161678Sdavidxu	} else if (share == PROCESS_SHARE || share == AUTO_SHARE) {
480161678Sdavidxu		map = &td->td_proc->p_vmspace->vm_map;
481161678Sdavidxu		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
482161678Sdavidxu		    &entry, &key->info.shared.object, &pindex, &prot,
483161678Sdavidxu		    &wired) != KERN_SUCCESS) {
484161678Sdavidxu			return EFAULT;
485161678Sdavidxu		}
486161678Sdavidxu
487161678Sdavidxu		if ((share == PROCESS_SHARE) ||
488161678Sdavidxu		    (share == AUTO_SHARE &&
489161678Sdavidxu		     VM_INHERIT_SHARE == entry->inheritance)) {
490161678Sdavidxu			key->shared = 1;
491161678Sdavidxu			key->info.shared.offset = entry->offset + entry->start -
492161678Sdavidxu				(vm_offset_t)addr;
493161678Sdavidxu			vm_object_reference(key->info.shared.object);
494161678Sdavidxu		} else {
495161678Sdavidxu			key->shared = 0;
496161678Sdavidxu			key->info.private.vs = td->td_proc->p_vmspace;
497161678Sdavidxu			key->info.private.addr = (uintptr_t)addr;
498161678Sdavidxu		}
499161678Sdavidxu		vm_map_lookup_done(map, entry);
500139013Sdavidxu	}
501139013Sdavidxu
502161678Sdavidxu	umtxq_hash(key);
503139013Sdavidxu	return (0);
504139013Sdavidxu}
505139013Sdavidxu
506161678Sdavidxu/*
507161678Sdavidxu * Release key.
508161678Sdavidxu */
509139013Sdavidxustatic inline void
510139013Sdavidxuumtx_key_release(struct umtx_key *key)
511139013Sdavidxu{
512161678Sdavidxu	if (key->shared)
513139013Sdavidxu		vm_object_deallocate(key->info.shared.object);
514139013Sdavidxu}
515139013Sdavidxu
516161678Sdavidxu/*
517161678Sdavidxu * Lock a umtx object.
518161678Sdavidxu */
519139013Sdavidxustatic int
520161678Sdavidxu_do_lock(struct thread *td, struct umtx *umtx, uintptr_t id, int timo)
521112904Sjeff{
522143149Sdavidxu	struct umtx_q *uq;
523112904Sjeff	intptr_t owner;
524112967Sjake	intptr_t old;
525138224Sdavidxu	int error = 0;
526112904Sjeff
527143149Sdavidxu	uq = td->td_umtxq;
528161678Sdavidxu
529112904Sjeff	/*
530161678Sdavidxu	 * Care must be exercised when dealing with umtx structure. It
531112904Sjeff	 * can fault on any access.
532112904Sjeff	 */
533112904Sjeff	for (;;) {
534112904Sjeff		/*
535112904Sjeff		 * Try the uncontested case.  This should be done in userland.
536112904Sjeff		 */
537161678Sdavidxu		owner = casuptr((intptr_t *)&umtx->u_owner, UMTX_UNOWNED, id);
538112904Sjeff
539138224Sdavidxu		/* The acquire succeeded. */
540138224Sdavidxu		if (owner == UMTX_UNOWNED)
541138224Sdavidxu			return (0);
542138224Sdavidxu
543115765Sjeff		/* The address was invalid. */
544115765Sjeff		if (owner == -1)
545115765Sjeff			return (EFAULT);
546115765Sjeff
547115765Sjeff		/* If no one owns it but it is contested try to acquire it. */
548115765Sjeff		if (owner == UMTX_CONTESTED) {
549115765Sjeff			owner = casuptr((intptr_t *)&umtx->u_owner,
550139013Sdavidxu			    UMTX_CONTESTED, id | UMTX_CONTESTED);
551115765Sjeff
552138224Sdavidxu			if (owner == UMTX_CONTESTED)
553138224Sdavidxu				return (0);
554138224Sdavidxu
555115765Sjeff			/* The address was invalid. */
556115765Sjeff			if (owner == -1)
557115765Sjeff				return (EFAULT);
558115765Sjeff
559115765Sjeff			/* If this failed the lock has changed, restart. */
560115765Sjeff			continue;
561112904Sjeff		}
562112904Sjeff
563138224Sdavidxu		/*
564138224Sdavidxu		 * If we caught a signal, we have retried and now
565138224Sdavidxu		 * exit immediately.
566138224Sdavidxu		 */
567161678Sdavidxu		if (error != 0)
568138224Sdavidxu			return (error);
569112904Sjeff
570161678Sdavidxu		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
571161678Sdavidxu			AUTO_SHARE, &uq->uq_key)) != 0)
572161678Sdavidxu			return (error);
573161678Sdavidxu
574161678Sdavidxu		umtxq_lock(&uq->uq_key);
575161678Sdavidxu		umtxq_busy(&uq->uq_key);
576161678Sdavidxu		umtxq_insert(uq);
577161678Sdavidxu		umtxq_unbusy(&uq->uq_key);
578161678Sdavidxu		umtxq_unlock(&uq->uq_key);
579161678Sdavidxu
580112904Sjeff		/*
581112904Sjeff		 * Set the contested bit so that a release in user space
582112904Sjeff		 * knows to use the system call for unlock.  If this fails
583112904Sjeff		 * either some one else has acquired the lock or it has been
584112904Sjeff		 * released.
585112904Sjeff		 */
586112967Sjake		old = casuptr((intptr_t *)&umtx->u_owner, owner,
587112967Sjake		    owner | UMTX_CONTESTED);
588112904Sjeff
589112904Sjeff		/* The address was invalid. */
590112967Sjake		if (old == -1) {
591143149Sdavidxu			umtxq_lock(&uq->uq_key);
592143149Sdavidxu			umtxq_remove(uq);
593143149Sdavidxu			umtxq_unlock(&uq->uq_key);
594143149Sdavidxu			umtx_key_release(&uq->uq_key);
595115765Sjeff			return (EFAULT);
596112904Sjeff		}
597112904Sjeff
598112904Sjeff		/*
599115765Sjeff		 * We set the contested bit, sleep. Otherwise the lock changed
600117685Smtm		 * and we need to retry or we lost a race to the thread
601117685Smtm		 * unlocking the umtx.
602112904Sjeff		 */
603143149Sdavidxu		umtxq_lock(&uq->uq_key);
604161678Sdavidxu		if (old == owner)
605161678Sdavidxu			error = umtxq_sleep(uq, "umtx", timo);
606143149Sdavidxu		umtxq_remove(uq);
607143149Sdavidxu		umtxq_unlock(&uq->uq_key);
608143149Sdavidxu		umtx_key_release(&uq->uq_key);
609112904Sjeff	}
610117743Smtm
611117743Smtm	return (0);
612112904Sjeff}
613112904Sjeff
614161678Sdavidxu/*
615161678Sdavidxu * Lock a umtx object.
616161678Sdavidxu */
617139013Sdavidxustatic int
618161678Sdavidxudo_lock(struct thread *td, struct umtx *umtx, uintptr_t id,
619140245Sdavidxu	struct timespec *timeout)
620112904Sjeff{
621140245Sdavidxu	struct timespec ts, ts2, ts3;
622139013Sdavidxu	struct timeval tv;
623140245Sdavidxu	int error;
624139013Sdavidxu
625140245Sdavidxu	if (timeout == NULL) {
626139013Sdavidxu		error = _do_lock(td, umtx, id, 0);
627139013Sdavidxu	} else {
628140245Sdavidxu		getnanouptime(&ts);
629140245Sdavidxu		timespecadd(&ts, timeout);
630140245Sdavidxu		TIMESPEC_TO_TIMEVAL(&tv, timeout);
631139013Sdavidxu		for (;;) {
632140245Sdavidxu			error = _do_lock(td, umtx, id, tvtohz(&tv));
633140245Sdavidxu			if (error != ETIMEDOUT)
634140245Sdavidxu				break;
635140245Sdavidxu			getnanouptime(&ts2);
636140245Sdavidxu			if (timespeccmp(&ts2, &ts, >=)) {
637139751Sdavidxu				error = ETIMEDOUT;
638139013Sdavidxu				break;
639139013Sdavidxu			}
640140245Sdavidxu			ts3 = ts;
641140245Sdavidxu			timespecsub(&ts3, &ts2);
642140245Sdavidxu			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
643139013Sdavidxu		}
644139013Sdavidxu	}
645139258Sdavidxu	/*
646139258Sdavidxu	 * This lets userland back off critical region if needed.
647139258Sdavidxu	 */
648161678Sdavidxu	if (error == EINTR)
649161678Sdavidxu		error = ERESTART;
650139013Sdavidxu	return (error);
651139013Sdavidxu}
652139013Sdavidxu
653161678Sdavidxu/*
654161678Sdavidxu * Unlock a umtx object.
655161678Sdavidxu */
656139013Sdavidxustatic int
657161678Sdavidxudo_unlock(struct thread *td, struct umtx *umtx, uintptr_t id)
658139013Sdavidxu{
659139013Sdavidxu	struct umtx_key key;
660112904Sjeff	intptr_t owner;
661112967Sjake	intptr_t old;
662139257Sdavidxu	int error;
663139257Sdavidxu	int count;
664112904Sjeff
665112904Sjeff	/*
666112904Sjeff	 * Make sure we own this mtx.
667112904Sjeff	 *
668112904Sjeff	 * XXX Need a {fu,su}ptr this is not correct on arch where
669112904Sjeff	 * sizeof(intptr_t) != sizeof(long).
670112904Sjeff	 */
671161678Sdavidxu	owner = fuword(&umtx->u_owner);
672161678Sdavidxu	if (owner == -1)
673115765Sjeff		return (EFAULT);
674115765Sjeff
675139013Sdavidxu	if ((owner & ~UMTX_CONTESTED) != id)
676115765Sjeff		return (EPERM);
677112904Sjeff
678161678Sdavidxu	/* This should be done in userland */
679161678Sdavidxu	if ((owner & UMTX_CONTESTED) == 0) {
680161678Sdavidxu		old = casuptr((intptr_t *)&umtx->u_owner, owner,
681161678Sdavidxu			UMTX_UNOWNED);
682161678Sdavidxu		if (old == -1)
683161678Sdavidxu			return (EFAULT);
684161678Sdavidxu		if (old == owner)
685161678Sdavidxu			return (0);
686161678Sdavidxu	}
687161678Sdavidxu
688117685Smtm	/* We should only ever be in here for contested locks */
689161678Sdavidxu	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
690161678Sdavidxu		&key)) != 0)
691139257Sdavidxu		return (error);
692139257Sdavidxu
693139257Sdavidxu	umtxq_lock(&key);
694139257Sdavidxu	umtxq_busy(&key);
695139257Sdavidxu	count = umtxq_count(&key);
696139257Sdavidxu	umtxq_unlock(&key);
697139257Sdavidxu
698117743Smtm	/*
699117743Smtm	 * When unlocking the umtx, it must be marked as unowned if
700117743Smtm	 * there is zero or one thread only waiting for it.
701117743Smtm	 * Otherwise, it must be marked as contested.
702117743Smtm	 */
703139257Sdavidxu	old = casuptr((intptr_t *)&umtx->u_owner, owner,
704139257Sdavidxu			count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
705139257Sdavidxu	umtxq_lock(&key);
706161678Sdavidxu	umtxq_signal(&key,1);
707139257Sdavidxu	umtxq_unbusy(&key);
708139257Sdavidxu	umtxq_unlock(&key);
709139257Sdavidxu	umtx_key_release(&key);
710115765Sjeff	if (old == -1)
711115765Sjeff		return (EFAULT);
712138224Sdavidxu	if (old != owner)
713138224Sdavidxu		return (EINVAL);
714115765Sjeff	return (0);
715112904Sjeff}
716139013Sdavidxu
717161678Sdavidxu/*
718161678Sdavidxu * Fetch and compare value, sleep on the address if value is not changed.
719161678Sdavidxu */
720139013Sdavidxustatic int
721161678Sdavidxudo_wait(struct thread *td, struct umtx *umtx, uintptr_t id, struct timespec *timeout)
722139013Sdavidxu{
723143149Sdavidxu	struct umtx_q *uq;
724140245Sdavidxu	struct timespec ts, ts2, ts3;
725139013Sdavidxu	struct timeval tv;
726161678Sdavidxu	uintptr_t tmp;
727140245Sdavidxu	int error = 0;
728139013Sdavidxu
729143149Sdavidxu	uq = td->td_umtxq;
730161678Sdavidxu	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_WAIT, AUTO_SHARE,
731161678Sdavidxu	    &uq->uq_key)) != 0)
732139013Sdavidxu		return (error);
733161678Sdavidxu
734161678Sdavidxu	umtxq_lock(&uq->uq_key);
735161678Sdavidxu	umtxq_insert(uq);
736161678Sdavidxu	umtxq_unlock(&uq->uq_key);
737139427Sdavidxu	tmp = fuword(&umtx->u_owner);
738139427Sdavidxu	if (tmp != id) {
739143149Sdavidxu		umtxq_lock(&uq->uq_key);
740143149Sdavidxu		umtxq_remove(uq);
741143149Sdavidxu		umtxq_unlock(&uq->uq_key);
742140245Sdavidxu	} else if (timeout == NULL) {
743143149Sdavidxu		umtxq_lock(&uq->uq_key);
744161678Sdavidxu		error = umtxq_sleep(uq, "ucond", 0);
745161678Sdavidxu		umtxq_remove(uq);
746143149Sdavidxu		umtxq_unlock(&uq->uq_key);
747139013Sdavidxu	} else {
748140245Sdavidxu		getnanouptime(&ts);
749140245Sdavidxu		timespecadd(&ts, timeout);
750140245Sdavidxu		TIMESPEC_TO_TIMEVAL(&tv, timeout);
751161678Sdavidxu		umtxq_lock(&uq->uq_key);
752139013Sdavidxu		for (;;) {
753161678Sdavidxu			error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
754161678Sdavidxu			if (!(uq->uq_flags & UQF_UMTXQ))
755161678Sdavidxu				break;
756140245Sdavidxu			if (error != ETIMEDOUT)
757140245Sdavidxu				break;
758161678Sdavidxu			umtxq_unlock(&uq->uq_key);
759140245Sdavidxu			getnanouptime(&ts2);
760140245Sdavidxu			if (timespeccmp(&ts2, &ts, >=)) {
761139751Sdavidxu				error = ETIMEDOUT;
762161678Sdavidxu				umtxq_lock(&uq->uq_key);
763139013Sdavidxu				break;
764139013Sdavidxu			}
765140245Sdavidxu			ts3 = ts;
766140245Sdavidxu			timespecsub(&ts3, &ts2);
767140245Sdavidxu			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
768161678Sdavidxu			umtxq_lock(&uq->uq_key);
769139013Sdavidxu		}
770143149Sdavidxu		umtxq_remove(uq);
771143149Sdavidxu		umtxq_unlock(&uq->uq_key);
772139013Sdavidxu	}
773143149Sdavidxu	umtx_key_release(&uq->uq_key);
774139257Sdavidxu	if (error == ERESTART)
775139257Sdavidxu		error = EINTR;
776139013Sdavidxu	return (error);
777139013Sdavidxu}
778139013Sdavidxu
779161678Sdavidxu/*
780161678Sdavidxu * Wake up threads sleeping on the specified address.
781161678Sdavidxu */
782151692Sdavidxuint
783151692Sdavidxukern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
784139013Sdavidxu{
785139013Sdavidxu	struct umtx_key key;
786139257Sdavidxu	int ret;
787139013Sdavidxu
788161678Sdavidxu	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
789161678Sdavidxu	   &key)) != 0)
790139257Sdavidxu		return (ret);
791139258Sdavidxu	umtxq_lock(&key);
792139257Sdavidxu	ret = umtxq_signal(&key, n_wake);
793139258Sdavidxu	umtxq_unlock(&key);
794139257Sdavidxu	umtx_key_release(&key);
795139013Sdavidxu	return (0);
796139013Sdavidxu}
797139013Sdavidxu
798161678Sdavidxu/*
799161678Sdavidxu * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
800161678Sdavidxu */
801161678Sdavidxustatic int
802161678Sdavidxu_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
803161678Sdavidxu	int try)
804161678Sdavidxu{
805161678Sdavidxu	struct umtx_q *uq;
806161678Sdavidxu	uint32_t owner, old, id;
807161678Sdavidxu	int error = 0;
808161678Sdavidxu
809161678Sdavidxu	id = td->td_tid;
810161678Sdavidxu	uq = td->td_umtxq;
811161678Sdavidxu
812161678Sdavidxu	/*
813161678Sdavidxu	 * Care must be exercised when dealing with umtx structure. It
814161678Sdavidxu	 * can fault on any access.
815161678Sdavidxu	 */
816161678Sdavidxu	for (;;) {
817161678Sdavidxu		/*
818161678Sdavidxu		 * Try the uncontested case.  This should be done in userland.
819161678Sdavidxu		 */
820161678Sdavidxu		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
821161678Sdavidxu
822161678Sdavidxu		/* The acquire succeeded. */
823161678Sdavidxu		if (owner == UMUTEX_UNOWNED)
824161678Sdavidxu			return (0);
825161678Sdavidxu
826161678Sdavidxu		/* The address was invalid. */
827161678Sdavidxu		if (owner == -1)
828161678Sdavidxu			return (EFAULT);
829161678Sdavidxu
830161678Sdavidxu		/* If no one owns it but it is contested try to acquire it. */
831161678Sdavidxu		if (owner == UMUTEX_CONTESTED) {
832161678Sdavidxu			owner = casuword32(&m->m_owner,
833161678Sdavidxu			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
834161678Sdavidxu
835161678Sdavidxu			if (owner == UMUTEX_CONTESTED)
836161678Sdavidxu				return (0);
837161678Sdavidxu
838161678Sdavidxu			/* The address was invalid. */
839161678Sdavidxu			if (owner == -1)
840161678Sdavidxu				return (EFAULT);
841161678Sdavidxu
842161678Sdavidxu			/* If this failed the lock has changed, restart. */
843161678Sdavidxu			continue;
844161678Sdavidxu		}
845161678Sdavidxu
846161678Sdavidxu		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
847161678Sdavidxu		    (owner & ~UMUTEX_CONTESTED) == id)
848161678Sdavidxu			return (EDEADLK);
849161678Sdavidxu
850161678Sdavidxu		if (try != 0)
851161678Sdavidxu			return (EBUSY);
852161678Sdavidxu
853161678Sdavidxu		/*
854161678Sdavidxu		 * If we caught a signal, we have retried and now
855161678Sdavidxu		 * exit immediately.
856161678Sdavidxu		 */
857161678Sdavidxu		if (error != 0)
858161678Sdavidxu			return (error);
859161678Sdavidxu
860161678Sdavidxu		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
861161678Sdavidxu		    GET_SHARE(flags), &uq->uq_key)) != 0)
862161678Sdavidxu			return (error);
863161678Sdavidxu
864161678Sdavidxu		umtxq_lock(&uq->uq_key);
865161678Sdavidxu		umtxq_busy(&uq->uq_key);
866161678Sdavidxu		umtxq_insert(uq);
867161678Sdavidxu		umtxq_unbusy(&uq->uq_key);
868161678Sdavidxu		umtxq_unlock(&uq->uq_key);
869161678Sdavidxu
870161678Sdavidxu		/*
871161678Sdavidxu		 * Set the contested bit so that a release in user space
872161678Sdavidxu		 * knows to use the system call for unlock.  If this fails
873161678Sdavidxu		 * either some one else has acquired the lock or it has been
874161678Sdavidxu		 * released.
875161678Sdavidxu		 */
876161678Sdavidxu		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
877161678Sdavidxu
878161678Sdavidxu		/* The address was invalid. */
879161678Sdavidxu		if (old == -1) {
880161678Sdavidxu			umtxq_lock(&uq->uq_key);
881161678Sdavidxu			umtxq_remove(uq);
882161678Sdavidxu			umtxq_unlock(&uq->uq_key);
883161678Sdavidxu			umtx_key_release(&uq->uq_key);
884161678Sdavidxu			return (EFAULT);
885161678Sdavidxu		}
886161678Sdavidxu
887161678Sdavidxu		/*
888161678Sdavidxu		 * We set the contested bit, sleep. Otherwise the lock changed
889161678Sdavidxu		 * and we need to retry or we lost a race to the thread
890161678Sdavidxu		 * unlocking the umtx.
891161678Sdavidxu		 */
892161678Sdavidxu		umtxq_lock(&uq->uq_key);
893161678Sdavidxu		if (old == owner)
894161678Sdavidxu			error = umtxq_sleep(uq, "umtxn", timo);
895161678Sdavidxu		umtxq_remove(uq);
896161678Sdavidxu		umtxq_unlock(&uq->uq_key);
897161678Sdavidxu		umtx_key_release(&uq->uq_key);
898161678Sdavidxu	}
899161678Sdavidxu
900161678Sdavidxu	return (0);
901161678Sdavidxu}
902161678Sdavidxu
903161678Sdavidxu/*
904161678Sdavidxu * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
905161678Sdavidxu */
906161678Sdavidxustatic int
907161678Sdavidxudo_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
908161678Sdavidxu	struct timespec *timeout, int try)
909161678Sdavidxu{
910161678Sdavidxu	struct timespec ts, ts2, ts3;
911161678Sdavidxu	struct timeval tv;
912161678Sdavidxu	int error;
913161678Sdavidxu
914161678Sdavidxu	if (timeout == NULL) {
915161678Sdavidxu		error = _do_lock_normal(td, m, flags, 0, try);
916161678Sdavidxu	} else {
917161678Sdavidxu		getnanouptime(&ts);
918161678Sdavidxu		timespecadd(&ts, timeout);
919161678Sdavidxu		TIMESPEC_TO_TIMEVAL(&tv, timeout);
920161678Sdavidxu		for (;;) {
921161678Sdavidxu			error = _do_lock_normal(td, m, flags, tvtohz(&tv), try);
922161678Sdavidxu			if (error != ETIMEDOUT)
923161678Sdavidxu				break;
924161678Sdavidxu			getnanouptime(&ts2);
925161678Sdavidxu			if (timespeccmp(&ts2, &ts, >=)) {
926161678Sdavidxu				error = ETIMEDOUT;
927161678Sdavidxu				break;
928161678Sdavidxu			}
929161678Sdavidxu			ts3 = ts;
930161678Sdavidxu			timespecsub(&ts3, &ts2);
931161678Sdavidxu			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
932161678Sdavidxu		}
933161678Sdavidxu	}
934161678Sdavidxu	/*
935161678Sdavidxu	 * This lets userland back off critical region if needed.
936161678Sdavidxu	 */
937161678Sdavidxu	if (error == EINTR)
938161678Sdavidxu		error = ERESTART;
939161678Sdavidxu	return (error);
940161678Sdavidxu}
941161678Sdavidxu
942161678Sdavidxu/*
943161678Sdavidxu * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
944161678Sdavidxu */
945161678Sdavidxustatic int
946161678Sdavidxudo_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
947161678Sdavidxu{
948161678Sdavidxu	struct umtx_key key;
949161678Sdavidxu	uint32_t owner, old, id;
950161678Sdavidxu	int error;
951161678Sdavidxu	int count;
952161678Sdavidxu
953161678Sdavidxu	id = td->td_tid;
954161678Sdavidxu	/*
955161678Sdavidxu	 * Make sure we own this mtx.
956161678Sdavidxu	 */
957161678Sdavidxu	owner = fuword32(&m->m_owner);
958161678Sdavidxu	if (owner == -1)
959161678Sdavidxu		return (EFAULT);
960161678Sdavidxu
961161678Sdavidxu	if ((owner & ~UMUTEX_CONTESTED) != id)
962161678Sdavidxu		return (EPERM);
963161678Sdavidxu
964161678Sdavidxu	/* This should be done in userland */
965161678Sdavidxu	if ((owner & UMUTEX_CONTESTED) == 0) {
966161678Sdavidxu		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
967161678Sdavidxu		if (old == -1)
968161678Sdavidxu			return (EFAULT);
969161678Sdavidxu		if (old == owner)
970161678Sdavidxu			return (0);
971161678Sdavidxu	}
972161678Sdavidxu
973161678Sdavidxu	/* We should only ever be in here for contested locks */
974161678Sdavidxu	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
975161678Sdavidxu	    &key)) != 0)
976161678Sdavidxu		return (error);
977161678Sdavidxu
978161678Sdavidxu	umtxq_lock(&key);
979161678Sdavidxu	umtxq_busy(&key);
980161678Sdavidxu	count = umtxq_count(&key);
981161678Sdavidxu	umtxq_unlock(&key);
982161678Sdavidxu
983161678Sdavidxu	/*
984161678Sdavidxu	 * When unlocking the umtx, it must be marked as unowned if
985161678Sdavidxu	 * there is zero or one thread only waiting for it.
986161678Sdavidxu	 * Otherwise, it must be marked as contested.
987161678Sdavidxu	 */
988161678Sdavidxu	old = casuword32(&m->m_owner, owner,
989161678Sdavidxu		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
990161678Sdavidxu	umtxq_lock(&key);
991161678Sdavidxu	umtxq_signal(&key,1);
992161678Sdavidxu	umtxq_unbusy(&key);
993161678Sdavidxu	umtxq_unlock(&key);
994161678Sdavidxu	umtx_key_release(&key);
995161678Sdavidxu	if (old == -1)
996161678Sdavidxu		return (EFAULT);
997161678Sdavidxu	if (old != owner)
998161678Sdavidxu		return (EINVAL);
999161678Sdavidxu	return (0);
1000161678Sdavidxu}
1001161678Sdavidxu
1002161678Sdavidxustatic inline struct umtx_pi *
1003161678Sdavidxuumtx_pi_alloc(void)
1004161678Sdavidxu{
1005161678Sdavidxu	struct umtx_pi *pi;
1006161678Sdavidxu
1007161678Sdavidxu	pi = uma_zalloc(umtx_pi_zone, M_ZERO | M_WAITOK);
1008161678Sdavidxu	TAILQ_INIT(&pi->pi_blocked);
1009161678Sdavidxu	atomic_add_int(&umtx_pi_allocated, 1);
1010161678Sdavidxu	return (pi);
1011161678Sdavidxu}
1012161678Sdavidxu
1013161678Sdavidxustatic inline void
1014161678Sdavidxuumtx_pi_free(struct umtx_pi *pi)
1015161678Sdavidxu{
1016161678Sdavidxu	uma_zfree(umtx_pi_zone, pi);
1017161678Sdavidxu	atomic_add_int(&umtx_pi_allocated, -1);
1018161678Sdavidxu}
1019161678Sdavidxu
1020161678Sdavidxu/*
1021161678Sdavidxu * Adjust the thread's position on a pi_state after its priority has been
1022161678Sdavidxu * changed.
1023161678Sdavidxu */
1024161678Sdavidxustatic int
1025161678Sdavidxuumtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1026161678Sdavidxu{
1027161678Sdavidxu	struct umtx_q *uq, *uq1, *uq2;
1028161678Sdavidxu	struct thread *td1;
1029161678Sdavidxu
1030161678Sdavidxu	mtx_assert(&sched_lock, MA_OWNED);
1031161678Sdavidxu	if (pi == NULL)
1032161678Sdavidxu		return (0);
1033161678Sdavidxu
1034161678Sdavidxu	uq = td->td_umtxq;
1035161678Sdavidxu
1036161678Sdavidxu	/*
1037161678Sdavidxu	 * Check if the thread needs to be moved on the blocked chain.
1038161678Sdavidxu	 * It needs to be moved if either its priority is lower than
1039161678Sdavidxu	 * the previous thread or higher than the next thread.
1040161678Sdavidxu	 */
1041161678Sdavidxu	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1042161678Sdavidxu	uq2 = TAILQ_NEXT(uq, uq_lockq);
1043161678Sdavidxu	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1044161678Sdavidxu	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1045161678Sdavidxu		/*
1046161678Sdavidxu		 * Remove thread from blocked chain and determine where
1047161678Sdavidxu		 * it should be moved to.
1048161678Sdavidxu		 */
1049161678Sdavidxu		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1050161678Sdavidxu		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1051161678Sdavidxu			td1 = uq1->uq_thread;
1052161678Sdavidxu			MPASS(td1->td_proc->p_magic == P_MAGIC);
1053161678Sdavidxu			if (UPRI(td1) > UPRI(td))
1054161678Sdavidxu				break;
1055161678Sdavidxu		}
1056161678Sdavidxu
1057161678Sdavidxu		if (uq1 == NULL)
1058161678Sdavidxu			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1059161678Sdavidxu		else
1060161678Sdavidxu			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1061161678Sdavidxu	}
1062161678Sdavidxu	return (1);
1063161678Sdavidxu}
1064161678Sdavidxu
1065161678Sdavidxu/*
1066161678Sdavidxu * Propagate priority when a thread is blocked on POSIX
1067161678Sdavidxu * PI mutex.
1068161678Sdavidxu */
1069161678Sdavidxustatic void
1070161678Sdavidxuumtx_propagate_priority(struct thread *td)
1071161678Sdavidxu{
1072161678Sdavidxu	struct umtx_q *uq;
1073161678Sdavidxu	struct umtx_pi *pi;
1074161678Sdavidxu	int pri;
1075161678Sdavidxu
1076161678Sdavidxu	mtx_assert(&sched_lock, MA_OWNED);
1077161678Sdavidxu	pri = UPRI(td);
1078161678Sdavidxu	uq = td->td_umtxq;
1079161678Sdavidxu	pi = uq->uq_pi_blocked;
1080161678Sdavidxu	if (pi == NULL)
1081161678Sdavidxu		return;
1082161678Sdavidxu
1083161678Sdavidxu	for (;;) {
1084161678Sdavidxu		td = pi->pi_owner;
1085161678Sdavidxu		if (td == NULL)
1086161678Sdavidxu			return;
1087161678Sdavidxu
1088161678Sdavidxu		MPASS(td->td_proc != NULL);
1089161678Sdavidxu		MPASS(td->td_proc->p_magic == P_MAGIC);
1090161678Sdavidxu
1091161678Sdavidxu		if (UPRI(td) <= pri)
1092161678Sdavidxu			return;
1093161678Sdavidxu
1094161678Sdavidxu		sched_lend_user_prio(td, pri);
1095161678Sdavidxu
1096161678Sdavidxu		/*
1097161678Sdavidxu		 * Pick up the lock that td is blocked on.
1098161678Sdavidxu		 */
1099161678Sdavidxu		uq = td->td_umtxq;
1100161678Sdavidxu		pi = uq->uq_pi_blocked;
1101161678Sdavidxu		/* Resort td on the list if needed. */
1102161678Sdavidxu		if (!umtx_pi_adjust_thread(pi, td))
1103161678Sdavidxu			break;
1104161678Sdavidxu	}
1105161678Sdavidxu}
1106161678Sdavidxu
1107161678Sdavidxu/*
1108161678Sdavidxu * Unpropagate priority for a PI mutex when a thread blocked on
1109161678Sdavidxu * it is interrupted by signal or resumed by others.
1110161678Sdavidxu */
1111161678Sdavidxustatic void
1112161678Sdavidxuumtx_unpropagate_priority(struct umtx_pi *pi)
1113161678Sdavidxu{
1114161678Sdavidxu	struct umtx_q *uq, *uq_owner;
1115161678Sdavidxu	struct umtx_pi *pi2;
1116161678Sdavidxu	int pri;
1117161678Sdavidxu
1118161678Sdavidxu	mtx_assert(&sched_lock, MA_OWNED);
1119161678Sdavidxu
1120161678Sdavidxu	while (pi != NULL && pi->pi_owner != NULL) {
1121161678Sdavidxu		pri = PRI_MAX;
1122161678Sdavidxu		uq_owner = pi->pi_owner->td_umtxq;
1123161678Sdavidxu
1124161678Sdavidxu		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1125161678Sdavidxu			uq = TAILQ_FIRST(&pi2->pi_blocked);
1126161678Sdavidxu			if (uq != NULL) {
1127161678Sdavidxu				if (pri > UPRI(uq->uq_thread))
1128161678Sdavidxu					pri = UPRI(uq->uq_thread);
1129161678Sdavidxu			}
1130161678Sdavidxu		}
1131161678Sdavidxu
1132161678Sdavidxu		if (pri > uq_owner->uq_inherited_pri)
1133161678Sdavidxu			pri = uq_owner->uq_inherited_pri;
1134161678Sdavidxu		sched_unlend_user_prio(pi->pi_owner, pri);
1135161678Sdavidxu		pi = uq_owner->uq_pi_blocked;
1136161678Sdavidxu	}
1137161678Sdavidxu}
1138161678Sdavidxu
1139161678Sdavidxu/*
1140161678Sdavidxu * Insert a PI mutex into owned list.
1141161678Sdavidxu */
1142161678Sdavidxustatic void
1143161678Sdavidxuumtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1144161678Sdavidxu{
1145161678Sdavidxu	struct umtx_q *uq_owner;
1146161678Sdavidxu
1147161678Sdavidxu	uq_owner = owner->td_umtxq;
1148161678Sdavidxu	mtx_assert(&sched_lock, MA_OWNED);
1149161678Sdavidxu	if (pi->pi_owner != NULL)
1150161678Sdavidxu		panic("pi_ower != NULL");
1151161678Sdavidxu	pi->pi_owner = owner;
1152161678Sdavidxu	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1153161678Sdavidxu}
1154161678Sdavidxu
1155161678Sdavidxu/*
1156161678Sdavidxu * Claim ownership of a PI mutex.
1157161678Sdavidxu */
1158161678Sdavidxustatic int
1159161678Sdavidxuumtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1160161678Sdavidxu{
1161161678Sdavidxu	struct umtx_q *uq, *uq_owner;
1162161678Sdavidxu
1163161678Sdavidxu	uq_owner = owner->td_umtxq;
1164161678Sdavidxu	mtx_lock_spin(&sched_lock);
1165161678Sdavidxu	if (pi->pi_owner == owner) {
1166161678Sdavidxu		mtx_unlock_spin(&sched_lock);
1167161678Sdavidxu		return (0);
1168161678Sdavidxu	}
1169161678Sdavidxu
1170161678Sdavidxu	if (pi->pi_owner != NULL) {
1171161678Sdavidxu		/*
1172161678Sdavidxu		 * userland may have already messed the mutex, sigh.
1173161678Sdavidxu		 */
1174161678Sdavidxu		mtx_unlock_spin(&sched_lock);
1175161678Sdavidxu		return (EPERM);
1176161678Sdavidxu	}
1177161678Sdavidxu	umtx_pi_setowner(pi, owner);
1178161678Sdavidxu	uq = TAILQ_FIRST(&pi->pi_blocked);
1179161678Sdavidxu	if (uq != NULL) {
1180161678Sdavidxu		int pri;
1181161678Sdavidxu
1182161678Sdavidxu		pri = UPRI(uq->uq_thread);
1183161678Sdavidxu		if (pri < UPRI(owner))
1184161678Sdavidxu			sched_lend_user_prio(owner, pri);
1185161678Sdavidxu	}
1186161678Sdavidxu	mtx_unlock_spin(&sched_lock);
1187161678Sdavidxu	return (0);
1188161678Sdavidxu}
1189161678Sdavidxu
1190161678Sdavidxu/*
1191161678Sdavidxu * Adjust a thread's order position in its blocked PI mutex,
1192161678Sdavidxu * this may result new priority propagating process.
1193161678Sdavidxu */
1194161599Sdavidxuvoid
1195161678Sdavidxuumtx_pi_adjust(struct thread *td, u_char oldpri)
1196161599Sdavidxu{
1197161678Sdavidxu	struct umtx_q *uq;
1198161678Sdavidxu	struct umtx_pi *pi;
1199161678Sdavidxu
1200161678Sdavidxu	uq = td->td_umtxq;
1201161678Sdavidxu
1202161678Sdavidxu	mtx_assert(&sched_lock, MA_OWNED);
1203161678Sdavidxu	MPASS(TD_ON_UPILOCK(td));
1204161678Sdavidxu
1205161678Sdavidxu	/*
1206161678Sdavidxu	 * Pick up the lock that td is blocked on.
1207161678Sdavidxu	 */
1208161678Sdavidxu	pi = uq->uq_pi_blocked;
1209161678Sdavidxu	MPASS(pi != NULL);
1210161678Sdavidxu
1211161678Sdavidxu	/* Resort the turnstile on the list. */
1212161678Sdavidxu	if (!umtx_pi_adjust_thread(pi, td))
1213161678Sdavidxu		return;
1214161678Sdavidxu
1215161678Sdavidxu	/*
1216161678Sdavidxu	 * If our priority was lowered and we are at the head of the
1217161678Sdavidxu	 * turnstile, then propagate our new priority up the chain.
1218161678Sdavidxu	 */
1219161678Sdavidxu	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1220161678Sdavidxu		umtx_propagate_priority(td);
1221161599Sdavidxu}
1222161599Sdavidxu
1223161678Sdavidxu/*
1224161678Sdavidxu * Sleep on a PI mutex.
1225161678Sdavidxu */
1226161678Sdavidxustatic int
1227161678Sdavidxuumtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1228161678Sdavidxu	uint32_t owner, const char *wmesg, int timo)
1229161678Sdavidxu{
1230161678Sdavidxu	struct umtxq_chain *uc;
1231161678Sdavidxu	struct thread *td, *td1;
1232161678Sdavidxu	struct umtx_q *uq1;
1233161678Sdavidxu	int pri;
1234161678Sdavidxu	int error = 0;
1235161678Sdavidxu
1236161678Sdavidxu	td = uq->uq_thread;
1237161678Sdavidxu	KASSERT(td == curthread, ("inconsistent uq_thread"));
1238161678Sdavidxu	uc = umtxq_getchain(&uq->uq_key);
1239161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
1240161678Sdavidxu	umtxq_insert(uq);
1241161678Sdavidxu	if (pi->pi_owner == NULL) {
1242161678Sdavidxu		/* XXX
1243161678Sdavidxu		 * Current, We only support process private PI-mutex,
1244161678Sdavidxu		 * non-contended PI-mutexes are locked in userland.
1245161678Sdavidxu		 * Process shared PI-mutex should always be initialized
1246161678Sdavidxu		 * by kernel and be registered in kernel, locking should
1247161678Sdavidxu		 * always be done by kernel to avoid security problems.
1248161678Sdavidxu		 * For process private PI-mutex, we can find owner
1249161678Sdavidxu		 * thread and boost its priority safely.
1250161678Sdavidxu		 */
1251161678Sdavidxu		PROC_LOCK(curproc);
1252161678Sdavidxu		td1 = thread_find(curproc, owner);
1253161678Sdavidxu		mtx_lock_spin(&sched_lock);
1254161678Sdavidxu		if (td1 != NULL && pi->pi_owner == NULL) {
1255161678Sdavidxu			uq1 = td1->td_umtxq;
1256161678Sdavidxu			umtx_pi_setowner(pi, td1);
1257161678Sdavidxu		}
1258161678Sdavidxu		PROC_UNLOCK(curproc);
1259161678Sdavidxu	} else {
1260161678Sdavidxu		mtx_lock_spin(&sched_lock);
1261161678Sdavidxu	}
1262161678Sdavidxu
1263161678Sdavidxu	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1264161678Sdavidxu		pri = UPRI(uq1->uq_thread);
1265161678Sdavidxu		if (pri > UPRI(td))
1266161678Sdavidxu			break;
1267161678Sdavidxu	}
1268161678Sdavidxu
1269161678Sdavidxu	if (uq1 != NULL)
1270161678Sdavidxu		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1271161678Sdavidxu	else
1272161678Sdavidxu		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1273161678Sdavidxu
1274161678Sdavidxu	uq->uq_pi_blocked = pi;
1275161678Sdavidxu	td->td_flags |= TDF_UPIBLOCKED;
1276161678Sdavidxu	mtx_unlock_spin(&sched_lock);
1277161678Sdavidxu	umtxq_unlock(&uq->uq_key);
1278161678Sdavidxu
1279161678Sdavidxu	mtx_lock_spin(&sched_lock);
1280161678Sdavidxu	umtx_propagate_priority(td);
1281161678Sdavidxu	mtx_unlock_spin(&sched_lock);
1282161678Sdavidxu
1283161678Sdavidxu	umtxq_lock(&uq->uq_key);
1284161678Sdavidxu	if (uq->uq_flags & UQF_UMTXQ) {
1285161678Sdavidxu		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1286161678Sdavidxu		if (error == EWOULDBLOCK)
1287161678Sdavidxu			error = ETIMEDOUT;
1288161678Sdavidxu		if (uq->uq_flags & UQF_UMTXQ) {
1289161678Sdavidxu			umtxq_busy(&uq->uq_key);
1290161678Sdavidxu			umtxq_remove(uq);
1291161678Sdavidxu			umtxq_unbusy(&uq->uq_key);
1292161678Sdavidxu		}
1293161678Sdavidxu	}
1294161678Sdavidxu	umtxq_unlock(&uq->uq_key);
1295161678Sdavidxu
1296161678Sdavidxu	mtx_lock_spin(&sched_lock);
1297161678Sdavidxu	uq->uq_pi_blocked = NULL;
1298161678Sdavidxu	td->td_flags &= ~TDF_UPIBLOCKED;
1299161678Sdavidxu	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1300161678Sdavidxu	umtx_unpropagate_priority(pi);
1301161678Sdavidxu	mtx_unlock_spin(&sched_lock);
1302161678Sdavidxu
1303161678Sdavidxu	umtxq_lock(&uq->uq_key);
1304161678Sdavidxu
1305161678Sdavidxu	return (error);
1306161678Sdavidxu}
1307161678Sdavidxu
1308161678Sdavidxu/*
1309161678Sdavidxu * Add reference count for a PI mutex.
1310161678Sdavidxu */
1311161678Sdavidxustatic void
1312161678Sdavidxuumtx_pi_ref(struct umtx_pi *pi)
1313161678Sdavidxu{
1314161678Sdavidxu	struct umtxq_chain *uc;
1315161678Sdavidxu
1316161678Sdavidxu	uc = umtxq_getchain(&pi->pi_key);
1317161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
1318161678Sdavidxu	pi->pi_refcount++;
1319161678Sdavidxu}
1320161678Sdavidxu
1321161678Sdavidxu/*
1322161678Sdavidxu * Decrease reference count for a PI mutex, if the counter
1323161678Sdavidxu * is decreased to zero, its memory space is freed.
1324161678Sdavidxu */
1325161678Sdavidxustatic void
1326161678Sdavidxuumtx_pi_unref(struct umtx_pi *pi)
1327161678Sdavidxu{
1328161678Sdavidxu	struct umtxq_chain *uc;
1329161678Sdavidxu	int free = 0;
1330161678Sdavidxu
1331161678Sdavidxu	uc = umtxq_getchain(&pi->pi_key);
1332161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
1333161678Sdavidxu	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1334161678Sdavidxu	if (--pi->pi_refcount == 0) {
1335161678Sdavidxu		mtx_lock_spin(&sched_lock);
1336161678Sdavidxu		if (pi->pi_owner != NULL) {
1337161678Sdavidxu			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1338161678Sdavidxu				pi, pi_link);
1339161678Sdavidxu			pi->pi_owner = NULL;
1340161678Sdavidxu		}
1341161678Sdavidxu		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1342161678Sdavidxu			("blocked queue not empty"));
1343161678Sdavidxu		mtx_unlock_spin(&sched_lock);
1344161678Sdavidxu		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1345161678Sdavidxu		free = 1;
1346161678Sdavidxu	}
1347161678Sdavidxu	if (free)
1348161678Sdavidxu		umtx_pi_free(pi);
1349161678Sdavidxu}
1350161678Sdavidxu
1351161678Sdavidxu/*
1352161678Sdavidxu * Find a PI mutex in hash table.
1353161678Sdavidxu */
1354161678Sdavidxustatic struct umtx_pi *
1355161678Sdavidxuumtx_pi_lookup(struct umtx_key *key)
1356161678Sdavidxu{
1357161678Sdavidxu	struct umtxq_chain *uc;
1358161678Sdavidxu	struct umtx_pi *pi;
1359161678Sdavidxu
1360161678Sdavidxu	uc = umtxq_getchain(key);
1361161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
1362161678Sdavidxu
1363161678Sdavidxu	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1364161678Sdavidxu		if (umtx_key_match(&pi->pi_key, key)) {
1365161678Sdavidxu			return (pi);
1366161678Sdavidxu		}
1367161678Sdavidxu	}
1368161678Sdavidxu	return (NULL);
1369161678Sdavidxu}
1370161678Sdavidxu
1371161678Sdavidxu/*
1372161678Sdavidxu * Insert a PI mutex into hash table.
1373161678Sdavidxu */
1374161678Sdavidxustatic inline void
1375161678Sdavidxuumtx_pi_insert(struct umtx_pi *pi)
1376161678Sdavidxu{
1377161678Sdavidxu	struct umtxq_chain *uc;
1378161678Sdavidxu
1379161678Sdavidxu	uc = umtxq_getchain(&pi->pi_key);
1380161678Sdavidxu	UMTXQ_LOCKED_ASSERT(uc);
1381161678Sdavidxu	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1382161678Sdavidxu}
1383161678Sdavidxu
1384161678Sdavidxu/*
1385161678Sdavidxu * Lock a PI mutex.
1386161678Sdavidxu */
1387161678Sdavidxustatic int
1388161678Sdavidxu_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1389161678Sdavidxu	int try)
1390161678Sdavidxu{
1391161678Sdavidxu	struct umtx_q *uq;
1392161678Sdavidxu	struct umtx_pi *pi, *new_pi;
1393161678Sdavidxu	uint32_t id, owner, old;
1394161678Sdavidxu	int error;
1395161678Sdavidxu
1396161678Sdavidxu	id = td->td_tid;
1397161678Sdavidxu	uq = td->td_umtxq;
1398161678Sdavidxu
1399161678Sdavidxu	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1400161678Sdavidxu	    &uq->uq_key)) != 0)
1401161678Sdavidxu		return (error);
1402161678Sdavidxu	for (;;) {
1403161678Sdavidxu		pi = NULL;
1404161678Sdavidxu		umtxq_lock(&uq->uq_key);
1405161678Sdavidxu		pi = umtx_pi_lookup(&uq->uq_key);
1406161678Sdavidxu		if (pi == NULL) {
1407161678Sdavidxu			umtxq_unlock(&uq->uq_key);
1408161678Sdavidxu			new_pi = umtx_pi_alloc();
1409161678Sdavidxu			new_pi->pi_key = uq->uq_key;
1410161678Sdavidxu			umtxq_lock(&uq->uq_key);
1411161678Sdavidxu			pi = umtx_pi_lookup(&uq->uq_key);
1412161678Sdavidxu			if (pi != NULL)
1413161678Sdavidxu				umtx_pi_free(new_pi);
1414161678Sdavidxu			else {
1415161678Sdavidxu				umtx_pi_insert(new_pi);
1416161678Sdavidxu				pi = new_pi;
1417161678Sdavidxu			}
1418161678Sdavidxu		}
1419161678Sdavidxu
1420161678Sdavidxu		umtx_pi_ref(pi);
1421161678Sdavidxu		umtxq_unlock(&uq->uq_key);
1422161678Sdavidxu
1423161678Sdavidxu		/*
1424161678Sdavidxu		 * Care must be exercised when dealing with umtx structure.  It
1425161678Sdavidxu		 * can fault on any access.
1426161678Sdavidxu		 */
1427161678Sdavidxu
1428161678Sdavidxu		/*
1429161678Sdavidxu		 * Try the uncontested case.  This should be done in userland.
1430161678Sdavidxu		 */
1431161678Sdavidxu		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1432161678Sdavidxu
1433161678Sdavidxu		/* The acquire succeeded. */
1434161678Sdavidxu		if (owner == UMUTEX_UNOWNED) {
1435161678Sdavidxu			error = 0;
1436161678Sdavidxu			break;
1437161678Sdavidxu		}
1438161678Sdavidxu
1439161678Sdavidxu		/* The address was invalid. */
1440161678Sdavidxu		if (owner == -1) {
1441161678Sdavidxu			error = EFAULT;
1442161678Sdavidxu			break;
1443161678Sdavidxu		}
1444161678Sdavidxu
1445161678Sdavidxu		/* If no one owns it but it is contested try to acquire it. */
1446161678Sdavidxu		if (owner == UMUTEX_CONTESTED) {
1447161678Sdavidxu			owner = casuword32(&m->m_owner,
1448161678Sdavidxu			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1449161678Sdavidxu
1450161678Sdavidxu			if (owner == UMUTEX_CONTESTED) {
1451161678Sdavidxu				umtxq_lock(&uq->uq_key);
1452161678Sdavidxu				error = umtx_pi_claim(pi, td);
1453161678Sdavidxu				umtxq_unlock(&uq->uq_key);
1454161678Sdavidxu				break;
1455161678Sdavidxu			}
1456161678Sdavidxu
1457161678Sdavidxu			/* The address was invalid. */
1458161678Sdavidxu			if (owner == -1) {
1459161678Sdavidxu				error = EFAULT;
1460161678Sdavidxu				break;
1461161678Sdavidxu			}
1462161678Sdavidxu
1463161678Sdavidxu			/* If this failed the lock has changed, restart. */
1464161678Sdavidxu			umtxq_lock(&uq->uq_key);
1465161678Sdavidxu			umtx_pi_unref(pi);
1466161678Sdavidxu			umtxq_unlock(&uq->uq_key);
1467161678Sdavidxu			pi = NULL;
1468161678Sdavidxu			continue;
1469161678Sdavidxu		}
1470161678Sdavidxu
1471161678Sdavidxu		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1472161678Sdavidxu		    (owner & ~UMUTEX_CONTESTED) == id) {
1473161678Sdavidxu			error = EDEADLK;
1474161678Sdavidxu			break;
1475161678Sdavidxu		}
1476161678Sdavidxu
1477161678Sdavidxu		if (try != 0) {
1478161678Sdavidxu			error = EBUSY;
1479161678Sdavidxu			break;
1480161678Sdavidxu		}
1481161678Sdavidxu
1482161678Sdavidxu		/*
1483161678Sdavidxu		 * If we caught a signal, we have retried and now
1484161678Sdavidxu		 * exit immediately.
1485161678Sdavidxu		 */
1486161678Sdavidxu		if (error != 0)
1487161678Sdavidxu			break;
1488161678Sdavidxu
1489161678Sdavidxu		umtxq_lock(&uq->uq_key);
1490161678Sdavidxu		umtxq_busy(&uq->uq_key);
1491161678Sdavidxu		umtxq_unlock(&uq->uq_key);
1492161678Sdavidxu
1493161678Sdavidxu		/*
1494161678Sdavidxu		 * Set the contested bit so that a release in user space
1495161678Sdavidxu		 * knows to use the system call for unlock.  If this fails
1496161678Sdavidxu		 * either some one else has acquired the lock or it has been
1497161678Sdavidxu		 * released.
1498161678Sdavidxu		 */
1499161678Sdavidxu		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1500161678Sdavidxu
1501161678Sdavidxu		/* The address was invalid. */
1502161678Sdavidxu		if (old == -1) {
1503161678Sdavidxu			umtxq_lock(&uq->uq_key);
1504161678Sdavidxu			umtxq_unbusy(&uq->uq_key);
1505161678Sdavidxu			umtxq_unlock(&uq->uq_key);
1506161678Sdavidxu			error = EFAULT;
1507161678Sdavidxu			break;
1508161678Sdavidxu		}
1509161678Sdavidxu
1510161678Sdavidxu		umtxq_lock(&uq->uq_key);
1511161678Sdavidxu		umtxq_unbusy(&uq->uq_key);
1512161678Sdavidxu		/*
1513161678Sdavidxu		 * We set the contested bit, sleep. Otherwise the lock changed
1514161678Sdavidxu		 * and we need to retry or we lost a race to the thread
1515161678Sdavidxu		 * unlocking the umtx.
1516161678Sdavidxu		 */
1517161678Sdavidxu		if (old == owner)
1518161678Sdavidxu			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1519161678Sdavidxu				 "umtxpi", timo);
1520161678Sdavidxu		umtx_pi_unref(pi);
1521161678Sdavidxu		umtxq_unlock(&uq->uq_key);
1522161678Sdavidxu		pi = NULL;
1523161678Sdavidxu	}
1524161678Sdavidxu
1525161678Sdavidxu	if (pi != NULL) {
1526161678Sdavidxu		umtxq_lock(&uq->uq_key);
1527161678Sdavidxu		umtx_pi_unref(pi);
1528161678Sdavidxu		umtxq_unlock(&uq->uq_key);
1529161678Sdavidxu	}
1530161678Sdavidxu
1531161678Sdavidxu	umtx_key_release(&uq->uq_key);
1532161678Sdavidxu	return (error);
1533161678Sdavidxu}
1534161678Sdavidxu
1535161678Sdavidxustatic int
1536161678Sdavidxudo_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1537161678Sdavidxu	struct timespec *timeout, int try)
1538161678Sdavidxu{
1539161678Sdavidxu	struct timespec ts, ts2, ts3;
1540161678Sdavidxu	struct timeval tv;
1541161678Sdavidxu	int error;
1542161678Sdavidxu
1543161678Sdavidxu	if (timeout == NULL) {
1544161678Sdavidxu		error = _do_lock_pi(td, m, flags, 0, try);
1545161678Sdavidxu	} else {
1546161678Sdavidxu		getnanouptime(&ts);
1547161678Sdavidxu		timespecadd(&ts, timeout);
1548161678Sdavidxu		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1549161678Sdavidxu		for (;;) {
1550161678Sdavidxu			error = _do_lock_pi(td, m, flags, tvtohz(&tv), try);
1551161678Sdavidxu			if (error != ETIMEDOUT)
1552161678Sdavidxu				break;
1553161678Sdavidxu			getnanouptime(&ts2);
1554161678Sdavidxu			if (timespeccmp(&ts2, &ts, >=)) {
1555161678Sdavidxu				error = ETIMEDOUT;
1556161678Sdavidxu				break;
1557161678Sdavidxu			}
1558161678Sdavidxu			ts3 = ts;
1559161678Sdavidxu			timespecsub(&ts3, &ts2);
1560161678Sdavidxu			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1561161678Sdavidxu		}
1562161678Sdavidxu	}
1563161678Sdavidxu	/*
1564161678Sdavidxu	 * This lets userland back off critical region if needed.
1565161678Sdavidxu	 */
1566161678Sdavidxu	if (error == EINTR)
1567161678Sdavidxu		error = ERESTART;
1568161678Sdavidxu	return (error);
1569161678Sdavidxu}
1570161678Sdavidxu
1571161678Sdavidxu/*
1572161678Sdavidxu * Unlock a PI mutex.
1573161678Sdavidxu */
1574161678Sdavidxustatic int
1575161678Sdavidxudo_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1576161678Sdavidxu{
1577161678Sdavidxu	struct umtx_key key;
1578161678Sdavidxu	struct umtx_q *uq_first, *uq_first2, *uq_me;
1579161678Sdavidxu	struct umtx_pi *pi, *pi2;
1580161678Sdavidxu	uint32_t owner, old, id;
1581161678Sdavidxu	int error;
1582161678Sdavidxu	int count;
1583161678Sdavidxu	int pri;
1584161678Sdavidxu
1585161678Sdavidxu	id = td->td_tid;
1586161678Sdavidxu	/*
1587161678Sdavidxu	 * Make sure we own this mtx.
1588161678Sdavidxu	 */
1589161678Sdavidxu	owner = fuword32(&m->m_owner);
1590161678Sdavidxu	if (owner == -1)
1591161678Sdavidxu		return (EFAULT);
1592161678Sdavidxu
1593161678Sdavidxu	if ((owner & ~UMUTEX_CONTESTED) != id)
1594161678Sdavidxu		return (EPERM);
1595161678Sdavidxu
1596161678Sdavidxu	/* This should be done in userland */
1597161678Sdavidxu	if ((owner & UMUTEX_CONTESTED) == 0) {
1598161678Sdavidxu		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1599161678Sdavidxu		if (old == -1)
1600161678Sdavidxu			return (EFAULT);
1601161678Sdavidxu		if (old == owner)
1602161678Sdavidxu			return (0);
1603161678Sdavidxu	}
1604161678Sdavidxu
1605161678Sdavidxu	/* We should only ever be in here for contested locks */
1606161678Sdavidxu	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1607161678Sdavidxu	    &key)) != 0)
1608161678Sdavidxu		return (error);
1609161678Sdavidxu
1610161678Sdavidxu	umtxq_lock(&key);
1611161678Sdavidxu	umtxq_busy(&key);
1612161678Sdavidxu	count = umtxq_count_pi(&key, &uq_first);
1613161678Sdavidxu	if (uq_first != NULL) {
1614161678Sdavidxu		pi = uq_first->uq_pi_blocked;
1615161678Sdavidxu		if (pi->pi_owner != curthread) {
1616161678Sdavidxu			umtxq_unbusy(&key);
1617161678Sdavidxu			umtxq_unlock(&key);
1618161678Sdavidxu			/* userland messed the mutex */
1619161678Sdavidxu			return (EPERM);
1620161678Sdavidxu		}
1621161678Sdavidxu		uq_me = curthread->td_umtxq;
1622161678Sdavidxu		mtx_lock_spin(&sched_lock);
1623161678Sdavidxu		pi->pi_owner = NULL;
1624161678Sdavidxu		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1625161678Sdavidxu		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1626161678Sdavidxu		pri = PRI_MAX;
1627161678Sdavidxu		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1628161678Sdavidxu			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1629161678Sdavidxu			if (uq_first2 != NULL) {
1630161678Sdavidxu				if (pri > UPRI(uq_first2->uq_thread))
1631161678Sdavidxu					pri = UPRI(uq_first2->uq_thread);
1632161678Sdavidxu			}
1633161678Sdavidxu		}
1634161678Sdavidxu		sched_unlend_user_prio(curthread, pri);
1635161678Sdavidxu		mtx_unlock_spin(&sched_lock);
1636161678Sdavidxu	}
1637161678Sdavidxu	umtxq_unlock(&key);
1638161678Sdavidxu
1639161678Sdavidxu	/*
1640161678Sdavidxu	 * When unlocking the umtx, it must be marked as unowned if
1641161678Sdavidxu	 * there is zero or one thread only waiting for it.
1642161678Sdavidxu	 * Otherwise, it must be marked as contested.
1643161678Sdavidxu	 */
1644161678Sdavidxu	old = casuword32(&m->m_owner, owner,
1645161678Sdavidxu		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1646161678Sdavidxu
1647161678Sdavidxu	umtxq_lock(&key);
1648161678Sdavidxu	if (uq_first != NULL)
1649161678Sdavidxu		umtxq_signal_thread(uq_first);
1650161678Sdavidxu	umtxq_unbusy(&key);
1651161678Sdavidxu	umtxq_unlock(&key);
1652161678Sdavidxu	umtx_key_release(&key);
1653161678Sdavidxu	if (old == -1)
1654161678Sdavidxu		return (EFAULT);
1655161678Sdavidxu	if (old != owner)
1656161678Sdavidxu		return (EINVAL);
1657161678Sdavidxu	return (0);
1658161678Sdavidxu}
1659161678Sdavidxu
1660161678Sdavidxu/*
1661161678Sdavidxu * Lock a PP mutex.
1662161678Sdavidxu */
1663161678Sdavidxustatic int
1664161678Sdavidxu_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1665161678Sdavidxu	int try)
1666161678Sdavidxu{
1667161678Sdavidxu	struct umtx_q *uq, *uq2;
1668161678Sdavidxu	struct umtx_pi *pi;
1669161678Sdavidxu	uint32_t ceiling;
1670161678Sdavidxu	uint32_t owner, id;
1671161678Sdavidxu	int error, pri, old_inherited_pri, su;
1672161678Sdavidxu
1673161678Sdavidxu	id = td->td_tid;
1674161678Sdavidxu	uq = td->td_umtxq;
1675161678Sdavidxu	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1676161678Sdavidxu	    &uq->uq_key)) != 0)
1677161678Sdavidxu		return (error);
1678161678Sdavidxu	su = (suser(td) == 0);
1679161678Sdavidxu	for (;;) {
1680161678Sdavidxu		old_inherited_pri = uq->uq_inherited_pri;
1681161678Sdavidxu		umtxq_lock(&uq->uq_key);
1682161678Sdavidxu		umtxq_busy(&uq->uq_key);
1683161678Sdavidxu		umtxq_unlock(&uq->uq_key);
1684161678Sdavidxu
1685161678Sdavidxu		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1686161678Sdavidxu		if (ceiling > RTP_PRIO_MAX) {
1687161678Sdavidxu			error = EINVAL;
1688161678Sdavidxu			goto out;
1689161678Sdavidxu		}
1690161678Sdavidxu
1691161678Sdavidxu		mtx_lock_spin(&sched_lock);
1692161678Sdavidxu		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1693161678Sdavidxu			mtx_unlock_spin(&sched_lock);
1694161678Sdavidxu			error = EINVAL;
1695161678Sdavidxu			goto out;
1696161678Sdavidxu		}
1697161678Sdavidxu		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1698161678Sdavidxu			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1699161678Sdavidxu			if (uq->uq_inherited_pri < UPRI(td))
1700161678Sdavidxu				sched_lend_user_prio(td, uq->uq_inherited_pri);
1701161678Sdavidxu		}
1702161678Sdavidxu		mtx_unlock_spin(&sched_lock);
1703161678Sdavidxu
1704161678Sdavidxu		owner = casuword32(&m->m_owner,
1705161678Sdavidxu		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1706161678Sdavidxu
1707161678Sdavidxu		if (owner == UMUTEX_CONTESTED) {
1708161678Sdavidxu			error = 0;
1709161678Sdavidxu			break;
1710161678Sdavidxu		}
1711161678Sdavidxu
1712161678Sdavidxu		/* The address was invalid. */
1713161678Sdavidxu		if (owner == -1) {
1714161678Sdavidxu			error = EFAULT;
1715161678Sdavidxu			break;
1716161678Sdavidxu		}
1717161678Sdavidxu
1718161678Sdavidxu		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1719161678Sdavidxu		    (owner & ~UMUTEX_CONTESTED) == id) {
1720161678Sdavidxu			error = EDEADLK;
1721161678Sdavidxu			break;
1722161678Sdavidxu		}
1723161678Sdavidxu
1724161678Sdavidxu		if (try != 0) {
1725161678Sdavidxu			error = EBUSY;
1726161678Sdavidxu			break;
1727161678Sdavidxu		}
1728161678Sdavidxu
1729161678Sdavidxu		/*
1730161678Sdavidxu		 * If we caught a signal, we have retried and now
1731161678Sdavidxu		 * exit immediately.
1732161678Sdavidxu		 */
1733161678Sdavidxu		if (error != 0)
1734161678Sdavidxu			break;
1735161678Sdavidxu
1736161678Sdavidxu		/*
1737161678Sdavidxu		 * We set the contested bit, sleep. Otherwise the lock changed
1738161678Sdavidxu		 * and we need to retry or we lost a race to the thread
1739161678Sdavidxu		 * unlocking the umtx.
1740161678Sdavidxu		 */
1741161678Sdavidxu		umtxq_lock(&uq->uq_key);
1742161678Sdavidxu		umtxq_insert(uq);
1743161678Sdavidxu		umtxq_unbusy(&uq->uq_key);
1744161678Sdavidxu		error = umtxq_sleep(uq, "umtxpp", timo);
1745161678Sdavidxu		umtxq_remove(uq);
1746161678Sdavidxu		umtxq_unlock(&uq->uq_key);
1747161678Sdavidxu
1748161678Sdavidxu		mtx_lock_spin(&sched_lock);
1749161678Sdavidxu		uq->uq_inherited_pri = old_inherited_pri;
1750161678Sdavidxu		pri = PRI_MAX;
1751161678Sdavidxu		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1752161678Sdavidxu			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1753161678Sdavidxu			if (uq2 != NULL) {
1754161678Sdavidxu				if (pri > UPRI(uq2->uq_thread))
1755161678Sdavidxu					pri = UPRI(uq2->uq_thread);
1756161678Sdavidxu			}
1757161678Sdavidxu		}
1758161678Sdavidxu		if (pri > uq->uq_inherited_pri)
1759161678Sdavidxu			pri = uq->uq_inherited_pri;
1760161678Sdavidxu		sched_unlend_user_prio(td, pri);
1761161678Sdavidxu		mtx_unlock_spin(&sched_lock);
1762161678Sdavidxu	}
1763161678Sdavidxu
1764161678Sdavidxu	if (error != 0) {
1765161678Sdavidxu		mtx_lock_spin(&sched_lock);
1766161678Sdavidxu		uq->uq_inherited_pri = old_inherited_pri;
1767161678Sdavidxu		pri = PRI_MAX;
1768161678Sdavidxu		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1769161678Sdavidxu			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1770161678Sdavidxu			if (uq2 != NULL) {
1771161678Sdavidxu				if (pri > UPRI(uq2->uq_thread))
1772161678Sdavidxu					pri = UPRI(uq2->uq_thread);
1773161678Sdavidxu			}
1774161678Sdavidxu		}
1775161678Sdavidxu		if (pri > uq->uq_inherited_pri)
1776161678Sdavidxu			pri = uq->uq_inherited_pri;
1777161678Sdavidxu		sched_unlend_user_prio(td, pri);
1778161678Sdavidxu		mtx_unlock_spin(&sched_lock);
1779161678Sdavidxu	}
1780161678Sdavidxu
1781161678Sdavidxuout:
1782161678Sdavidxu	umtxq_lock(&uq->uq_key);
1783161678Sdavidxu	umtxq_unbusy(&uq->uq_key);
1784161678Sdavidxu	umtxq_unlock(&uq->uq_key);
1785161678Sdavidxu	umtx_key_release(&uq->uq_key);
1786161678Sdavidxu	return (error);
1787161678Sdavidxu}
1788161678Sdavidxu
1789161678Sdavidxu/*
1790161678Sdavidxu * Lock a PP mutex.
1791161678Sdavidxu */
1792161678Sdavidxustatic int
1793161678Sdavidxudo_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
1794161678Sdavidxu	struct timespec *timeout, int try)
1795161678Sdavidxu{
1796161678Sdavidxu	struct timespec ts, ts2, ts3;
1797161678Sdavidxu	struct timeval tv;
1798161678Sdavidxu	int error;
1799161678Sdavidxu
1800161678Sdavidxu	if (timeout == NULL) {
1801161678Sdavidxu		error = _do_lock_pp(td, m, flags, 0, try);
1802161678Sdavidxu	} else {
1803161678Sdavidxu		getnanouptime(&ts);
1804161678Sdavidxu		timespecadd(&ts, timeout);
1805161678Sdavidxu		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1806161678Sdavidxu		for (;;) {
1807161678Sdavidxu			error = _do_lock_pp(td, m, flags, tvtohz(&tv), try);
1808161678Sdavidxu			if (error != ETIMEDOUT)
1809161678Sdavidxu				break;
1810161678Sdavidxu			getnanouptime(&ts2);
1811161678Sdavidxu			if (timespeccmp(&ts2, &ts, >=)) {
1812161678Sdavidxu				error = ETIMEDOUT;
1813161678Sdavidxu				break;
1814161678Sdavidxu			}
1815161678Sdavidxu			ts3 = ts;
1816161678Sdavidxu			timespecsub(&ts3, &ts2);
1817161678Sdavidxu			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1818161678Sdavidxu		}
1819161678Sdavidxu	}
1820161678Sdavidxu	/*
1821161678Sdavidxu	 * This lets userland back off critical region if needed.
1822161678Sdavidxu	 */
1823161678Sdavidxu	if (error == EINTR)
1824161678Sdavidxu		error = ERESTART;
1825161678Sdavidxu	return (error);
1826161678Sdavidxu}
1827161678Sdavidxu
1828161678Sdavidxu/*
1829161678Sdavidxu * Unlock a PP mutex.
1830161678Sdavidxu */
1831161678Sdavidxustatic int
1832161678Sdavidxudo_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1833161678Sdavidxu{
1834161678Sdavidxu	struct umtx_key key;
1835161678Sdavidxu	struct umtx_q *uq, *uq2;
1836161678Sdavidxu	struct umtx_pi *pi;
1837161678Sdavidxu	uint32_t owner, id;
1838161678Sdavidxu	uint32_t rceiling;
1839161678Sdavidxu	int error, pri, new_inherited_pri;
1840161678Sdavidxu
1841161678Sdavidxu	id = td->td_tid;
1842161678Sdavidxu	uq = td->td_umtxq;
1843161678Sdavidxu
1844161678Sdavidxu	/*
1845161678Sdavidxu	 * Make sure we own this mtx.
1846161678Sdavidxu	 */
1847161678Sdavidxu	owner = fuword32(&m->m_owner);
1848161678Sdavidxu	if (owner == -1)
1849161678Sdavidxu		return (EFAULT);
1850161678Sdavidxu
1851161678Sdavidxu	if ((owner & ~UMUTEX_CONTESTED) != id)
1852161678Sdavidxu		return (EPERM);
1853161678Sdavidxu
1854161678Sdavidxu	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1855161678Sdavidxu	if (error != 0)
1856161678Sdavidxu		return (error);
1857161678Sdavidxu
1858161678Sdavidxu	if (rceiling == -1)
1859161678Sdavidxu		new_inherited_pri = PRI_MAX;
1860161678Sdavidxu	else {
1861161678Sdavidxu		rceiling = RTP_PRIO_MAX - rceiling;
1862161678Sdavidxu		if (rceiling > RTP_PRIO_MAX)
1863161678Sdavidxu			return (EINVAL);
1864161678Sdavidxu		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1865161678Sdavidxu	}
1866161678Sdavidxu
1867161678Sdavidxu	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1868161678Sdavidxu	    &key)) != 0)
1869161678Sdavidxu		return (error);
1870161678Sdavidxu	umtxq_lock(&key);
1871161678Sdavidxu	umtxq_busy(&key);
1872161678Sdavidxu	umtxq_unlock(&key);
1873161678Sdavidxu	/*
1874161678Sdavidxu	 * For priority protected mutex, always set unlocked state
1875161678Sdavidxu	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1876161678Sdavidxu	 * to lock the mutex, it is necessary because thread priority
1877161678Sdavidxu	 * has to be adjusted for such mutex.
1878161678Sdavidxu	 */
1879161678Sdavidxu	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
1880161678Sdavidxu
1881161678Sdavidxu	umtxq_lock(&key);
1882161678Sdavidxu	if (error == 0)
1883161678Sdavidxu		umtxq_signal(&key, 1);
1884161678Sdavidxu	umtxq_unbusy(&key);
1885161678Sdavidxu	umtxq_unlock(&key);
1886161678Sdavidxu
1887161678Sdavidxu	if (error == -1)
1888161678Sdavidxu		error = EFAULT;
1889161678Sdavidxu	else {
1890161678Sdavidxu		mtx_lock_spin(&sched_lock);
1891161678Sdavidxu		uq->uq_inherited_pri = new_inherited_pri;
1892161678Sdavidxu		pri = PRI_MAX;
1893161678Sdavidxu		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1894161678Sdavidxu			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1895161678Sdavidxu			if (uq2 != NULL) {
1896161678Sdavidxu				if (pri > UPRI(uq2->uq_thread))
1897161678Sdavidxu					pri = UPRI(uq2->uq_thread);
1898161678Sdavidxu			}
1899161678Sdavidxu		}
1900161678Sdavidxu		if (pri > uq->uq_inherited_pri)
1901161678Sdavidxu			pri = uq->uq_inherited_pri;
1902161678Sdavidxu		sched_unlend_user_prio(td, pri);
1903161678Sdavidxu		mtx_unlock_spin(&sched_lock);
1904161678Sdavidxu	}
1905161678Sdavidxu	umtx_key_release(&key);
1906161678Sdavidxu	return (error);
1907161678Sdavidxu}
1908161678Sdavidxu
1909161678Sdavidxustatic int
1910161678Sdavidxudo_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
1911161678Sdavidxu	uint32_t *old_ceiling)
1912161678Sdavidxu{
1913161678Sdavidxu	struct umtx_q *uq;
1914161678Sdavidxu	uint32_t save_ceiling;
1915161678Sdavidxu	uint32_t owner, id;
1916161678Sdavidxu	uint32_t flags;
1917161678Sdavidxu	int error;
1918161678Sdavidxu
1919161678Sdavidxu	flags = fuword32(&m->m_flags);
1920161678Sdavidxu	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
1921161678Sdavidxu		return (EINVAL);
1922161678Sdavidxu	if (ceiling > RTP_PRIO_MAX)
1923161678Sdavidxu		return (EINVAL);
1924161678Sdavidxu	id = td->td_tid;
1925161678Sdavidxu	uq = td->td_umtxq;
1926161678Sdavidxu	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1927161678Sdavidxu	   &uq->uq_key)) != 0)
1928161678Sdavidxu		return (error);
1929161678Sdavidxu	for (;;) {
1930161678Sdavidxu		umtxq_lock(&uq->uq_key);
1931161678Sdavidxu		umtxq_busy(&uq->uq_key);
1932161678Sdavidxu		umtxq_unlock(&uq->uq_key);
1933161678Sdavidxu
1934161678Sdavidxu		save_ceiling = fuword32(&m->m_ceilings[0]);
1935161678Sdavidxu
1936161678Sdavidxu		owner = casuword32(&m->m_owner,
1937161678Sdavidxu		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1938161678Sdavidxu
1939161678Sdavidxu		if (owner == UMUTEX_CONTESTED) {
1940161678Sdavidxu			suword32(&m->m_ceilings[0], ceiling);
1941161678Sdavidxu			suword32(&m->m_owner, UMUTEX_CONTESTED);
1942161678Sdavidxu			error = 0;
1943161678Sdavidxu			break;
1944161678Sdavidxu		}
1945161678Sdavidxu
1946161678Sdavidxu		/* The address was invalid. */
1947161678Sdavidxu		if (owner == -1) {
1948161678Sdavidxu			error = EFAULT;
1949161678Sdavidxu			break;
1950161678Sdavidxu		}
1951161678Sdavidxu
1952161678Sdavidxu		if ((owner & ~UMUTEX_CONTESTED) == id) {
1953161678Sdavidxu			suword32(&m->m_ceilings[0], ceiling);
1954161678Sdavidxu			error = 0;
1955161678Sdavidxu			break;
1956161678Sdavidxu		}
1957161678Sdavidxu
1958161678Sdavidxu		/*
1959161678Sdavidxu		 * If we caught a signal, we have retried and now
1960161678Sdavidxu		 * exit immediately.
1961161678Sdavidxu		 */
1962161678Sdavidxu		if (error != 0)
1963161678Sdavidxu			break;
1964161678Sdavidxu
1965161678Sdavidxu		/*
1966161678Sdavidxu		 * We set the contested bit, sleep. Otherwise the lock changed
1967161678Sdavidxu		 * and we need to retry or we lost a race to the thread
1968161678Sdavidxu		 * unlocking the umtx.
1969161678Sdavidxu		 */
1970161678Sdavidxu		umtxq_lock(&uq->uq_key);
1971161678Sdavidxu		umtxq_insert(uq);
1972161678Sdavidxu		umtxq_unbusy(&uq->uq_key);
1973161678Sdavidxu		error = umtxq_sleep(uq, "umtxpp", 0);
1974161678Sdavidxu		umtxq_remove(uq);
1975161678Sdavidxu		umtxq_unlock(&uq->uq_key);
1976161678Sdavidxu	}
1977161678Sdavidxu	umtxq_lock(&uq->uq_key);
1978161678Sdavidxu	if (error == 0)
1979161678Sdavidxu		umtxq_signal(&uq->uq_key, INT_MAX);
1980161678Sdavidxu	umtxq_unbusy(&uq->uq_key);
1981161678Sdavidxu	umtxq_unlock(&uq->uq_key);
1982161678Sdavidxu	umtx_key_release(&uq->uq_key);
1983161678Sdavidxu	if (error == 0 && old_ceiling != NULL)
1984161678Sdavidxu		suword32(old_ceiling, save_ceiling);
1985161678Sdavidxu	return (error);
1986161678Sdavidxu}
1987161678Sdavidxu
1988161678Sdavidxu/*
1989161678Sdavidxu * Lock a userland POSIX mutex.
1990161678Sdavidxu */
1991161678Sdavidxustatic int
1992161678Sdavidxudo_lock_umutex(struct thread *td, struct umutex *m, struct timespec *ts,
1993161678Sdavidxu	int try)
1994161678Sdavidxu{
1995161678Sdavidxu	uint32_t flags;
1996161678Sdavidxu	int ret;
1997161678Sdavidxu
1998161678Sdavidxu	flags = fuword32(&m->m_flags);
1999161678Sdavidxu	if (flags == -1)
2000161678Sdavidxu		return (EFAULT);
2001161678Sdavidxu
2002161678Sdavidxu	if ((flags & UMUTEX_PRIO_INHERIT) != 0)
2003161678Sdavidxu		ret = do_lock_pi(td, m, flags, ts, try);
2004161678Sdavidxu	else if ((flags & UMUTEX_PRIO_PROTECT) != 0)
2005161678Sdavidxu		ret = do_lock_pp(td, m, flags, ts, try);
2006161678Sdavidxu	else
2007161678Sdavidxu		ret = do_lock_normal(td, m, flags, ts, try);
2008161678Sdavidxu
2009161678Sdavidxu	return (ret);
2010161678Sdavidxu}
2011161678Sdavidxu
2012161678Sdavidxu/*
2013161678Sdavidxu * Unlock a userland POSIX mutex.
2014161678Sdavidxu */
2015161678Sdavidxustatic int
2016161678Sdavidxudo_unlock_umutex(struct thread *td, struct umutex *m)
2017161678Sdavidxu{
2018161678Sdavidxu	uint32_t flags;
2019161678Sdavidxu	int ret;
2020161678Sdavidxu
2021161678Sdavidxu	flags = fuword32(&m->m_flags);
2022161678Sdavidxu	if (flags == -1)
2023161678Sdavidxu		return (EFAULT);
2024161678Sdavidxu
2025161678Sdavidxu	if ((flags & UMUTEX_PRIO_INHERIT) != 0)
2026161678Sdavidxu		ret = do_unlock_pi(td, m, flags);
2027161678Sdavidxu	else if ((flags & UMUTEX_PRIO_PROTECT) != 0)
2028161678Sdavidxu		ret = do_unlock_pp(td, m, flags);
2029161678Sdavidxu	else
2030161678Sdavidxu		ret = do_unlock_normal(td, m, flags);
2031161678Sdavidxu
2032161678Sdavidxu	return (ret);
2033161678Sdavidxu}
2034161678Sdavidxu
2035139013Sdavidxuint
2036139013Sdavidxu_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2037139013Sdavidxu    /* struct umtx *umtx */
2038139013Sdavidxu{
2039139013Sdavidxu	return _do_lock(td, uap->umtx, td->td_tid, 0);
2040139013Sdavidxu}
2041139013Sdavidxu
2042139013Sdavidxuint
2043139013Sdavidxu_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2044139013Sdavidxu    /* struct umtx *umtx */
2045139013Sdavidxu{
2046139013Sdavidxu	return do_unlock(td, uap->umtx, td->td_tid);
2047139013Sdavidxu}
2048139013Sdavidxu
2049139013Sdavidxuint
2050139013Sdavidxu_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2051139013Sdavidxu{
2052140245Sdavidxu	struct timespec timeout;
2053139013Sdavidxu	struct timespec *ts;
2054139013Sdavidxu	int error;
2055139013Sdavidxu
2056139013Sdavidxu	switch(uap->op) {
2057161678Sdavidxu	case UMTX_OP_MUTEX_LOCK:
2058161678Sdavidxu		/* Allow a null timespec (wait forever). */
2059161678Sdavidxu		if (uap->uaddr2 == NULL)
2060161678Sdavidxu			ts = NULL;
2061161678Sdavidxu		else {
2062161678Sdavidxu			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2063161678Sdavidxu			if (error != 0)
2064161678Sdavidxu				break;
2065161678Sdavidxu			if (timeout.tv_nsec >= 1000000000 ||
2066161678Sdavidxu			    timeout.tv_nsec < 0) {
2067161678Sdavidxu				error = EINVAL;
2068161678Sdavidxu				break;
2069161678Sdavidxu			}
2070161678Sdavidxu			ts = &timeout;
2071161678Sdavidxu		}
2072161678Sdavidxu		error = do_lock_umutex(td, uap->obj, ts, 0);
2073161678Sdavidxu		break;
2074161678Sdavidxu	case UMTX_OP_MUTEX_UNLOCK:
2075161678Sdavidxu		error = do_unlock_umutex(td, uap->obj);
2076161678Sdavidxu		break;
2077161678Sdavidxu	case UMTX_OP_MUTEX_TRYLOCK:
2078161678Sdavidxu		error = do_lock_umutex(td, uap->obj, NULL, 1);
2079161678Sdavidxu		break;
2080161678Sdavidxu	case UMTX_OP_SET_CEILING:
2081161678Sdavidxu		error = do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2082161678Sdavidxu		break;
2083139013Sdavidxu	case UMTX_OP_LOCK:
2084139013Sdavidxu		/* Allow a null timespec (wait forever). */
2085139292Sdavidxu		if (uap->uaddr2 == NULL)
2086139013Sdavidxu			ts = NULL;
2087139013Sdavidxu		else {
2088140245Sdavidxu			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2089139013Sdavidxu			if (error != 0)
2090140102Sdavidxu				break;
2091140245Sdavidxu			if (timeout.tv_nsec >= 1000000000 ||
2092140245Sdavidxu			    timeout.tv_nsec < 0) {
2093140102Sdavidxu				error = EINVAL;
2094140102Sdavidxu				break;
2095140102Sdavidxu			}
2096140245Sdavidxu			ts = &timeout;
2097139013Sdavidxu		}
2098161678Sdavidxu		error = do_lock(td, uap->obj, uap->val, ts);
2099140102Sdavidxu		break;
2100139013Sdavidxu	case UMTX_OP_UNLOCK:
2101161678Sdavidxu		error = do_unlock(td, uap->obj, uap->val);
2102140102Sdavidxu		break;
2103139427Sdavidxu	case UMTX_OP_WAIT:
2104139013Sdavidxu		/* Allow a null timespec (wait forever). */
2105139292Sdavidxu		if (uap->uaddr2 == NULL)
2106139013Sdavidxu			ts = NULL;
2107139013Sdavidxu		else {
2108140245Sdavidxu			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2109139013Sdavidxu			if (error != 0)
2110140102Sdavidxu				break;
2111140245Sdavidxu			if (timeout.tv_nsec >= 1000000000 ||
2112140245Sdavidxu			    timeout.tv_nsec < 0) {
2113140102Sdavidxu				error = EINVAL;
2114140102Sdavidxu				break;
2115140102Sdavidxu			}
2116140245Sdavidxu			ts = &timeout;
2117139013Sdavidxu		}
2118161678Sdavidxu		error = do_wait(td, uap->obj, uap->val, ts);
2119140102Sdavidxu		break;
2120139013Sdavidxu	case UMTX_OP_WAKE:
2121161678Sdavidxu		error = kern_umtx_wake(td, uap->obj, uap->val);
2122140102Sdavidxu		break;
2123139013Sdavidxu	default:
2124140102Sdavidxu		error = EINVAL;
2125140102Sdavidxu		break;
2126139013Sdavidxu	}
2127140421Sdavidxu	return (error);
2128139013Sdavidxu}
2129161678Sdavidxu
2130161678Sdavidxuvoid
2131161678Sdavidxuumtx_thread_init(struct thread *td)
2132161678Sdavidxu{
2133161678Sdavidxu	td->td_umtxq = umtxq_alloc();
2134161678Sdavidxu	td->td_umtxq->uq_thread = td;
2135161678Sdavidxu}
2136161678Sdavidxu
2137161678Sdavidxuvoid
2138161678Sdavidxuumtx_thread_fini(struct thread *td)
2139161678Sdavidxu{
2140161678Sdavidxu	umtxq_free(td->td_umtxq);
2141161678Sdavidxu}
2142161678Sdavidxu
2143161678Sdavidxu/*
2144161678Sdavidxu * It will be called when new thread is created, e.g fork().
2145161678Sdavidxu */
2146161678Sdavidxuvoid
2147161678Sdavidxuumtx_thread_alloc(struct thread *td)
2148161678Sdavidxu{
2149161678Sdavidxu	struct umtx_q *uq;
2150161678Sdavidxu
2151161678Sdavidxu	uq = td->td_umtxq;
2152161678Sdavidxu	uq->uq_inherited_pri = PRI_MAX;
2153161678Sdavidxu
2154161678Sdavidxu	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2155161678Sdavidxu	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2156161678Sdavidxu	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2157161678Sdavidxu	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2158161678Sdavidxu}
2159161678Sdavidxu
2160161678Sdavidxu/*
2161161678Sdavidxu * exec() hook.
2162161678Sdavidxu */
2163161678Sdavidxustatic void
2164161678Sdavidxuumtx_exec_hook(void *arg __unused, struct proc *p __unused,
2165161678Sdavidxu	struct image_params *imgp __unused)
2166161678Sdavidxu{
2167161678Sdavidxu	umtx_thread_cleanup(curthread);
2168161678Sdavidxu}
2169161678Sdavidxu
2170161678Sdavidxu/*
2171161678Sdavidxu * thread_exit() hook.
2172161678Sdavidxu */
2173161678Sdavidxuvoid
2174161678Sdavidxuumtx_thread_exit(struct thread *td)
2175161678Sdavidxu{
2176161678Sdavidxu	umtx_thread_cleanup(td);
2177161678Sdavidxu}
2178161678Sdavidxu
2179161678Sdavidxu/*
2180161678Sdavidxu * clean up umtx data.
2181161678Sdavidxu */
2182161678Sdavidxustatic void
2183161678Sdavidxuumtx_thread_cleanup(struct thread *td)
2184161678Sdavidxu{
2185161678Sdavidxu	struct umtx_q *uq;
2186161678Sdavidxu	struct umtx_pi *pi;
2187161678Sdavidxu
2188161678Sdavidxu	if ((uq = td->td_umtxq) == NULL)
2189161678Sdavidxu		return;
2190161678Sdavidxu
2191161678Sdavidxu	mtx_lock_spin(&sched_lock);
2192161678Sdavidxu	uq->uq_inherited_pri = PRI_MAX;
2193161678Sdavidxu	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2194161678Sdavidxu		pi->pi_owner = NULL;
2195161678Sdavidxu		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2196161678Sdavidxu	}
2197161678Sdavidxu	td->td_flags &= ~TDF_UBORROWING;
2198161678Sdavidxu	mtx_unlock_spin(&sched_lock);
2199161678Sdavidxu}
2200