linux_futex.c revision 293896
1/*	$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */
2
3/*-
4 * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. All advertising materials mentioning features or use of this software
15 *    must display the following acknowledgement:
16 *	This product includes software developed by Emmanuel Dreyfus
17 * 4. The name of the author may not be used to endorse or promote
18 *    products derived from this software without specific prior written
19 *    permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: releng/9.3/sys/compat/linux/linux_futex.c 293896 2016-01-14 09:11:26Z glebius $");
36#if 0
37__KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $");
38#endif
39
40#include "opt_compat.h"
41#include "opt_kdtrace.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/imgact.h>
46#include <sys/kernel.h>
47#include <sys/ktr.h>
48#include <sys/lock.h>
49#include <sys/malloc.h>
50#include <sys/mutex.h>
51#include <sys/priv.h>
52#include <sys/proc.h>
53#include <sys/queue.h>
54#include <sys/sched.h>
55#include <sys/sdt.h>
56#include <sys/sx.h>
57#include <sys/umtx.h>
58
59#ifdef COMPAT_LINUX32
60#include <machine/../linux32/linux.h>
61#include <machine/../linux32/linux32_proto.h>
62#else
63#include <machine/../linux/linux.h>
64#include <machine/../linux/linux_proto.h>
65#endif
66#include <compat/linux/linux_dtrace.h>
67#include <compat/linux/linux_emul.h>
68#include <compat/linux/linux_futex.h>
69#include <compat/linux/linux_util.h>
70
71/* DTrace init */
72LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
73
74/* Linuxulator-global DTrace probes */
75LIN_SDT_PROBE_DECLARE(locks, emul_lock, locked);
76LIN_SDT_PROBE_DECLARE(locks, emul_lock, unlock);
77
78/**
79 * Futex part for the special DTrace module "locks".
80 */
81LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *");
82LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *");
83
84/**
85 * Per futex probes.
86 */
87LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *");
88LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *");
89
90/**
91 * DTrace probes in this module.
92 */
93LIN_SDT_PROBE_DEFINE2(futex, futex_put, entry, "struct futex *",
94    "struct waiting_proc *");
95LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t",
96    "int");
97LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t",
98    "int");
99LIN_SDT_PROBE_DEFINE0(futex, futex_put, return);
100LIN_SDT_PROBE_DEFINE3(futex, futex_get0, entry, "uint32_t *", "struct futex **",
101    "uint32_t");
102LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int");
103LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t",
104    "int");
105LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *");
106LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int");
107LIN_SDT_PROBE_DEFINE1(futex, futex_get0, return, "int");
108LIN_SDT_PROBE_DEFINE3(futex, futex_get, entry, "uint32_t *",
109    "struct waiting_proc **", "struct futex **");
110LIN_SDT_PROBE_DEFINE0(futex, futex_get, error);
111LIN_SDT_PROBE_DEFINE1(futex, futex_get, return, "int");
112LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, entry, "struct futex *",
113    "struct waiting_proc **", "int");
114LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *",
115    "struct waiting_proc *", "uint32_t *", "uint32_t");
116LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *",
117    "struct waiting_proc *");
118LIN_SDT_PROBE_DEFINE1(futex, futex_sleep, return, "int");
119LIN_SDT_PROBE_DEFINE3(futex, futex_wake, entry, "struct futex *", "int",
120    "uint32_t");
121LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t",
122    "struct waiting_proc *", "uint32_t");
123LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *");
124LIN_SDT_PROBE_DEFINE1(futex, futex_wake, return, "int");
125LIN_SDT_PROBE_DEFINE4(futex, futex_requeue, entry, "struct futex *", "int",
126    "struct futex *", "int");
127LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *");
128LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *",
129    "struct waiting_proc *", "uint32_t");
130LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, return, "int");
131LIN_SDT_PROBE_DEFINE4(futex, futex_wait, entry, "struct futex *",
132    "struct waiting_proc **", "int", "uint32_t");
133LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int");
134LIN_SDT_PROBE_DEFINE1(futex, futex_wait, return, "int");
135LIN_SDT_PROBE_DEFINE3(futex, futex_atomic_op, entry, "struct thread *",
136    "int", "uint32_t");
137LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int",
138    "int");
139LIN_SDT_PROBE_DEFINE0(futex, futex_atomic_op, missing_access_check);
140LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int");
141LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int");
142LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, return, "int");
143LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, entry, "struct thread *",
144    "struct linux_sys_futex_args *");
145LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_clockswitch);
146LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, itimerfix_error, "int");
147LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, copyin_error, "int");
148LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, invalid_cmp_requeue_use);
149LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wait, "uint32_t *",
150    "uint32_t", "uint32_t");
151LIN_SDT_PROBE_DEFINE4(futex, linux_sys_futex, debug_wait_value_neq,
152    "uint32_t *", "uint32_t", "int", "uint32_t");
153LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wake, "uint32_t *",
154    "uint32_t", "uint32_t");
155LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_cmp_requeue, "uint32_t *",
156    "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *");
157LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq,
158    "uint32_t", "int");
159LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_wake_op, "uint32_t *",
160    "int", "uint32_t", "uint32_t *", "uint32_t");
161LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unhandled_efault);
162LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_lock_pi);
163LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_unlock_pi);
164LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_trylock_pi);
165LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, deprecated_requeue);
166LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi);
167LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi);
168LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, unknown_operation, "int");
169LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, return, "int");
170LIN_SDT_PROBE_DEFINE2(futex, linux_set_robust_list, entry, "struct thread *",
171    "struct linux_set_robust_list_args *");
172LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error);
173LIN_SDT_PROBE_DEFINE1(futex, linux_set_robust_list, return, "int");
174LIN_SDT_PROBE_DEFINE2(futex, linux_get_robust_list, entry, "struct thread *",
175    "struct linux_get_robust_list_args *");
176LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int");
177LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, return, "int");
178LIN_SDT_PROBE_DEFINE3(futex, handle_futex_death, entry, "struct proc *",
179    "uint32_t *", "int");
180LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int");
181LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, return, "int");
182LIN_SDT_PROBE_DEFINE3(futex, fetch_robust_entry, entry,
183    "struct linux_robust_list **", "struct linux_robust_list **", "int *");
184LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int");
185LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, return, "int");
186LIN_SDT_PROBE_DEFINE1(futex, release_futexes, entry, "struct proc *");
187LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int");
188LIN_SDT_PROBE_DEFINE0(futex, release_futexes, return);
189
190static MALLOC_DEFINE(M_FUTEX, "futex", "Linux futexes");
191static MALLOC_DEFINE(M_FUTEX_WP, "futex wp", "Linux futexes wp");
192
193struct futex;
194
195struct waiting_proc {
196	uint32_t	wp_flags;
197	struct futex	*wp_futex;
198	TAILQ_ENTRY(waiting_proc) wp_list;
199};
200
201struct futex {
202	struct sx	f_lck;
203	uint32_t	*f_uaddr;	/* user-supplied value, for debug */
204	struct umtx_key	f_key;
205	uint32_t	f_refcount;
206	uint32_t	f_bitset;
207	LIST_ENTRY(futex) f_list;
208	TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc;
209};
210
211struct futex_list futex_list;
212
213#define FUTEX_LOCK(f)		sx_xlock(&(f)->f_lck)
214#define FUTEX_UNLOCK(f)		sx_xunlock(&(f)->f_lck)
215#define FUTEX_INIT(f)		do { \
216				    sx_init_flags(&(f)->f_lck, "ftlk", \
217					SX_DUPOK); \
218				    LIN_SDT_PROBE1(futex, futex, create, \
219					&(f)->f_lck); \
220				} while (0)
221#define FUTEX_DESTROY(f)	do { \
222				    LIN_SDT_PROBE1(futex, futex, destroy, \
223					&(f)->f_lck); \
224				    sx_destroy(&(f)->f_lck); \
225				} while (0)
226#define FUTEX_ASSERT_LOCKED(f)	sx_assert(&(f)->f_lck, SA_XLOCKED)
227
228struct mtx futex_mtx;			/* protects the futex list */
229#define FUTEXES_LOCK		do { \
230				    mtx_lock(&futex_mtx); \
231				    LIN_SDT_PROBE1(locks, futex_mtx, \
232					locked, &futex_mtx); \
233				} while (0)
234#define FUTEXES_UNLOCK		do { \
235				    LIN_SDT_PROBE1(locks, futex_mtx, \
236					unlock, &futex_mtx); \
237				    mtx_unlock(&futex_mtx); \
238				} while (0)
239
240/* flags for futex_get() */
241#define FUTEX_CREATE_WP		0x1	/* create waiting_proc */
242#define FUTEX_DONTCREATE	0x2	/* don't create futex if not exists */
243#define FUTEX_DONTEXISTS	0x4	/* return EINVAL if futex exists */
244#define	FUTEX_SHARED		0x8	/* shared futex */
245
246/* wp_flags */
247#define FUTEX_WP_REQUEUED	0x1	/* wp requeued - wp moved from wp_list
248					 * of futex where thread sleep to wp_list
249					 * of another futex.
250					 */
251#define FUTEX_WP_REMOVED	0x2	/* wp is woken up and removed from futex
252					 * wp_list to prevent double wakeup.
253					 */
254
255/* support.s */
256int futex_xchgl(int oparg, uint32_t *uaddr, int *oldval);
257int futex_addl(int oparg, uint32_t *uaddr, int *oldval);
258int futex_orl(int oparg, uint32_t *uaddr, int *oldval);
259int futex_andl(int oparg, uint32_t *uaddr, int *oldval);
260int futex_xorl(int oparg, uint32_t *uaddr, int *oldval);
261
262static void
263futex_put(struct futex *f, struct waiting_proc *wp)
264{
265	LIN_SDT_PROBE2(futex, futex_put, entry, f, wp);
266
267	FUTEX_ASSERT_LOCKED(f);
268	if (wp != NULL) {
269		if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0)
270			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
271		free(wp, M_FUTEX_WP);
272	}
273
274	FUTEXES_LOCK;
275	if (--f->f_refcount == 0) {
276		LIST_REMOVE(f, f_list);
277		FUTEXES_UNLOCK;
278		FUTEX_UNLOCK(f);
279
280		LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr,
281		    f->f_refcount, f->f_key.shared);
282		LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d "
283		    "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared);
284		umtx_key_release(&f->f_key);
285		FUTEX_DESTROY(f);
286		free(f, M_FUTEX);
287
288		LIN_SDT_PROBE0(futex, futex_put, return);
289		return;
290	}
291
292	LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount,
293	    f->f_key.shared);
294	LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d",
295	    f->f_uaddr, f->f_refcount, f->f_key.shared);
296	FUTEXES_UNLOCK;
297	FUTEX_UNLOCK(f);
298
299	LIN_SDT_PROBE0(futex, futex_put, return);
300}
301
302static int
303futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags)
304{
305	struct futex *f, *tmpf;
306	struct umtx_key key;
307	int error;
308
309	LIN_SDT_PROBE3(futex, futex_get0, entry, uaddr, newf, flags);
310
311	*newf = tmpf = NULL;
312
313	error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ?
314	    AUTO_SHARE : THREAD_SHARE, &key);
315	if (error) {
316		LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error);
317		LIN_SDT_PROBE1(futex, futex_get0, return, error);
318		return (error);
319	}
320retry:
321	FUTEXES_LOCK;
322	LIST_FOREACH(f, &futex_list, f_list) {
323		if (umtx_key_match(&f->f_key, &key)) {
324			if (tmpf != NULL) {
325				FUTEX_UNLOCK(tmpf);
326				FUTEX_DESTROY(tmpf);
327				free(tmpf, M_FUTEX);
328			}
329			if (flags & FUTEX_DONTEXISTS) {
330				FUTEXES_UNLOCK;
331				umtx_key_release(&key);
332
333				LIN_SDT_PROBE1(futex, futex_get0, return,
334				    EINVAL);
335				return (EINVAL);
336			}
337
338			/*
339			 * Increment refcount of the found futex to
340			 * prevent it from deallocation before FUTEX_LOCK()
341			 */
342			++f->f_refcount;
343			FUTEXES_UNLOCK;
344			umtx_key_release(&key);
345
346			FUTEX_LOCK(f);
347			*newf = f;
348			LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr,
349			    f->f_refcount, f->f_key.shared);
350			LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d",
351			    uaddr, f->f_refcount, f->f_key.shared);
352
353			LIN_SDT_PROBE1(futex, futex_get0, return, 0);
354			return (0);
355		}
356	}
357
358	if (flags & FUTEX_DONTCREATE) {
359		FUTEXES_UNLOCK;
360		umtx_key_release(&key);
361		LIN_SDT_PROBE1(futex, futex_get0, null, uaddr);
362		LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr);
363
364		LIN_SDT_PROBE1(futex, futex_get0, return, 0);
365		return (0);
366	}
367
368	if (tmpf == NULL) {
369		FUTEXES_UNLOCK;
370		tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO);
371		tmpf->f_uaddr = uaddr;
372		tmpf->f_key = key;
373		tmpf->f_refcount = 1;
374		tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY;
375		FUTEX_INIT(tmpf);
376		TAILQ_INIT(&tmpf->f_waiting_proc);
377
378		/*
379		 * Lock the new futex before an insert into the futex_list
380		 * to prevent futex usage by other.
381		 */
382		FUTEX_LOCK(tmpf);
383		goto retry;
384	}
385
386	LIST_INSERT_HEAD(&futex_list, tmpf, f_list);
387	FUTEXES_UNLOCK;
388
389	LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount,
390	    tmpf->f_key.shared);
391	LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new",
392	    uaddr, tmpf->f_refcount, tmpf->f_key.shared);
393	*newf = tmpf;
394
395	LIN_SDT_PROBE1(futex, futex_get0, return, 0);
396	return (0);
397}
398
399static int
400futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f,
401    uint32_t flags)
402{
403	int error;
404
405	LIN_SDT_PROBE3(futex, futex_get, entry, uaddr, wp, f);
406
407	if (flags & FUTEX_CREATE_WP) {
408		*wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK);
409		(*wp)->wp_flags = 0;
410	}
411	error = futex_get0(uaddr, f, flags);
412	if (error) {
413		LIN_SDT_PROBE0(futex, futex_get, error);
414
415		if (flags & FUTEX_CREATE_WP)
416			free(*wp, M_FUTEX_WP);
417
418		LIN_SDT_PROBE1(futex, futex_get, return, error);
419		return (error);
420	}
421	if (flags & FUTEX_CREATE_WP) {
422		TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list);
423		(*wp)->wp_futex = *f;
424	}
425
426	LIN_SDT_PROBE1(futex, futex_get, return, error);
427	return (error);
428}
429
430static int
431futex_sleep(struct futex *f, struct waiting_proc *wp, int timeout)
432{
433	int error;
434
435	FUTEX_ASSERT_LOCKED(f);
436	LIN_SDT_PROBE3(futex, futex_sleep, entry, f, wp, timeout);
437	LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %d ref %d",
438	    f->f_uaddr, wp, timeout, f->f_refcount);
439	error = sx_sleep(wp, &f->f_lck, PCATCH, "futex", timeout);
440	if (wp->wp_flags & FUTEX_WP_REQUEUED) {
441		KASSERT(f != wp->wp_futex, ("futex != wp_futex"));
442
443		if (error) {
444			LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error,
445			    f->f_uaddr, wp, wp->wp_futex->f_uaddr,
446			    wp->wp_futex->f_refcount);
447		}
448
449		LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp"
450		    " %p requeued uaddr %p ref %d",
451		    error, f->f_uaddr, wp, wp->wp_futex->f_uaddr,
452		    wp->wp_futex->f_refcount);
453		futex_put(f, NULL);
454		f = wp->wp_futex;
455		FUTEX_LOCK(f);
456	} else {
457		if (error) {
458			LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error,
459			    f->f_uaddr, wp);
460		}
461		LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p",
462		    error, f->f_uaddr, wp);
463	}
464
465	futex_put(f, wp);
466
467	LIN_SDT_PROBE1(futex, futex_sleep, return, error);
468	return (error);
469}
470
471static int
472futex_wake(struct futex *f, int n, uint32_t bitset)
473{
474	struct waiting_proc *wp, *wpt;
475	int count = 0;
476
477	LIN_SDT_PROBE3(futex, futex_wake, entry, f, n, bitset);
478
479	if (bitset == 0) {
480		LIN_SDT_PROBE1(futex, futex_wake, return, EINVAL);
481		return (EINVAL);
482	}
483
484	FUTEX_ASSERT_LOCKED(f);
485	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
486		LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp,
487		    f->f_refcount);
488		LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d",
489		    f->f_uaddr, wp, f->f_refcount);
490		/*
491		 * Unless we find a matching bit in
492		 * the bitset, continue searching.
493		 */
494		if (!(wp->wp_futex->f_bitset & bitset))
495			continue;
496
497		wp->wp_flags |= FUTEX_WP_REMOVED;
498		TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
499		LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp);
500		wakeup_one(wp);
501		if (++count == n)
502			break;
503	}
504
505	LIN_SDT_PROBE1(futex, futex_wake, return, count);
506	return (count);
507}
508
509static int
510futex_requeue(struct futex *f, int n, struct futex *f2, int n2)
511{
512	struct waiting_proc *wp, *wpt;
513	int count = 0;
514
515	LIN_SDT_PROBE4(futex, futex_requeue, entry, f, n, f2, n2);
516
517	FUTEX_ASSERT_LOCKED(f);
518	FUTEX_ASSERT_LOCKED(f2);
519
520	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
521		if (++count <= n) {
522			LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p",
523			    f->f_uaddr, wp);
524			wp->wp_flags |= FUTEX_WP_REMOVED;
525			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
526			LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp);
527			wakeup_one(wp);
528		} else {
529			LIN_SDT_PROBE3(futex, futex_requeue, requeue,
530			    f->f_uaddr, wp, f2->f_uaddr);
531			LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p",
532			    f->f_uaddr, wp, f2->f_uaddr);
533			wp->wp_flags |= FUTEX_WP_REQUEUED;
534			/* Move wp to wp_list of f2 futex */
535			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
536			TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list);
537
538			/*
539			 * Thread which sleeps on wp after waking should
540			 * acquire f2 lock, so increment refcount of f2 to
541			 * prevent it from premature deallocation.
542			 */
543			wp->wp_futex = f2;
544			FUTEXES_LOCK;
545			++f2->f_refcount;
546			FUTEXES_UNLOCK;
547			if (count - n >= n2)
548				break;
549		}
550	}
551
552	LIN_SDT_PROBE1(futex, futex_requeue, return, count);
553	return (count);
554}
555
556static int
557futex_wait(struct futex *f, struct waiting_proc *wp, int timeout_hz,
558    uint32_t bitset)
559{
560	int error;
561
562	LIN_SDT_PROBE4(futex, futex_wait, entry, f, wp, timeout_hz, bitset);
563
564	if (bitset == 0) {
565		LIN_SDT_PROBE1(futex, futex_wait, return, EINVAL);
566		return (EINVAL);
567	}
568
569	f->f_bitset = bitset;
570	error = futex_sleep(f, wp, timeout_hz);
571	if (error)
572		LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error);
573	if (error == EWOULDBLOCK)
574		error = ETIMEDOUT;
575
576	LIN_SDT_PROBE1(futex, futex_wait, return, error);
577	return (error);
578}
579
580static int
581futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr)
582{
583	int op = (encoded_op >> 28) & 7;
584	int cmp = (encoded_op >> 24) & 15;
585	int oparg = (encoded_op << 8) >> 20;
586	int cmparg = (encoded_op << 20) >> 20;
587	int oldval = 0, ret;
588
589	LIN_SDT_PROBE3(futex, futex_atomic_op, entry, td, encoded_op, uaddr);
590
591	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
592		oparg = 1 << oparg;
593
594	LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg,
595	    cmparg);
596
597	/* XXX: Linux verifies access here and returns EFAULT */
598	LIN_SDT_PROBE0(futex, futex_atomic_op, missing_access_check);
599
600	switch (op) {
601	case FUTEX_OP_SET:
602		ret = futex_xchgl(oparg, uaddr, &oldval);
603		break;
604	case FUTEX_OP_ADD:
605		ret = futex_addl(oparg, uaddr, &oldval);
606		break;
607	case FUTEX_OP_OR:
608		ret = futex_orl(oparg, uaddr, &oldval);
609		break;
610	case FUTEX_OP_ANDN:
611		ret = futex_andl(~oparg, uaddr, &oldval);
612		break;
613	case FUTEX_OP_XOR:
614		ret = futex_xorl(oparg, uaddr, &oldval);
615		break;
616	default:
617		LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op);
618		ret = -ENOSYS;
619		break;
620	}
621
622	if (ret) {
623		LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret);
624		return (ret);
625	}
626
627	switch (cmp) {
628	case FUTEX_OP_CMP_EQ:
629		ret = (oldval == cmparg);
630		break;
631	case FUTEX_OP_CMP_NE:
632		ret = (oldval != cmparg);
633		break;
634	case FUTEX_OP_CMP_LT:
635		ret = (oldval < cmparg);
636		break;
637	case FUTEX_OP_CMP_GE:
638		ret = (oldval >= cmparg);
639		break;
640	case FUTEX_OP_CMP_LE:
641		ret = (oldval <= cmparg);
642		break;
643	case FUTEX_OP_CMP_GT:
644		ret = (oldval > cmparg);
645		break;
646	default:
647		LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp);
648		ret = -ENOSYS;
649	}
650
651	LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret);
652	return (ret);
653}
654
655int
656linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
657{
658	int clockrt, nrwake, op_ret, ret;
659	struct linux_emuldata *em;
660	struct waiting_proc *wp;
661	struct futex *f, *f2;
662	struct l_timespec timeout;
663	struct timeval utv, ctv;
664	int timeout_hz;
665	int error;
666	uint32_t flags, val;
667
668	LIN_SDT_PROBE2(futex, linux_sys_futex, entry, td, args);
669
670	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
671		flags = 0;
672		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
673	} else
674		flags = FUTEX_SHARED;
675
676	/*
677	 * Currently support for switching between CLOCK_MONOTONIC and
678	 * CLOCK_REALTIME is not present. However Linux forbids the use of
679	 * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and
680	 * FUTEX_WAIT_REQUEUE_PI.
681	 */
682	clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
683	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
684	if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET &&
685		args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) {
686		LIN_SDT_PROBE0(futex, linux_sys_futex,
687		    unimplemented_clockswitch);
688		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
689		return (ENOSYS);
690	}
691
692	error = 0;
693	f = f2 = NULL;
694
695	switch (args->op) {
696	case LINUX_FUTEX_WAIT:
697		args->val3 = FUTEX_BITSET_MATCH_ANY;
698		/* FALLTHROUGH */
699
700	case LINUX_FUTEX_WAIT_BITSET:
701		LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wait, args->uaddr,
702		    args->val, args->val3);
703		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
704		    args->uaddr, args->val, args->val3);
705
706		error = futex_get(args->uaddr, &wp, &f,
707		    flags | FUTEX_CREATE_WP);
708		if (error) {
709			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
710			return (error);
711		}
712
713		error = copyin(args->uaddr, &val, sizeof(val));
714		if (error) {
715			LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
716			    error);
717			LINUX_CTR1(sys_futex, "WAIT copyin failed %d",
718			    error);
719			futex_put(f, wp);
720
721			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
722			return (error);
723		}
724		if (val != args->val) {
725			LIN_SDT_PROBE4(futex, linux_sys_futex,
726			    debug_wait_value_neq, args->uaddr, args->val, val,
727			    args->val3);
728			LINUX_CTR3(sys_futex,
729			    "WAIT uaddr %p val 0x%x != uval 0x%x",
730			    args->uaddr, args->val, val);
731			futex_put(f, wp);
732
733			LIN_SDT_PROBE1(futex, linux_sys_futex, return,
734			    EWOULDBLOCK);
735			return (EWOULDBLOCK);
736		}
737
738		if (args->timeout != NULL) {
739			error = copyin(args->timeout, &timeout, sizeof(timeout));
740			if (error) {
741				LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
742				    error);
743				LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
744				futex_put(f, wp);
745				return (error);
746			}
747			TIMESPEC_TO_TIMEVAL(&utv, &timeout);
748			error = itimerfix(&utv);
749			if (error) {
750				LIN_SDT_PROBE1(futex, linux_sys_futex, itimerfix_error,
751				    error);
752				LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
753				futex_put(f, wp);
754				return (error);
755			}
756			if (clockrt) {
757				microtime(&ctv);
758				timevalsub(&utv, &ctv);
759			} else if (args->op == LINUX_FUTEX_WAIT_BITSET) {
760				microuptime(&ctv);
761				timevalsub(&utv, &ctv);
762			}
763			if (utv.tv_sec < 0)
764				timevalclear(&utv);
765			timeout_hz = tvtohz(&utv);
766		} else
767			timeout_hz = 0;
768
769		error = futex_wait(f, wp, timeout_hz, args->val3);
770		break;
771
772	case LINUX_FUTEX_WAKE:
773		args->val3 = FUTEX_BITSET_MATCH_ANY;
774		/* FALLTHROUGH */
775
776	case LINUX_FUTEX_WAKE_BITSET:
777		LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wake, args->uaddr,
778		    args->val, args->val3);
779		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
780		    args->uaddr, args->val, args->val3);
781
782		error = futex_get(args->uaddr, NULL, &f,
783		    flags | FUTEX_DONTCREATE);
784		if (error) {
785			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
786			return (error);
787		}
788
789		if (f == NULL) {
790			td->td_retval[0] = 0;
791
792			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
793			return (error);
794		}
795		td->td_retval[0] = futex_wake(f, args->val, args->val3);
796		futex_put(f, NULL);
797		break;
798
799	case LINUX_FUTEX_CMP_REQUEUE:
800		LIN_SDT_PROBE5(futex, linux_sys_futex, debug_cmp_requeue,
801		    args->uaddr, args->val, args->val3, args->uaddr2,
802		    args->timeout);
803		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
804		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
805		    args->uaddr, args->val, args->val3, args->uaddr2,
806		    args->timeout);
807
808		/*
809		 * Linux allows this, we would not, it is an incorrect
810		 * usage of declared ABI, so return EINVAL.
811		 */
812		if (args->uaddr == args->uaddr2) {
813			LIN_SDT_PROBE0(futex, linux_sys_futex,
814			    invalid_cmp_requeue_use);
815			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
816			return (EINVAL);
817		}
818
819		error = futex_get(args->uaddr, NULL, &f, flags);
820		if (error) {
821			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
822			return (error);
823		}
824
825		/*
826		 * To avoid deadlocks return EINVAL if second futex
827		 * exists at this time.
828		 *
829		 * Glibc fall back to FUTEX_WAKE in case of any error
830		 * returned by FUTEX_CMP_REQUEUE.
831		 */
832		error = futex_get(args->uaddr2, NULL, &f2,
833		    flags | FUTEX_DONTEXISTS);
834		if (error) {
835			futex_put(f, NULL);
836
837			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
838			return (error);
839		}
840		error = copyin(args->uaddr, &val, sizeof(val));
841		if (error) {
842			LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
843			    error);
844			LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d",
845			    error);
846			futex_put(f2, NULL);
847			futex_put(f, NULL);
848
849			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
850			return (error);
851		}
852		if (val != args->val3) {
853			LIN_SDT_PROBE2(futex, linux_sys_futex,
854			    debug_cmp_requeue_value_neq, args->val, val);
855			LINUX_CTR2(sys_futex, "CMP_REQUEUE val 0x%x != uval 0x%x",
856			    args->val, val);
857			futex_put(f2, NULL);
858			futex_put(f, NULL);
859
860			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EAGAIN);
861			return (EAGAIN);
862		}
863
864		nrwake = (int)(unsigned long)args->timeout;
865		td->td_retval[0] = futex_requeue(f, args->val, f2, nrwake);
866		futex_put(f2, NULL);
867		futex_put(f, NULL);
868		break;
869
870	case LINUX_FUTEX_WAKE_OP:
871		LIN_SDT_PROBE5(futex, linux_sys_futex, debug_wake_op,
872		    args->uaddr, args->op, args->val, args->uaddr2, args->val3);
873		LINUX_CTR5(sys_futex, "WAKE_OP "
874		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
875		    args->uaddr, args->val, args->uaddr2, args->val3,
876		    args->timeout);
877
878		error = futex_get(args->uaddr, NULL, &f, flags);
879		if (error) {
880			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
881			return (error);
882		}
883
884		if (args->uaddr != args->uaddr2)
885			error = futex_get(args->uaddr2, NULL, &f2, flags);
886		if (error) {
887			futex_put(f, NULL);
888
889			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
890			return (error);
891		}
892
893		/*
894		 * This function returns positive number as results and
895		 * negative as errors
896		 */
897		op_ret = futex_atomic_op(td, args->val3, args->uaddr2);
898
899		LINUX_CTR2(sys_futex, "WAKE_OP atomic_op uaddr %p ret 0x%x",
900		    args->uaddr, op_ret);
901
902		if (op_ret < 0) {
903			/* XXX: We don't handle the EFAULT yet. */
904			if (op_ret != -EFAULT) {
905				if (f2 != NULL)
906					futex_put(f2, NULL);
907				futex_put(f, NULL);
908
909				LIN_SDT_PROBE1(futex, linux_sys_futex, return,
910				    -op_ret);
911				return (-op_ret);
912			} else {
913				LIN_SDT_PROBE0(futex, linux_sys_futex,
914				    unhandled_efault);
915			}
916			if (f2 != NULL)
917				futex_put(f2, NULL);
918			futex_put(f, NULL);
919
920			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EFAULT);
921			return (EFAULT);
922		}
923
924		ret = futex_wake(f, args->val, args->val3);
925
926		if (op_ret > 0) {
927			op_ret = 0;
928			nrwake = (int)(unsigned long)args->timeout;
929
930			if (f2 != NULL)
931				op_ret += futex_wake(f2, nrwake, args->val3);
932			else
933				op_ret += futex_wake(f, nrwake, args->val3);
934			ret += op_ret;
935
936		}
937		if (f2 != NULL)
938			futex_put(f2, NULL);
939		futex_put(f, NULL);
940		td->td_retval[0] = ret;
941		break;
942
943	case LINUX_FUTEX_LOCK_PI:
944		/* not yet implemented */
945		linux_msg(td,
946			  "linux_sys_futex: "
947			  "op LINUX_FUTEX_LOCK_PI not implemented\n");
948		LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_lock_pi);
949		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
950		return (ENOSYS);
951
952	case LINUX_FUTEX_UNLOCK_PI:
953		/* not yet implemented */
954		linux_msg(td,
955			  "linux_sys_futex: "
956			  "op LINUX_FUTEX_UNLOCK_PI not implemented\n");
957		LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_unlock_pi);
958		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
959		return (ENOSYS);
960
961	case LINUX_FUTEX_TRYLOCK_PI:
962		/* not yet implemented */
963		linux_msg(td,
964			  "linux_sys_futex: "
965			  "op LINUX_FUTEX_TRYLOCK_PI not implemented\n");
966		LIN_SDT_PROBE0(futex, linux_sys_futex,
967		    unimplemented_trylock_pi);
968		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
969		return (ENOSYS);
970
971	case LINUX_FUTEX_REQUEUE:
972
973		/*
974		 * Glibc does not use this operation since version 2.3.3,
975		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
976		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
977		 * FUTEX_REQUEUE returned EINVAL.
978		 */
979		em = em_find(td->td_proc, EMUL_DONTLOCK);
980		if ((em->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
981			linux_msg(td,
982				  "linux_sys_futex: "
983				  "unsupported futex_requeue op\n");
984			em->flags |= LINUX_XDEPR_REQUEUEOP;
985			LIN_SDT_PROBE0(futex, linux_sys_futex,
986			    deprecated_requeue);
987		}
988
989		LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
990		return (EINVAL);
991
992	case LINUX_FUTEX_WAIT_REQUEUE_PI:
993		/* not yet implemented */
994		linux_msg(td,
995			  "linux_sys_futex: "
996			  "op FUTEX_WAIT_REQUEUE_PI not implemented\n");
997		LIN_SDT_PROBE0(futex, linux_sys_futex,
998		    unimplemented_wait_requeue_pi);
999		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
1000		return (ENOSYS);
1001
1002	case LINUX_FUTEX_CMP_REQUEUE_PI:
1003		/* not yet implemented */
1004		linux_msg(td,
1005			    "linux_sys_futex: "
1006			    "op LINUX_FUTEX_CMP_REQUEUE_PI not implemented\n");
1007		LIN_SDT_PROBE0(futex, linux_sys_futex,
1008		    unimplemented_cmp_requeue_pi);
1009		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
1010		return (ENOSYS);
1011
1012	default:
1013		linux_msg(td,
1014			  "linux_sys_futex: unknown op %d\n", args->op);
1015		LIN_SDT_PROBE1(futex, linux_sys_futex, unknown_operation,
1016		    args->op);
1017		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
1018		return (ENOSYS);
1019	}
1020
1021	LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
1022	return (error);
1023}
1024
1025int
1026linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
1027{
1028	struct linux_emuldata *em;
1029
1030	LIN_SDT_PROBE2(futex, linux_set_robust_list, entry, td, args);
1031
1032	if (args->len != sizeof(struct linux_robust_list_head)) {
1033		LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error);
1034		LIN_SDT_PROBE1(futex, linux_set_robust_list, return, EINVAL);
1035		return (EINVAL);
1036	}
1037
1038	em = em_find(td->td_proc, EMUL_DOLOCK);
1039	em->robust_futexes = args->head;
1040	EMUL_UNLOCK(&emul_lock);
1041
1042	LIN_SDT_PROBE1(futex, linux_set_robust_list, return, 0);
1043	return (0);
1044}
1045
1046int
1047linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
1048{
1049	struct linux_emuldata *em;
1050	struct linux_robust_list_head *head;
1051	l_size_t len = sizeof(struct linux_robust_list_head);
1052	int error = 0;
1053
1054	LIN_SDT_PROBE2(futex, linux_get_robust_list, entry, td, args);
1055
1056	if (!args->pid) {
1057		em = em_find(td->td_proc, EMUL_DONTLOCK);
1058		head = em->robust_futexes;
1059	} else {
1060		struct proc *p;
1061
1062		p = pfind(args->pid);
1063		if (p == NULL) {
1064			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
1065			    ESRCH);
1066			return (ESRCH);
1067		}
1068
1069		em = em_find(p, EMUL_DONTLOCK);
1070		/* XXX: ptrace? */
1071		if (priv_check(td, PRIV_CRED_SETUID) ||
1072		    priv_check(td, PRIV_CRED_SETEUID) ||
1073		    p_candebug(td, p)) {
1074			PROC_UNLOCK(p);
1075
1076			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
1077			    EPERM);
1078			return (EPERM);
1079		}
1080		head = em->robust_futexes;
1081
1082		PROC_UNLOCK(p);
1083	}
1084
1085	error = copyout(&len, args->len, sizeof(l_size_t));
1086	if (error) {
1087		LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error,
1088		    error);
1089		LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EFAULT);
1090		return (EFAULT);
1091	}
1092
1093	error = copyout(&head, args->head, sizeof(head));
1094	if (error) {
1095		LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error,
1096		    error);
1097	}
1098
1099	LIN_SDT_PROBE1(futex, linux_get_robust_list, return, error);
1100	return (error);
1101}
1102
1103static int
1104handle_futex_death(struct proc *p, uint32_t *uaddr, int pi)
1105{
1106	uint32_t uval, nval, mval;
1107	struct futex *f;
1108	int error;
1109
1110	LIN_SDT_PROBE3(futex, handle_futex_death, entry, p, uaddr, pi);
1111
1112retry:
1113	error = copyin(uaddr, &uval, 4);
1114	if (error) {
1115		LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error);
1116		LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT);
1117		return (EFAULT);
1118	}
1119	if ((uval & FUTEX_TID_MASK) == p->p_pid) {
1120		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
1121		nval = casuword32(uaddr, uval, mval);
1122
1123		if (nval == -1) {
1124			LIN_SDT_PROBE1(futex, handle_futex_death, return,
1125			    EFAULT);
1126			return (EFAULT);
1127		}
1128
1129		if (nval != uval)
1130			goto retry;
1131
1132		if (!pi && (uval & FUTEX_WAITERS)) {
1133			error = futex_get(uaddr, NULL, &f,
1134			    FUTEX_DONTCREATE | FUTEX_SHARED);
1135			if (error) {
1136				LIN_SDT_PROBE1(futex, handle_futex_death,
1137				    return, error);
1138				return (error);
1139			}
1140			if (f != NULL) {
1141				futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY);
1142				futex_put(f, NULL);
1143			}
1144		}
1145	}
1146
1147	LIN_SDT_PROBE1(futex, handle_futex_death, return, 0);
1148	return (0);
1149}
1150
1151static int
1152fetch_robust_entry(struct linux_robust_list **entry,
1153    struct linux_robust_list **head, int *pi)
1154{
1155	l_ulong uentry;
1156	int error;
1157
1158	LIN_SDT_PROBE3(futex, fetch_robust_entry, entry, entry, head, pi);
1159
1160	error = copyin((const void *)head, &uentry, sizeof(l_ulong));
1161	if (error) {
1162		LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error);
1163		LIN_SDT_PROBE1(futex, fetch_robust_entry, return, EFAULT);
1164		return (EFAULT);
1165	}
1166
1167	*entry = (void *)(uentry & ~1UL);
1168	*pi = uentry & 1;
1169
1170	LIN_SDT_PROBE1(futex, fetch_robust_entry, return, 0);
1171	return (0);
1172}
1173
1174/* This walks the list of robust futexes releasing them. */
1175void
1176release_futexes(struct proc *p)
1177{
1178	struct linux_robust_list_head *head = NULL;
1179	struct linux_robust_list *entry, *next_entry, *pending;
1180	unsigned int limit = 2048, pi, next_pi, pip;
1181	struct linux_emuldata *em;
1182	l_long futex_offset;
1183	int rc, error;
1184
1185	LIN_SDT_PROBE1(futex, release_futexes, entry, p);
1186
1187	em = em_find(p, EMUL_DONTLOCK);
1188	head = em->robust_futexes;
1189
1190	if (head == NULL) {
1191		LIN_SDT_PROBE0(futex, release_futexes, return);
1192		return;
1193	}
1194
1195	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) {
1196		LIN_SDT_PROBE0(futex, release_futexes, return);
1197		return;
1198	}
1199
1200	error = copyin(&head->futex_offset, &futex_offset,
1201	    sizeof(futex_offset));
1202	if (error) {
1203		LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error);
1204		LIN_SDT_PROBE0(futex, release_futexes, return);
1205		return;
1206	}
1207
1208	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) {
1209		LIN_SDT_PROBE0(futex, release_futexes, return);
1210		return;
1211	}
1212
1213	while (entry != &head->list) {
1214		rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi);
1215
1216		if (entry != pending)
1217			if (handle_futex_death(p,
1218			    (uint32_t *)((caddr_t)entry + futex_offset), pi)) {
1219				LIN_SDT_PROBE0(futex, release_futexes, return);
1220				return;
1221			}
1222		if (rc) {
1223			LIN_SDT_PROBE0(futex, release_futexes, return);
1224			return;
1225		}
1226
1227		entry = next_entry;
1228		pi = next_pi;
1229
1230		if (!--limit)
1231			break;
1232
1233		sched_relinquish(curthread);
1234	}
1235
1236	if (pending)
1237		handle_futex_death(p, (uint32_t *)((caddr_t)pending + futex_offset), pip);
1238
1239	LIN_SDT_PROBE0(futex, release_futexes, return);
1240}
1241