linux_futex.c revision 218117
1/*	$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */
2
3/*-
4 * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. All advertising materials mentioning features or use of this software
15 *    must display the following acknowledgement:
16 *	This product includes software developed by Emmanuel Dreyfus
17 * 4. The name of the author may not be used to endorse or promote
18 *    products derived from this software without specific prior written
19 *    permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/sys/compat/linux/linux_futex.c 218117 2011-01-31 05:59:05Z dchagin $");
36#if 0
37__KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $");
38#endif
39
40#include "opt_compat.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/imgact.h>
45#include <sys/kernel.h>
46#include <sys/ktr.h>
47#include <sys/lock.h>
48#include <sys/malloc.h>
49#include <sys/mutex.h>
50#include <sys/priv.h>
51#include <sys/proc.h>
52#include <sys/queue.h>
53#include <sys/sched.h>
54#include <sys/sx.h>
55
56#ifdef COMPAT_LINUX32
57#include <machine/../linux32/linux.h>
58#include <machine/../linux32/linux32_proto.h>
59#else
60#include <machine/../linux/linux.h>
61#include <machine/../linux/linux_proto.h>
62#endif
63#include <compat/linux/linux_futex.h>
64#include <compat/linux/linux_emul.h>
65#include <compat/linux/linux_util.h>
66
67MALLOC_DEFINE(M_FUTEX, "futex", "Linux futexes");
68MALLOC_DEFINE(M_FUTEX_WP, "futex wp", "Linux futexes wp");
69
70struct futex;
71
72struct waiting_proc {
73	uint32_t	wp_flags;
74	struct futex	*wp_futex;
75	TAILQ_ENTRY(waiting_proc) wp_list;
76};
77
78struct futex {
79	struct sx	f_lck;
80	uint32_t	*f_uaddr;
81	uint32_t	f_refcount;
82	uint32_t	f_bitset;
83	LIST_ENTRY(futex) f_list;
84	TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc;
85};
86
87struct futex_list futex_list;
88
89#define FUTEX_LOCK(f)		sx_xlock(&(f)->f_lck)
90#define FUTEX_UNLOCK(f)		sx_xunlock(&(f)->f_lck)
91#define FUTEX_INIT(f)		sx_init_flags(&(f)->f_lck, "ftlk", 0)
92#define FUTEX_DESTROY(f)	sx_destroy(&(f)->f_lck)
93#define FUTEX_ASSERT_LOCKED(f)	sx_assert(&(f)->f_lck, SA_XLOCKED)
94
95struct mtx futex_mtx;			/* protects the futex list */
96#define FUTEXES_LOCK		mtx_lock(&futex_mtx)
97#define FUTEXES_UNLOCK		mtx_unlock(&futex_mtx)
98
99/* flags for futex_get() */
100#define FUTEX_CREATE_WP		0x1	/* create waiting_proc */
101#define FUTEX_DONTCREATE	0x2	/* don't create futex if not exists */
102#define FUTEX_DONTEXISTS	0x4	/* return EINVAL if futex exists */
103
104/* wp_flags */
105#define FUTEX_WP_REQUEUED	0x1	/* wp requeued - wp moved from wp_list
106					 * of futex where thread sleep to wp_list
107					 * of another futex.
108					 */
109#define FUTEX_WP_REMOVED	0x2	/* wp is woken up and removed from futex
110					 * wp_list to prevent double wakeup.
111					 */
112
113/* support.s */
114int futex_xchgl(int oparg, uint32_t *uaddr, int *oldval);
115int futex_addl(int oparg, uint32_t *uaddr, int *oldval);
116int futex_orl(int oparg, uint32_t *uaddr, int *oldval);
117int futex_andl(int oparg, uint32_t *uaddr, int *oldval);
118int futex_xorl(int oparg, uint32_t *uaddr, int *oldval);
119
120static void
121futex_put(struct futex *f, struct waiting_proc *wp)
122{
123
124	FUTEX_ASSERT_LOCKED(f);
125	if (wp != NULL) {
126		if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0)
127			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
128		free(wp, M_FUTEX_WP);
129	}
130
131	FUTEXES_LOCK;
132	if (--f->f_refcount == 0) {
133		LIST_REMOVE(f, f_list);
134		FUTEXES_UNLOCK;
135		FUTEX_UNLOCK(f);
136
137		LINUX_CTR2(sys_futex, "futex_put destroy uaddr %p ref %d",
138		    f->f_uaddr, f->f_refcount);
139		FUTEX_DESTROY(f);
140		free(f, M_FUTEX);
141		return;
142	}
143
144	LINUX_CTR2(sys_futex, "futex_put uaddr %p ref %d",
145	    f->f_uaddr, f->f_refcount);
146	FUTEXES_UNLOCK;
147	FUTEX_UNLOCK(f);
148}
149
150static int
151futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags)
152{
153	struct futex *f, *tmpf;
154
155	*newf = tmpf = NULL;
156
157retry:
158	FUTEXES_LOCK;
159	LIST_FOREACH(f, &futex_list, f_list) {
160		if (f->f_uaddr == uaddr) {
161			if (tmpf != NULL) {
162				FUTEX_UNLOCK(tmpf);
163				FUTEX_DESTROY(tmpf);
164				free(tmpf, M_FUTEX);
165			}
166			if (flags & FUTEX_DONTEXISTS) {
167				FUTEXES_UNLOCK;
168				return (EINVAL);
169			}
170
171			/*
172			 * Increment refcount of the found futex to
173			 * prevent it from deallocation before FUTEX_LOCK()
174			 */
175			++f->f_refcount;
176			FUTEXES_UNLOCK;
177
178			FUTEX_LOCK(f);
179			*newf = f;
180			LINUX_CTR2(sys_futex, "futex_get uaddr %p ref %d",
181			    uaddr, f->f_refcount);
182			return (0);
183		}
184	}
185
186	if (flags & FUTEX_DONTCREATE) {
187		FUTEXES_UNLOCK;
188		LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr);
189		return (0);
190	}
191
192	if (tmpf == NULL) {
193		FUTEXES_UNLOCK;
194		tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO);
195		tmpf->f_uaddr = uaddr;
196		tmpf->f_refcount = 1;
197		FUTEX_INIT(tmpf);
198		TAILQ_INIT(&tmpf->f_waiting_proc);
199
200		/*
201		 * Lock the new futex before an insert into the futex_list
202		 * to prevent futex usage by other.
203		 */
204		FUTEX_LOCK(tmpf);
205		goto retry;
206	}
207
208	LIST_INSERT_HEAD(&futex_list, tmpf, f_list);
209	FUTEXES_UNLOCK;
210
211	LINUX_CTR2(sys_futex, "futex_get uaddr %p ref %d new",
212	    uaddr, tmpf->f_refcount);
213	*newf = tmpf;
214	return (0);
215}
216
217static int
218futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f,
219    uint32_t flags)
220{
221	int error;
222
223	if (flags & FUTEX_CREATE_WP) {
224		*wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK);
225		(*wp)->wp_flags = 0;
226	}
227	error = futex_get0(uaddr, f, flags);
228	if (error) {
229		if (flags & FUTEX_CREATE_WP)
230			free(*wp, M_FUTEX_WP);
231		return (error);
232	}
233	if (flags & FUTEX_CREATE_WP) {
234		TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list);
235		(*wp)->wp_futex = *f;
236	}
237
238	return (error);
239}
240
241static int
242futex_sleep(struct futex *f, struct waiting_proc *wp, int timeout)
243{
244	int error;
245
246	FUTEX_ASSERT_LOCKED(f);
247	LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %d ref %d",
248	    f->f_uaddr, wp, timeout, f->f_refcount);
249	error = sx_sleep(wp, &f->f_lck, PCATCH, "futex", timeout);
250	if (wp->wp_flags & FUTEX_WP_REQUEUED) {
251		KASSERT(f != wp->wp_futex, ("futex != wp_futex"));
252		LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p w"
253		    " %p requeued uaddr %p ref %d",
254		    error, f->f_uaddr, wp, wp->wp_futex->f_uaddr,
255		    wp->wp_futex->f_refcount);
256		futex_put(f, NULL);
257		f = wp->wp_futex;
258		FUTEX_LOCK(f);
259	} else
260		LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p",
261		    error, f->f_uaddr, wp);
262
263	futex_put(f, wp);
264	return (error);
265}
266
267static int
268futex_wake(struct futex *f, int n, uint32_t bitset)
269{
270	struct waiting_proc *wp, *wpt;
271	int count = 0;
272
273	if (bitset == 0)
274		return (EINVAL);
275
276	FUTEX_ASSERT_LOCKED(f);
277	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
278		LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d",
279		    f->f_uaddr, wp, f->f_refcount);
280		/*
281		 * Unless we find a matching bit in
282		 * the bitset, continue searching.
283		 */
284		if (!(wp->wp_futex->f_bitset & bitset))
285			continue;
286
287		wp->wp_flags |= FUTEX_WP_REMOVED;
288		TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
289		wakeup_one(wp);
290		if (++count == n)
291			break;
292	}
293
294	return (count);
295}
296
297static int
298futex_requeue(struct futex *f, int n, struct futex *f2, int n2)
299{
300	struct waiting_proc *wp, *wpt;
301	int count = 0;
302
303	FUTEX_ASSERT_LOCKED(f);
304	FUTEX_ASSERT_LOCKED(f2);
305
306	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
307		if (++count <= n) {
308			LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p",
309			    f->f_uaddr, wp);
310			wp->wp_flags |= FUTEX_WP_REMOVED;
311			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
312			wakeup_one(wp);
313		} else {
314			LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p",
315			    f->f_uaddr, wp, f2->f_uaddr);
316			wp->wp_flags |= FUTEX_WP_REQUEUED;
317			/* Move wp to wp_list of f2 futex */
318			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
319			TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list);
320
321			/*
322			 * Thread which sleeps on wp after waking should
323			 * acquire f2 lock, so increment refcount of f2 to
324			 * prevent it from premature deallocation.
325			 */
326			wp->wp_futex = f2;
327			FUTEXES_LOCK;
328			++f2->f_refcount;
329			FUTEXES_UNLOCK;
330			if (count - n >= n2)
331				break;
332		}
333	}
334
335	return (count);
336}
337
338static int
339futex_wait(struct futex *f, struct waiting_proc *wp, struct l_timespec *ts,
340    uint32_t bitset)
341{
342	struct l_timespec timeout;
343	struct timeval tv;
344	int timeout_hz;
345	int error;
346
347	if (bitset == 0)
348		return (EINVAL);
349	f->f_bitset = bitset;
350
351	if (ts != NULL) {
352		error = copyin(ts, &timeout, sizeof(timeout));
353		if (error)
354			return (error);
355		TIMESPEC_TO_TIMEVAL(&tv, &timeout);
356		error = itimerfix(&tv);
357		if (error)
358			return (error);
359		timeout_hz = tvtohz(&tv);
360	} else
361		timeout_hz = 0;
362
363	error = futex_sleep(f, wp, timeout_hz);
364	if (error == EWOULDBLOCK)
365		error = ETIMEDOUT;
366
367	return (error);
368}
369
370static int
371futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr)
372{
373	int op = (encoded_op >> 28) & 7;
374	int cmp = (encoded_op >> 24) & 15;
375	int oparg = (encoded_op << 8) >> 20;
376	int cmparg = (encoded_op << 20) >> 20;
377	int oldval = 0, ret;
378
379	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
380		oparg = 1 << oparg;
381
382#ifdef DEBUG
383	if (ldebug(sys_futex))
384		printf("futex_atomic_op: op = %d, cmp = %d, oparg = %x, "
385		       "cmparg = %x, uaddr = %p\n",
386		       op, cmp, oparg, cmparg, uaddr);
387#endif
388	/* XXX: linux verifies access here and returns EFAULT */
389
390	switch (op) {
391	case FUTEX_OP_SET:
392		ret = futex_xchgl(oparg, uaddr, &oldval);
393		break;
394	case FUTEX_OP_ADD:
395		ret = futex_addl(oparg, uaddr, &oldval);
396		break;
397	case FUTEX_OP_OR:
398		ret = futex_orl(oparg, uaddr, &oldval);
399		break;
400	case FUTEX_OP_ANDN:
401		ret = futex_andl(~oparg, uaddr, &oldval);
402		break;
403	case FUTEX_OP_XOR:
404		ret = futex_xorl(oparg, uaddr, &oldval);
405		break;
406	default:
407		ret = -ENOSYS;
408		break;
409	}
410
411	if (ret)
412		return (ret);
413
414	switch (cmp) {
415	case FUTEX_OP_CMP_EQ:
416		return (oldval == cmparg);
417	case FUTEX_OP_CMP_NE:
418		return (oldval != cmparg);
419	case FUTEX_OP_CMP_LT:
420		return (oldval < cmparg);
421	case FUTEX_OP_CMP_GE:
422		return (oldval >= cmparg);
423	case FUTEX_OP_CMP_LE:
424		return (oldval <= cmparg);
425	case FUTEX_OP_CMP_GT:
426		return (oldval > cmparg);
427	default:
428		return (-ENOSYS);
429	}
430}
431
432int
433linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
434{
435	int clockrt, nrwake, op_ret, ret, val;
436	struct linux_emuldata *em;
437	struct waiting_proc *wp;
438	struct futex *f, *f2 = NULL;
439	int error = 0;
440
441	/*
442	 * Our implementation provides only privates futexes. Most of the apps
443	 * should use private futexes but don't claim so. Therefore we treat
444	 * all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works
445	 * in most cases (ie. when futexes are not shared on file descriptor
446	 * or between different processes.).
447	 */
448	args->op = args->op & ~LINUX_FUTEX_PRIVATE_FLAG;
449
450	/*
451	 * Currently support for switching between CLOCK_MONOTONIC and
452	 * CLOCK_REALTIME is not present. However Linux forbids the use of
453	 * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and
454	 * FUTEX_WAIT_REQUEUE_PI.
455	 */
456	clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
457	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
458	if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET &&
459		args->op != LINUX_FUTEX_WAIT_REQUEUE_PI)
460		return (ENOSYS);
461
462	switch (args->op) {
463	case LINUX_FUTEX_WAIT:
464		args->val3 = FUTEX_BITSET_MATCH_ANY;
465		/* FALLTHROUGH */
466
467	case LINUX_FUTEX_WAIT_BITSET:
468
469		LINUX_CTR3(sys_futex, "WAIT uaddr %p val %d val3 %d",
470		    args->uaddr, args->val, args->val3);
471#ifdef DEBUG
472		if (ldebug(sys_futex))
473			printf(ARGS(sys_futex,
474			    "futex_wait uaddr %p val %d val3 %d"),
475			    args->uaddr, args->val, args->val3);
476#endif
477		error = futex_get(args->uaddr, &wp, &f, FUTEX_CREATE_WP);
478		if (error)
479			return (error);
480		error = copyin(args->uaddr, &val, sizeof(val));
481		if (error) {
482			LINUX_CTR1(sys_futex, "WAIT copyin failed %d",
483			    error);
484			futex_put(f, wp);
485			return (error);
486		}
487		if (val != args->val) {
488			LINUX_CTR4(sys_futex,
489			    "WAIT uaddr %p val %d != uval %d val3 %d",
490			    args->uaddr, args->val, val, args->val3);
491			futex_put(f, wp);
492			return (EWOULDBLOCK);
493		}
494
495		error = futex_wait(f, wp, args->timeout, args->val3);
496		break;
497
498	case LINUX_FUTEX_WAKE:
499		args->val3 = FUTEX_BITSET_MATCH_ANY;
500		/* FALLTHROUGH */
501
502	case LINUX_FUTEX_WAKE_BITSET:
503
504		LINUX_CTR3(sys_futex, "WAKE uaddr %p val % d val3 %d",
505		    args->uaddr, args->val, args->val3);
506
507		/*
508		 * XXX: Linux is able to cope with different addresses
509		 * corresponding to the same mapped memory in the sleeping
510		 * and waker process(es).
511		 */
512#ifdef DEBUG
513		if (ldebug(sys_futex))
514			printf(ARGS(sys_futex, "futex_wake uaddr %p val %d val3 %d"),
515			    args->uaddr, args->val, args->val3);
516#endif
517		error = futex_get(args->uaddr, NULL, &f, FUTEX_DONTCREATE);
518		if (error)
519			return (error);
520		if (f == NULL) {
521			td->td_retval[0] = 0;
522			return (error);
523		}
524		td->td_retval[0] = futex_wake(f, args->val, args->val3);
525		futex_put(f, NULL);
526		break;
527
528	case LINUX_FUTEX_CMP_REQUEUE:
529
530		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
531		    "val %d val3 %d uaddr2 %p val2 %d",
532		    args->uaddr, args->val, args->val3, args->uaddr2,
533		    (int)(unsigned long)args->timeout);
534
535#ifdef DEBUG
536		if (ldebug(sys_futex))
537			printf(ARGS(sys_futex, "futex_cmp_requeue uaddr %p "
538			    "val %d val3 %d uaddr2 %p val2 %d"),
539			    args->uaddr, args->val, args->val3, args->uaddr2,
540			    (int)(unsigned long)args->timeout);
541#endif
542
543		/*
544		 * Linux allows this, we would not, it is an incorrect
545		 * usage of declared ABI, so return EINVAL.
546		 */
547		if (args->uaddr == args->uaddr2)
548			return (EINVAL);
549		error = futex_get0(args->uaddr, &f, 0);
550		if (error)
551			return (error);
552
553		/*
554		 * To avoid deadlocks return EINVAL if second futex
555		 * exists at this time. Otherwise create the new futex
556		 * and ignore false positive LOR which thus happens.
557		 *
558		 * Glibc fall back to FUTEX_WAKE in case of any error
559		 * returned by FUTEX_CMP_REQUEUE.
560		 */
561		error = futex_get0(args->uaddr2, &f2, FUTEX_DONTEXISTS);
562		if (error) {
563			futex_put(f, NULL);
564			return (error);
565		}
566		error = copyin(args->uaddr, &val, sizeof(val));
567		if (error) {
568			LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d",
569			    error);
570			futex_put(f2, NULL);
571			futex_put(f, NULL);
572			return (error);
573		}
574		if (val != args->val3) {
575			LINUX_CTR2(sys_futex, "CMP_REQUEUE val %d != uval %d",
576			    args->val, val);
577			futex_put(f2, NULL);
578			futex_put(f, NULL);
579			return (EAGAIN);
580		}
581
582		nrwake = (int)(unsigned long)args->timeout;
583		td->td_retval[0] = futex_requeue(f, args->val, f2, nrwake);
584		futex_put(f2, NULL);
585		futex_put(f, NULL);
586		break;
587
588	case LINUX_FUTEX_WAKE_OP:
589
590		LINUX_CTR5(sys_futex, "WAKE_OP "
591		    "uaddr %p op %d val %x uaddr2 %p val3 %x",
592		    args->uaddr, args->op, args->val,
593		    args->uaddr2, args->val3);
594
595#ifdef DEBUG
596		if (ldebug(sys_futex))
597			printf(ARGS(sys_futex, "futex_wake_op "
598			    "uaddr %p op %d val %x uaddr2 %p val3 %x"),
599			    args->uaddr, args->op, args->val,
600			    args->uaddr2, args->val3);
601#endif
602		error = futex_get0(args->uaddr, &f, 0);
603		if (error)
604			return (error);
605		if (args->uaddr != args->uaddr2)
606			error = futex_get0(args->uaddr2, &f2, 0);
607		if (error) {
608			futex_put(f, NULL);
609			return (error);
610		}
611
612		/*
613		 * This function returns positive number as results and
614		 * negative as errors
615		 */
616		op_ret = futex_atomic_op(td, args->val3, args->uaddr2);
617
618		if (op_ret < 0) {
619			/* XXX: We don't handle the EFAULT yet. */
620			if (op_ret != -EFAULT) {
621				if (f2 != NULL)
622					futex_put(f2, NULL);
623				futex_put(f, NULL);
624				return (-op_ret);
625			}
626			if (f2 != NULL)
627				futex_put(f2, NULL);
628			futex_put(f, NULL);
629			return (EFAULT);
630		}
631
632		ret = futex_wake(f, args->val, args->val3);
633
634		if (op_ret > 0) {
635			op_ret = 0;
636			nrwake = (int)(unsigned long)args->timeout;
637
638			if (f2 != NULL)
639				op_ret += futex_wake(f2, nrwake, args->val3);
640			else
641				op_ret += futex_wake(f, nrwake, args->val3);
642			ret += op_ret;
643
644		}
645		if (f2 != NULL)
646			futex_put(f2, NULL);
647		futex_put(f, NULL);
648		td->td_retval[0] = ret;
649		break;
650
651	case LINUX_FUTEX_LOCK_PI:
652		/* not yet implemented */
653		linux_msg(td,
654			  "linux_sys_futex: "
655			  "op LINUX_FUTEX_LOCK_PI not implemented\n");
656		return (ENOSYS);
657
658	case LINUX_FUTEX_UNLOCK_PI:
659		/* not yet implemented */
660		linux_msg(td,
661			  "linux_sys_futex: "
662			  "op LINUX_FUTEX_UNLOCK_PI not implemented\n");
663		return (ENOSYS);
664
665	case LINUX_FUTEX_TRYLOCK_PI:
666		/* not yet implemented */
667		linux_msg(td,
668			  "linux_sys_futex: "
669			  "op LINUX_FUTEX_TRYLOCK_PI not implemented\n");
670		return (ENOSYS);
671
672	case LINUX_FUTEX_REQUEUE:
673
674		/*
675		 * Glibc does not use this operation since version 2.3.3,
676		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
677		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
678		 * FUTEX_REQUEUE returned EINVAL.
679		 */
680		em = em_find(td->td_proc, EMUL_DONTLOCK);
681		if (em->used_requeue == 0) {
682			linux_msg(td,
683				  "linux_sys_futex: "
684				  "unsupported futex_requeue op\n");
685			em->used_requeue = 1;
686		}
687		return (EINVAL);
688
689	case LINUX_FUTEX_WAIT_REQUEUE_PI:
690		/* not yet implemented */
691		linux_msg(td,
692			  "linux_sys_futex: "
693			  "op FUTEX_WAIT_REQUEUE_PI not implemented\n");
694		return (ENOSYS);
695
696	default:
697		linux_msg(td,
698			  "linux_sys_futex: unknown op %d\n", args->op);
699		return (ENOSYS);
700	}
701
702	return (error);
703}
704
705int
706linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
707{
708	struct linux_emuldata *em;
709
710#ifdef DEBUG
711	if (ldebug(set_robust_list))
712		printf(ARGS(set_robust_list, "head %p len %d"),
713		    args->head, args->len);
714#endif
715
716	if (args->len != sizeof(struct linux_robust_list_head))
717		return (EINVAL);
718
719	em = em_find(td->td_proc, EMUL_DOLOCK);
720	em->robust_futexes = args->head;
721	EMUL_UNLOCK(&emul_lock);
722
723	return (0);
724}
725
726int
727linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
728{
729	struct linux_emuldata *em;
730	struct linux_robust_list_head *head;
731	l_size_t len = sizeof(struct linux_robust_list_head);
732	int error = 0;
733
734#ifdef	DEBUG
735	if (ldebug(get_robust_list))
736		printf(ARGS(get_robust_list, ""));
737#endif
738
739	if (!args->pid) {
740		em = em_find(td->td_proc, EMUL_DONTLOCK);
741		head = em->robust_futexes;
742	} else {
743		struct proc *p;
744
745		p = pfind(args->pid);
746		if (p == NULL)
747			return (ESRCH);
748
749		em = em_find(p, EMUL_DONTLOCK);
750		/* XXX: ptrace? */
751		if (priv_check(td, PRIV_CRED_SETUID) ||
752		    priv_check(td, PRIV_CRED_SETEUID) ||
753		    p_candebug(td, p)) {
754			PROC_UNLOCK(p);
755			return (EPERM);
756		}
757		head = em->robust_futexes;
758
759		PROC_UNLOCK(p);
760	}
761
762	error = copyout(&len, args->len, sizeof(l_size_t));
763	if (error)
764		return (EFAULT);
765
766	error = copyout(head, args->head, sizeof(struct linux_robust_list_head));
767
768	return (error);
769}
770
771static int
772handle_futex_death(struct proc *p, uint32_t *uaddr, int pi)
773{
774	uint32_t uval, nval, mval;
775	struct futex *f;
776	int error;
777
778retry:
779	if (copyin(uaddr, &uval, 4))
780		return (EFAULT);
781	if ((uval & FUTEX_TID_MASK) == p->p_pid) {
782		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
783		nval = casuword32(uaddr, uval, mval);
784
785		if (nval == -1)
786			return (EFAULT);
787
788		if (nval != uval)
789			goto retry;
790
791		if (!pi && (uval & FUTEX_WAITERS)) {
792			error = futex_get(uaddr, NULL, &f,
793			    FUTEX_DONTCREATE);
794			if (error)
795				return (error);
796			if (f != NULL) {
797				futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY);
798				futex_put(f, NULL);
799			}
800		}
801	}
802
803	return (0);
804}
805
806static int
807fetch_robust_entry(struct linux_robust_list **entry,
808    struct linux_robust_list **head, int *pi)
809{
810	l_ulong uentry;
811
812	if (copyin((const void *)head, &uentry, sizeof(l_ulong)))
813		return (EFAULT);
814
815	*entry = (void *)(uentry & ~1UL);
816	*pi = uentry & 1;
817
818	return (0);
819}
820
821/* This walks the list of robust futexes releasing them. */
822void
823release_futexes(struct proc *p)
824{
825	struct linux_robust_list_head *head = NULL;
826	struct linux_robust_list *entry, *next_entry, *pending;
827	unsigned int limit = 2048, pi, next_pi, pip;
828	struct linux_emuldata *em;
829	l_long futex_offset;
830	int rc;
831
832	em = em_find(p, EMUL_DONTLOCK);
833	head = em->robust_futexes;
834
835	if (head == NULL)
836		return;
837
838	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
839		return;
840
841	if (copyin(&head->futex_offset, &futex_offset, sizeof(futex_offset)))
842		return;
843
844	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
845		return;
846
847	while (entry != &head->list) {
848		rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi);
849
850		if (entry != pending)
851			if (handle_futex_death(p, (uint32_t *)entry + futex_offset, pi))
852				return;
853		if (rc)
854			return;
855
856		entry = next_entry;
857		pi = next_pi;
858
859		if (!--limit)
860			break;
861
862		sched_relinquish(curthread);
863	}
864
865	if (pending)
866		handle_futex_death(p, (uint32_t *)pending + futex_offset, pip);
867}
868