1// SPDX-License-Identifier: GPL-2.0-or-later
2
3#include <linux/syscalls.h>
4#include <linux/time_namespace.h>
5
6#include "futex.h"
7
8/*
9 * Support for robust futexes: the kernel cleans up held futexes at
10 * thread exit time.
11 *
12 * Implementation: user-space maintains a per-thread list of locks it
13 * is holding. Upon do_exit(), the kernel carefully walks this list,
14 * and marks all locks that are owned by this thread with the
15 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
16 * always manipulated with the lock held, so the list is private and
17 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
18 * field, to allow the kernel to clean up if the thread dies after
19 * acquiring the lock, but just before it could have added itself to
20 * the list. There can only be one such pending lock.
21 */
22
23/**
24 * sys_set_robust_list() - Set the robust-futex list head of a task
25 * @head:	pointer to the list-head
26 * @len:	length of the list-head, as userspace expects
27 */
28SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
29		size_t, len)
30{
31	/*
32	 * The kernel knows only one size for now:
33	 */
34	if (unlikely(len != sizeof(*head)))
35		return -EINVAL;
36
37	current->robust_list = head;
38
39	return 0;
40}
41
42/**
43 * sys_get_robust_list() - Get the robust-futex list head of a task
44 * @pid:	pid of the process [zero for current task]
45 * @head_ptr:	pointer to a list-head pointer, the kernel fills it in
46 * @len_ptr:	pointer to a length field, the kernel fills in the header size
47 */
48SYSCALL_DEFINE3(get_robust_list, int, pid,
49		struct robust_list_head __user * __user *, head_ptr,
50		size_t __user *, len_ptr)
51{
52	struct robust_list_head __user *head;
53	unsigned long ret;
54	struct task_struct *p;
55
56	rcu_read_lock();
57
58	ret = -ESRCH;
59	if (!pid)
60		p = current;
61	else {
62		p = find_task_by_vpid(pid);
63		if (!p)
64			goto err_unlock;
65	}
66
67	ret = -EPERM;
68	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
69		goto err_unlock;
70
71	head = p->robust_list;
72	rcu_read_unlock();
73
74	if (put_user(sizeof(*head), len_ptr))
75		return -EFAULT;
76	return put_user(head, head_ptr);
77
78err_unlock:
79	rcu_read_unlock();
80
81	return ret;
82}
83
84long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
85		u32 __user *uaddr2, u32 val2, u32 val3)
86{
87	unsigned int flags = futex_to_flags(op);
88	int cmd = op & FUTEX_CMD_MASK;
89
90	if (flags & FLAGS_CLOCKRT) {
91		if (cmd != FUTEX_WAIT_BITSET &&
92		    cmd != FUTEX_WAIT_REQUEUE_PI &&
93		    cmd != FUTEX_LOCK_PI2)
94			return -ENOSYS;
95	}
96
97	switch (cmd) {
98	case FUTEX_WAIT:
99		val3 = FUTEX_BITSET_MATCH_ANY;
100		fallthrough;
101	case FUTEX_WAIT_BITSET:
102		return futex_wait(uaddr, flags, val, timeout, val3);
103	case FUTEX_WAKE:
104		val3 = FUTEX_BITSET_MATCH_ANY;
105		fallthrough;
106	case FUTEX_WAKE_BITSET:
107		return futex_wake(uaddr, flags, val, val3);
108	case FUTEX_REQUEUE:
109		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
110	case FUTEX_CMP_REQUEUE:
111		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
112	case FUTEX_WAKE_OP:
113		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
114	case FUTEX_LOCK_PI:
115		flags |= FLAGS_CLOCKRT;
116		fallthrough;
117	case FUTEX_LOCK_PI2:
118		return futex_lock_pi(uaddr, flags, timeout, 0);
119	case FUTEX_UNLOCK_PI:
120		return futex_unlock_pi(uaddr, flags);
121	case FUTEX_TRYLOCK_PI:
122		return futex_lock_pi(uaddr, flags, NULL, 1);
123	case FUTEX_WAIT_REQUEUE_PI:
124		val3 = FUTEX_BITSET_MATCH_ANY;
125		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
126					     uaddr2);
127	case FUTEX_CMP_REQUEUE_PI:
128		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
129	}
130	return -ENOSYS;
131}
132
133static __always_inline bool futex_cmd_has_timeout(u32 cmd)
134{
135	switch (cmd) {
136	case FUTEX_WAIT:
137	case FUTEX_LOCK_PI:
138	case FUTEX_LOCK_PI2:
139	case FUTEX_WAIT_BITSET:
140	case FUTEX_WAIT_REQUEUE_PI:
141		return true;
142	}
143	return false;
144}
145
146static __always_inline int
147futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
148{
149	if (!timespec64_valid(ts))
150		return -EINVAL;
151
152	*t = timespec64_to_ktime(*ts);
153	if (cmd == FUTEX_WAIT)
154		*t = ktime_add_safe(ktime_get(), *t);
155	else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
156		*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
157	return 0;
158}
159
160SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
161		const struct __kernel_timespec __user *, utime,
162		u32 __user *, uaddr2, u32, val3)
163{
164	int ret, cmd = op & FUTEX_CMD_MASK;
165	ktime_t t, *tp = NULL;
166	struct timespec64 ts;
167
168	if (utime && futex_cmd_has_timeout(cmd)) {
169		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
170			return -EFAULT;
171		if (get_timespec64(&ts, utime))
172			return -EFAULT;
173		ret = futex_init_timeout(cmd, op, &ts, &t);
174		if (ret)
175			return ret;
176		tp = &t;
177	}
178
179	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
180}
181
182/**
183 * futex_parse_waitv - Parse a waitv array from userspace
184 * @futexv:	Kernel side list of waiters to be filled
185 * @uwaitv:     Userspace list to be parsed
186 * @nr_futexes: Length of futexv
187 * @wake:	Wake to call when futex is woken
188 * @wake_data:	Data for the wake handler
189 *
190 * Return: Error code on failure, 0 on success
191 */
192int futex_parse_waitv(struct futex_vector *futexv,
193		      struct futex_waitv __user *uwaitv,
194		      unsigned int nr_futexes, futex_wake_fn *wake,
195		      void *wake_data)
196{
197	struct futex_waitv aux;
198	unsigned int i;
199
200	for (i = 0; i < nr_futexes; i++) {
201		unsigned int flags;
202
203		if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
204			return -EFAULT;
205
206		if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
207			return -EINVAL;
208
209		flags = futex2_to_flags(aux.flags);
210		if (!futex_flags_valid(flags))
211			return -EINVAL;
212
213		if (!futex_validate_input(flags, aux.val))
214			return -EINVAL;
215
216		futexv[i].w.flags = flags;
217		futexv[i].w.val = aux.val;
218		futexv[i].w.uaddr = aux.uaddr;
219		futexv[i].q = futex_q_init;
220		futexv[i].q.wake = wake;
221		futexv[i].q.wake_data = wake_data;
222	}
223
224	return 0;
225}
226
227static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
228				clockid_t clockid, struct hrtimer_sleeper *to)
229{
230	int flag_clkid = 0, flag_init = 0;
231	struct timespec64 ts;
232	ktime_t time;
233	int ret;
234
235	if (!timeout)
236		return 0;
237
238	if (clockid == CLOCK_REALTIME) {
239		flag_clkid = FLAGS_CLOCKRT;
240		flag_init = FUTEX_CLOCK_REALTIME;
241	}
242
243	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
244		return -EINVAL;
245
246	if (get_timespec64(&ts, timeout))
247		return -EFAULT;
248
249	/*
250	 * Since there's no opcode for futex_waitv, use
251	 * FUTEX_WAIT_BITSET that uses absolute timeout as well
252	 */
253	ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
254	if (ret)
255		return ret;
256
257	futex_setup_timer(&time, to, flag_clkid, 0);
258	return 0;
259}
260
261static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
262{
263	hrtimer_cancel(&to->timer);
264	destroy_hrtimer_on_stack(&to->timer);
265}
266
267/**
268 * sys_futex_waitv - Wait on a list of futexes
269 * @waiters:    List of futexes to wait on
270 * @nr_futexes: Length of futexv
271 * @flags:      Flag for timeout (monotonic/realtime)
272 * @timeout:	Optional absolute timeout.
273 * @clockid:	Clock to be used for the timeout, realtime or monotonic.
274 *
275 * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
276 * if a futex_wake() is performed at any uaddr. The syscall returns immediately
277 * if any waiter has *uaddr != val. *timeout is an optional timeout value for
278 * the operation. Each waiter has individual flags. The `flags` argument for
279 * the syscall should be used solely for specifying the timeout as realtime, if
280 * needed. Flags for private futexes, sizes, etc. should be used on the
281 * individual flags of each waiter.
282 *
283 * Returns the array index of one of the woken futexes. No further information
284 * is provided: any number of other futexes may also have been woken by the
285 * same event, and if more than one futex was woken, the retrned index may
286 * refer to any one of them. (It is not necessaryily the futex with the
287 * smallest index, nor the one most recently woken, nor...)
288 */
289
290SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
291		unsigned int, nr_futexes, unsigned int, flags,
292		struct __kernel_timespec __user *, timeout, clockid_t, clockid)
293{
294	struct hrtimer_sleeper to;
295	struct futex_vector *futexv;
296	int ret;
297
298	/* This syscall supports no flags for now */
299	if (flags)
300		return -EINVAL;
301
302	if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
303		return -EINVAL;
304
305	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
306		return ret;
307
308	futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
309	if (!futexv) {
310		ret = -ENOMEM;
311		goto destroy_timer;
312	}
313
314	ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
315				NULL);
316	if (!ret)
317		ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
318
319	kfree(futexv);
320
321destroy_timer:
322	if (timeout)
323		futex2_destroy_timeout(&to);
324	return ret;
325}
326
327/*
328 * sys_futex_wake - Wake a number of futexes
329 * @uaddr:	Address of the futex(es) to wake
330 * @mask:	bitmask
331 * @nr:		Number of the futexes to wake
332 * @flags:	FUTEX2 flags
333 *
334 * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
335 * futex2 family of calls.
336 */
337
338SYSCALL_DEFINE4(futex_wake,
339		void __user *, uaddr,
340		unsigned long, mask,
341		int, nr,
342		unsigned int, flags)
343{
344	if (flags & ~FUTEX2_VALID_MASK)
345		return -EINVAL;
346
347	flags = futex2_to_flags(flags);
348	if (!futex_flags_valid(flags))
349		return -EINVAL;
350
351	if (!futex_validate_input(flags, mask))
352		return -EINVAL;
353
354	return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
355}
356
357/*
358 * sys_futex_wait - Wait on a futex
359 * @uaddr:	Address of the futex to wait on
360 * @val:	Value of @uaddr
361 * @mask:	bitmask
362 * @flags:	FUTEX2 flags
363 * @timeout:	Optional absolute timeout
364 * @clockid:	Clock to be used for the timeout, realtime or monotonic
365 *
366 * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
367 * futex2 familiy of calls.
368 */
369
370SYSCALL_DEFINE6(futex_wait,
371		void __user *, uaddr,
372		unsigned long, val,
373		unsigned long, mask,
374		unsigned int, flags,
375		struct __kernel_timespec __user *, timeout,
376		clockid_t, clockid)
377{
378	struct hrtimer_sleeper to;
379	int ret;
380
381	if (flags & ~FUTEX2_VALID_MASK)
382		return -EINVAL;
383
384	flags = futex2_to_flags(flags);
385	if (!futex_flags_valid(flags))
386		return -EINVAL;
387
388	if (!futex_validate_input(flags, val) ||
389	    !futex_validate_input(flags, mask))
390		return -EINVAL;
391
392	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
393		return ret;
394
395	ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
396
397	if (timeout)
398		futex2_destroy_timeout(&to);
399
400	return ret;
401}
402
403/*
404 * sys_futex_requeue - Requeue a waiter from one futex to another
405 * @waiters:	array describing the source and destination futex
406 * @flags:	unused
407 * @nr_wake:	number of futexes to wake
408 * @nr_requeue:	number of futexes to requeue
409 *
410 * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
411 * futex2 family of calls.
412 */
413
414SYSCALL_DEFINE4(futex_requeue,
415		struct futex_waitv __user *, waiters,
416		unsigned int, flags,
417		int, nr_wake,
418		int, nr_requeue)
419{
420	struct futex_vector futexes[2];
421	u32 cmpval;
422	int ret;
423
424	if (flags)
425		return -EINVAL;
426
427	if (!waiters)
428		return -EINVAL;
429
430	ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
431	if (ret)
432		return ret;
433
434	cmpval = futexes[0].w.val;
435
436	return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
437			     u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
438			     nr_wake, nr_requeue, &cmpval, 0);
439}
440
441#ifdef CONFIG_COMPAT
442COMPAT_SYSCALL_DEFINE2(set_robust_list,
443		struct compat_robust_list_head __user *, head,
444		compat_size_t, len)
445{
446	if (unlikely(len != sizeof(*head)))
447		return -EINVAL;
448
449	current->compat_robust_list = head;
450
451	return 0;
452}
453
454COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
455			compat_uptr_t __user *, head_ptr,
456			compat_size_t __user *, len_ptr)
457{
458	struct compat_robust_list_head __user *head;
459	unsigned long ret;
460	struct task_struct *p;
461
462	rcu_read_lock();
463
464	ret = -ESRCH;
465	if (!pid)
466		p = current;
467	else {
468		p = find_task_by_vpid(pid);
469		if (!p)
470			goto err_unlock;
471	}
472
473	ret = -EPERM;
474	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
475		goto err_unlock;
476
477	head = p->compat_robust_list;
478	rcu_read_unlock();
479
480	if (put_user(sizeof(*head), len_ptr))
481		return -EFAULT;
482	return put_user(ptr_to_compat(head), head_ptr);
483
484err_unlock:
485	rcu_read_unlock();
486
487	return ret;
488}
489#endif /* CONFIG_COMPAT */
490
491#ifdef CONFIG_COMPAT_32BIT_TIME
492SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
493		const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
494		u32, val3)
495{
496	int ret, cmd = op & FUTEX_CMD_MASK;
497	ktime_t t, *tp = NULL;
498	struct timespec64 ts;
499
500	if (utime && futex_cmd_has_timeout(cmd)) {
501		if (get_old_timespec32(&ts, utime))
502			return -EFAULT;
503		ret = futex_init_timeout(cmd, op, &ts, &t);
504		if (ret)
505			return ret;
506		tp = &t;
507	}
508
509	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
510}
511#endif /* CONFIG_COMPAT_32BIT_TIME */
512
513