1// SPDX-License-Identifier: GPL-2.0-or-later
2
3#include <linux/slab.h>
4#include <linux/sched/rt.h>
5#include <linux/sched/task.h>
6
7#include "futex.h"
8#include "../locking/rtmutex_common.h"
9
10/*
11 * PI code:
12 */
13int refill_pi_state_cache(void)
14{
15	struct futex_pi_state *pi_state;
16
17	if (likely(current->pi_state_cache))
18		return 0;
19
20	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
21
22	if (!pi_state)
23		return -ENOMEM;
24
25	INIT_LIST_HEAD(&pi_state->list);
26	/* pi_mutex gets initialized later */
27	pi_state->owner = NULL;
28	refcount_set(&pi_state->refcount, 1);
29	pi_state->key = FUTEX_KEY_INIT;
30
31	current->pi_state_cache = pi_state;
32
33	return 0;
34}
35
36static struct futex_pi_state *alloc_pi_state(void)
37{
38	struct futex_pi_state *pi_state = current->pi_state_cache;
39
40	WARN_ON(!pi_state);
41	current->pi_state_cache = NULL;
42
43	return pi_state;
44}
45
46static void pi_state_update_owner(struct futex_pi_state *pi_state,
47				  struct task_struct *new_owner)
48{
49	struct task_struct *old_owner = pi_state->owner;
50
51	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
52
53	if (old_owner) {
54		raw_spin_lock(&old_owner->pi_lock);
55		WARN_ON(list_empty(&pi_state->list));
56		list_del_init(&pi_state->list);
57		raw_spin_unlock(&old_owner->pi_lock);
58	}
59
60	if (new_owner) {
61		raw_spin_lock(&new_owner->pi_lock);
62		WARN_ON(!list_empty(&pi_state->list));
63		list_add(&pi_state->list, &new_owner->pi_state_list);
64		pi_state->owner = new_owner;
65		raw_spin_unlock(&new_owner->pi_lock);
66	}
67}
68
69void get_pi_state(struct futex_pi_state *pi_state)
70{
71	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
72}
73
74/*
75 * Drops a reference to the pi_state object and frees or caches it
76 * when the last reference is gone.
77 */
78void put_pi_state(struct futex_pi_state *pi_state)
79{
80	if (!pi_state)
81		return;
82
83	if (!refcount_dec_and_test(&pi_state->refcount))
84		return;
85
86	/*
87	 * If pi_state->owner is NULL, the owner is most probably dying
88	 * and has cleaned up the pi_state already
89	 */
90	if (pi_state->owner) {
91		unsigned long flags;
92
93		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
94		pi_state_update_owner(pi_state, NULL);
95		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
96		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
97	}
98
99	if (current->pi_state_cache) {
100		kfree(pi_state);
101	} else {
102		/*
103		 * pi_state->list is already empty.
104		 * clear pi_state->owner.
105		 * refcount is at 0 - put it back to 1.
106		 */
107		pi_state->owner = NULL;
108		refcount_set(&pi_state->refcount, 1);
109		current->pi_state_cache = pi_state;
110	}
111}
112
113/*
114 * We need to check the following states:
115 *
116 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
117 *
118 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
119 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
120 *
121 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
122 *
123 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
124 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
125 *
126 * [6]  Found  | Found    | task      | 0         | 1      | Valid
127 *
128 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
129 *
130 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
131 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
132 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
133 *
134 * [1]	Indicates that the kernel can acquire the futex atomically. We
135 *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
136 *
137 * [2]	Valid, if TID does not belong to a kernel thread. If no matching
138 *      thread is found then it indicates that the owner TID has died.
139 *
140 * [3]	Invalid. The waiter is queued on a non PI futex
141 *
142 * [4]	Valid state after exit_robust_list(), which sets the user space
143 *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
144 *
145 * [5]	The user space value got manipulated between exit_robust_list()
146 *	and exit_pi_state_list()
147 *
148 * [6]	Valid state after exit_pi_state_list() which sets the new owner in
149 *	the pi_state but cannot access the user space value.
150 *
151 * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
152 *
153 * [8]	Owner and user space value match
154 *
155 * [9]	There is no transient state which sets the user space TID to 0
156 *	except exit_robust_list(), but this is indicated by the
157 *	FUTEX_OWNER_DIED bit. See [4]
158 *
159 * [10] There is no transient state which leaves owner and user space
160 *	TID out of sync. Except one error case where the kernel is denied
161 *	write access to the user address, see fixup_pi_state_owner().
162 *
163 *
164 * Serialization and lifetime rules:
165 *
166 * hb->lock:
167 *
168 *	hb -> futex_q, relation
169 *	futex_q -> pi_state, relation
170 *
171 *	(cannot be raw because hb can contain arbitrary amount
172 *	 of futex_q's)
173 *
174 * pi_mutex->wait_lock:
175 *
176 *	{uval, pi_state}
177 *
178 *	(and pi_mutex 'obviously')
179 *
180 * p->pi_lock:
181 *
182 *	p->pi_state_list -> pi_state->list, relation
183 *	pi_mutex->owner -> pi_state->owner, relation
184 *
185 * pi_state->refcount:
186 *
187 *	pi_state lifetime
188 *
189 *
190 * Lock order:
191 *
192 *   hb->lock
193 *     pi_mutex->wait_lock
194 *       p->pi_lock
195 *
196 */
197
198/*
199 * Validate that the existing waiter has a pi_state and sanity check
200 * the pi_state against the user space value. If correct, attach to
201 * it.
202 */
203static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
204			      struct futex_pi_state *pi_state,
205			      struct futex_pi_state **ps)
206{
207	pid_t pid = uval & FUTEX_TID_MASK;
208	u32 uval2;
209	int ret;
210
211	/*
212	 * Userspace might have messed up non-PI and PI futexes [3]
213	 */
214	if (unlikely(!pi_state))
215		return -EINVAL;
216
217	/*
218	 * We get here with hb->lock held, and having found a
219	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
220	 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
221	 * which in turn means that futex_lock_pi() still has a reference on
222	 * our pi_state.
223	 *
224	 * The waiter holding a reference on @pi_state also protects against
225	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
226	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
227	 * free pi_state before we can take a reference ourselves.
228	 */
229	WARN_ON(!refcount_read(&pi_state->refcount));
230
231	/*
232	 * Now that we have a pi_state, we can acquire wait_lock
233	 * and do the state validation.
234	 */
235	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
236
237	/*
238	 * Since {uval, pi_state} is serialized by wait_lock, and our current
239	 * uval was read without holding it, it can have changed. Verify it
240	 * still is what we expect it to be, otherwise retry the entire
241	 * operation.
242	 */
243	if (futex_get_value_locked(&uval2, uaddr))
244		goto out_efault;
245
246	if (uval != uval2)
247		goto out_eagain;
248
249	/*
250	 * Handle the owner died case:
251	 */
252	if (uval & FUTEX_OWNER_DIED) {
253		/*
254		 * exit_pi_state_list sets owner to NULL and wakes the
255		 * topmost waiter. The task which acquires the
256		 * pi_state->rt_mutex will fixup owner.
257		 */
258		if (!pi_state->owner) {
259			/*
260			 * No pi state owner, but the user space TID
261			 * is not 0. Inconsistent state. [5]
262			 */
263			if (pid)
264				goto out_einval;
265			/*
266			 * Take a ref on the state and return success. [4]
267			 */
268			goto out_attach;
269		}
270
271		/*
272		 * If TID is 0, then either the dying owner has not
273		 * yet executed exit_pi_state_list() or some waiter
274		 * acquired the rtmutex in the pi state, but did not
275		 * yet fixup the TID in user space.
276		 *
277		 * Take a ref on the state and return success. [6]
278		 */
279		if (!pid)
280			goto out_attach;
281	} else {
282		/*
283		 * If the owner died bit is not set, then the pi_state
284		 * must have an owner. [7]
285		 */
286		if (!pi_state->owner)
287			goto out_einval;
288	}
289
290	/*
291	 * Bail out if user space manipulated the futex value. If pi
292	 * state exists then the owner TID must be the same as the
293	 * user space TID. [9/10]
294	 */
295	if (pid != task_pid_vnr(pi_state->owner))
296		goto out_einval;
297
298out_attach:
299	get_pi_state(pi_state);
300	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
301	*ps = pi_state;
302	return 0;
303
304out_einval:
305	ret = -EINVAL;
306	goto out_error;
307
308out_eagain:
309	ret = -EAGAIN;
310	goto out_error;
311
312out_efault:
313	ret = -EFAULT;
314	goto out_error;
315
316out_error:
317	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
318	return ret;
319}
320
321static int handle_exit_race(u32 __user *uaddr, u32 uval,
322			    struct task_struct *tsk)
323{
324	u32 uval2;
325
326	/*
327	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
328	 * caller that the alleged owner is busy.
329	 */
330	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
331		return -EBUSY;
332
333	/*
334	 * Reread the user space value to handle the following situation:
335	 *
336	 * CPU0				CPU1
337	 *
338	 * sys_exit()			sys_futex()
339	 *  do_exit()			 futex_lock_pi()
340	 *                                futex_lock_pi_atomic()
341	 *   exit_signals(tsk)		    No waiters:
342	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
343	 *  mm_release(tsk)		    Set waiter bit
344	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
345	 *      Set owner died		    attach_to_pi_owner() {
346	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
347	 *   }				     if (!tsk->flags & PF_EXITING) {
348	 *  ...				       attach();
349	 *  tsk->futex_state =               } else {
350	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
351	 *					  FUTEX_STATE_DEAD)
352	 *				         return -EAGAIN;
353	 *				       return -ESRCH; <--- FAIL
354	 *				     }
355	 *
356	 * Returning ESRCH unconditionally is wrong here because the
357	 * user space value has been changed by the exiting task.
358	 *
359	 * The same logic applies to the case where the exiting task is
360	 * already gone.
361	 */
362	if (futex_get_value_locked(&uval2, uaddr))
363		return -EFAULT;
364
365	/* If the user space value has changed, try again. */
366	if (uval2 != uval)
367		return -EAGAIN;
368
369	/*
370	 * The exiting task did not have a robust list, the robust list was
371	 * corrupted or the user space value in *uaddr is simply bogus.
372	 * Give up and tell user space.
373	 */
374	return -ESRCH;
375}
376
377static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
378				 struct futex_pi_state **ps)
379{
380	/*
381	 * No existing pi state. First waiter. [2]
382	 *
383	 * This creates pi_state, we have hb->lock held, this means nothing can
384	 * observe this state, wait_lock is irrelevant.
385	 */
386	struct futex_pi_state *pi_state = alloc_pi_state();
387
388	/*
389	 * Initialize the pi_mutex in locked state and make @p
390	 * the owner of it:
391	 */
392	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
393
394	/* Store the key for possible exit cleanups: */
395	pi_state->key = *key;
396
397	WARN_ON(!list_empty(&pi_state->list));
398	list_add(&pi_state->list, &p->pi_state_list);
399	/*
400	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
401	 * because there is no concurrency as the object is not published yet.
402	 */
403	pi_state->owner = p;
404
405	*ps = pi_state;
406}
407/*
408 * Lookup the task for the TID provided from user space and attach to
409 * it after doing proper sanity checks.
410 */
411static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
412			      struct futex_pi_state **ps,
413			      struct task_struct **exiting)
414{
415	pid_t pid = uval & FUTEX_TID_MASK;
416	struct task_struct *p;
417
418	/*
419	 * We are the first waiter - try to look up the real owner and attach
420	 * the new pi_state to it, but bail out when TID = 0 [1]
421	 *
422	 * The !pid check is paranoid. None of the call sites should end up
423	 * with pid == 0, but better safe than sorry. Let the caller retry
424	 */
425	if (!pid)
426		return -EAGAIN;
427	p = find_get_task_by_vpid(pid);
428	if (!p)
429		return handle_exit_race(uaddr, uval, NULL);
430
431	if (unlikely(p->flags & PF_KTHREAD)) {
432		put_task_struct(p);
433		return -EPERM;
434	}
435
436	/*
437	 * We need to look at the task state to figure out, whether the
438	 * task is exiting. To protect against the change of the task state
439	 * in futex_exit_release(), we do this protected by p->pi_lock:
440	 */
441	raw_spin_lock_irq(&p->pi_lock);
442	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
443		/*
444		 * The task is on the way out. When the futex state is
445		 * FUTEX_STATE_DEAD, we know that the task has finished
446		 * the cleanup:
447		 */
448		int ret = handle_exit_race(uaddr, uval, p);
449
450		raw_spin_unlock_irq(&p->pi_lock);
451		/*
452		 * If the owner task is between FUTEX_STATE_EXITING and
453		 * FUTEX_STATE_DEAD then store the task pointer and keep
454		 * the reference on the task struct. The calling code will
455		 * drop all locks, wait for the task to reach
456		 * FUTEX_STATE_DEAD and then drop the refcount. This is
457		 * required to prevent a live lock when the current task
458		 * preempted the exiting task between the two states.
459		 */
460		if (ret == -EBUSY)
461			*exiting = p;
462		else
463			put_task_struct(p);
464		return ret;
465	}
466
467	__attach_to_pi_owner(p, key, ps);
468	raw_spin_unlock_irq(&p->pi_lock);
469
470	put_task_struct(p);
471
472	return 0;
473}
474
475static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
476{
477	int err;
478	u32 curval;
479
480	if (unlikely(should_fail_futex(true)))
481		return -EFAULT;
482
483	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
484	if (unlikely(err))
485		return err;
486
487	/* If user space value changed, let the caller retry */
488	return curval != uval ? -EAGAIN : 0;
489}
490
491/**
492 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
493 * @uaddr:		the pi futex user address
494 * @hb:			the pi futex hash bucket
495 * @key:		the futex key associated with uaddr and hb
496 * @ps:			the pi_state pointer where we store the result of the
497 *			lookup
498 * @task:		the task to perform the atomic lock work for.  This will
499 *			be "current" except in the case of requeue pi.
500 * @exiting:		Pointer to store the task pointer of the owner task
501 *			which is in the middle of exiting
502 * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
503 *
504 * Return:
505 *  -  0 - ready to wait;
506 *  -  1 - acquired the lock;
507 *  - <0 - error
508 *
509 * The hb->lock must be held by the caller.
510 *
511 * @exiting is only set when the return value is -EBUSY. If so, this holds
512 * a refcount on the exiting task on return and the caller needs to drop it
513 * after waiting for the exit to complete.
514 */
515int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
516			 union futex_key *key,
517			 struct futex_pi_state **ps,
518			 struct task_struct *task,
519			 struct task_struct **exiting,
520			 int set_waiters)
521{
522	u32 uval, newval, vpid = task_pid_vnr(task);
523	struct futex_q *top_waiter;
524	int ret;
525
526	/*
527	 * Read the user space value first so we can validate a few
528	 * things before proceeding further.
529	 */
530	if (futex_get_value_locked(&uval, uaddr))
531		return -EFAULT;
532
533	if (unlikely(should_fail_futex(true)))
534		return -EFAULT;
535
536	/*
537	 * Detect deadlocks.
538	 */
539	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
540		return -EDEADLK;
541
542	if ((unlikely(should_fail_futex(true))))
543		return -EDEADLK;
544
545	/*
546	 * Lookup existing state first. If it exists, try to attach to
547	 * its pi_state.
548	 */
549	top_waiter = futex_top_waiter(hb, key);
550	if (top_waiter)
551		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
552
553	/*
554	 * No waiter and user TID is 0. We are here because the
555	 * waiters or the owner died bit is set or called from
556	 * requeue_cmp_pi or for whatever reason something took the
557	 * syscall.
558	 */
559	if (!(uval & FUTEX_TID_MASK)) {
560		/*
561		 * We take over the futex. No other waiters and the user space
562		 * TID is 0. We preserve the owner died bit.
563		 */
564		newval = uval & FUTEX_OWNER_DIED;
565		newval |= vpid;
566
567		/* The futex requeue_pi code can enforce the waiters bit */
568		if (set_waiters)
569			newval |= FUTEX_WAITERS;
570
571		ret = lock_pi_update_atomic(uaddr, uval, newval);
572		if (ret)
573			return ret;
574
575		/*
576		 * If the waiter bit was requested the caller also needs PI
577		 * state attached to the new owner of the user space futex.
578		 *
579		 * @task is guaranteed to be alive and it cannot be exiting
580		 * because it is either sleeping or waiting in
581		 * futex_requeue_pi_wakeup_sync().
582		 *
583		 * No need to do the full attach_to_pi_owner() exercise
584		 * because @task is known and valid.
585		 */
586		if (set_waiters) {
587			raw_spin_lock_irq(&task->pi_lock);
588			__attach_to_pi_owner(task, key, ps);
589			raw_spin_unlock_irq(&task->pi_lock);
590		}
591		return 1;
592	}
593
594	/*
595	 * First waiter. Set the waiters bit before attaching ourself to
596	 * the owner. If owner tries to unlock, it will be forced into
597	 * the kernel and blocked on hb->lock.
598	 */
599	newval = uval | FUTEX_WAITERS;
600	ret = lock_pi_update_atomic(uaddr, uval, newval);
601	if (ret)
602		return ret;
603	/*
604	 * If the update of the user space value succeeded, we try to
605	 * attach to the owner. If that fails, no harm done, we only
606	 * set the FUTEX_WAITERS bit in the user space variable.
607	 */
608	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
609}
610
611/*
612 * Caller must hold a reference on @pi_state.
613 */
614static int wake_futex_pi(u32 __user *uaddr, u32 uval,
615			 struct futex_pi_state *pi_state,
616			 struct rt_mutex_waiter *top_waiter)
617{
618	struct task_struct *new_owner;
619	bool postunlock = false;
620	DEFINE_RT_WAKE_Q(wqh);
621	u32 curval, newval;
622	int ret = 0;
623
624	new_owner = top_waiter->task;
625
626	/*
627	 * We pass it to the next owner. The WAITERS bit is always kept
628	 * enabled while there is PI state around. We cleanup the owner
629	 * died bit, because we are the owner.
630	 */
631	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
632
633	if (unlikely(should_fail_futex(true))) {
634		ret = -EFAULT;
635		goto out_unlock;
636	}
637
638	ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
639	if (!ret && (curval != uval)) {
640		/*
641		 * If a unconditional UNLOCK_PI operation (user space did not
642		 * try the TID->0 transition) raced with a waiter setting the
643		 * FUTEX_WAITERS flag between get_user() and locking the hash
644		 * bucket lock, retry the operation.
645		 */
646		if ((FUTEX_TID_MASK & curval) == uval)
647			ret = -EAGAIN;
648		else
649			ret = -EINVAL;
650	}
651
652	if (!ret) {
653		/*
654		 * This is a point of no return; once we modified the uval
655		 * there is no going back and subsequent operations must
656		 * not fail.
657		 */
658		pi_state_update_owner(pi_state, new_owner);
659		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
660	}
661
662out_unlock:
663	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
664
665	if (postunlock)
666		rt_mutex_postunlock(&wqh);
667
668	return ret;
669}
670
671static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
672				  struct task_struct *argowner)
673{
674	struct futex_pi_state *pi_state = q->pi_state;
675	struct task_struct *oldowner, *newowner;
676	u32 uval, curval, newval, newtid;
677	int err = 0;
678
679	oldowner = pi_state->owner;
680
681	/*
682	 * We are here because either:
683	 *
684	 *  - we stole the lock and pi_state->owner needs updating to reflect
685	 *    that (@argowner == current),
686	 *
687	 * or:
688	 *
689	 *  - someone stole our lock and we need to fix things to point to the
690	 *    new owner (@argowner == NULL).
691	 *
692	 * Either way, we have to replace the TID in the user space variable.
693	 * This must be atomic as we have to preserve the owner died bit here.
694	 *
695	 * Note: We write the user space value _before_ changing the pi_state
696	 * because we can fault here. Imagine swapped out pages or a fork
697	 * that marked all the anonymous memory readonly for cow.
698	 *
699	 * Modifying pi_state _before_ the user space value would leave the
700	 * pi_state in an inconsistent state when we fault here, because we
701	 * need to drop the locks to handle the fault. This might be observed
702	 * in the PID checks when attaching to PI state .
703	 */
704retry:
705	if (!argowner) {
706		if (oldowner != current) {
707			/*
708			 * We raced against a concurrent self; things are
709			 * already fixed up. Nothing to do.
710			 */
711			return 0;
712		}
713
714		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
715			/* We got the lock. pi_state is correct. Tell caller. */
716			return 1;
717		}
718
719		/*
720		 * The trylock just failed, so either there is an owner or
721		 * there is a higher priority waiter than this one.
722		 */
723		newowner = rt_mutex_owner(&pi_state->pi_mutex);
724		/*
725		 * If the higher priority waiter has not yet taken over the
726		 * rtmutex then newowner is NULL. We can't return here with
727		 * that state because it's inconsistent vs. the user space
728		 * state. So drop the locks and try again. It's a valid
729		 * situation and not any different from the other retry
730		 * conditions.
731		 */
732		if (unlikely(!newowner)) {
733			err = -EAGAIN;
734			goto handle_err;
735		}
736	} else {
737		WARN_ON_ONCE(argowner != current);
738		if (oldowner == current) {
739			/*
740			 * We raced against a concurrent self; things are
741			 * already fixed up. Nothing to do.
742			 */
743			return 1;
744		}
745		newowner = argowner;
746	}
747
748	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
749	/* Owner died? */
750	if (!pi_state->owner)
751		newtid |= FUTEX_OWNER_DIED;
752
753	err = futex_get_value_locked(&uval, uaddr);
754	if (err)
755		goto handle_err;
756
757	for (;;) {
758		newval = (uval & FUTEX_OWNER_DIED) | newtid;
759
760		err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
761		if (err)
762			goto handle_err;
763
764		if (curval == uval)
765			break;
766		uval = curval;
767	}
768
769	/*
770	 * We fixed up user space. Now we need to fix the pi_state
771	 * itself.
772	 */
773	pi_state_update_owner(pi_state, newowner);
774
775	return argowner == current;
776
777	/*
778	 * In order to reschedule or handle a page fault, we need to drop the
779	 * locks here. In the case of a fault, this gives the other task
780	 * (either the highest priority waiter itself or the task which stole
781	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
782	 * are back from handling the fault we need to check the pi_state after
783	 * reacquiring the locks and before trying to do another fixup. When
784	 * the fixup has been done already we simply return.
785	 *
786	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
787	 * drop hb->lock since the caller owns the hb -> futex_q relation.
788	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
789	 */
790handle_err:
791	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
792	spin_unlock(q->lock_ptr);
793
794	switch (err) {
795	case -EFAULT:
796		err = fault_in_user_writeable(uaddr);
797		break;
798
799	case -EAGAIN:
800		cond_resched();
801		err = 0;
802		break;
803
804	default:
805		WARN_ON_ONCE(1);
806		break;
807	}
808
809	spin_lock(q->lock_ptr);
810	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
811
812	/*
813	 * Check if someone else fixed it for us:
814	 */
815	if (pi_state->owner != oldowner)
816		return argowner == current;
817
818	/* Retry if err was -EAGAIN or the fault in succeeded */
819	if (!err)
820		goto retry;
821
822	/*
823	 * fault_in_user_writeable() failed so user state is immutable. At
824	 * best we can make the kernel state consistent but user state will
825	 * be most likely hosed and any subsequent unlock operation will be
826	 * rejected due to PI futex rule [10].
827	 *
828	 * Ensure that the rtmutex owner is also the pi_state owner despite
829	 * the user space value claiming something different. There is no
830	 * point in unlocking the rtmutex if current is the owner as it
831	 * would need to wait until the next waiter has taken the rtmutex
832	 * to guarantee consistent state. Keep it simple. Userspace asked
833	 * for this wreckaged state.
834	 *
835	 * The rtmutex has an owner - either current or some other
836	 * task. See the EAGAIN loop above.
837	 */
838	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
839
840	return err;
841}
842
843static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
844				struct task_struct *argowner)
845{
846	struct futex_pi_state *pi_state = q->pi_state;
847	int ret;
848
849	lockdep_assert_held(q->lock_ptr);
850
851	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
852	ret = __fixup_pi_state_owner(uaddr, q, argowner);
853	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
854	return ret;
855}
856
857/**
858 * fixup_pi_owner() - Post lock pi_state and corner case management
859 * @uaddr:	user address of the futex
860 * @q:		futex_q (contains pi_state and access to the rt_mutex)
861 * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
862 *
863 * After attempting to lock an rt_mutex, this function is called to cleanup
864 * the pi_state owner as well as handle race conditions that may allow us to
865 * acquire the lock. Must be called with the hb lock held.
866 *
867 * Return:
868 *  -  1 - success, lock taken;
869 *  -  0 - success, lock not taken;
870 *  - <0 - on error (-EFAULT)
871 */
872int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
873{
874	if (locked) {
875		/*
876		 * Got the lock. We might not be the anticipated owner if we
877		 * did a lock-steal - fix up the PI-state in that case:
878		 *
879		 * Speculative pi_state->owner read (we don't hold wait_lock);
880		 * since we own the lock pi_state->owner == current is the
881		 * stable state, anything else needs more attention.
882		 */
883		if (q->pi_state->owner != current)
884			return fixup_pi_state_owner(uaddr, q, current);
885		return 1;
886	}
887
888	/*
889	 * If we didn't get the lock; check if anybody stole it from us. In
890	 * that case, we need to fix up the uval to point to them instead of
891	 * us, otherwise bad things happen. [10]
892	 *
893	 * Another speculative read; pi_state->owner == current is unstable
894	 * but needs our attention.
895	 */
896	if (q->pi_state->owner == current)
897		return fixup_pi_state_owner(uaddr, q, NULL);
898
899	/*
900	 * Paranoia check. If we did not take the lock, then we should not be
901	 * the owner of the rt_mutex. Warn and establish consistent state.
902	 */
903	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
904		return fixup_pi_state_owner(uaddr, q, current);
905
906	return 0;
907}
908
909/*
910 * Userspace tried a 0 -> TID atomic transition of the futex value
911 * and failed. The kernel side here does the whole locking operation:
912 * if there are waiters then it will block as a consequence of relying
913 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
914 * a 0 value of the futex too.).
915 *
916 * Also serves as futex trylock_pi()'ing, and due semantics.
917 */
918int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
919{
920	struct hrtimer_sleeper timeout, *to;
921	struct task_struct *exiting = NULL;
922	struct rt_mutex_waiter rt_waiter;
923	struct futex_hash_bucket *hb;
924	struct futex_q q = futex_q_init;
925	int res, ret;
926
927	if (!IS_ENABLED(CONFIG_FUTEX_PI))
928		return -ENOSYS;
929
930	if (refill_pi_state_cache())
931		return -ENOMEM;
932
933	to = futex_setup_timer(time, &timeout, flags, 0);
934
935retry:
936	ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
937	if (unlikely(ret != 0))
938		goto out;
939
940retry_private:
941	hb = futex_q_lock(&q);
942
943	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
944				   &exiting, 0);
945	if (unlikely(ret)) {
946		/*
947		 * Atomic work succeeded and we got the lock,
948		 * or failed. Either way, we do _not_ block.
949		 */
950		switch (ret) {
951		case 1:
952			/* We got the lock. */
953			ret = 0;
954			goto out_unlock_put_key;
955		case -EFAULT:
956			goto uaddr_faulted;
957		case -EBUSY:
958		case -EAGAIN:
959			/*
960			 * Two reasons for this:
961			 * - EBUSY: Task is exiting and we just wait for the
962			 *   exit to complete.
963			 * - EAGAIN: The user space value changed.
964			 */
965			futex_q_unlock(hb);
966			/*
967			 * Handle the case where the owner is in the middle of
968			 * exiting. Wait for the exit to complete otherwise
969			 * this task might loop forever, aka. live lock.
970			 */
971			wait_for_owner_exiting(ret, exiting);
972			cond_resched();
973			goto retry;
974		default:
975			goto out_unlock_put_key;
976		}
977	}
978
979	WARN_ON(!q.pi_state);
980
981	/*
982	 * Only actually queue now that the atomic ops are done:
983	 */
984	__futex_queue(&q, hb);
985
986	if (trylock) {
987		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
988		/* Fixup the trylock return value: */
989		ret = ret ? 0 : -EWOULDBLOCK;
990		goto no_block;
991	}
992
993	/*
994	 * Must be done before we enqueue the waiter, here is unfortunately
995	 * under the hb lock, but that *should* work because it does nothing.
996	 */
997	rt_mutex_pre_schedule();
998
999	rt_mutex_init_waiter(&rt_waiter);
1000
1001	/*
1002	 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1003	 * hold it while doing rt_mutex_start_proxy(), because then it will
1004	 * include hb->lock in the blocking chain, even through we'll not in
1005	 * fact hold it while blocking. This will lead it to report -EDEADLK
1006	 * and BUG when futex_unlock_pi() interleaves with this.
1007	 *
1008	 * Therefore acquire wait_lock while holding hb->lock, but drop the
1009	 * latter before calling __rt_mutex_start_proxy_lock(). This
1010	 * interleaves with futex_unlock_pi() -- which does a similar lock
1011	 * handoff -- such that the latter can observe the futex_q::pi_state
1012	 * before __rt_mutex_start_proxy_lock() is done.
1013	 */
1014	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1015	spin_unlock(q.lock_ptr);
1016	/*
1017	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1018	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
1019	 * it sees the futex_q::pi_state.
1020	 */
1021	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1022	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1023
1024	if (ret) {
1025		if (ret == 1)
1026			ret = 0;
1027		goto cleanup;
1028	}
1029
1030	if (unlikely(to))
1031		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1032
1033	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1034
1035cleanup:
1036	/*
1037	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
1038	 * must unwind the above, however we canont lock hb->lock because
1039	 * rt_mutex already has a waiter enqueued and hb->lock can itself try
1040	 * and enqueue an rt_waiter through rtlock.
1041	 *
1042	 * Doing the cleanup without holding hb->lock can cause inconsistent
1043	 * state between hb and pi_state, but only in the direction of not
1044	 * seeing a waiter that is leaving.
1045	 *
1046	 * See futex_unlock_pi(), it deals with this inconsistency.
1047	 *
1048	 * There be dragons here, since we must deal with the inconsistency on
1049	 * the way out (here), it is impossible to detect/warn about the race
1050	 * the other way around (missing an incoming waiter).
1051	 *
1052	 * What could possibly go wrong...
1053	 */
1054	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1055		ret = 0;
1056
1057	/*
1058	 * Now that the rt_waiter has been dequeued, it is safe to use
1059	 * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
1060	 * the
1061	 */
1062	spin_lock(q.lock_ptr);
1063	/*
1064	 * Waiter is unqueued.
1065	 */
1066	rt_mutex_post_schedule();
1067no_block:
1068	/*
1069	 * Fixup the pi_state owner and possibly acquire the lock if we
1070	 * haven't already.
1071	 */
1072	res = fixup_pi_owner(uaddr, &q, !ret);
1073	/*
1074	 * If fixup_pi_owner() returned an error, propagate that.  If it acquired
1075	 * the lock, clear our -ETIMEDOUT or -EINTR.
1076	 */
1077	if (res)
1078		ret = (res < 0) ? res : 0;
1079
1080	futex_unqueue_pi(&q);
1081	spin_unlock(q.lock_ptr);
1082	goto out;
1083
1084out_unlock_put_key:
1085	futex_q_unlock(hb);
1086
1087out:
1088	if (to) {
1089		hrtimer_cancel(&to->timer);
1090		destroy_hrtimer_on_stack(&to->timer);
1091	}
1092	return ret != -EINTR ? ret : -ERESTARTNOINTR;
1093
1094uaddr_faulted:
1095	futex_q_unlock(hb);
1096
1097	ret = fault_in_user_writeable(uaddr);
1098	if (ret)
1099		goto out;
1100
1101	if (!(flags & FLAGS_SHARED))
1102		goto retry_private;
1103
1104	goto retry;
1105}
1106
1107/*
1108 * Userspace attempted a TID -> 0 atomic transition, and failed.
1109 * This is the in-kernel slowpath: we look up the PI state (if any),
1110 * and do the rt-mutex unlock.
1111 */
1112int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1113{
1114	u32 curval, uval, vpid = task_pid_vnr(current);
1115	union futex_key key = FUTEX_KEY_INIT;
1116	struct futex_hash_bucket *hb;
1117	struct futex_q *top_waiter;
1118	int ret;
1119
1120	if (!IS_ENABLED(CONFIG_FUTEX_PI))
1121		return -ENOSYS;
1122
1123retry:
1124	if (get_user(uval, uaddr))
1125		return -EFAULT;
1126	/*
1127	 * We release only a lock we actually own:
1128	 */
1129	if ((uval & FUTEX_TID_MASK) != vpid)
1130		return -EPERM;
1131
1132	ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
1133	if (ret)
1134		return ret;
1135
1136	hb = futex_hash(&key);
1137	spin_lock(&hb->lock);
1138retry_hb:
1139
1140	/*
1141	 * Check waiters first. We do not trust user space values at
1142	 * all and we at least want to know if user space fiddled
1143	 * with the futex value instead of blindly unlocking.
1144	 */
1145	top_waiter = futex_top_waiter(hb, &key);
1146	if (top_waiter) {
1147		struct futex_pi_state *pi_state = top_waiter->pi_state;
1148		struct rt_mutex_waiter *rt_waiter;
1149
1150		ret = -EINVAL;
1151		if (!pi_state)
1152			goto out_unlock;
1153
1154		/*
1155		 * If current does not own the pi_state then the futex is
1156		 * inconsistent and user space fiddled with the futex value.
1157		 */
1158		if (pi_state->owner != current)
1159			goto out_unlock;
1160
1161		/*
1162		 * By taking wait_lock while still holding hb->lock, we ensure
1163		 * there is no point where we hold neither; and thereby
1164		 * wake_futex_pi() must observe any new waiters.
1165		 *
1166		 * Since the cleanup: case in futex_lock_pi() removes the
1167		 * rt_waiter without holding hb->lock, it is possible for
1168		 * wake_futex_pi() to not find a waiter while the above does,
1169		 * in this case the waiter is on the way out and it can be
1170		 * ignored.
1171		 *
1172		 * In particular; this forces __rt_mutex_start_proxy() to
1173		 * complete such that we're guaranteed to observe the
1174		 * rt_waiter.
1175		 */
1176		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1177
1178		/*
1179		 * Futex vs rt_mutex waiter state -- if there are no rt_mutex
1180		 * waiters even though futex thinks there are, then the waiter
1181		 * is leaving. The entry needs to be removed from the list so a
1182		 * new futex_lock_pi() is not using this stale PI-state while
1183		 * the futex is available in user space again.
1184		 * There can be more than one task on its way out so it needs
1185		 * to retry.
1186		 */
1187		rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
1188		if (!rt_waiter) {
1189			__futex_unqueue(top_waiter);
1190			raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1191			goto retry_hb;
1192		}
1193
1194		get_pi_state(pi_state);
1195		spin_unlock(&hb->lock);
1196
1197		/* drops pi_state->pi_mutex.wait_lock */
1198		ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
1199
1200		put_pi_state(pi_state);
1201
1202		/*
1203		 * Success, we're done! No tricky corner cases.
1204		 */
1205		if (!ret)
1206			return ret;
1207		/*
1208		 * The atomic access to the futex value generated a
1209		 * pagefault, so retry the user-access and the wakeup:
1210		 */
1211		if (ret == -EFAULT)
1212			goto pi_faulted;
1213		/*
1214		 * A unconditional UNLOCK_PI op raced against a waiter
1215		 * setting the FUTEX_WAITERS bit. Try again.
1216		 */
1217		if (ret == -EAGAIN)
1218			goto pi_retry;
1219		/*
1220		 * wake_futex_pi has detected invalid state. Tell user
1221		 * space.
1222		 */
1223		return ret;
1224	}
1225
1226	/*
1227	 * We have no kernel internal state, i.e. no waiters in the
1228	 * kernel. Waiters which are about to queue themselves are stuck
1229	 * on hb->lock. So we can safely ignore them. We do neither
1230	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
1231	 * owner.
1232	 */
1233	if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1234		spin_unlock(&hb->lock);
1235		switch (ret) {
1236		case -EFAULT:
1237			goto pi_faulted;
1238
1239		case -EAGAIN:
1240			goto pi_retry;
1241
1242		default:
1243			WARN_ON_ONCE(1);
1244			return ret;
1245		}
1246	}
1247
1248	/*
1249	 * If uval has changed, let user space handle it.
1250	 */
1251	ret = (curval == uval) ? 0 : -EAGAIN;
1252
1253out_unlock:
1254	spin_unlock(&hb->lock);
1255	return ret;
1256
1257pi_retry:
1258	cond_resched();
1259	goto retry;
1260
1261pi_faulted:
1262
1263	ret = fault_in_user_writeable(uaddr);
1264	if (!ret)
1265		goto retry;
1266
1267	return ret;
1268}
1269
1270