1// SPDX-License-Identifier: GPL-2.0-or-later
2
3#include <linux/slab.h>
4#include <linux/sched/task.h>
5
6#include "futex.h"
7#include "../locking/rtmutex_common.h"
8
9/*
10 * PI code:
11 */
12int refill_pi_state_cache(void)
13{
14	struct futex_pi_state *pi_state;
15
16	if (likely(current->pi_state_cache))
17		return 0;
18
19	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
20
21	if (!pi_state)
22		return -ENOMEM;
23
24	INIT_LIST_HEAD(&pi_state->list);
25	/* pi_mutex gets initialized later */
26	pi_state->owner = NULL;
27	refcount_set(&pi_state->refcount, 1);
28	pi_state->key = FUTEX_KEY_INIT;
29
30	current->pi_state_cache = pi_state;
31
32	return 0;
33}
34
35static struct futex_pi_state *alloc_pi_state(void)
36{
37	struct futex_pi_state *pi_state = current->pi_state_cache;
38
39	WARN_ON(!pi_state);
40	current->pi_state_cache = NULL;
41
42	return pi_state;
43}
44
45static void pi_state_update_owner(struct futex_pi_state *pi_state,
46				  struct task_struct *new_owner)
47{
48	struct task_struct *old_owner = pi_state->owner;
49
50	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
51
52	if (old_owner) {
53		raw_spin_lock(&old_owner->pi_lock);
54		WARN_ON(list_empty(&pi_state->list));
55		list_del_init(&pi_state->list);
56		raw_spin_unlock(&old_owner->pi_lock);
57	}
58
59	if (new_owner) {
60		raw_spin_lock(&new_owner->pi_lock);
61		WARN_ON(!list_empty(&pi_state->list));
62		list_add(&pi_state->list, &new_owner->pi_state_list);
63		pi_state->owner = new_owner;
64		raw_spin_unlock(&new_owner->pi_lock);
65	}
66}
67
68void get_pi_state(struct futex_pi_state *pi_state)
69{
70	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
71}
72
73/*
74 * Drops a reference to the pi_state object and frees or caches it
75 * when the last reference is gone.
76 */
77void put_pi_state(struct futex_pi_state *pi_state)
78{
79	if (!pi_state)
80		return;
81
82	if (!refcount_dec_and_test(&pi_state->refcount))
83		return;
84
85	/*
86	 * If pi_state->owner is NULL, the owner is most probably dying
87	 * and has cleaned up the pi_state already
88	 */
89	if (pi_state->owner) {
90		unsigned long flags;
91
92		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
93		pi_state_update_owner(pi_state, NULL);
94		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
95		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
96	}
97
98	if (current->pi_state_cache) {
99		kfree(pi_state);
100	} else {
101		/*
102		 * pi_state->list is already empty.
103		 * clear pi_state->owner.
104		 * refcount is at 0 - put it back to 1.
105		 */
106		pi_state->owner = NULL;
107		refcount_set(&pi_state->refcount, 1);
108		current->pi_state_cache = pi_state;
109	}
110}
111
112/*
113 * We need to check the following states:
114 *
115 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
116 *
117 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
118 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
119 *
120 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
121 *
122 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
123 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
124 *
125 * [6]  Found  | Found    | task      | 0         | 1      | Valid
126 *
127 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
128 *
129 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
130 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
131 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
132 *
133 * [1]	Indicates that the kernel can acquire the futex atomically. We
134 *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
135 *
136 * [2]	Valid, if TID does not belong to a kernel thread. If no matching
137 *      thread is found then it indicates that the owner TID has died.
138 *
139 * [3]	Invalid. The waiter is queued on a non PI futex
140 *
141 * [4]	Valid state after exit_robust_list(), which sets the user space
142 *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
143 *
144 * [5]	The user space value got manipulated between exit_robust_list()
145 *	and exit_pi_state_list()
146 *
147 * [6]	Valid state after exit_pi_state_list() which sets the new owner in
148 *	the pi_state but cannot access the user space value.
149 *
150 * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
151 *
152 * [8]	Owner and user space value match
153 *
154 * [9]	There is no transient state which sets the user space TID to 0
155 *	except exit_robust_list(), but this is indicated by the
156 *	FUTEX_OWNER_DIED bit. See [4]
157 *
158 * [10] There is no transient state which leaves owner and user space
159 *	TID out of sync. Except one error case where the kernel is denied
160 *	write access to the user address, see fixup_pi_state_owner().
161 *
162 *
163 * Serialization and lifetime rules:
164 *
165 * hb->lock:
166 *
167 *	hb -> futex_q, relation
168 *	futex_q -> pi_state, relation
169 *
170 *	(cannot be raw because hb can contain arbitrary amount
171 *	 of futex_q's)
172 *
173 * pi_mutex->wait_lock:
174 *
175 *	{uval, pi_state}
176 *
177 *	(and pi_mutex 'obviously')
178 *
179 * p->pi_lock:
180 *
181 *	p->pi_state_list -> pi_state->list, relation
182 *	pi_mutex->owner -> pi_state->owner, relation
183 *
184 * pi_state->refcount:
185 *
186 *	pi_state lifetime
187 *
188 *
189 * Lock order:
190 *
191 *   hb->lock
192 *     pi_mutex->wait_lock
193 *       p->pi_lock
194 *
195 */
196
197/*
198 * Validate that the existing waiter has a pi_state and sanity check
199 * the pi_state against the user space value. If correct, attach to
200 * it.
201 */
202static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
203			      struct futex_pi_state *pi_state,
204			      struct futex_pi_state **ps)
205{
206	pid_t pid = uval & FUTEX_TID_MASK;
207	u32 uval2;
208	int ret;
209
210	/*
211	 * Userspace might have messed up non-PI and PI futexes [3]
212	 */
213	if (unlikely(!pi_state))
214		return -EINVAL;
215
216	/*
217	 * We get here with hb->lock held, and having found a
218	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
219	 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
220	 * which in turn means that futex_lock_pi() still has a reference on
221	 * our pi_state.
222	 *
223	 * The waiter holding a reference on @pi_state also protects against
224	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
225	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
226	 * free pi_state before we can take a reference ourselves.
227	 */
228	WARN_ON(!refcount_read(&pi_state->refcount));
229
230	/*
231	 * Now that we have a pi_state, we can acquire wait_lock
232	 * and do the state validation.
233	 */
234	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
235
236	/*
237	 * Since {uval, pi_state} is serialized by wait_lock, and our current
238	 * uval was read without holding it, it can have changed. Verify it
239	 * still is what we expect it to be, otherwise retry the entire
240	 * operation.
241	 */
242	if (futex_get_value_locked(&uval2, uaddr))
243		goto out_efault;
244
245	if (uval != uval2)
246		goto out_eagain;
247
248	/*
249	 * Handle the owner died case:
250	 */
251	if (uval & FUTEX_OWNER_DIED) {
252		/*
253		 * exit_pi_state_list sets owner to NULL and wakes the
254		 * topmost waiter. The task which acquires the
255		 * pi_state->rt_mutex will fixup owner.
256		 */
257		if (!pi_state->owner) {
258			/*
259			 * No pi state owner, but the user space TID
260			 * is not 0. Inconsistent state. [5]
261			 */
262			if (pid)
263				goto out_einval;
264			/*
265			 * Take a ref on the state and return success. [4]
266			 */
267			goto out_attach;
268		}
269
270		/*
271		 * If TID is 0, then either the dying owner has not
272		 * yet executed exit_pi_state_list() or some waiter
273		 * acquired the rtmutex in the pi state, but did not
274		 * yet fixup the TID in user space.
275		 *
276		 * Take a ref on the state and return success. [6]
277		 */
278		if (!pid)
279			goto out_attach;
280	} else {
281		/*
282		 * If the owner died bit is not set, then the pi_state
283		 * must have an owner. [7]
284		 */
285		if (!pi_state->owner)
286			goto out_einval;
287	}
288
289	/*
290	 * Bail out if user space manipulated the futex value. If pi
291	 * state exists then the owner TID must be the same as the
292	 * user space TID. [9/10]
293	 */
294	if (pid != task_pid_vnr(pi_state->owner))
295		goto out_einval;
296
297out_attach:
298	get_pi_state(pi_state);
299	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
300	*ps = pi_state;
301	return 0;
302
303out_einval:
304	ret = -EINVAL;
305	goto out_error;
306
307out_eagain:
308	ret = -EAGAIN;
309	goto out_error;
310
311out_efault:
312	ret = -EFAULT;
313	goto out_error;
314
315out_error:
316	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
317	return ret;
318}
319
320static int handle_exit_race(u32 __user *uaddr, u32 uval,
321			    struct task_struct *tsk)
322{
323	u32 uval2;
324
325	/*
326	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
327	 * caller that the alleged owner is busy.
328	 */
329	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
330		return -EBUSY;
331
332	/*
333	 * Reread the user space value to handle the following situation:
334	 *
335	 * CPU0				CPU1
336	 *
337	 * sys_exit()			sys_futex()
338	 *  do_exit()			 futex_lock_pi()
339	 *                                futex_lock_pi_atomic()
340	 *   exit_signals(tsk)		    No waiters:
341	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
342	 *  mm_release(tsk)		    Set waiter bit
343	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
344	 *      Set owner died		    attach_to_pi_owner() {
345	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
346	 *   }				     if (!tsk->flags & PF_EXITING) {
347	 *  ...				       attach();
348	 *  tsk->futex_state =               } else {
349	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
350	 *					  FUTEX_STATE_DEAD)
351	 *				         return -EAGAIN;
352	 *				       return -ESRCH; <--- FAIL
353	 *				     }
354	 *
355	 * Returning ESRCH unconditionally is wrong here because the
356	 * user space value has been changed by the exiting task.
357	 *
358	 * The same logic applies to the case where the exiting task is
359	 * already gone.
360	 */
361	if (futex_get_value_locked(&uval2, uaddr))
362		return -EFAULT;
363
364	/* If the user space value has changed, try again. */
365	if (uval2 != uval)
366		return -EAGAIN;
367
368	/*
369	 * The exiting task did not have a robust list, the robust list was
370	 * corrupted or the user space value in *uaddr is simply bogus.
371	 * Give up and tell user space.
372	 */
373	return -ESRCH;
374}
375
376static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
377				 struct futex_pi_state **ps)
378{
379	/*
380	 * No existing pi state. First waiter. [2]
381	 *
382	 * This creates pi_state, we have hb->lock held, this means nothing can
383	 * observe this state, wait_lock is irrelevant.
384	 */
385	struct futex_pi_state *pi_state = alloc_pi_state();
386
387	/*
388	 * Initialize the pi_mutex in locked state and make @p
389	 * the owner of it:
390	 */
391	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
392
393	/* Store the key for possible exit cleanups: */
394	pi_state->key = *key;
395
396	WARN_ON(!list_empty(&pi_state->list));
397	list_add(&pi_state->list, &p->pi_state_list);
398	/*
399	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
400	 * because there is no concurrency as the object is not published yet.
401	 */
402	pi_state->owner = p;
403
404	*ps = pi_state;
405}
406/*
407 * Lookup the task for the TID provided from user space and attach to
408 * it after doing proper sanity checks.
409 */
410static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
411			      struct futex_pi_state **ps,
412			      struct task_struct **exiting)
413{
414	pid_t pid = uval & FUTEX_TID_MASK;
415	struct task_struct *p;
416
417	/*
418	 * We are the first waiter - try to look up the real owner and attach
419	 * the new pi_state to it, but bail out when TID = 0 [1]
420	 *
421	 * The !pid check is paranoid. None of the call sites should end up
422	 * with pid == 0, but better safe than sorry. Let the caller retry
423	 */
424	if (!pid)
425		return -EAGAIN;
426	p = find_get_task_by_vpid(pid);
427	if (!p)
428		return handle_exit_race(uaddr, uval, NULL);
429
430	if (unlikely(p->flags & PF_KTHREAD)) {
431		put_task_struct(p);
432		return -EPERM;
433	}
434
435	/*
436	 * We need to look at the task state to figure out, whether the
437	 * task is exiting. To protect against the change of the task state
438	 * in futex_exit_release(), we do this protected by p->pi_lock:
439	 */
440	raw_spin_lock_irq(&p->pi_lock);
441	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
442		/*
443		 * The task is on the way out. When the futex state is
444		 * FUTEX_STATE_DEAD, we know that the task has finished
445		 * the cleanup:
446		 */
447		int ret = handle_exit_race(uaddr, uval, p);
448
449		raw_spin_unlock_irq(&p->pi_lock);
450		/*
451		 * If the owner task is between FUTEX_STATE_EXITING and
452		 * FUTEX_STATE_DEAD then store the task pointer and keep
453		 * the reference on the task struct. The calling code will
454		 * drop all locks, wait for the task to reach
455		 * FUTEX_STATE_DEAD and then drop the refcount. This is
456		 * required to prevent a live lock when the current task
457		 * preempted the exiting task between the two states.
458		 */
459		if (ret == -EBUSY)
460			*exiting = p;
461		else
462			put_task_struct(p);
463		return ret;
464	}
465
466	__attach_to_pi_owner(p, key, ps);
467	raw_spin_unlock_irq(&p->pi_lock);
468
469	put_task_struct(p);
470
471	return 0;
472}
473
474static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
475{
476	int err;
477	u32 curval;
478
479	if (unlikely(should_fail_futex(true)))
480		return -EFAULT;
481
482	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
483	if (unlikely(err))
484		return err;
485
486	/* If user space value changed, let the caller retry */
487	return curval != uval ? -EAGAIN : 0;
488}
489
490/**
491 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
492 * @uaddr:		the pi futex user address
493 * @hb:			the pi futex hash bucket
494 * @key:		the futex key associated with uaddr and hb
495 * @ps:			the pi_state pointer where we store the result of the
496 *			lookup
497 * @task:		the task to perform the atomic lock work for.  This will
498 *			be "current" except in the case of requeue pi.
499 * @exiting:		Pointer to store the task pointer of the owner task
500 *			which is in the middle of exiting
501 * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
502 *
503 * Return:
504 *  -  0 - ready to wait;
505 *  -  1 - acquired the lock;
506 *  - <0 - error
507 *
508 * The hb->lock must be held by the caller.
509 *
510 * @exiting is only set when the return value is -EBUSY. If so, this holds
511 * a refcount on the exiting task on return and the caller needs to drop it
512 * after waiting for the exit to complete.
513 */
514int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
515			 union futex_key *key,
516			 struct futex_pi_state **ps,
517			 struct task_struct *task,
518			 struct task_struct **exiting,
519			 int set_waiters)
520{
521	u32 uval, newval, vpid = task_pid_vnr(task);
522	struct futex_q *top_waiter;
523	int ret;
524
525	/*
526	 * Read the user space value first so we can validate a few
527	 * things before proceeding further.
528	 */
529	if (futex_get_value_locked(&uval, uaddr))
530		return -EFAULT;
531
532	if (unlikely(should_fail_futex(true)))
533		return -EFAULT;
534
535	/*
536	 * Detect deadlocks.
537	 */
538	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
539		return -EDEADLK;
540
541	if ((unlikely(should_fail_futex(true))))
542		return -EDEADLK;
543
544	/*
545	 * Lookup existing state first. If it exists, try to attach to
546	 * its pi_state.
547	 */
548	top_waiter = futex_top_waiter(hb, key);
549	if (top_waiter)
550		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
551
552	/*
553	 * No waiter and user TID is 0. We are here because the
554	 * waiters or the owner died bit is set or called from
555	 * requeue_cmp_pi or for whatever reason something took the
556	 * syscall.
557	 */
558	if (!(uval & FUTEX_TID_MASK)) {
559		/*
560		 * We take over the futex. No other waiters and the user space
561		 * TID is 0. We preserve the owner died bit.
562		 */
563		newval = uval & FUTEX_OWNER_DIED;
564		newval |= vpid;
565
566		/* The futex requeue_pi code can enforce the waiters bit */
567		if (set_waiters)
568			newval |= FUTEX_WAITERS;
569
570		ret = lock_pi_update_atomic(uaddr, uval, newval);
571		if (ret)
572			return ret;
573
574		/*
575		 * If the waiter bit was requested the caller also needs PI
576		 * state attached to the new owner of the user space futex.
577		 *
578		 * @task is guaranteed to be alive and it cannot be exiting
579		 * because it is either sleeping or waiting in
580		 * futex_requeue_pi_wakeup_sync().
581		 *
582		 * No need to do the full attach_to_pi_owner() exercise
583		 * because @task is known and valid.
584		 */
585		if (set_waiters) {
586			raw_spin_lock_irq(&task->pi_lock);
587			__attach_to_pi_owner(task, key, ps);
588			raw_spin_unlock_irq(&task->pi_lock);
589		}
590		return 1;
591	}
592
593	/*
594	 * First waiter. Set the waiters bit before attaching ourself to
595	 * the owner. If owner tries to unlock, it will be forced into
596	 * the kernel and blocked on hb->lock.
597	 */
598	newval = uval | FUTEX_WAITERS;
599	ret = lock_pi_update_atomic(uaddr, uval, newval);
600	if (ret)
601		return ret;
602	/*
603	 * If the update of the user space value succeeded, we try to
604	 * attach to the owner. If that fails, no harm done, we only
605	 * set the FUTEX_WAITERS bit in the user space variable.
606	 */
607	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
608}
609
610/*
611 * Caller must hold a reference on @pi_state.
612 */
613static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
614{
615	struct rt_mutex_waiter *top_waiter;
616	struct task_struct *new_owner;
617	bool postunlock = false;
618	DEFINE_RT_WAKE_Q(wqh);
619	u32 curval, newval;
620	int ret = 0;
621
622	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
623	if (WARN_ON_ONCE(!top_waiter)) {
624		/*
625		 * As per the comment in futex_unlock_pi() this should not happen.
626		 *
627		 * When this happens, give up our locks and try again, giving
628		 * the futex_lock_pi() instance time to complete, either by
629		 * waiting on the rtmutex or removing itself from the futex
630		 * queue.
631		 */
632		ret = -EAGAIN;
633		goto out_unlock;
634	}
635
636	new_owner = top_waiter->task;
637
638	/*
639	 * We pass it to the next owner. The WAITERS bit is always kept
640	 * enabled while there is PI state around. We cleanup the owner
641	 * died bit, because we are the owner.
642	 */
643	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
644
645	if (unlikely(should_fail_futex(true))) {
646		ret = -EFAULT;
647		goto out_unlock;
648	}
649
650	ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
651	if (!ret && (curval != uval)) {
652		/*
653		 * If a unconditional UNLOCK_PI operation (user space did not
654		 * try the TID->0 transition) raced with a waiter setting the
655		 * FUTEX_WAITERS flag between get_user() and locking the hash
656		 * bucket lock, retry the operation.
657		 */
658		if ((FUTEX_TID_MASK & curval) == uval)
659			ret = -EAGAIN;
660		else
661			ret = -EINVAL;
662	}
663
664	if (!ret) {
665		/*
666		 * This is a point of no return; once we modified the uval
667		 * there is no going back and subsequent operations must
668		 * not fail.
669		 */
670		pi_state_update_owner(pi_state, new_owner);
671		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
672	}
673
674out_unlock:
675	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
676
677	if (postunlock)
678		rt_mutex_postunlock(&wqh);
679
680	return ret;
681}
682
683static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
684				  struct task_struct *argowner)
685{
686	struct futex_pi_state *pi_state = q->pi_state;
687	struct task_struct *oldowner, *newowner;
688	u32 uval, curval, newval, newtid;
689	int err = 0;
690
691	oldowner = pi_state->owner;
692
693	/*
694	 * We are here because either:
695	 *
696	 *  - we stole the lock and pi_state->owner needs updating to reflect
697	 *    that (@argowner == current),
698	 *
699	 * or:
700	 *
701	 *  - someone stole our lock and we need to fix things to point to the
702	 *    new owner (@argowner == NULL).
703	 *
704	 * Either way, we have to replace the TID in the user space variable.
705	 * This must be atomic as we have to preserve the owner died bit here.
706	 *
707	 * Note: We write the user space value _before_ changing the pi_state
708	 * because we can fault here. Imagine swapped out pages or a fork
709	 * that marked all the anonymous memory readonly for cow.
710	 *
711	 * Modifying pi_state _before_ the user space value would leave the
712	 * pi_state in an inconsistent state when we fault here, because we
713	 * need to drop the locks to handle the fault. This might be observed
714	 * in the PID checks when attaching to PI state .
715	 */
716retry:
717	if (!argowner) {
718		if (oldowner != current) {
719			/*
720			 * We raced against a concurrent self; things are
721			 * already fixed up. Nothing to do.
722			 */
723			return 0;
724		}
725
726		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
727			/* We got the lock. pi_state is correct. Tell caller. */
728			return 1;
729		}
730
731		/*
732		 * The trylock just failed, so either there is an owner or
733		 * there is a higher priority waiter than this one.
734		 */
735		newowner = rt_mutex_owner(&pi_state->pi_mutex);
736		/*
737		 * If the higher priority waiter has not yet taken over the
738		 * rtmutex then newowner is NULL. We can't return here with
739		 * that state because it's inconsistent vs. the user space
740		 * state. So drop the locks and try again. It's a valid
741		 * situation and not any different from the other retry
742		 * conditions.
743		 */
744		if (unlikely(!newowner)) {
745			err = -EAGAIN;
746			goto handle_err;
747		}
748	} else {
749		WARN_ON_ONCE(argowner != current);
750		if (oldowner == current) {
751			/*
752			 * We raced against a concurrent self; things are
753			 * already fixed up. Nothing to do.
754			 */
755			return 1;
756		}
757		newowner = argowner;
758	}
759
760	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
761	/* Owner died? */
762	if (!pi_state->owner)
763		newtid |= FUTEX_OWNER_DIED;
764
765	err = futex_get_value_locked(&uval, uaddr);
766	if (err)
767		goto handle_err;
768
769	for (;;) {
770		newval = (uval & FUTEX_OWNER_DIED) | newtid;
771
772		err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
773		if (err)
774			goto handle_err;
775
776		if (curval == uval)
777			break;
778		uval = curval;
779	}
780
781	/*
782	 * We fixed up user space. Now we need to fix the pi_state
783	 * itself.
784	 */
785	pi_state_update_owner(pi_state, newowner);
786
787	return argowner == current;
788
789	/*
790	 * In order to reschedule or handle a page fault, we need to drop the
791	 * locks here. In the case of a fault, this gives the other task
792	 * (either the highest priority waiter itself or the task which stole
793	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
794	 * are back from handling the fault we need to check the pi_state after
795	 * reacquiring the locks and before trying to do another fixup. When
796	 * the fixup has been done already we simply return.
797	 *
798	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
799	 * drop hb->lock since the caller owns the hb -> futex_q relation.
800	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
801	 */
802handle_err:
803	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
804	spin_unlock(q->lock_ptr);
805
806	switch (err) {
807	case -EFAULT:
808		err = fault_in_user_writeable(uaddr);
809		break;
810
811	case -EAGAIN:
812		cond_resched();
813		err = 0;
814		break;
815
816	default:
817		WARN_ON_ONCE(1);
818		break;
819	}
820
821	spin_lock(q->lock_ptr);
822	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
823
824	/*
825	 * Check if someone else fixed it for us:
826	 */
827	if (pi_state->owner != oldowner)
828		return argowner == current;
829
830	/* Retry if err was -EAGAIN or the fault in succeeded */
831	if (!err)
832		goto retry;
833
834	/*
835	 * fault_in_user_writeable() failed so user state is immutable. At
836	 * best we can make the kernel state consistent but user state will
837	 * be most likely hosed and any subsequent unlock operation will be
838	 * rejected due to PI futex rule [10].
839	 *
840	 * Ensure that the rtmutex owner is also the pi_state owner despite
841	 * the user space value claiming something different. There is no
842	 * point in unlocking the rtmutex if current is the owner as it
843	 * would need to wait until the next waiter has taken the rtmutex
844	 * to guarantee consistent state. Keep it simple. Userspace asked
845	 * for this wreckaged state.
846	 *
847	 * The rtmutex has an owner - either current or some other
848	 * task. See the EAGAIN loop above.
849	 */
850	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
851
852	return err;
853}
854
855static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
856				struct task_struct *argowner)
857{
858	struct futex_pi_state *pi_state = q->pi_state;
859	int ret;
860
861	lockdep_assert_held(q->lock_ptr);
862
863	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
864	ret = __fixup_pi_state_owner(uaddr, q, argowner);
865	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
866	return ret;
867}
868
869/**
870 * fixup_pi_owner() - Post lock pi_state and corner case management
871 * @uaddr:	user address of the futex
872 * @q:		futex_q (contains pi_state and access to the rt_mutex)
873 * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
874 *
875 * After attempting to lock an rt_mutex, this function is called to cleanup
876 * the pi_state owner as well as handle race conditions that may allow us to
877 * acquire the lock. Must be called with the hb lock held.
878 *
879 * Return:
880 *  -  1 - success, lock taken;
881 *  -  0 - success, lock not taken;
882 *  - <0 - on error (-EFAULT)
883 */
884int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
885{
886	if (locked) {
887		/*
888		 * Got the lock. We might not be the anticipated owner if we
889		 * did a lock-steal - fix up the PI-state in that case:
890		 *
891		 * Speculative pi_state->owner read (we don't hold wait_lock);
892		 * since we own the lock pi_state->owner == current is the
893		 * stable state, anything else needs more attention.
894		 */
895		if (q->pi_state->owner != current)
896			return fixup_pi_state_owner(uaddr, q, current);
897		return 1;
898	}
899
900	/*
901	 * If we didn't get the lock; check if anybody stole it from us. In
902	 * that case, we need to fix up the uval to point to them instead of
903	 * us, otherwise bad things happen. [10]
904	 *
905	 * Another speculative read; pi_state->owner == current is unstable
906	 * but needs our attention.
907	 */
908	if (q->pi_state->owner == current)
909		return fixup_pi_state_owner(uaddr, q, NULL);
910
911	/*
912	 * Paranoia check. If we did not take the lock, then we should not be
913	 * the owner of the rt_mutex. Warn and establish consistent state.
914	 */
915	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
916		return fixup_pi_state_owner(uaddr, q, current);
917
918	return 0;
919}
920
921/*
922 * Userspace tried a 0 -> TID atomic transition of the futex value
923 * and failed. The kernel side here does the whole locking operation:
924 * if there are waiters then it will block as a consequence of relying
925 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
926 * a 0 value of the futex too.).
927 *
928 * Also serves as futex trylock_pi()'ing, and due semantics.
929 */
930int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
931{
932	struct hrtimer_sleeper timeout, *to;
933	struct task_struct *exiting = NULL;
934	struct rt_mutex_waiter rt_waiter;
935	struct futex_hash_bucket *hb;
936	struct futex_q q = futex_q_init;
937	int res, ret;
938
939	if (!IS_ENABLED(CONFIG_FUTEX_PI))
940		return -ENOSYS;
941
942	if (refill_pi_state_cache())
943		return -ENOMEM;
944
945	to = futex_setup_timer(time, &timeout, flags, 0);
946
947retry:
948	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
949	if (unlikely(ret != 0))
950		goto out;
951
952retry_private:
953	hb = futex_q_lock(&q);
954
955	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
956				   &exiting, 0);
957	if (unlikely(ret)) {
958		/*
959		 * Atomic work succeeded and we got the lock,
960		 * or failed. Either way, we do _not_ block.
961		 */
962		switch (ret) {
963		case 1:
964			/* We got the lock. */
965			ret = 0;
966			goto out_unlock_put_key;
967		case -EFAULT:
968			goto uaddr_faulted;
969		case -EBUSY:
970		case -EAGAIN:
971			/*
972			 * Two reasons for this:
973			 * - EBUSY: Task is exiting and we just wait for the
974			 *   exit to complete.
975			 * - EAGAIN: The user space value changed.
976			 */
977			futex_q_unlock(hb);
978			/*
979			 * Handle the case where the owner is in the middle of
980			 * exiting. Wait for the exit to complete otherwise
981			 * this task might loop forever, aka. live lock.
982			 */
983			wait_for_owner_exiting(ret, exiting);
984			cond_resched();
985			goto retry;
986		default:
987			goto out_unlock_put_key;
988		}
989	}
990
991	WARN_ON(!q.pi_state);
992
993	/*
994	 * Only actually queue now that the atomic ops are done:
995	 */
996	__futex_queue(&q, hb);
997
998	if (trylock) {
999		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
1000		/* Fixup the trylock return value: */
1001		ret = ret ? 0 : -EWOULDBLOCK;
1002		goto no_block;
1003	}
1004
1005	rt_mutex_init_waiter(&rt_waiter);
1006
1007	/*
1008	 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
1009	 * hold it while doing rt_mutex_start_proxy(), because then it will
1010	 * include hb->lock in the blocking chain, even through we'll not in
1011	 * fact hold it while blocking. This will lead it to report -EDEADLK
1012	 * and BUG when futex_unlock_pi() interleaves with this.
1013	 *
1014	 * Therefore acquire wait_lock while holding hb->lock, but drop the
1015	 * latter before calling __rt_mutex_start_proxy_lock(). This
1016	 * interleaves with futex_unlock_pi() -- which does a similar lock
1017	 * handoff -- such that the latter can observe the futex_q::pi_state
1018	 * before __rt_mutex_start_proxy_lock() is done.
1019	 */
1020	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1021	spin_unlock(q.lock_ptr);
1022	/*
1023	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1024	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
1025	 * it sees the futex_q::pi_state.
1026	 */
1027	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1028	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1029
1030	if (ret) {
1031		if (ret == 1)
1032			ret = 0;
1033		goto cleanup;
1034	}
1035
1036	if (unlikely(to))
1037		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1038
1039	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1040
1041cleanup:
1042	spin_lock(q.lock_ptr);
1043	/*
1044	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
1045	 * first acquire the hb->lock before removing the lock from the
1046	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
1047	 * lists consistent.
1048	 *
1049	 * In particular; it is important that futex_unlock_pi() can not
1050	 * observe this inconsistency.
1051	 */
1052	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1053		ret = 0;
1054
1055no_block:
1056	/*
1057	 * Fixup the pi_state owner and possibly acquire the lock if we
1058	 * haven't already.
1059	 */
1060	res = fixup_pi_owner(uaddr, &q, !ret);
1061	/*
1062	 * If fixup_pi_owner() returned an error, propagate that.  If it acquired
1063	 * the lock, clear our -ETIMEDOUT or -EINTR.
1064	 */
1065	if (res)
1066		ret = (res < 0) ? res : 0;
1067
1068	futex_unqueue_pi(&q);
1069	spin_unlock(q.lock_ptr);
1070	goto out;
1071
1072out_unlock_put_key:
1073	futex_q_unlock(hb);
1074
1075out:
1076	if (to) {
1077		hrtimer_cancel(&to->timer);
1078		destroy_hrtimer_on_stack(&to->timer);
1079	}
1080	return ret != -EINTR ? ret : -ERESTARTNOINTR;
1081
1082uaddr_faulted:
1083	futex_q_unlock(hb);
1084
1085	ret = fault_in_user_writeable(uaddr);
1086	if (ret)
1087		goto out;
1088
1089	if (!(flags & FLAGS_SHARED))
1090		goto retry_private;
1091
1092	goto retry;
1093}
1094
1095/*
1096 * Userspace attempted a TID -> 0 atomic transition, and failed.
1097 * This is the in-kernel slowpath: we look up the PI state (if any),
1098 * and do the rt-mutex unlock.
1099 */
1100int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1101{
1102	u32 curval, uval, vpid = task_pid_vnr(current);
1103	union futex_key key = FUTEX_KEY_INIT;
1104	struct futex_hash_bucket *hb;
1105	struct futex_q *top_waiter;
1106	int ret;
1107
1108	if (!IS_ENABLED(CONFIG_FUTEX_PI))
1109		return -ENOSYS;
1110
1111retry:
1112	if (get_user(uval, uaddr))
1113		return -EFAULT;
1114	/*
1115	 * We release only a lock we actually own:
1116	 */
1117	if ((uval & FUTEX_TID_MASK) != vpid)
1118		return -EPERM;
1119
1120	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
1121	if (ret)
1122		return ret;
1123
1124	hb = futex_hash(&key);
1125	spin_lock(&hb->lock);
1126
1127	/*
1128	 * Check waiters first. We do not trust user space values at
1129	 * all and we at least want to know if user space fiddled
1130	 * with the futex value instead of blindly unlocking.
1131	 */
1132	top_waiter = futex_top_waiter(hb, &key);
1133	if (top_waiter) {
1134		struct futex_pi_state *pi_state = top_waiter->pi_state;
1135
1136		ret = -EINVAL;
1137		if (!pi_state)
1138			goto out_unlock;
1139
1140		/*
1141		 * If current does not own the pi_state then the futex is
1142		 * inconsistent and user space fiddled with the futex value.
1143		 */
1144		if (pi_state->owner != current)
1145			goto out_unlock;
1146
1147		get_pi_state(pi_state);
1148		/*
1149		 * By taking wait_lock while still holding hb->lock, we ensure
1150		 * there is no point where we hold neither; and therefore
1151		 * wake_futex_p() must observe a state consistent with what we
1152		 * observed.
1153		 *
1154		 * In particular; this forces __rt_mutex_start_proxy() to
1155		 * complete such that we're guaranteed to observe the
1156		 * rt_waiter. Also see the WARN in wake_futex_pi().
1157		 */
1158		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1159		spin_unlock(&hb->lock);
1160
1161		/* drops pi_state->pi_mutex.wait_lock */
1162		ret = wake_futex_pi(uaddr, uval, pi_state);
1163
1164		put_pi_state(pi_state);
1165
1166		/*
1167		 * Success, we're done! No tricky corner cases.
1168		 */
1169		if (!ret)
1170			return ret;
1171		/*
1172		 * The atomic access to the futex value generated a
1173		 * pagefault, so retry the user-access and the wakeup:
1174		 */
1175		if (ret == -EFAULT)
1176			goto pi_faulted;
1177		/*
1178		 * A unconditional UNLOCK_PI op raced against a waiter
1179		 * setting the FUTEX_WAITERS bit. Try again.
1180		 */
1181		if (ret == -EAGAIN)
1182			goto pi_retry;
1183		/*
1184		 * wake_futex_pi has detected invalid state. Tell user
1185		 * space.
1186		 */
1187		return ret;
1188	}
1189
1190	/*
1191	 * We have no kernel internal state, i.e. no waiters in the
1192	 * kernel. Waiters which are about to queue themselves are stuck
1193	 * on hb->lock. So we can safely ignore them. We do neither
1194	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
1195	 * owner.
1196	 */
1197	if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1198		spin_unlock(&hb->lock);
1199		switch (ret) {
1200		case -EFAULT:
1201			goto pi_faulted;
1202
1203		case -EAGAIN:
1204			goto pi_retry;
1205
1206		default:
1207			WARN_ON_ONCE(1);
1208			return ret;
1209		}
1210	}
1211
1212	/*
1213	 * If uval has changed, let user space handle it.
1214	 */
1215	ret = (curval == uval) ? 0 : -EAGAIN;
1216
1217out_unlock:
1218	spin_unlock(&hb->lock);
1219	return ret;
1220
1221pi_retry:
1222	cond_resched();
1223	goto retry;
1224
1225pi_faulted:
1226
1227	ret = fault_in_user_writeable(uaddr);
1228	if (!ret)
1229		goto retry;
1230
1231	return ret;
1232}
1233
1234