1b2441318SGreg Kroah-Hartman// SPDX-License-Identifier: GPL-2.0
2c4e05116SIngo Molnar/* kernel/rwsem.c: R/W semaphores, public implementation
3c4e05116SIngo Molnar *
4c4e05116SIngo Molnar * Written by David Howells (dhowells@redhat.com).
5c4e05116SIngo Molnar * Derived from asm-i386/semaphore.h
65dec94d4SWaiman Long *
75dec94d4SWaiman Long * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
85dec94d4SWaiman Long * and Michel Lespinasse <walken@google.com>
95dec94d4SWaiman Long *
105dec94d4SWaiman Long * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
115dec94d4SWaiman Long * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
125dec94d4SWaiman Long *
134f23dbc1SWaiman Long * Rwsem count bit fields re-definition and rwsem rearchitecture by
144f23dbc1SWaiman Long * Waiman Long <longman@redhat.com> and
154f23dbc1SWaiman Long * Peter Zijlstra <peterz@infradead.org>.
16c4e05116SIngo Molnar */
17c4e05116SIngo Molnar
18c4e05116SIngo Molnar#include <linux/types.h>
19c4e05116SIngo Molnar#include <linux/kernel.h>
20c7af77b5SLivio Soares#include <linux/sched.h>
215dec94d4SWaiman Long#include <linux/sched/rt.h>
225dec94d4SWaiman Long#include <linux/sched/task.h>
23b17b0153SIngo Molnar#include <linux/sched/debug.h>
245dec94d4SWaiman Long#include <linux/sched/wake_q.h>
255dec94d4SWaiman Long#include <linux/sched/signal.h>
267d43f1ceSWaiman Long#include <linux/sched/clock.h>
279984de1aSPaul Gortmaker#include <linux/export.h>
28c4e05116SIngo Molnar#include <linux/rwsem.h>
2960063497SArun Sharma#include <linux/atomic.h>
30c4e05116SIngo Molnar
3142254105SThomas Gleixner#ifndef CONFIG_PREEMPT_RT
325dec94d4SWaiman Long#include "lock_events.h"
335dec94d4SWaiman Long
345dec94d4SWaiman Long/*
35617f3ef9SWaiman Long * The least significant 2 bits of the owner value has the following
365dec94d4SWaiman Long * meanings when set.
3702f1082bSWaiman Long *  - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
38617f3ef9SWaiman Long *  - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
395dec94d4SWaiman Long *
40617f3ef9SWaiman Long * When the rwsem is reader-owned and a spinning writer has timed out,
41617f3ef9SWaiman Long * the nonspinnable bit will be set to disable optimistic spinning.
427d43f1ceSWaiman Long
435dec94d4SWaiman Long * When a writer acquires a rwsem, it puts its task_struct pointer
445dec94d4SWaiman Long * into the owner field. It is cleared after an unlock.
455dec94d4SWaiman Long *
465dec94d4SWaiman Long * When a reader acquires a rwsem, it will also puts its task_struct
477d43f1ceSWaiman Long * pointer into the owner field with the RWSEM_READER_OWNED bit set.
487d43f1ceSWaiman Long * On unlock, the owner field will largely be left untouched. So
497d43f1ceSWaiman Long * for a free or reader-owned rwsem, the owner value may contain
507d43f1ceSWaiman Long * information about the last reader that acquires the rwsem.
515dec94d4SWaiman Long *
525dec94d4SWaiman Long * That information may be helpful in debugging cases where the system
535dec94d4SWaiman Long * seems to hang on a reader owned rwsem especially if only one reader
545dec94d4SWaiman Long * is involved. Ideally we would like to track all the readers that own
555dec94d4SWaiman Long * a rwsem, but the overhead is simply too big.
565cfd92e1SWaiman Long *
57617f3ef9SWaiman Long * A fast path reader optimistic lock stealing is supported when the rwsem
58617f3ef9SWaiman Long * is previously owned by a writer and the following conditions are met:
59617f3ef9SWaiman Long *  - rwsem is not currently writer owned
60617f3ef9SWaiman Long *  - the handoff isn't set.
615dec94d4SWaiman Long */
625dec94d4SWaiman Long#define RWSEM_READER_OWNED	(1UL << 0)
63617f3ef9SWaiman Long#define RWSEM_NONSPINNABLE	(1UL << 1)
6402f1082bSWaiman Long#define RWSEM_OWNER_FLAGS_MASK	(RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
655dec94d4SWaiman Long
665dec94d4SWaiman Long#ifdef CONFIG_DEBUG_RWSEMS
675dec94d4SWaiman Long# define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\
685dec94d4SWaiman Long	if (!debug_locks_silent &&				\
69fce45cd4SDavidlohr Bueso	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
705dec94d4SWaiman Long		#c, atomic_long_read(&(sem)->count),		\
71fce45cd4SDavidlohr Bueso		(unsigned long) sem->magic,			\
7294a9717bSWaiman Long		atomic_long_read(&(sem)->owner), (long)current,	\
735dec94d4SWaiman Long		list_empty(&(sem)->wait_list) ? "" : "not "))	\
745dec94d4SWaiman Long			debug_locks_off();			\
755dec94d4SWaiman Long	} while (0)
765dec94d4SWaiman Long#else
775dec94d4SWaiman Long# define DEBUG_RWSEMS_WARN_ON(c, sem)
785dec94d4SWaiman Long#endif
795dec94d4SWaiman Long
805dec94d4SWaiman Long/*
81a15ea1a3SWaiman Long * On 64-bit architectures, the bit definitions of the count are:
825dec94d4SWaiman Long *
83a15ea1a3SWaiman Long * Bit  0    - writer locked bit
84a15ea1a3SWaiman Long * Bit  1    - waiters present bit
85a15ea1a3SWaiman Long * Bit  2    - lock handoff bit
86a15ea1a3SWaiman Long * Bits 3-7  - reserved
87a15ea1a3SWaiman Long * Bits 8-62 - 55-bit reader count
88a15ea1a3SWaiman Long * Bit  63   - read fail bit
89a15ea1a3SWaiman Long *
90a15ea1a3SWaiman Long * On 32-bit architectures, the bit definitions of the count are:
91a15ea1a3SWaiman Long *
92a15ea1a3SWaiman Long * Bit  0    - writer locked bit
93a15ea1a3SWaiman Long * Bit  1    - waiters present bit
94a15ea1a3SWaiman Long * Bit  2    - lock handoff bit
95a15ea1a3SWaiman Long * Bits 3-7  - reserved
96a15ea1a3SWaiman Long * Bits 8-30 - 23-bit reader count
97a15ea1a3SWaiman Long * Bit  31   - read fail bit
98a15ea1a3SWaiman Long *
99a15ea1a3SWaiman Long * It is not likely that the most significant bit (read fail bit) will ever
100a15ea1a3SWaiman Long * be set. This guard bit is still checked anyway in the down_read() fastpath
101a15ea1a3SWaiman Long * just in case we need to use up more of the reader bits for other purpose
102a15ea1a3SWaiman Long * in the future.
1035dec94d4SWaiman Long *
1045dec94d4SWaiman Long * atomic_long_fetch_add() is used to obtain reader lock, whereas
1055dec94d4SWaiman Long * atomic_long_cmpxchg() will be used to obtain writer lock.
1064f23dbc1SWaiman Long *
1074f23dbc1SWaiman Long * There are three places where the lock handoff bit may be set or cleared.
108d257cc8cSWaiman Long * 1) rwsem_mark_wake() for readers		-- set, clear
109d257cc8cSWaiman Long * 2) rwsem_try_write_lock() for writers	-- set, clear
110d257cc8cSWaiman Long * 3) rwsem_del_waiter()			-- clear
1114f23dbc1SWaiman Long *
1124f23dbc1SWaiman Long * For all the above cases, wait_lock will be held. A writer must also
1134f23dbc1SWaiman Long * be the first one in the wait_list to be eligible for setting the handoff
1144f23dbc1SWaiman Long * bit. So concurrent setting/clearing of handoff bit is not possible.
1155dec94d4SWaiman Long */
1165dec94d4SWaiman Long#define RWSEM_WRITER_LOCKED	(1UL << 0)
1175dec94d4SWaiman Long#define RWSEM_FLAG_WAITERS	(1UL << 1)
1184f23dbc1SWaiman Long#define RWSEM_FLAG_HANDOFF	(1UL << 2)
119a15ea1a3SWaiman Long#define RWSEM_FLAG_READFAIL	(1UL << (BITS_PER_LONG - 1))
1204f23dbc1SWaiman Long
1215dec94d4SWaiman Long#define RWSEM_READER_SHIFT	8
1225dec94d4SWaiman Long#define RWSEM_READER_BIAS	(1UL << RWSEM_READER_SHIFT)
1235dec94d4SWaiman Long#define RWSEM_READER_MASK	(~(RWSEM_READER_BIAS - 1))
1245dec94d4SWaiman Long#define RWSEM_WRITER_MASK	RWSEM_WRITER_LOCKED
1255dec94d4SWaiman Long#define RWSEM_LOCK_MASK		(RWSEM_WRITER_MASK|RWSEM_READER_MASK)
1264f23dbc1SWaiman Long#define RWSEM_READ_FAILED_MASK	(RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
127a15ea1a3SWaiman Long				 RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
1285dec94d4SWaiman Long
1295dec94d4SWaiman Long/*
1305dec94d4SWaiman Long * All writes to owner are protected by WRITE_ONCE() to make sure that
1315dec94d4SWaiman Long * store tearing can't happen as optimistic spinners may read and use
1325dec94d4SWaiman Long * the owner value concurrently without lock. Read from owner, however,
1335dec94d4SWaiman Long * may not need READ_ONCE() as long as the pointer value is only used
1345dec94d4SWaiman Long * for comparison and isn't being dereferenced.
1355dec94d4SWaiman Long */
1365dec94d4SWaiman Longstatic inline void rwsem_set_owner(struct rw_semaphore *sem)
1375dec94d4SWaiman Long{
13894a9717bSWaiman Long	atomic_long_set(&sem->owner, (long)current);
1395dec94d4SWaiman Long}
1405dec94d4SWaiman Long
1415dec94d4SWaiman Longstatic inline void rwsem_clear_owner(struct rw_semaphore *sem)
1425dec94d4SWaiman Long{
14394a9717bSWaiman Long	atomic_long_set(&sem->owner, 0);
14494a9717bSWaiman Long}
14594a9717bSWaiman Long
14694a9717bSWaiman Long/*
14794a9717bSWaiman Long * Test the flags in the owner field.
14894a9717bSWaiman Long */
14994a9717bSWaiman Longstatic inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
15094a9717bSWaiman Long{
15194a9717bSWaiman Long	return atomic_long_read(&sem->owner) & flags;
1525dec94d4SWaiman Long}
1535dec94d4SWaiman Long
1545dec94d4SWaiman Long/*
1555dec94d4SWaiman Long * The task_struct pointer of the last owning reader will be left in
1565dec94d4SWaiman Long * the owner field.
1575dec94d4SWaiman Long *
1585dec94d4SWaiman Long * Note that the owner value just indicates the task has owned the rwsem
1595dec94d4SWaiman Long * previously, it may not be the real owner or one of the real owners
1605dec94d4SWaiman Long * anymore when that field is examined, so take it with a grain of salt.
1615cfd92e1SWaiman Long *
1625cfd92e1SWaiman Long * The reader non-spinnable bit is preserved.
1635dec94d4SWaiman Long */
1645dec94d4SWaiman Longstatic inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
1655dec94d4SWaiman Long					    struct task_struct *owner)
1665dec94d4SWaiman Long{
1675cfd92e1SWaiman Long	unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
168617f3ef9SWaiman Long		(atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
1695dec94d4SWaiman Long
17094a9717bSWaiman Long	atomic_long_set(&sem->owner, val);
1715dec94d4SWaiman Long}
1725dec94d4SWaiman Long
1735dec94d4SWaiman Longstatic inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
1745dec94d4SWaiman Long{
1755dec94d4SWaiman Long	__rwsem_set_reader_owned(sem, current);
1765dec94d4SWaiman Long}
1775dec94d4SWaiman Long
1785dec94d4SWaiman Long/*
17994a9717bSWaiman Long * Return true if the rwsem is owned by a reader.
1805dec94d4SWaiman Long */
18194a9717bSWaiman Longstatic inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
1825dec94d4SWaiman Long{
18394a9717bSWaiman Long#ifdef CONFIG_DEBUG_RWSEMS
18494a9717bSWaiman Long	/*
18594a9717bSWaiman Long	 * Check the count to see if it is write-locked.
18694a9717bSWaiman Long	 */
18794a9717bSWaiman Long	long count = atomic_long_read(&sem->count);
18894a9717bSWaiman Long
18994a9717bSWaiman Long	if (count & RWSEM_WRITER_MASK)
19094a9717bSWaiman Long		return false;
19194a9717bSWaiman Long#endif
19294a9717bSWaiman Long	return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
1935dec94d4SWaiman Long}
1945dec94d4SWaiman Long
1955dec94d4SWaiman Long#ifdef CONFIG_DEBUG_RWSEMS
1965dec94d4SWaiman Long/*
1975dec94d4SWaiman Long * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
1985dec94d4SWaiman Long * is a task pointer in owner of a reader-owned rwsem, it will be the
1995dec94d4SWaiman Long * real owner or one of the real owners. The only exception is when the
2005dec94d4SWaiman Long * unlock is done by up_read_non_owner().
2015dec94d4SWaiman Long */
2025dec94d4SWaiman Longstatic inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
2035dec94d4SWaiman Long{
20494a9717bSWaiman Long	unsigned long val = atomic_long_read(&sem->owner);
20594a9717bSWaiman Long
20694a9717bSWaiman Long	while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
20794a9717bSWaiman Long		if (atomic_long_try_cmpxchg(&sem->owner, &val,
20894a9717bSWaiman Long					    val & RWSEM_OWNER_FLAGS_MASK))
20994a9717bSWaiman Long			return;
21094a9717bSWaiman Long	}
2115dec94d4SWaiman Long}
2125dec94d4SWaiman Long#else
2135dec94d4SWaiman Longstatic inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
214