1b2441318SGreg Kroah-Hartman// SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds * linux/ipc/sem.c
41da177e4SLinus Torvalds * Copyright (C) 1992 Krishna Balasubramanian
51da177e4SLinus Torvalds * Copyright (C) 1995 Eric Schenk, Bruno Haible
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
81da177e4SLinus Torvalds *
91da177e4SLinus Torvalds * SMP-threaded, sysctl's added
10624dffcbSChristian Kujau * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
111da177e4SLinus Torvalds * Enforced range limit on SEM_UNDO
12046c6884SAlan Cox * (c) 2001 Red Hat Inc
131da177e4SLinus Torvalds * Lockless wakeup
141da177e4SLinus Torvalds * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
159ae949faSDavidlohr Bueso * (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
16c5cf6359SManfred Spraul * Further wakeup optimizations, documentation
17c5cf6359SManfred Spraul * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
18073115d6SSteve Grubb *
19073115d6SSteve Grubb * support for audit of ipc object properties and permission changes
20073115d6SSteve Grubb * Dustin Kirkland <dustin.kirkland@us.ibm.com>
21e3893534SKirill Korotaev *
22e3893534SKirill Korotaev * namespaces support
23e3893534SKirill Korotaev * OpenVZ, SWsoft Inc.
24e3893534SKirill Korotaev * Pavel Emelianov <xemul@openvz.org>
25c5cf6359SManfred Spraul *
26c5cf6359SManfred Spraul * Implementation notes: (May 2010)
27c5cf6359SManfred Spraul * This file implements System V semaphores.
28c5cf6359SManfred Spraul *
29c5cf6359SManfred Spraul * User space visible behavior:
30c5cf6359SManfred Spraul * - FIFO ordering for semop() operations (just FIFO, not starvation
31c5cf6359SManfred Spraul *   protection)
32c5cf6359SManfred Spraul * - multiple semaphore operations that alter the same semaphore in
33c5cf6359SManfred Spraul *   one semop() are handled.
34c5cf6359SManfred Spraul * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
35c5cf6359SManfred Spraul *   SETALL calls.
36c5cf6359SManfred Spraul * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
37c5cf6359SManfred Spraul * - undo adjustments at process exit are limited to 0..SEMVMX.
38c5cf6359SManfred Spraul * - namespace are supported.
39b1989a3dSBhaskar Chowdhury * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing
40c5cf6359SManfred Spraul *   to /proc/sys/kernel/sem.
41c5cf6359SManfred Spraul * - statistics about the usage are reported in /proc/sysvipc/sem.
42c5cf6359SManfred Spraul *
43c5cf6359SManfred Spraul * Internals:
44c5cf6359SManfred Spraul * - scalability:
45c5cf6359SManfred Spraul *   - all global variables are read-mostly.
46c5cf6359SManfred Spraul *   - semop() calls and semctl(RMID) are synchronized by RCU.
47c5cf6359SManfred Spraul *   - most operations do write operations (actually: spin_lock calls) to
48c5cf6359SManfred Spraul *     the per-semaphore array structure.
49c5cf6359SManfred Spraul *   Thus: Perfect SMP scaling between independent semaphore arrays.
50c5cf6359SManfred Spraul *         If multiple semaphores in one array are used, then cache line
51c5cf6359SManfred Spraul *         trashing on the semaphore array spinlock will limit the scaling.
522f2ed41dSManfred Spraul * - semncnt and semzcnt are calculated on demand in count_semcnt()
53c5cf6359SManfred Spraul * - the task that performs a successful semop() scans the list of all
54c5cf6359SManfred Spraul *   sleeping tasks and completes any pending operations that can be fulfilled.
55c5cf6359SManfred Spraul *   Semaphores are actively given to waiting tasks (necessary for FIFO).
56c5cf6359SManfred Spraul *   (see update_queue())
57c5cf6359SManfred Spraul * - To improve the scalability, the actual wake-up calls are performed after
589ae949faSDavidlohr Bueso *   dropping all locks. (see wake_up_sem_queue_prepare())
59c5cf6359SManfred Spraul * - All work is done by the waker, the woken up task does not have to do
60c5cf6359SManfred Spraul *   anything - not even acquiring a lock or dropping a refcount.
61c5cf6359SManfred Spraul * - A woken up task may not even touch the semaphore array anymore, it may
62c5cf6359SManfred Spraul *   have been destroyed already by a semctl(RMID).
63c5cf6359SManfred Spraul * - UNDO values are stored in an array (one per process and per
64c5cf6359SManfred Spraul *   semaphore array, lazily allocated). For backwards compatibility, multiple
65c5cf6359SManfred Spraul *   modes for the UNDO variables are supported (per process, per thread)
66c5cf6359SManfred Spraul *   (see copy_semundo, CLONE_SYSVSEM)
67c5cf6359SManfred Spraul * - There are two lists of the pending operations: a per-array list
68c5cf6359SManfred Spraul *   and per-semaphore list (stored in the array). This allows to achieve FIFO
69c5cf6359SManfred Spraul *   ordering without always scanning all pending operations.
70c5cf6359SManfred Spraul *   The worst-case behavior is nevertheless O(N^2) for N wakeups.
711da177e4SLinus Torvalds */
721da177e4SLinus Torvalds
73b0d17578SArnd Bergmann#include <linux/compat.h>
741da177e4SLinus Torvalds#include <linux/slab.h>
751da177e4SLinus Torvalds#include <linux/spinlock.h>
761da177e4SLinus Torvalds#include <linux/init.h>
771da177e4SLinus Torvalds#include <linux/proc_fs.h>
781da177e4SLinus Torvalds#include <linux/time.h>
791da177e4SLinus Torvalds#include <linux/security.h>
801da177e4SLinus Torvalds#include <linux/syscalls.h>
811da177e4SLinus Torvalds#include <linux/audit.h>
82c59ede7bSRandy Dunlap#include <linux/capability.h>
8319b4946cSMike Waychison#include <linux/seq_file.h>
843e148c79SNadia Derbey#include <linux/rwsem.h>
85e3893534SKirill Korotaev#include <linux/nsproxy.h>
86ae5e1b22SPavel Emelyanov#include <linux/ipc_namespace.h>
8784f001e1SIngo Molnar#include <linux/sched/wake_q.h>
88ec67aaa4SDavidlohr Bueso#include <linux/nospec.h>
890eb71a9dSNeilBrown#include <linux/rhashtable.h>
905f921ae9SIngo Molnar
917153e402SPaul McQuade#include <linux/uaccess.h>
921da177e4SLinus Torvalds#include "util.h"
931da177e4SLinus Torvalds
941a5c1349SEric W. Biederman/* One semaphore structure for each semaphore in the system. */
951a5c1349SEric W. Biedermanstruct sem {
961a5c1349SEric W. Biederman	int	semval;		/* current value */
971a5c1349SEric W. Biederman	/*
981a5c1349SEric W. Biederman	 * PID of the process that last modified the semaphore. For
991a5c1349SEric W. Biederman	 * Linux, specifically these are:
1001a5c1349SEric W. Biederman	 *  - semop
1011a5c1349SEric W. Biederman	 *  - semctl, via SETVAL and SETALL.
1021a5c1349SEric W. Biederman	 *  - at task exit when performing undo adjustments (see exit_sem).
1031a5c1349SEric W. Biederman	 */
10451d6f263SEric W. Biederman	struct pid *sempid;
1051a5c1349SEric W. Biederman	spinlock_t	lock;	/* spinlock for fine-grained semtimedop */
1061a5c1349SEric W. Biederman	struct list_head pending_alter; /* pending single-sop operations */
1071a5c1349SEric W. Biederman					/* that alter the semaphore */
1081a5c1349SEric W. Biederman	struct list_head pending_const; /* pending single-sop operations */
1091a5c1349SEric W. Biederman					/* that do not alter the semaphore*/
1102a70b787SArnd Bergmann	time64_t	 sem_otime;	/* candidate for sem_otime */
1111a5c1349SEric W. Biederman} ____cacheline_aligned_in_smp;
1121a5c1349SEric W. Biederman
1131a5c1349SEric W. Biederman/* One sem_array data structure for each set of semaphores in the system. */
1141a5c1349SEric W. Biedermanstruct sem_array {
1151a5c1349SEric W. Biederman	struct kern_ipc_perm	sem_perm;	/* permissions .. see ipc.h */
1161a5c1349SEric W. Biederman	time64_t		sem_ctime;	/* create/last semctl() time */
1171a5c1349SEric W. Biederman	struct list_head	pending_alter;	/* pending operations */
1181a5c1349SEric W. Biederman						/* that alter the array */
1191a5c1349SEric W. Biederman	struct list_head	pending_const;	/* pending complex operations */
1201a5c1349SEric W. Biederman						/* that do not alter semvals */
1211a5c1349SEric W. Biederman	struct list_head	list_id;	/* undo requests on this array */
1221a5c1349SEric W. Biederman	int			sem_nsems;	/* no. of semaphores in array */
1231a5c1349SEric W. Biederman	int			complex_count;	/* pending complex operations */
1241a5c1349SEric W. Biederman	unsigned int		use_global_lock;/* >0: global lock required */
1251a5c1349SEric W. Biederman
1261a5c1349SEric W. Biederman	struct sem		sems[];
1271a5c1349SEric W. Biederman} __randomize_layout;
128e57940d7SManfred Spraul
129e57940d7SManfred Spraul/* One queue for each sleeping process in the system. */
130e57940d7SManfred Spraulstruct sem_queue {
131e57940d7SManfred Spraul	struct list_head	list;	 /* queue of pending operations */
132e57940d7SManfred Spraul	struct task_struct	*sleeper; /* this process */
133e57940d7SManfred Spraul	struct sem_undo		*undo;	 /* undo structure */
13451d6f263SEric W. Biederman	struct pid		*pid;	 /* process id of requesting process */
135e57940d7SManfred Spraul	int			status;	 /* completion status of operation */
136e57940d7SManfred Spraul	struct sembuf		*sops;	 /* array of pending operations */
137ed247b7cSManfred Spraul	struct sembuf		*blocking; /* the operation that blocked */
138e57940d7SManfred Spraul	int			nsops;	 /* number of operations */
1394ce33ec2SDavidlohr Bueso	bool			alter;	 /* does *sops alter the array? */
1404ce33ec2SDavidlohr Bueso	bool                    dupsop;	 /* sops on more than one sem_num */
141e57940d7SManfred Spraul};
142e57940d7SManfred Spraul
143e57940d7SManfred Spraul/* Each task has a list of undo requests. They are executed automatically
144e57940d7SManfred Spraul * when the process exits.
145e57940d7SManfred Spraul */
146e57940d7SManfred Spraulstruct sem_undo {
147e57940d7SManfred Spraul	struct list_head	list_proc;	/* per-process list: *
148e57940d7SManfred Spraul						 * all undos from one process
149e57940d7SManfred Spraul						 * rcu protected */
150e57940d7SManfred Spraul	struct rcu_head		rcu;		/* rcu struct for sem_undo */
151e57940d7SManfred Spraul	struct sem_undo_list	*ulp;		/* back ptr to sem_undo_list */
152e57940d7SManfred Spraul	struct list_head	list_id;	/* per semaphore array list:
153e57940d7SManfred Spraul						 * all undos for one array */
154e57940d7SManfred Spraul	int			semid;		/* semaphore set identifier */
155e57940d7SManfred Spraul	short			*semadj;	/* array of adjustments */
156e57940d7SManfred Spraul						/* one per semaphore */
157e57940d7SManfred Spraul};
158e57940d7SManfred Spraul
159e57940d7SManfred Spraul/* sem_undo_list controls shared access to the list of sem_undo structures
160e57940d7SManfred Spraul * that may be shared among all a CLONE_SYSVSEM task group.
161e57940d7SManfred Spraul */
162e57940d7SManfred Spraulstruct sem_undo_list {
163f74370b8SElena Reshetova	refcount_t		refcnt;
164e57940d7SManfred Spraul	spinlock_t		lock;
165e57940d7SManfred Spraul	struct list_head	list_proc;
166e57940d7SManfred Spraul};
167e57940d7SManfred Spraul
168e57940d7SManfred Spraul
169ed2ddbf8SPierre Peiffer#define sem_ids(ns)	((ns)->ids[IPC_SEM_IDS])
170e3893534SKirill Korotaev
1717748dbfaSNadia Derbeystatic int newary(struct ipc_namespace *, struct ipc_params *);
17201b8b07aSPierre Peifferstatic void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
1731da177e4SLinus Torvalds#ifdef CONFIG_PROC_FS
17419b4946cSMike Waychisonstatic int sysvipc_sem_proc_show(struct seq_file *s, void *it);
1751da177e4SLinus Torvalds#endif
1761da177e4SLinus Torvalds
1771da177e4SLinus Torvalds#define SEMMSL_FAST	256 /* 512 bytes on stack */
1781da177e4SLinus Torvalds#define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
1791da177e4SLinus Torvalds
1809de5ab8aSManfred Spraul/*
1819de5ab8aSManfred Spraul * Switching from the mode suitable for simple ops
1829de5ab8aSManfred Spraul * to the mode for complex ops is costly. Therefore:
1839de5ab8aSManfred Spraul * use some hysteresis
1849de5ab8aSManfred Spraul */
1859de5ab8aSManfred Spraul#define USE_GLOBAL_LOCK_HYSTERESIS	10
1869de5ab8aSManfred Spraul
1871da177e4SLinus Torvalds/*
188758a6ba3SManfred Spraul * Locking:
1895864a2fdSManfred Spraul * a) global sem_lock() for read/write
1901da177e4SLinus Torvalds *	sem_undo.id_next,
191758a6ba3SManfred Spraul *	sem_array.complex_count,
1925864a2fdSManfred Spraul *	sem_array.pending{_alter,_const},
1935864a2fdSManfred Spraul *	sem_array.sem_undo
19446c0a8caSPaul McQuade *
1955864a2fdSManfred Spraul * b) global or semaphore sem_lock() for read/write:
1961a233956SManfred Spraul *	sem_array.sems[i].pending_{const,alter}:
1975864a2fdSManfred Spraul *
1985864a2fdSManfred Spraul * c) special:
1995864a2fdSManfred Spraul *	sem_undo_list.list_proc:
2005864a2fdSManfred Spraul *	* undo_list->lock for write
2015864a2fdSManfred Spraul *	* rcu for read
2029de5ab8aSManfred Spraul *	use_global_lock:
2039de5ab8aSManfred Spraul *	* global sem_lock() for write
2049de5ab8aSManfred Spraul *	* either local or global sem_lock() for read.
2059de5ab8aSManfred Spraul *
2069de5ab8aSManfred Spraul * Memory ordering:
2079de5ab8aSManfred Spraul * Most ordering is enforced by using spin_lock() and spin_unlock().
2088116b54eSManfred Spraul *
2098116b54eSManfred Spraul * Exceptions:
2108116b54eSManfred Spraul * 1) use_global_lock: (SEM_BARRIER_1)
2119de5ab8aSManfred Spraul * Setting it from non-zero to 0 is a RELEASE, this is ensured by
2128116b54eSManfred Spraul * using smp_store_release(): Immediately after setting it to 0,
2138116b54eSManfred Spraul * a simple op can start.
2149de5ab8aSManfred Spraul * Testing if it is non-zero is an ACQUIRE, this is ensured by using
2159de5ab8aSManfred Spraul * smp_load_acquire().
2169de5ab8aSManfred Spraul * Setting it from 0 to non-zero must be ordered with regards to
2179de5ab8aSManfred Spraul * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
2189de5ab8aSManfred Spraul * is inside a spin_lock() and after a write from 0 to non-zero a
2199de5ab8aSManfred Spraul * spin_lock()+spin_unlock() is done.
22017d056e0SManfred Spraul * To prevent the compiler/cpu temporarily writing 0 to use_global_lock,
22117d056e0SManfred Spraul * READ_ONCE()/WRITE_ONCE() is used.
2228116b54eSManfred Spraul *
2238116b54eSManfred Spraul * 2) queue.status: (SEM_BARRIER_2)
2248116b54eSManfred Spraul * Initialization is done while holding sem_lock(), so no further barrier is
2258116b54eSManfred Spraul * required.
2268116b54eSManfred Spraul * Setting it to a result code is a RELEASE, this is ensured by both a
2278116b54eSManfred Spraul * smp_store_release() (for case a) and while holding sem_lock()
2288116b54eSManfred Spraul * (for case b).
229b1989a3dSBhaskar Chowdhury * The ACQUIRE when reading the result code without holding sem_lock() is
2308116b54eSManfred Spraul * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep().
2318116b54eSManfred Spraul * (case a above).
2328116b54eSManfred Spraul * Reading the result code while holding sem_lock() needs no further barriers,
2338116b54eSManfred Spraul * the locks inside sem_lock() enforce ordering (case b above)
2348116b54eSManfred Spraul *
2358116b54eSManfred Spraul * 3) current->state:
2368116b54eSManfred Spraul * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock().
2378116b54eSManfred Spraul * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may
2388116b54eSManfred Spraul * happen immediately after calling wake_q_add. As wake_q_add_safe() is called
2398116b54eSManfred Spraul * when holding sem_lock(), no further barriers are required.
2408116b54eSManfred Spraul *
2418116b54eSManfred Spraul * See also ipc/mqueue.c for more details on the covered races.
2421da177e4SLinus Torvalds */
2431da177e4SLinus Torvalds
244e3893534SKirill Korotaev#define sc_semmsl	sem_ctls[0]
245e3893534SKirill Korotaev#define sc_semmns	sem_ctls[1]
246e3893534SKirill Korotaev#define sc_semopm	sem_ctls[2]
247e3893534SKirill Korotaev#define sc_semmni	sem_ctls[3]
248e3893534SKirill Korotaev
249eae04d25SDavidlohr Buesovoid sem_init_ns(struct ipc_namespace *ns)
250e3893534SKirill Korotaev{
251e3893534SKirill Korotaev	ns->sc_semmsl = SEMMSL;
252e3893534SKirill Korotaev	ns->sc_semmns = SEMMNS;
253e3893534SKirill Korotaev	ns->sc_semopm = SEMOPM;
254e3893534SKirill Korotaev	ns->sc_semmni = SEMMNI;
255e3893534SKirill Korotaev	ns->used_sems = 0;
256eae04d25SDavidlohr Bueso	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
257e3893534SKirill Korotaev}
258e3893534SKirill Korotaev
259ae5e1b22SPavel Emelyanov#ifdef CONFIG_IPC_NS
260e3893534SKirill Korotaevvoid sem_exit_ns(struct ipc_namespace *ns)
261e3893534SKirill Korotaev{
26201b8b07aSPierre Peiffer	free_ipcs(ns, &sem_ids(ns), freeary);
2637d6feeb2SSerge E. Hallyn	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
2640cfb6aeeSGuillaume Knispel	rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
265e3893534SKirill Korotaev}
266ae5e1b22SPavel Emelyanov#endif
2671da177e4SLinus Torvalds
268eae04d25SDavidlohr Buesovoid __init sem_init(void)
2691da177e4SLinus Torvalds{
270eae04d25SDavidlohr Bueso	sem_init_ns(&init_ipc_ns);
27119b4946cSMike Waychison	ipc_init_proc_interface("sysvipc/sem",
27219b4946cSMike Waychison				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
273e3893534SKirill Korotaev				IPC_SEM_IDS, sysvipc_sem_proc_show);
2741da177e4SLinus Torvalds}
2751da177e4SLinus Torvalds
276f269f40aSManfred Spraul/**
277f269f40aSManfred Spraul * unmerge_queues - unmerge queues, if possible.
278f269f40aSManfred Spraul * @sma: semaphore array
279f269f40aSManfred Spraul *
280f269f40aSManfred Spraul * The function unmerges the wait queues if complex_count is 0.
281f269f40aSManfred Spraul * It must be called prior to dropping the global semaphore array lock.
282f269f40aSManfred Spraul */
283f269f40aSManfred Spraulstatic void unmerge_queues(struct sem_array *sma)
284f269f40aSManfred Spraul{
285f269f40aSManfred Spraul	struct sem_queue *q, *tq;
286f269f40aSManfred Spraul
287f269f40aSManfred Spraul	/* complex operations still around? */
288f269f40aSManfred Spraul	if (sma->complex_count)
289f269f40aSManfred Spraul		return;
290f269f40aSManfred Spraul	/*
291f269f40aSManfred Spraul	 * We will switch back to simple mode.
292f269f40aSManfred Spraul	 * Move all pending operation back into the per-semaphore
293f269f40aSManfred Spraul	 * queues.
294f269f40aSManfred Spraul	 */
295f269f40aSManfred Spraul	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
296f269f40aSManfred Spraul		struct sem *curr;
2971a233956SManfred Spraul		curr = &sma->sems[q->sops[0].sem_num];
298f269f40aSManfred Spraul
299f269f40aSManfred Spraul		list_add_tail(&q->list, &curr->pending_alter);
300f269f40aSManfred Spraul	}
301f269f40aSManfred Spraul	INIT_LIST_HEAD(&sma->pending_alter);
302f269f40aSManfred Spraul}
303f269f40aSManfred Spraul
304f269f40aSManfred Spraul/**
3058001c858SDavidlohr Bueso * merge_queues - merge single semop queues into global queue
306f269f40aSManfred Spraul * @sma: semaphore array
307f269f40aSManfred Spraul *
308f269f40aSManfred Spraul * This function merges all per-semaphore queues into the global queue.
309f269f40aSManfred Spraul * It is necessary to achieve FIFO ordering for the pending single-sop
310f269f40aSManfred Spraul * operations when a multi-semop operation must sleep.
311f269f40aSManfred Spraul * Only the alter operations must be moved, the const operations can stay.
312f269f40aSManfred Spraul */
313f269f40aSManfred Spraulstatic void merge_queues(struct sem_array *sma)
314f269f40aSManfred Spraul{
315f269f40aSManfred Spraul	int i;
316f269f40aSManfred Spraul	for (i = 0; i < sma->sem_nsems; i++) {
3171a233956SManfred Spraul		struct sem *sem = &sma->sems[i];
318f269f40aSManfred Spraul
319f269f40aSManfred Spraul		list_splice_init(&sem->pending_alter, &sma->pending_alter);
320f269f40aSManfred Spraul	}
321f269f40aSManfred Spraul}
322f269f40aSManfred Spraul
32353dad6d3SDavidlohr Buesostatic void sem_rcu_free(struct rcu_head *head)
32453dad6d3SDavidlohr Bueso{
325dba4cdd3SManfred Spraul	struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu);
326dba4cdd3SManfred Spraul	struct sem_array *sma = container_of(p, struct sem_array, sem_perm);
32753dad6d3SDavidlohr Bueso
328aefad959SEric W. Biederman	security_sem_free(&sma->sem_perm);
329e2029dfeSKees Cook	kvfree(sma);
33053dad6d3SDavidlohr Bueso}
33153dad6d3SDavidlohr Bueso
3325e9d5275SManfred Spraul/*
3335864a2fdSManfred Spraul * Enter the mode suitable for non-simple operations:
3345e9d5275SManfred Spraul * Caller must own sem_perm.lock.
3355e9d5275SManfred Spraul */
3365864a2fdSManfred Spraulstatic void complexmode_enter(struct sem_array *sma)
3375e9d5275SManfred Spraul{
338