sem.c revision d9a605e4
1/*
2 * linux/ipc/sem.c
3 * Copyright (C) 1992 Krishna Balasubramanian
4 * Copyright (C) 1995 Eric Schenk, Bruno Haible
5 *
6 * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
7 *
8 * SMP-threaded, sysctl's added
9 * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
10 * Enforced range limit on SEM_UNDO
11 * (c) 2001 Red Hat Inc
12 * Lockless wakeup
13 * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
14 * Further wakeup optimizations, documentation
15 * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
16 *
17 * support for audit of ipc object properties and permission changes
18 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
19 *
20 * namespaces support
21 * OpenVZ, SWsoft Inc.
22 * Pavel Emelianov <xemul@openvz.org>
23 *
24 * Implementation notes: (May 2010)
25 * This file implements System V semaphores.
26 *
27 * User space visible behavior:
28 * - FIFO ordering for semop() operations (just FIFO, not starvation
29 *   protection)
30 * - multiple semaphore operations that alter the same semaphore in
31 *   one semop() are handled.
32 * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
33 *   SETALL calls.
34 * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
35 * - undo adjustments at process exit are limited to 0..SEMVMX.
36 * - namespace are supported.
37 * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
38 *   to /proc/sys/kernel/sem.
39 * - statistics about the usage are reported in /proc/sysvipc/sem.
40 *
41 * Internals:
42 * - scalability:
43 *   - all global variables are read-mostly.
44 *   - semop() calls and semctl(RMID) are synchronized by RCU.
45 *   - most operations do write operations (actually: spin_lock calls) to
46 *     the per-semaphore array structure.
47 *   Thus: Perfect SMP scaling between independent semaphore arrays.
48 *         If multiple semaphores in one array are used, then cache line
49 *         trashing on the semaphore array spinlock will limit the scaling.
50 * - semncnt and semzcnt are calculated on demand in count_semncnt() and
51 *   count_semzcnt()
52 * - the task that performs a successful semop() scans the list of all
53 *   sleeping tasks and completes any pending operations that can be fulfilled.
54 *   Semaphores are actively given to waiting tasks (necessary for FIFO).
55 *   (see update_queue())
56 * - To improve the scalability, the actual wake-up calls are performed after
57 *   dropping all locks. (see wake_up_sem_queue_prepare(),
58 *   wake_up_sem_queue_do())
59 * - All work is done by the waker, the woken up task does not have to do
60 *   anything - not even acquiring a lock or dropping a refcount.
61 * - A woken up task may not even touch the semaphore array anymore, it may
62 *   have been destroyed already by a semctl(RMID).
63 * - The synchronizations between wake-ups due to a timeout/signal and a
64 *   wake-up due to a completed semaphore operation is achieved by using an
65 *   intermediate state (IN_WAKEUP).
66 * - UNDO values are stored in an array (one per process and per
67 *   semaphore array, lazily allocated). For backwards compatibility, multiple
68 *   modes for the UNDO variables are supported (per process, per thread)
69 *   (see copy_semundo, CLONE_SYSVSEM)
70 * - There are two lists of the pending operations: a per-array list
71 *   and per-semaphore list (stored in the array). This allows to achieve FIFO
72 *   ordering without always scanning all pending operations.
73 *   The worst-case behavior is nevertheless O(N^2) for N wakeups.
74 */
75
76#include <linux/slab.h>
77#include <linux/spinlock.h>
78#include <linux/init.h>
79#include <linux/proc_fs.h>
80#include <linux/time.h>
81#include <linux/security.h>
82#include <linux/syscalls.h>
83#include <linux/audit.h>
84#include <linux/capability.h>
85#include <linux/seq_file.h>
86#include <linux/rwsem.h>
87#include <linux/nsproxy.h>
88#include <linux/ipc_namespace.h>
89
90#include <asm/uaccess.h>
91#include "util.h"
92
93/* One semaphore structure for each semaphore in the system. */
94struct sem {
95	int	semval;		/* current value */
96	int	sempid;		/* pid of last operation */
97	spinlock_t	lock;	/* spinlock for fine-grained semtimedop */
98	struct list_head pending_alter; /* pending single-sop operations */
99					/* that alter the semaphore */
100	struct list_head pending_const; /* pending single-sop operations */
101					/* that do not alter the semaphore*/
102	time_t	sem_otime;	/* candidate for sem_otime */
103} ____cacheline_aligned_in_smp;
104
105/* One queue for each sleeping process in the system. */
106struct sem_queue {
107	struct list_head	list;	 /* queue of pending operations */
108	struct task_struct	*sleeper; /* this process */
109	struct sem_undo		*undo;	 /* undo structure */
110	int			pid;	 /* process id of requesting process */
111	int			status;	 /* completion status of operation */
112	struct sembuf		*sops;	 /* array of pending operations */
113	int			nsops;	 /* number of operations */
114	int			alter;	 /* does *sops alter the array? */
115};
116
117/* Each task has a list of undo requests. They are executed automatically
118 * when the process exits.
119 */
120struct sem_undo {
121	struct list_head	list_proc;	/* per-process list: *
122						 * all undos from one process
123						 * rcu protected */
124	struct rcu_head		rcu;		/* rcu struct for sem_undo */
125	struct sem_undo_list	*ulp;		/* back ptr to sem_undo_list */
126	struct list_head	list_id;	/* per semaphore array list:
127						 * all undos for one array */
128	int			semid;		/* semaphore set identifier */
129	short			*semadj;	/* array of adjustments */
130						/* one per semaphore */
131};
132
133/* sem_undo_list controls shared access to the list of sem_undo structures
134 * that may be shared among all a CLONE_SYSVSEM task group.
135 */
136struct sem_undo_list {
137	atomic_t		refcnt;
138	spinlock_t		lock;
139	struct list_head	list_proc;
140};
141
142
143#define sem_ids(ns)	((ns)->ids[IPC_SEM_IDS])
144
145#define sem_checkid(sma, semid)	ipc_checkid(&sma->sem_perm, semid)
146
147static int newary(struct ipc_namespace *, struct ipc_params *);
148static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
149#ifdef CONFIG_PROC_FS
150static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
151#endif
152
153#define SEMMSL_FAST	256 /* 512 bytes on stack */
154#define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
155
156/*
157 * Locking:
158 *	sem_undo.id_next,
159 *	sem_array.complex_count,
160 *	sem_array.pending{_alter,_cont},
161 *	sem_array.sem_undo: global sem_lock() for read/write
162 *	sem_undo.proc_next: only "current" is allowed to read/write that field.
163 *
164 *	sem_array.sem_base[i].pending_{const,alter}:
165 *		global or semaphore sem_lock() for read/write
166 */
167
168#define sc_semmsl	sem_ctls[0]
169#define sc_semmns	sem_ctls[1]
170#define sc_semopm	sem_ctls[2]
171#define sc_semmni	sem_ctls[3]
172
173void sem_init_ns(struct ipc_namespace *ns)
174{
175	ns->sc_semmsl = SEMMSL;
176	ns->sc_semmns = SEMMNS;
177	ns->sc_semopm = SEMOPM;
178	ns->sc_semmni = SEMMNI;
179	ns->used_sems = 0;
180	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
181}
182
183#ifdef CONFIG_IPC_NS
184void sem_exit_ns(struct ipc_namespace *ns)
185{
186	free_ipcs(ns, &sem_ids(ns), freeary);
187	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
188}
189#endif
190
191void __init sem_init (void)
192{
193	sem_init_ns(&init_ipc_ns);
194	ipc_init_proc_interface("sysvipc/sem",
195				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
196				IPC_SEM_IDS, sysvipc_sem_proc_show);
197}
198
199/**
200 * unmerge_queues - unmerge queues, if possible.
201 * @sma: semaphore array
202 *
203 * The function unmerges the wait queues if complex_count is 0.
204 * It must be called prior to dropping the global semaphore array lock.
205 */
206static void unmerge_queues(struct sem_array *sma)
207{
208	struct sem_queue *q, *tq;
209
210	/* complex operations still around? */
211	if (sma->complex_count)
212		return;
213	/*
214	 * We will switch back to simple mode.
215	 * Move all pending operation back into the per-semaphore
216	 * queues.
217	 */
218	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
219		struct sem *curr;
220		curr = &sma->sem_base[q->sops[0].sem_num];
221
222		list_add_tail(&q->list, &curr->pending_alter);
223	}
224	INIT_LIST_HEAD(&sma->pending_alter);
225}
226
227/**
228 * merge_queues - Merge single semop queues into global queue
229 * @sma: semaphore array
230 *
231 * This function merges all per-semaphore queues into the global queue.
232 * It is necessary to achieve FIFO ordering for the pending single-sop
233 * operations when a multi-semop operation must sleep.
234 * Only the alter operations must be moved, the const operations can stay.
235 */
236static void merge_queues(struct sem_array *sma)
237{
238	int i;
239	for (i = 0; i < sma->sem_nsems; i++) {
240		struct sem *sem = sma->sem_base + i;
241
242		list_splice_init(&sem->pending_alter, &sma->pending_alter);
243	}
244}
245
246/*
247 * If the request contains only one semaphore operation, and there are
248 * no complex transactions pending, lock only the semaphore involved.
249 * Otherwise, lock the entire semaphore array, since we either have
250 * multiple semaphores in our own semops, or we need to look at
251 * semaphores from other pending complex operations.
252 *
253 * Carefully guard against sma->complex_count changing between zero
254 * and non-zero while we are spinning for the lock. The value of
255 * sma->complex_count cannot change while we are holding the lock,
256 * so sem_unlock should be fine.
257 *
258 * The global lock path checks that all the local locks have been released,
259 * checking each local lock once. This means that the local lock paths
260 * cannot start their critical sections while the global lock is held.
261 */
262static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
263			      int nsops)
264{
265	int locknum;
266 again:
267	if (nsops == 1 && !sma->complex_count) {
268		struct sem *sem = sma->sem_base + sops->sem_num;
269
270		/* Lock just the semaphore we are interested in. */
271		spin_lock(&sem->lock);
272
273		/*
274		 * If sma->complex_count was set while we were spinning,
275		 * we may need to look at things we did not lock here.
276		 */
277		if (unlikely(sma->complex_count)) {
278			spin_unlock(&sem->lock);
279			goto lock_array;
280		}
281
282		/*
283		 * Another process is holding the global lock on the
284		 * sem_array; we cannot enter our critical section,
285		 * but have to wait for the global lock to be released.
286		 */
287		if (unlikely(spin_is_locked(&sma->sem_perm.lock))) {
288			spin_unlock(&sem->lock);
289			spin_unlock_wait(&sma->sem_perm.lock);
290			goto again;
291		}
292
293		locknum = sops->sem_num;
294	} else {
295		int i;
296		/*
297		 * Lock the semaphore array, and wait for all of the
298		 * individual semaphore locks to go away.  The code
299		 * above ensures no new single-lock holders will enter
300		 * their critical section while the array lock is held.
301		 */
302 lock_array:
303		ipc_lock_object(&sma->sem_perm);
304		for (i = 0; i < sma->sem_nsems; i++) {
305			struct sem *sem = sma->sem_base + i;
306			spin_unlock_wait(&sem->lock);
307		}
308		locknum = -1;
309	}
310	return locknum;
311}
312
313static inline void sem_unlock(struct sem_array *sma, int locknum)
314{
315	if (locknum == -1) {
316		unmerge_queues(sma);
317		ipc_unlock_object(&sma->sem_perm);
318	} else {
319		struct sem *sem = sma->sem_base + locknum;
320		spin_unlock(&sem->lock);
321	}
322}
323
324/*
325 * sem_lock_(check_) routines are called in the paths where the rwsem
326 * is not held.
327 *
328 * The caller holds the RCU read lock.
329 */
330static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns,
331			int id, struct sembuf *sops, int nsops, int *locknum)
332{
333	struct kern_ipc_perm *ipcp;
334	struct sem_array *sma;
335
336	ipcp = ipc_obtain_object(&sem_ids(ns), id);
337	if (IS_ERR(ipcp))
338		return ERR_CAST(ipcp);
339
340	sma = container_of(ipcp, struct sem_array, sem_perm);
341	*locknum = sem_lock(sma, sops, nsops);
342
343	/* ipc_rmid() may have already freed the ID while sem_lock
344	 * was spinning: verify that the structure is still valid
345	 */
346	if (!ipcp->deleted)
347		return container_of(ipcp, struct sem_array, sem_perm);
348
349	sem_unlock(sma, *locknum);
350	return ERR_PTR(-EINVAL);
351}
352
353static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
354{
355	struct kern_ipc_perm *ipcp = ipc_obtain_object(&sem_ids(ns), id);
356
357	if (IS_ERR(ipcp))
358		return ERR_CAST(ipcp);
359
360	return container_of(ipcp, struct sem_array, sem_perm);
361}
362
363static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns,
364							int id)
365{
366	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);
367
368	if (IS_ERR(ipcp))
369		return ERR_CAST(ipcp);
370
371	return container_of(ipcp, struct sem_array, sem_perm);
372}
373
374static inline void sem_lock_and_putref(struct sem_array *sma)
375{
376	sem_lock(sma, NULL, -1);
377	ipc_rcu_putref(sma);
378}
379
380static inline void sem_putref(struct sem_array *sma)
381{
382	ipc_rcu_putref(sma);
383}
384
385static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
386{
387	ipc_rmid(&sem_ids(ns), &s->sem_perm);
388}
389
390/*
391 * Lockless wakeup algorithm:
392 * Without the check/retry algorithm a lockless wakeup is possible:
393 * - queue.status is initialized to -EINTR before blocking.
394 * - wakeup is performed by
395 *	* unlinking the queue entry from the pending list
396 *	* setting queue.status to IN_WAKEUP
397 *	  This is the notification for the blocked thread that a
398 *	  result value is imminent.
399 *	* call wake_up_process
400 *	* set queue.status to the final value.
401 * - the previously blocked thread checks queue.status:
402 *   	* if it's IN_WAKEUP, then it must wait until the value changes
403 *   	* if it's not -EINTR, then the operation was completed by
404 *   	  update_queue. semtimedop can return queue.status without
405 *   	  performing any operation on the sem array.
406 *   	* otherwise it must acquire the spinlock and check what's up.
407 *
408 * The two-stage algorithm is necessary to protect against the following
409 * races:
410 * - if queue.status is set after wake_up_process, then the woken up idle
411 *   thread could race forward and try (and fail) to acquire sma->lock
412 *   before update_queue had a chance to set queue.status
413 * - if queue.status is written before wake_up_process and if the
414 *   blocked process is woken up by a signal between writing
415 *   queue.status and the wake_up_process, then the woken up
416 *   process could return from semtimedop and die by calling
417 *   sys_exit before wake_up_process is called. Then wake_up_process
418 *   will oops, because the task structure is already invalid.
419 *   (yes, this happened on s390 with sysv msg).
420 *
421 */
422#define IN_WAKEUP	1
423
424/**
425 * newary - Create a new semaphore set
426 * @ns: namespace
427 * @params: ptr to the structure that contains key, semflg and nsems
428 *
429 * Called with sem_ids.rwsem held (as a writer)
430 */
431
432static int newary(struct ipc_namespace *ns, struct ipc_params *params)
433{
434	int id;
435	int retval;
436	struct sem_array *sma;
437	int size;
438	key_t key = params->key;
439	int nsems = params->u.nsems;
440	int semflg = params->flg;
441	int i;
442
443	if (!nsems)
444		return -EINVAL;
445	if (ns->used_sems + nsems > ns->sc_semmns)
446		return -ENOSPC;
447
448	size = sizeof (*sma) + nsems * sizeof (struct sem);
449	sma = ipc_rcu_alloc(size);
450	if (!sma) {
451		return -ENOMEM;
452	}
453	memset (sma, 0, size);
454
455	sma->sem_perm.mode = (semflg & S_IRWXUGO);
456	sma->sem_perm.key = key;
457
458	sma->sem_perm.security = NULL;
459	retval = security_sem_alloc(sma);
460	if (retval) {
461		ipc_rcu_putref(sma);
462		return retval;
463	}
464
465	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
466	if (id < 0) {
467		security_sem_free(sma);
468		ipc_rcu_putref(sma);
469		return id;
470	}
471	ns->used_sems += nsems;
472
473	sma->sem_base = (struct sem *) &sma[1];
474
475	for (i = 0; i < nsems; i++) {
476		INIT_LIST_HEAD(&sma->sem_base[i].pending_alter);
477		INIT_LIST_HEAD(&sma->sem_base[i].pending_const);
478		spin_lock_init(&sma->sem_base[i].lock);
479	}
480
481	sma->complex_count = 0;
482	INIT_LIST_HEAD(&sma->pending_alter);
483	INIT_LIST_HEAD(&sma->pending_const);
484	INIT_LIST_HEAD(&sma->list_id);
485	sma->sem_nsems = nsems;
486	sma->sem_ctime = get_seconds();
487	sem_unlock(sma, -1);
488	rcu_read_unlock();
489
490	return sma->sem_perm.id;
491}
492
493
494/*
495 * Called with sem_ids.rwsem and ipcp locked.
496 */
497static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
498{
499	struct sem_array *sma;
500
501	sma = container_of(ipcp, struct sem_array, sem_perm);
502	return security_sem_associate(sma, semflg);
503}
504
505/*
506 * Called with sem_ids.rwsem and ipcp locked.
507 */
508static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
509				struct ipc_params *params)
510{
511	struct sem_array *sma;
512
513	sma = container_of(ipcp, struct sem_array, sem_perm);
514	if (params->u.nsems > sma->sem_nsems)
515		return -EINVAL;
516
517	return 0;
518}
519
520SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
521{
522	struct ipc_namespace *ns;
523	struct ipc_ops sem_ops;
524	struct ipc_params sem_params;
525
526	ns = current->nsproxy->ipc_ns;
527
528	if (nsems < 0 || nsems > ns->sc_semmsl)
529		return -EINVAL;
530
531	sem_ops.getnew = newary;
532	sem_ops.associate = sem_security;
533	sem_ops.more_checks = sem_more_checks;
534
535	sem_params.key = key;
536	sem_params.flg = semflg;
537	sem_params.u.nsems = nsems;
538
539	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
540}
541
542/** perform_atomic_semop - Perform (if possible) a semaphore operation
543 * @sma: semaphore array
544 * @sops: array with operations that should be checked
545 * @nsems: number of sops
546 * @un: undo array
547 * @pid: pid that did the change
548 *
549 * Returns 0 if the operation was possible.
550 * Returns 1 if the operation is impossible, the caller must sleep.
551 * Negative values are error codes.
552 */
553
554static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops,
555			     int nsops, struct sem_undo *un, int pid)
556{
557	int result, sem_op;
558	struct sembuf *sop;
559	struct sem * curr;
560
561	for (sop = sops; sop < sops + nsops; sop++) {
562		curr = sma->sem_base + sop->sem_num;
563		sem_op = sop->sem_op;
564		result = curr->semval;
565
566		if (!sem_op && result)
567			goto would_block;
568
569		result += sem_op;
570		if (result < 0)
571			goto would_block;
572		if (result > SEMVMX)
573			goto out_of_range;
574		if (sop->sem_flg & SEM_UNDO) {
575			int undo = un->semadj[sop->sem_num] - sem_op;
576			/*
577	 		 *	Exceeding the undo range is an error.
578			 */
579			if (undo < (-SEMAEM - 1) || undo > SEMAEM)
580				goto out_of_range;
581		}
582		curr->semval = result;
583	}
584
585	sop--;
586	while (sop >= sops) {
587		sma->sem_base[sop->sem_num].sempid = pid;
588		if (sop->sem_flg & SEM_UNDO)
589			un->semadj[sop->sem_num] -= sop->sem_op;
590		sop--;
591	}
592
593	return 0;
594
595out_of_range:
596	result = -ERANGE;
597	goto undo;
598
599would_block:
600	if (sop->sem_flg & IPC_NOWAIT)
601		result = -EAGAIN;
602	else
603		result = 1;
604
605undo:
606	sop--;
607	while (sop >= sops) {
608		sma->sem_base[sop->sem_num].semval -= sop->sem_op;
609		sop--;
610	}
611
612	return result;
613}
614
615/** wake_up_sem_queue_prepare(q, error): Prepare wake-up
616 * @q: queue entry that must be signaled
617 * @error: Error value for the signal
618 *
619 * Prepare the wake-up of the queue entry q.
620 */
621static void wake_up_sem_queue_prepare(struct list_head *pt,
622				struct sem_queue *q, int error)
623{
624	if (list_empty(pt)) {
625		/*
626		 * Hold preempt off so that we don't get preempted and have the
627		 * wakee busy-wait until we're scheduled back on.
628		 */
629		preempt_disable();
630	}
631	q->status = IN_WAKEUP;
632	q->pid = error;
633
634	list_add_tail(&q->list, pt);
635}
636
637/**
638 * wake_up_sem_queue_do(pt) - do the actual wake-up
639 * @pt: list of tasks to be woken up
640 *
641 * Do the actual wake-up.
642 * The function is called without any locks held, thus the semaphore array
643 * could be destroyed already and the tasks can disappear as soon as the
644 * status is set to the actual return code.
645 */
646static void wake_up_sem_queue_do(struct list_head *pt)
647{
648	struct sem_queue *q, *t;
649	int did_something;
650
651	did_something = !list_empty(pt);
652	list_for_each_entry_safe(q, t, pt, list) {
653		wake_up_process(q->sleeper);
654		/* q can disappear immediately after writing q->status. */
655		smp_wmb();
656		q->status = q->pid;
657	}
658	if (did_something)
659		preempt_enable();
660}
661
662static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
663{
664	list_del(&q->list);
665	if (q->nsops > 1)
666		sma->complex_count--;
667}
668
669/** check_restart(sma, q)
670 * @sma: semaphore array
671 * @q: the operation that just completed
672 *
673 * update_queue is O(N^2) when it restarts scanning the whole queue of
674 * waiting operations. Therefore this function checks if the restart is
675 * really necessary. It is called after a previously waiting operation
676 * modified the array.
677 * Note that wait-for-zero operations are handled without restart.
678 */
679static int check_restart(struct sem_array *sma, struct sem_queue *q)
680{
681	/* pending complex alter operations are too difficult to analyse */
682	if (!list_empty(&sma->pending_alter))
683		return 1;
684
685	/* we were a sleeping complex operation. Too difficult */
686	if (q->nsops > 1)
687		return 1;
688
689	/* It is impossible that someone waits for the new value:
690	 * - complex operations always restart.
691	 * - wait-for-zero are handled seperately.
692	 * - q is a previously sleeping simple operation that
693	 *   altered the array. It must be a decrement, because
694	 *   simple increments never sleep.
695	 * - If there are older (higher priority) decrements
696	 *   in the queue, then they have observed the original
697	 *   semval value and couldn't proceed. The operation
698	 *   decremented to value - thus they won't proceed either.
699	 */
700	return 0;
701}
702
703/**
704 * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks
705 * @sma: semaphore array.
706 * @semnum: semaphore that was modified.
707 * @pt: list head for the tasks that must be woken up.
708 *
709 * wake_const_ops must be called after a semaphore in a semaphore array
710 * was set to 0. If complex const operations are pending, wake_const_ops must
711 * be called with semnum = -1, as well as with the number of each modified
712 * semaphore.
713 * The tasks that must be woken up are added to @pt. The return code
714 * is stored in q->pid.
715 * The function returns 1 if at least one operation was completed successfully.
716 */
717static int wake_const_ops(struct sem_array *sma, int semnum,
718				struct list_head *pt)
719{
720	struct sem_queue *q;
721	struct list_head *walk;
722	struct list_head *pending_list;
723	int semop_completed = 0;
724
725	if (semnum == -1)
726		pending_list = &sma->pending_const;
727	else
728		pending_list = &sma->sem_base[semnum].pending_const;
729
730	walk = pending_list->next;
731	while (walk != pending_list) {
732		int error;
733
734		q = container_of(walk, struct sem_queue, list);
735		walk = walk->next;
736
737		error = perform_atomic_semop(sma, q->sops, q->nsops,
738						 q->undo, q->pid);
739
740		if (error <= 0) {
741			/* operation completed, remove from queue & wakeup */
742
743			unlink_queue(sma, q);
744
745			wake_up_sem_queue_prepare(pt, q, error);
746			if (error == 0)
747				semop_completed = 1;
748		}
749	}
750	return semop_completed;
751}
752
753/**
754 * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks
755 * @sma: semaphore array
756 * @sops: operations that were performed
757 * @nsops: number of operations
758 * @pt: list head of the tasks that must be woken up.
759 *
760 * do_smart_wakeup_zero() checks all required queue for wait-for-zero
761 * operations, based on the actual changes that were performed on the
762 * semaphore array.
763 * The function returns 1 if at least one operation was completed successfully.
764 */
765static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
766					int nsops, struct list_head *pt)
767{
768	int i;
769	int semop_completed = 0;
770	int got_zero = 0;
771
772	/* first: the per-semaphore queues, if known */
773	if (sops) {
774		for (i = 0; i < nsops; i++) {
775			int num = sops[i].sem_num;
776
777			if (sma->sem_base[num].semval == 0) {
778				got_zero = 1;
779				semop_completed |= wake_const_ops(sma, num, pt);
780			}
781		}
782	} else {
783		/*
784		 * No sops means modified semaphores not known.
785		 * Assume all were changed.
786		 */
787		for (i = 0; i < sma->sem_nsems; i++) {
788			if (sma->sem_base[i].semval == 0) {
789				got_zero = 1;
790				semop_completed |= wake_const_ops(sma, i, pt);
791			}
792		}
793	}
794	/*
795	 * If one of the modified semaphores got 0,
796	 * then check the global queue, too.
797	 */
798	if (got_zero)
799		semop_completed |= wake_const_ops(sma, -1, pt);
800
801	return semop_completed;
802}
803
804
805/**
806 * update_queue(sma, semnum): Look for tasks that can be completed.
807 * @sma: semaphore array.
808 * @semnum: semaphore that was modified.
809 * @pt: list head for the tasks that must be woken up.
810 *
811 * update_queue must be called after a semaphore in a semaphore array
812 * was modified. If multiple semaphores were modified, update_queue must
813 * be called with semnum = -1, as well as with the number of each modified
814 * semaphore.
815 * The tasks that must be woken up are added to @pt. The return code
816 * is stored in q->pid.
817 * The function internally checks if const operations can now succeed.
818 *
819 * The function return 1 if at least one semop was completed successfully.
820 */
821static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
822{
823	struct sem_queue *q;
824	struct list_head *walk;
825	struct list_head *pending_list;
826	int semop_completed = 0;
827
828	if (semnum == -1)
829		pending_list = &sma->pending_alter;
830	else
831		pending_list = &sma->sem_base[semnum].pending_alter;
832
833again:
834	walk = pending_list->next;
835	while (walk != pending_list) {
836		int error, restart;
837
838		q = container_of(walk, struct sem_queue, list);
839		walk = walk->next;
840
841		/* If we are scanning the single sop, per-semaphore list of
842		 * one semaphore and that semaphore is 0, then it is not
843		 * necessary to scan further: simple increments
844		 * that affect only one entry succeed immediately and cannot
845		 * be in the  per semaphore pending queue, and decrements
846		 * cannot be successful if the value is already 0.
847		 */
848		if (semnum != -1 && sma->sem_base[semnum].semval == 0)
849			break;
850
851		error = perform_atomic_semop(sma, q->sops, q->nsops,
852					 q->undo, q->pid);
853
854		/* Does q->sleeper still need to sleep? */
855		if (error > 0)
856			continue;
857
858		unlink_queue(sma, q);
859
860		if (error) {
861			restart = 0;
862		} else {
863			semop_completed = 1;
864			do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
865			restart = check_restart(sma, q);
866		}
867
868		wake_up_sem_queue_prepare(pt, q, error);
869		if (restart)
870			goto again;
871	}
872	return semop_completed;
873}
874
875/**
876 * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue
877 * @sma: semaphore array
878 * @sops: operations that were performed
879 * @nsops: number of operations
880 * @otime: force setting otime
881 * @pt: list head of the tasks that must be woken up.
882 *
883 * do_smart_update() does the required calls to update_queue and wakeup_zero,
884 * based on the actual changes that were performed on the semaphore array.
885 * Note that the function does not do the actual wake-up: the caller is
886 * responsible for calling wake_up_sem_queue_do(@pt).
887 * It is safe to perform this call after dropping all locks.
888 */
889static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
890			int otime, struct list_head *pt)
891{
892	int i;
893
894	otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
895
896	if (!list_empty(&sma->pending_alter)) {
897		/* semaphore array uses the global queue - just process it. */
898		otime |= update_queue(sma, -1, pt);
899	} else {
900		if (!sops) {
901			/*
902			 * No sops, thus the modified semaphores are not
903			 * known. Check all.
904			 */
905			for (i = 0; i < sma->sem_nsems; i++)
906				otime |= update_queue(sma, i, pt);
907		} else {
908			/*
909			 * Check the semaphores that were increased:
910			 * - No complex ops, thus all sleeping ops are
911			 *   decrease.
912			 * - if we decreased the value, then any sleeping
913			 *   semaphore ops wont be able to run: If the
914			 *   previous value was too small, then the new
915			 *   value will be too small, too.
916			 */
917			for (i = 0; i < nsops; i++) {
918				if (sops[i].sem_op > 0) {
919					otime |= update_queue(sma,
920							sops[i].sem_num, pt);
921				}
922			}
923		}
924	}
925	if (otime) {
926		if (sops == NULL) {
927			sma->sem_base[0].sem_otime = get_seconds();
928		} else {
929			sma->sem_base[sops[0].sem_num].sem_otime =
930								get_seconds();
931		}
932	}
933}
934
935
936/* The following counts are associated to each semaphore:
937 *   semncnt        number of tasks waiting on semval being nonzero
938 *   semzcnt        number of tasks waiting on semval being zero
939 * This model assumes that a task waits on exactly one semaphore.
940 * Since semaphore operations are to be performed atomically, tasks actually
941 * wait on a whole sequence of semaphores simultaneously.
942 * The counts we return here are a rough approximation, but still
943 * warrant that semncnt+semzcnt>0 if the task is on the pending queue.
944 */
945static int count_semncnt (struct sem_array * sma, ushort semnum)
946{
947	int semncnt;
948	struct sem_queue * q;
949
950	semncnt = 0;
951	list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) {
952		struct sembuf * sops = q->sops;
953		BUG_ON(sops->sem_num != semnum);
954		if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT))
955			semncnt++;
956	}
957
958	list_for_each_entry(q, &sma->pending_alter, list) {
959		struct sembuf * sops = q->sops;
960		int nsops = q->nsops;
961		int i;
962		for (i = 0; i < nsops; i++)
963			if (sops[i].sem_num == semnum
964			    && (sops[i].sem_op < 0)
965			    && !(sops[i].sem_flg & IPC_NOWAIT))
966				semncnt++;
967	}
968	return semncnt;
969}
970
971static int count_semzcnt (struct sem_array * sma, ushort semnum)
972{
973	int semzcnt;
974	struct sem_queue * q;
975
976	semzcnt = 0;
977	list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) {
978		struct sembuf * sops = q->sops;
979		BUG_ON(sops->sem_num != semnum);
980		if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT))
981			semzcnt++;
982	}
983
984	list_for_each_entry(q, &sma->pending_const, list) {
985		struct sembuf * sops = q->sops;
986		int nsops = q->nsops;
987		int i;
988		for (i = 0; i < nsops; i++)
989			if (sops[i].sem_num == semnum
990			    && (sops[i].sem_op == 0)
991			    && !(sops[i].sem_flg & IPC_NOWAIT))
992				semzcnt++;
993	}
994	return semzcnt;
995}
996
997/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
998 * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
999 * remains locked on exit.
1000 */
1001static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1002{
1003	struct sem_undo *un, *tu;
1004	struct sem_queue *q, *tq;
1005	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
1006	struct list_head tasks;
1007	int i;
1008
1009	/* Free the existing undo structures for this semaphore set.  */
1010	ipc_assert_locked_object(&sma->sem_perm);
1011	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
1012		list_del(&un->list_id);
1013		spin_lock(&un->ulp->lock);
1014		un->semid = -1;
1015		list_del_rcu(&un->list_proc);
1016		spin_unlock(&un->ulp->lock);
1017		kfree_rcu(un, rcu);
1018	}
1019
1020	/* Wake up all pending processes and let them fail with EIDRM. */
1021	INIT_LIST_HEAD(&tasks);
1022	list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
1023		unlink_queue(sma, q);
1024		wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
1025	}
1026
1027	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
1028		unlink_queue(sma, q);
1029		wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
1030	}
1031	for (i = 0; i < sma->sem_nsems; i++) {
1032		struct sem *sem = sma->sem_base + i;
1033		list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
1034			unlink_queue(sma, q);
1035			wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
1036		}
1037		list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
1038			unlink_queue(sma, q);
1039			wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
1040		}
1041	}
1042
1043	/* Remove the semaphore set from the IDR */
1044	sem_rmid(ns, sma);
1045	sem_unlock(sma, -1);
1046	rcu_read_unlock();
1047
1048	wake_up_sem_queue_do(&tasks);
1049	ns->used_sems -= sma->sem_nsems;
1050	security_sem_free(sma);
1051	ipc_rcu_putref(sma);
1052}
1053
1054static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
1055{
1056	switch(version) {
1057	case IPC_64:
1058		return copy_to_user(buf, in, sizeof(*in));
1059	case IPC_OLD:
1060	    {
1061		struct semid_ds out;
1062
1063		memset(&out, 0, sizeof(out));
1064
1065		ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);
1066
1067		out.sem_otime	= in->sem_otime;
1068		out.sem_ctime	= in->sem_ctime;
1069		out.sem_nsems	= in->sem_nsems;
1070
1071		return copy_to_user(buf, &out, sizeof(out));
1072	    }
1073	default:
1074		return -EINVAL;
1075	}
1076}
1077
1078static time_t get_semotime(struct sem_array *sma)
1079{
1080	int i;
1081	time_t res;
1082
1083	res = sma->sem_base[0].sem_otime;
1084	for (i = 1; i < sma->sem_nsems; i++) {
1085		time_t to = sma->sem_base[i].sem_otime;
1086
1087		if (to > res)
1088			res = to;
1089	}
1090	return res;
1091}
1092
1093static int semctl_nolock(struct ipc_namespace *ns, int semid,
1094			 int cmd, int version, void __user *p)
1095{
1096	int err;
1097	struct sem_array *sma;
1098
1099	switch(cmd) {
1100	case IPC_INFO:
1101	case SEM_INFO:
1102	{
1103		struct seminfo seminfo;
1104		int max_id;
1105
1106		err = security_sem_semctl(NULL, cmd);
1107		if (err)
1108			return err;
1109
1110		memset(&seminfo,0,sizeof(seminfo));
1111		seminfo.semmni = ns->sc_semmni;
1112		seminfo.semmns = ns->sc_semmns;
1113		seminfo.semmsl = ns->sc_semmsl;
1114		seminfo.semopm = ns->sc_semopm;
1115		seminfo.semvmx = SEMVMX;
1116		seminfo.semmnu = SEMMNU;
1117		seminfo.semmap = SEMMAP;
1118		seminfo.semume = SEMUME;
1119		down_read(&sem_ids(ns).rwsem);
1120		if (cmd == SEM_INFO) {
1121			seminfo.semusz = sem_ids(ns).in_use;
1122			seminfo.semaem = ns->used_sems;
1123		} else {
1124			seminfo.semusz = SEMUSZ;
1125			seminfo.semaem = SEMAEM;
1126		}
1127		max_id = ipc_get_maxid(&sem_ids(ns));
1128		up_read(&sem_ids(ns).rwsem);
1129		if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
1130			return -EFAULT;
1131		return (max_id < 0) ? 0: max_id;
1132	}
1133	case IPC_STAT:
1134	case SEM_STAT:
1135	{
1136		struct semid64_ds tbuf;
1137		int id = 0;
1138
1139		memset(&tbuf, 0, sizeof(tbuf));
1140
1141		rcu_read_lock();
1142		if (cmd == SEM_STAT) {
1143			sma = sem_obtain_object(ns, semid);
1144			if (IS_ERR(sma)) {
1145				err = PTR_ERR(sma);
1146				goto out_unlock;
1147			}
1148			id = sma->sem_perm.id;
1149		} else {
1150			sma = sem_obtain_object_check(ns, semid);
1151			if (IS_ERR(sma)) {
1152				err = PTR_ERR(sma);
1153				goto out_unlock;
1154			}
1155		}
1156
1157		err = -EACCES;
1158		if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
1159			goto out_unlock;
1160
1161		err = security_sem_semctl(sma, cmd);
1162		if (err)
1163			goto out_unlock;
1164
1165		kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm);
1166		tbuf.sem_otime = get_semotime(sma);
1167		tbuf.sem_ctime = sma->sem_ctime;
1168		tbuf.sem_nsems = sma->sem_nsems;
1169		rcu_read_unlock();
1170		if (copy_semid_to_user(p, &tbuf, version))
1171			return -EFAULT;
1172		return id;
1173	}
1174	default:
1175		return -EINVAL;
1176	}
1177out_unlock:
1178	rcu_read_unlock();
1179	return err;
1180}
1181
1182static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1183		unsigned long arg)
1184{
1185	struct sem_undo *un;
1186	struct sem_array *sma;
1187	struct sem* curr;
1188	int err;
1189	struct list_head tasks;
1190	int val;
1191#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
1192	/* big-endian 64bit */
1193	val = arg >> 32;
1194#else
1195	/* 32bit or little-endian 64bit */
1196	val = arg;
1197#endif
1198
1199	if (val > SEMVMX || val < 0)
1200		return -ERANGE;
1201
1202	INIT_LIST_HEAD(&tasks);
1203
1204	rcu_read_lock();
1205	sma = sem_obtain_object_check(ns, semid);
1206	if (IS_ERR(sma)) {
1207		rcu_read_unlock();
1208		return PTR_ERR(sma);
1209	}
1210
1211	if (semnum < 0 || semnum >= sma->sem_nsems) {
1212		rcu_read_unlock();
1213		return -EINVAL;
1214	}
1215
1216
1217	if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
1218		rcu_read_unlock();
1219		return -EACCES;
1220	}
1221
1222	err = security_sem_semctl(sma, SETVAL);
1223	if (err) {
1224		rcu_read_unlock();
1225		return -EACCES;
1226	}
1227
1228	sem_lock(sma, NULL, -1);
1229
1230	curr = &sma->sem_base[semnum];
1231
1232	ipc_assert_locked_object(&sma->sem_perm);
1233	list_for_each_entry(un, &sma->list_id, list_id)
1234		un->semadj[semnum] = 0;
1235
1236	curr->semval = val;
1237	curr->sempid = task_tgid_vnr(current);
1238	sma->sem_ctime = get_seconds();
1239	/* maybe some queued-up processes were waiting for this */
1240	do_smart_update(sma, NULL, 0, 0, &tasks);
1241	sem_unlock(sma, -1);
1242	rcu_read_unlock();
1243	wake_up_sem_queue_do(&tasks);
1244	return 0;
1245}
1246
1247static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
1248		int cmd, void __user *p)
1249{
1250	struct sem_array *sma;
1251	struct sem* curr;
1252	int err, nsems;
1253	ushort fast_sem_io[SEMMSL_FAST];
1254	ushort* sem_io = fast_sem_io;
1255	struct list_head tasks;
1256
1257	INIT_LIST_HEAD(&tasks);
1258
1259	rcu_read_lock();
1260	sma = sem_obtain_object_check(ns, semid);
1261	if (IS_ERR(sma)) {
1262		rcu_read_unlock();
1263		return PTR_ERR(sma);
1264	}
1265
1266	nsems = sma->sem_nsems;
1267
1268	err = -EACCES;
1269	if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
1270		goto out_rcu_wakeup;
1271
1272	err = security_sem_semctl(sma, cmd);
1273	if (err)
1274		goto out_rcu_wakeup;
1275
1276	err = -EACCES;
1277	switch (cmd) {
1278	case GETALL:
1279	{
1280		ushort __user *array = p;
1281		int i;
1282
1283		sem_lock(sma, NULL, -1);
1284		if(nsems > SEMMSL_FAST) {
1285			if (!ipc_rcu_getref(sma)) {
1286				sem_unlock(sma, -1);
1287				rcu_read_unlock();
1288				err = -EIDRM;
1289				goto out_free;
1290			}
1291			sem_unlock(sma, -1);
1292			rcu_read_unlock();
1293			sem_io = ipc_alloc(sizeof(ushort)*nsems);
1294			if(sem_io == NULL) {
1295				sem_putref(sma);
1296				return -ENOMEM;
1297			}
1298
1299			rcu_read_lock();
1300			sem_lock_and_putref(sma);
1301			if (sma->sem_perm.deleted) {
1302				sem_unlock(sma, -1);
1303				rcu_read_unlock();
1304				err = -EIDRM;
1305				goto out_free;
1306			}
1307		}
1308		for (i = 0; i < sma->sem_nsems; i++)
1309			sem_io[i] = sma->sem_base[i].semval;
1310		sem_unlock(sma, -1);
1311		rcu_read_unlock();
1312		err = 0;
1313		if(copy_to_user(array, sem_io, nsems*sizeof(ushort)))
1314			err = -EFAULT;
1315		goto out_free;
1316	}
1317	case SETALL:
1318	{
1319		int i;
1320		struct sem_undo *un;
1321
1322		if (!ipc_rcu_getref(sma)) {
1323			rcu_read_unlock();
1324			return -EIDRM;
1325		}
1326		rcu_read_unlock();
1327
1328		if(nsems > SEMMSL_FAST) {
1329			sem_io = ipc_alloc(sizeof(ushort)*nsems);
1330			if(sem_io == NULL) {
1331				sem_putref(sma);
1332				return -ENOMEM;
1333			}
1334		}
1335
1336		if (copy_from_user (sem_io, p, nsems*sizeof(ushort))) {
1337			sem_putref(sma);
1338			err = -EFAULT;
1339			goto out_free;
1340		}
1341
1342		for (i = 0; i < nsems; i++) {
1343			if (sem_io[i] > SEMVMX) {
1344				sem_putref(sma);
1345				err = -ERANGE;
1346				goto out_free;
1347			}
1348		}
1349		rcu_read_lock();
1350		sem_lock_and_putref(sma);
1351		if (sma->sem_perm.deleted) {
1352			sem_unlock(sma, -1);
1353			rcu_read_unlock();
1354			err = -EIDRM;
1355			goto out_free;
1356		}
1357
1358		for (i = 0; i < nsems; i++)
1359			sma->sem_base[i].semval = sem_io[i];
1360
1361		ipc_assert_locked_object(&sma->sem_perm);
1362		list_for_each_entry(un, &sma->list_id, list_id) {
1363			for (i = 0; i < nsems; i++)
1364				un->semadj[i] = 0;
1365		}
1366		sma->sem_ctime = get_seconds();
1367		/* maybe some queued-up processes were waiting for this */
1368		do_smart_update(sma, NULL, 0, 0, &tasks);
1369		err = 0;
1370		goto out_unlock;
1371	}
1372	/* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
1373	}
1374	err = -EINVAL;
1375	if (semnum < 0 || semnum >= nsems)
1376		goto out_rcu_wakeup;
1377
1378	sem_lock(sma, NULL, -1);
1379	curr = &sma->sem_base[semnum];
1380
1381	switch (cmd) {
1382	case GETVAL:
1383		err = curr->semval;
1384		goto out_unlock;
1385	case GETPID:
1386		err = curr->sempid;
1387		goto out_unlock;
1388	case GETNCNT:
1389		err = count_semncnt(sma,semnum);
1390		goto out_unlock;
1391	case GETZCNT:
1392		err = count_semzcnt(sma,semnum);
1393		goto out_unlock;
1394	}
1395
1396out_unlock:
1397	sem_unlock(sma, -1);
1398out_rcu_wakeup:
1399	rcu_read_unlock();
1400	wake_up_sem_queue_do(&tasks);
1401out_free:
1402	if(sem_io != fast_sem_io)
1403		ipc_free(sem_io, sizeof(ushort)*nsems);
1404	return err;
1405}
1406
1407static inline unsigned long
1408copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
1409{
1410	switch(version) {
1411	case IPC_64:
1412		if (copy_from_user(out, buf, sizeof(*out)))
1413			return -EFAULT;
1414		return 0;
1415	case IPC_OLD:
1416	    {
1417		struct semid_ds tbuf_old;
1418
1419		if(copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
1420			return -EFAULT;
1421
1422		out->sem_perm.uid	= tbuf_old.sem_perm.uid;
1423		out->sem_perm.gid	= tbuf_old.sem_perm.gid;
1424		out->sem_perm.mode	= tbuf_old.sem_perm.mode;
1425
1426		return 0;
1427	    }
1428	default:
1429		return -EINVAL;
1430	}
1431}
1432
1433/*
1434 * This function handles some semctl commands which require the rwsem
1435 * to be held in write mode.
1436 * NOTE: no locks must be held, the rwsem is taken inside this function.
1437 */
1438static int semctl_down(struct ipc_namespace *ns, int semid,
1439		       int cmd, int version, void __user *p)
1440{
1441	struct sem_array *sma;
1442	int err;
1443	struct semid64_ds semid64;
1444	struct kern_ipc_perm *ipcp;
1445
1446	if(cmd == IPC_SET) {
1447		if (copy_semid_from_user(&semid64, p, version))
1448			return -EFAULT;
1449	}
1450
1451	down_write(&sem_ids(ns).rwsem);
1452	rcu_read_lock();
1453
1454	ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
1455				      &semid64.sem_perm, 0);
1456	if (IS_ERR(ipcp)) {
1457		err = PTR_ERR(ipcp);
1458		goto out_unlock1;
1459	}
1460
1461	sma = container_of(ipcp, struct sem_array, sem_perm);
1462
1463	err = security_sem_semctl(sma, cmd);
1464	if (err)
1465		goto out_unlock1;
1466
1467	switch (cmd) {
1468	case IPC_RMID:
1469		sem_lock(sma, NULL, -1);
1470		/* freeary unlocks the ipc object and rcu */
1471		freeary(ns, ipcp);
1472		goto out_up;
1473	case IPC_SET:
1474		sem_lock(sma, NULL, -1);
1475		err = ipc_update_perm(&semid64.sem_perm, ipcp);
1476		if (err)
1477			goto out_unlock0;
1478		sma->sem_ctime = get_seconds();
1479		break;
1480	default:
1481		err = -EINVAL;
1482		goto out_unlock1;
1483	}
1484
1485out_unlock0:
1486	sem_unlock(sma, -1);
1487out_unlock1:
1488	rcu_read_unlock();
1489out_up:
1490	up_write(&sem_ids(ns).rwsem);
1491	return err;
1492}
1493
1494SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
1495{
1496	int version;
1497	struct ipc_namespace *ns;
1498	void __user *p = (void __user *)arg;
1499
1500	if (semid < 0)
1501		return -EINVAL;
1502
1503	version = ipc_parse_version(&cmd);
1504	ns = current->nsproxy->ipc_ns;
1505
1506	switch(cmd) {
1507	case IPC_INFO:
1508	case SEM_INFO:
1509	case IPC_STAT:
1510	case SEM_STAT:
1511		return semctl_nolock(ns, semid, cmd, version, p);
1512	case GETALL:
1513	case GETVAL:
1514	case GETPID:
1515	case GETNCNT:
1516	case GETZCNT:
1517	case SETALL:
1518		return semctl_main(ns, semid, semnum, cmd, p);
1519	case SETVAL:
1520		return semctl_setval(ns, semid, semnum, arg);
1521	case IPC_RMID:
1522	case IPC_SET:
1523		return semctl_down(ns, semid, cmd, version, p);
1524	default:
1525		return -EINVAL;
1526	}
1527}
1528
1529/* If the task doesn't already have a undo_list, then allocate one
1530 * here.  We guarantee there is only one thread using this undo list,
1531 * and current is THE ONE
1532 *
1533 * If this allocation and assignment succeeds, but later
1534 * portions of this code fail, there is no need to free the sem_undo_list.
1535 * Just let it stay associated with the task, and it'll be freed later
1536 * at exit time.
1537 *
1538 * This can block, so callers must hold no locks.
1539 */
1540static inline int get_undo_list(struct sem_undo_list **undo_listp)
1541{
1542	struct sem_undo_list *undo_list;
1543
1544	undo_list = current->sysvsem.undo_list;
1545	if (!undo_list) {
1546		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
1547		if (undo_list == NULL)
1548			return -ENOMEM;
1549		spin_lock_init(&undo_list->lock);
1550		atomic_set(&undo_list->refcnt, 1);
1551		INIT_LIST_HEAD(&undo_list->list_proc);
1552
1553		current->sysvsem.undo_list = undo_list;
1554	}
1555	*undo_listp = undo_list;
1556	return 0;
1557}
1558
1559static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid)
1560{
1561	struct sem_undo *un;
1562
1563	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) {
1564		if (un->semid == semid)
1565			return un;
1566	}
1567	return NULL;
1568}
1569
1570static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
1571{
1572	struct sem_undo *un;
1573
1574  	assert_spin_locked(&ulp->lock);
1575
1576	un = __lookup_undo(ulp, semid);
1577	if (un) {
1578		list_del_rcu(&un->list_proc);
1579		list_add_rcu(&un->list_proc, &ulp->list_proc);
1580	}
1581	return un;
1582}
1583
1584/**
1585 * find_alloc_undo - Lookup (and if not present create) undo array
1586 * @ns: namespace
1587 * @semid: semaphore array id
1588 *
1589 * The function looks up (and if not present creates) the undo structure.
1590 * The size of the undo structure depends on the size of the semaphore
1591 * array, thus the alloc path is not that straightforward.
1592 * Lifetime-rules: sem_undo is rcu-protected, on success, the function
1593 * performs a rcu_read_lock().
1594 */
1595static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
1596{
1597	struct sem_array *sma;
1598	struct sem_undo_list *ulp;
1599	struct sem_undo *un, *new;
1600	int nsems, error;
1601
1602	error = get_undo_list(&ulp);
1603	if (error)
1604		return ERR_PTR(error);
1605
1606	rcu_read_lock();
1607	spin_lock(&ulp->lock);
1608	un = lookup_undo(ulp, semid);
1609	spin_unlock(&ulp->lock);
1610	if (likely(un!=NULL))
1611		goto out;
1612
1613	/* no undo structure around - allocate one. */
1614	/* step 1: figure out the size of the semaphore array */
1615	sma = sem_obtain_object_check(ns, semid);
1616	if (IS_ERR(sma)) {
1617		rcu_read_unlock();
1618		return ERR_CAST(sma);
1619	}
1620
1621	nsems = sma->sem_nsems;
1622	if (!ipc_rcu_getref(sma)) {
1623		rcu_read_unlock();
1624		un = ERR_PTR(-EIDRM);
1625		goto out;
1626	}
1627	rcu_read_unlock();
1628
1629	/* step 2: allocate new undo structure */
1630	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
1631	if (!new) {
1632		sem_putref(sma);
1633		return ERR_PTR(-ENOMEM);
1634	}
1635
1636	/* step 3: Acquire the lock on semaphore array */
1637	rcu_read_lock();
1638	sem_lock_and_putref(sma);
1639	if (sma->sem_perm.deleted) {
1640		sem_unlock(sma, -1);
1641		rcu_read_unlock();
1642		kfree(new);
1643		un = ERR_PTR(-EIDRM);
1644		goto out;
1645	}
1646	spin_lock(&ulp->lock);
1647
1648	/*
1649	 * step 4: check for races: did someone else allocate the undo struct?
1650	 */
1651	un = lookup_undo(ulp, semid);
1652	if (un) {
1653		kfree(new);
1654		goto success;
1655	}
1656	/* step 5: initialize & link new undo structure */
1657	new->semadj = (short *) &new[1];
1658	new->ulp = ulp;
1659	new->semid = semid;
1660	assert_spin_locked(&ulp->lock);
1661	list_add_rcu(&new->list_proc, &ulp->list_proc);
1662	ipc_assert_locked_object(&sma->sem_perm);
1663	list_add(&new->list_id, &sma->list_id);
1664	un = new;
1665
1666success:
1667	spin_unlock(&ulp->lock);
1668	sem_unlock(sma, -1);
1669out:
1670	return un;
1671}
1672
1673
1674/**
1675 * get_queue_result - Retrieve the result code from sem_queue
1676 * @q: Pointer to queue structure
1677 *
1678 * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
1679 * q->status, then we must loop until the value is replaced with the final
1680 * value: This may happen if a task is woken up by an unrelated event (e.g.
1681 * signal) and in parallel the task is woken up by another task because it got
1682 * the requested semaphores.
1683 *
1684 * The function can be called with or without holding the semaphore spinlock.
1685 */
1686static int get_queue_result(struct sem_queue *q)
1687{
1688	int error;
1689
1690	error = q->status;
1691	while (unlikely(error == IN_WAKEUP)) {
1692		cpu_relax();
1693		error = q->status;
1694	}
1695
1696	return error;
1697}
1698
1699SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1700		unsigned, nsops, const struct timespec __user *, timeout)
1701{
1702	int error = -EINVAL;
1703	struct sem_array *sma;
1704	struct sembuf fast_sops[SEMOPM_FAST];
1705	struct sembuf* sops = fast_sops, *sop;
1706	struct sem_undo *un;
1707	int undos = 0, alter = 0, max, locknum;
1708	struct sem_queue queue;
1709	unsigned long jiffies_left = 0;
1710	struct ipc_namespace *ns;
1711	struct list_head tasks;
1712
1713	ns = current->nsproxy->ipc_ns;
1714
1715	if (nsops < 1 || semid < 0)
1716		return -EINVAL;
1717	if (nsops > ns->sc_semopm)
1718		return -E2BIG;
1719	if(nsops > SEMOPM_FAST) {
1720		sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL);
1721		if(sops==NULL)
1722			return -ENOMEM;
1723	}
1724	if (copy_from_user (sops, tsops, nsops * sizeof(*tsops))) {
1725		error=-EFAULT;
1726		goto out_free;
1727	}
1728	if (timeout) {
1729		struct timespec _timeout;
1730		if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
1731			error = -EFAULT;
1732			goto out_free;
1733		}
1734		if (_timeout.tv_sec < 0 || _timeout.tv_nsec < 0 ||
1735			_timeout.tv_nsec >= 1000000000L) {
1736			error = -EINVAL;
1737			goto out_free;
1738		}
1739		jiffies_left = timespec_to_jiffies(&_timeout);
1740	}
1741	max = 0;
1742	for (sop = sops; sop < sops + nsops; sop++) {
1743		if (sop->sem_num >= max)
1744			max = sop->sem_num;
1745		if (sop->sem_flg & SEM_UNDO)
1746			undos = 1;
1747		if (sop->sem_op != 0)
1748			alter = 1;
1749	}
1750
1751	INIT_LIST_HEAD(&tasks);
1752
1753	if (undos) {
1754		/* On success, find_alloc_undo takes the rcu_read_lock */
1755		un = find_alloc_undo(ns, semid);
1756		if (IS_ERR(un)) {
1757			error = PTR_ERR(un);
1758			goto out_free;
1759		}
1760	} else {
1761		un = NULL;
1762		rcu_read_lock();
1763	}
1764
1765	sma = sem_obtain_object_check(ns, semid);
1766	if (IS_ERR(sma)) {
1767		rcu_read_unlock();
1768		error = PTR_ERR(sma);
1769		goto out_free;
1770	}
1771
1772	error = -EFBIG;
1773	if (max >= sma->sem_nsems)
1774		goto out_rcu_wakeup;
1775
1776	error = -EACCES;
1777	if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO))
1778		goto out_rcu_wakeup;
1779
1780	error = security_sem_semop(sma, sops, nsops, alter);
1781	if (error)
1782		goto out_rcu_wakeup;
1783
1784	/*
1785	 * semid identifiers are not unique - find_alloc_undo may have
1786	 * allocated an undo structure, it was invalidated by an RMID
1787	 * and now a new array with received the same id. Check and fail.
1788	 * This case can be detected checking un->semid. The existence of
1789	 * "un" itself is guaranteed by rcu.
1790	 */
1791	error = -EIDRM;
1792	locknum = sem_lock(sma, sops, nsops);
1793	if (un && un->semid == -1)
1794		goto out_unlock_free;
1795
1796	error = perform_atomic_semop(sma, sops, nsops, un,
1797					task_tgid_vnr(current));
1798	if (error <= 0) {
1799		if (alter && error == 0)
1800			do_smart_update(sma, sops, nsops, 1, &tasks);
1801
1802		goto out_unlock_free;
1803	}
1804
1805	/* We need to sleep on this operation, so we put the current
1806	 * task into the pending queue and go to sleep.
1807	 */
1808
1809	queue.sops = sops;
1810	queue.nsops = nsops;
1811	queue.undo = un;
1812	queue.pid = task_tgid_vnr(current);
1813	queue.alter = alter;
1814
1815	if (nsops == 1) {
1816		struct sem *curr;
1817		curr = &sma->sem_base[sops->sem_num];
1818
1819		if (alter) {
1820			if (sma->complex_count) {
1821				list_add_tail(&queue.list,
1822						&sma->pending_alter);
1823			} else {
1824
1825				list_add_tail(&queue.list,
1826						&curr->pending_alter);
1827			}
1828		} else {
1829			list_add_tail(&queue.list, &curr->pending_const);
1830		}
1831	} else {
1832		if (!sma->complex_count)
1833			merge_queues(sma);
1834
1835		if (alter)
1836			list_add_tail(&queue.list, &sma->pending_alter);
1837		else
1838			list_add_tail(&queue.list, &sma->pending_const);
1839
1840		sma->complex_count++;
1841	}
1842
1843	queue.status = -EINTR;
1844	queue.sleeper = current;
1845
1846sleep_again:
1847	current->state = TASK_INTERRUPTIBLE;
1848	sem_unlock(sma, locknum);
1849	rcu_read_unlock();
1850
1851	if (timeout)
1852		jiffies_left = schedule_timeout(jiffies_left);
1853	else
1854		schedule();
1855
1856	error = get_queue_result(&queue);
1857
1858	if (error != -EINTR) {
1859		/* fast path: update_queue already obtained all requested
1860		 * resources.
1861		 * Perform a smp_mb(): User space could assume that semop()
1862		 * is a memory barrier: Without the mb(), the cpu could
1863		 * speculatively read in user space stale data that was
1864		 * overwritten by the previous owner of the semaphore.
1865		 */
1866		smp_mb();
1867
1868		goto out_free;
1869	}
1870
1871	rcu_read_lock();
1872	sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum);
1873
1874	/*
1875	 * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
1876	 */
1877	error = get_queue_result(&queue);
1878
1879	/*
1880	 * Array removed? If yes, leave without sem_unlock().
1881	 */
1882	if (IS_ERR(sma)) {
1883		rcu_read_unlock();
1884		goto out_free;
1885	}
1886
1887
1888	/*
1889	 * If queue.status != -EINTR we are woken up by another process.
1890	 * Leave without unlink_queue(), but with sem_unlock().
1891	 */
1892
1893	if (error != -EINTR) {
1894		goto out_unlock_free;
1895	}
1896
1897	/*
1898	 * If an interrupt occurred we have to clean up the queue
1899	 */
1900	if (timeout && jiffies_left == 0)
1901		error = -EAGAIN;
1902
1903	/*
1904	 * If the wakeup was spurious, just retry
1905	 */
1906	if (error == -EINTR && !signal_pending(current))
1907		goto sleep_again;
1908
1909	unlink_queue(sma, &queue);
1910
1911out_unlock_free:
1912	sem_unlock(sma, locknum);
1913out_rcu_wakeup:
1914	rcu_read_unlock();
1915	wake_up_sem_queue_do(&tasks);
1916out_free:
1917	if(sops != fast_sops)
1918		kfree(sops);
1919	return error;
1920}
1921
1922SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
1923		unsigned, nsops)
1924{
1925	return sys_semtimedop(semid, tsops, nsops, NULL);
1926}
1927
1928/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
1929 * parent and child tasks.
1930 */
1931
1932int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
1933{
1934	struct sem_undo_list *undo_list;
1935	int error;
1936
1937	if (clone_flags & CLONE_SYSVSEM) {
1938		error = get_undo_list(&undo_list);
1939		if (error)
1940			return error;
1941		atomic_inc(&undo_list->refcnt);
1942		tsk->sysvsem.undo_list = undo_list;
1943	} else
1944		tsk->sysvsem.undo_list = NULL;
1945
1946	return 0;
1947}
1948
1949/*
1950 * add semadj values to semaphores, free undo structures.
1951 * undo structures are not freed when semaphore arrays are destroyed
1952 * so some of them may be out of date.
1953 * IMPLEMENTATION NOTE: There is some confusion over whether the
1954 * set of adjustments that needs to be done should be done in an atomic
1955 * manner or not. That is, if we are attempting to decrement the semval
1956 * should we queue up and wait until we can do so legally?
1957 * The original implementation attempted to do this (queue and wait).
1958 * The current implementation does not do so. The POSIX standard
1959 * and SVID should be consulted to determine what behavior is mandated.
1960 */
1961void exit_sem(struct task_struct *tsk)
1962{
1963	struct sem_undo_list *ulp;
1964
1965	ulp = tsk->sysvsem.undo_list;
1966	if (!ulp)
1967		return;
1968	tsk->sysvsem.undo_list = NULL;
1969
1970	if (!atomic_dec_and_test(&ulp->refcnt))
1971		return;
1972
1973	for (;;) {
1974		struct sem_array *sma;
1975		struct sem_undo *un;
1976		struct list_head tasks;
1977		int semid, i;
1978
1979		rcu_read_lock();
1980		un = list_entry_rcu(ulp->list_proc.next,
1981				    struct sem_undo, list_proc);
1982		if (&un->list_proc == &ulp->list_proc)
1983			semid = -1;
1984		 else
1985			semid = un->semid;
1986
1987		if (semid == -1) {
1988			rcu_read_unlock();
1989			break;
1990		}
1991
1992		sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, un->semid);
1993		/* exit_sem raced with IPC_RMID, nothing to do */
1994		if (IS_ERR(sma)) {
1995			rcu_read_unlock();
1996			continue;
1997		}
1998
1999		sem_lock(sma, NULL, -1);
2000		un = __lookup_undo(ulp, semid);
2001		if (un == NULL) {
2002			/* exit_sem raced with IPC_RMID+semget() that created
2003			 * exactly the same semid. Nothing to do.
2004			 */
2005			sem_unlock(sma, -1);
2006			rcu_read_unlock();
2007			continue;
2008		}
2009
2010		/* remove un from the linked lists */
2011		ipc_assert_locked_object(&sma->sem_perm);
2012		list_del(&un->list_id);
2013
2014		spin_lock(&ulp->lock);
2015		list_del_rcu(&un->list_proc);
2016		spin_unlock(&ulp->lock);
2017
2018		/* perform adjustments registered in un */
2019		for (i = 0; i < sma->sem_nsems; i++) {
2020			struct sem * semaphore = &sma->sem_base[i];
2021			if (un->semadj[i]) {
2022				semaphore->semval += un->semadj[i];
2023				/*
2024				 * Range checks of the new semaphore value,
2025				 * not defined by sus:
2026				 * - Some unices ignore the undo entirely
2027				 *   (e.g. HP UX 11i 11.22, Tru64 V5.1)
2028				 * - some cap the value (e.g. FreeBSD caps
2029				 *   at 0, but doesn't enforce SEMVMX)
2030				 *
2031				 * Linux caps the semaphore value, both at 0
2032				 * and at SEMVMX.
2033				 *
2034				 * 	Manfred <manfred@colorfullife.com>
2035				 */
2036				if (semaphore->semval < 0)
2037					semaphore->semval = 0;
2038				if (semaphore->semval > SEMVMX)
2039					semaphore->semval = SEMVMX;
2040				semaphore->sempid = task_tgid_vnr(current);
2041			}
2042		}
2043		/* maybe some queued-up processes were waiting for this */
2044		INIT_LIST_HEAD(&tasks);
2045		do_smart_update(sma, NULL, 0, 1, &tasks);
2046		sem_unlock(sma, -1);
2047		rcu_read_unlock();
2048		wake_up_sem_queue_do(&tasks);
2049
2050		kfree_rcu(un, rcu);
2051	}
2052	kfree(ulp);
2053}
2054
2055#ifdef CONFIG_PROC_FS
2056static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
2057{
2058	struct user_namespace *user_ns = seq_user_ns(s);
2059	struct sem_array *sma = it;
2060	time_t sem_otime;
2061
2062	sem_otime = get_semotime(sma);
2063
2064	return seq_printf(s,
2065			  "%10d %10d  %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
2066			  sma->sem_perm.key,
2067			  sma->sem_perm.id,
2068			  sma->sem_perm.mode,
2069			  sma->sem_nsems,
2070			  from_kuid_munged(user_ns, sma->sem_perm.uid),
2071			  from_kgid_munged(user_ns, sma->sem_perm.gid),
2072			  from_kuid_munged(user_ns, sma->sem_perm.cuid),
2073			  from_kgid_munged(user_ns, sma->sem_perm.cgid),
2074			  sem_otime,
2075			  sma->sem_ctime);
2076}
2077#endif
2078