sem.c revision d8c63376
1/*
2 * linux/ipc/sem.c
3 * Copyright (C) 1992 Krishna Balasubramanian
4 * Copyright (C) 1995 Eric Schenk, Bruno Haible
5 *
6 * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
7 *
8 * SMP-threaded, sysctl's added
9 * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
10 * Enforced range limit on SEM_UNDO
11 * (c) 2001 Red Hat Inc
12 * Lockless wakeup
13 * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
14 * Further wakeup optimizations, documentation
15 * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
16 *
17 * support for audit of ipc object properties and permission changes
18 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
19 *
20 * namespaces support
21 * OpenVZ, SWsoft Inc.
22 * Pavel Emelianov <xemul@openvz.org>
23 *
24 * Implementation notes: (May 2010)
25 * This file implements System V semaphores.
26 *
27 * User space visible behavior:
28 * - FIFO ordering for semop() operations (just FIFO, not starvation
29 *   protection)
30 * - multiple semaphore operations that alter the same semaphore in
31 *   one semop() are handled.
32 * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
33 *   SETALL calls.
34 * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
35 * - undo adjustments at process exit are limited to 0..SEMVMX.
36 * - namespace are supported.
37 * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
38 *   to /proc/sys/kernel/sem.
39 * - statistics about the usage are reported in /proc/sysvipc/sem.
40 *
41 * Internals:
42 * - scalability:
43 *   - all global variables are read-mostly.
44 *   - semop() calls and semctl(RMID) are synchronized by RCU.
45 *   - most operations do write operations (actually: spin_lock calls) to
46 *     the per-semaphore array structure.
47 *   Thus: Perfect SMP scaling between independent semaphore arrays.
48 *         If multiple semaphores in one array are used, then cache line
49 *         trashing on the semaphore array spinlock will limit the scaling.
50 * - semncnt and semzcnt are calculated on demand in count_semncnt() and
51 *   count_semzcnt()
52 * - the task that performs a successful semop() scans the list of all
53 *   sleeping tasks and completes any pending operations that can be fulfilled.
54 *   Semaphores are actively given to waiting tasks (necessary for FIFO).
55 *   (see update_queue())
56 * - To improve the scalability, the actual wake-up calls are performed after
57 *   dropping all locks. (see wake_up_sem_queue_prepare(),
58 *   wake_up_sem_queue_do())
59 * - All work is done by the waker, the woken up task does not have to do
60 *   anything - not even acquiring a lock or dropping a refcount.
61 * - A woken up task may not even touch the semaphore array anymore, it may
62 *   have been destroyed already by a semctl(RMID).
63 * - The synchronizations between wake-ups due to a timeout/signal and a
64 *   wake-up due to a completed semaphore operation is achieved by using an
65 *   intermediate state (IN_WAKEUP).
66 * - UNDO values are stored in an array (one per process and per
67 *   semaphore array, lazily allocated). For backwards compatibility, multiple
68 *   modes for the UNDO variables are supported (per process, per thread)
69 *   (see copy_semundo, CLONE_SYSVSEM)
70 * - There are two lists of the pending operations: a per-array list
71 *   and per-semaphore list (stored in the array). This allows to achieve FIFO
72 *   ordering without always scanning all pending operations.
73 *   The worst-case behavior is nevertheless O(N^2) for N wakeups.
74 */
75
76#include <linux/slab.h>
77#include <linux/spinlock.h>
78#include <linux/init.h>
79#include <linux/proc_fs.h>
80#include <linux/time.h>
81#include <linux/security.h>
82#include <linux/syscalls.h>
83#include <linux/audit.h>
84#include <linux/capability.h>
85#include <linux/seq_file.h>
86#include <linux/rwsem.h>
87#include <linux/nsproxy.h>
88#include <linux/ipc_namespace.h>
89
90#include <asm/uaccess.h>
91#include "util.h"
92
93/* One semaphore structure for each semaphore in the system. */
94struct sem {
95	int	semval;		/* current value */
96	int	sempid;		/* pid of last operation */
97	spinlock_t	lock;	/* spinlock for fine-grained semtimedop */
98	struct list_head pending_alter; /* pending single-sop operations */
99					/* that alter the semaphore */
100	struct list_head pending_const; /* pending single-sop operations */
101					/* that do not alter the semaphore*/
102	time_t	sem_otime;	/* candidate for sem_otime */
103} ____cacheline_aligned_in_smp;
104
105/* One queue for each sleeping process in the system. */
106struct sem_queue {
107	struct list_head	list;	 /* queue of pending operations */
108	struct task_struct	*sleeper; /* this process */
109	struct sem_undo		*undo;	 /* undo structure */
110	int			pid;	 /* process id of requesting process */
111	int			status;	 /* completion status of operation */
112	struct sembuf		*sops;	 /* array of pending operations */
113	int			nsops;	 /* number of operations */
114	int			alter;	 /* does *sops alter the array? */
115};
116
117/* Each task has a list of undo requests. They are executed automatically
118 * when the process exits.
119 */
120struct sem_undo {
121	struct list_head	list_proc;	/* per-process list: *
122						 * all undos from one process
123						 * rcu protected */
124	struct rcu_head		rcu;		/* rcu struct for sem_undo */
125	struct sem_undo_list	*ulp;		/* back ptr to sem_undo_list */
126	struct list_head	list_id;	/* per semaphore array list:
127						 * all undos for one array */
128	int			semid;		/* semaphore set identifier */
129	short			*semadj;	/* array of adjustments */
130						/* one per semaphore */
131};
132
133/* sem_undo_list controls shared access to the list of sem_undo structures
134 * that may be shared among all a CLONE_SYSVSEM task group.
135 */
136struct sem_undo_list {
137	atomic_t		refcnt;
138	spinlock_t		lock;
139	struct list_head	list_proc;
140};
141
142
143#define sem_ids(ns)	((ns)->ids[IPC_SEM_IDS])
144
145#define sem_checkid(sma, semid)	ipc_checkid(&sma->sem_perm, semid)
146
147static int newary(struct ipc_namespace *, struct ipc_params *);
148static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
149#ifdef CONFIG_PROC_FS
150static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
151#endif
152
153#define SEMMSL_FAST	256 /* 512 bytes on stack */
154#define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
155
156/*
157 * Locking:
158 *	sem_undo.id_next,
159 *	sem_array.complex_count,
160 *	sem_array.pending{_alter,_cont},
161 *	sem_array.sem_undo: global sem_lock() for read/write
162 *	sem_undo.proc_next: only "current" is allowed to read/write that field.
163 *
164 *	sem_array.sem_base[i].pending_{const,alter}:
165 *		global or semaphore sem_lock() for read/write
166 */
167
168#define sc_semmsl	sem_ctls[0]
169#define sc_semmns	sem_ctls[1]
170#define sc_semopm	sem_ctls[2]
171#define sc_semmni	sem_ctls[3]
172
173void sem_init_ns(struct ipc_namespace *ns)
174{
175	ns->sc_semmsl = SEMMSL;
176	ns->sc_semmns = SEMMNS;
177	ns->sc_semopm = SEMOPM;
178	ns->sc_semmni = SEMMNI;
179	ns->used_sems = 0;
180	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
181}
182
183#ifdef CONFIG_IPC_NS
184void sem_exit_ns(struct ipc_namespace *ns)
185{
186	free_ipcs(ns, &sem_ids(ns), freeary);
187	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
188}
189#endif
190
191void __init sem_init (void)
192{
193	sem_init_ns(&init_ipc_ns);
194	ipc_init_proc_interface("sysvipc/sem",
195				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
196				IPC_SEM_IDS, sysvipc_sem_proc_show);
197}
198
199/**
200 * unmerge_queues - unmerge queues, if possible.
201 * @sma: semaphore array
202 *
203 * The function unmerges the wait queues if complex_count is 0.
204 * It must be called prior to dropping the global semaphore array lock.
205 */
206static void unmerge_queues(struct sem_array *sma)
207{
208	struct sem_queue *q, *tq;
209
210	/* complex operations still around? */
211	if (sma->complex_count)
212		return;
213	/*
214	 * We will switch back to simple mode.
215	 * Move all pending operation back into the per-semaphore
216	 * queues.
217	 */
218	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
219		struct sem *curr;
220		curr = &sma->sem_base[q->sops[0].sem_num];
221
222		list_add_tail(&q->list, &curr->pending_alter);
223	}
224	INIT_LIST_HEAD(&sma->pending_alter);
225}
226
227/**
228 * merge_queues - Merge single semop queues into global queue
229 * @sma: semaphore array
230 *
231 * This function merges all per-semaphore queues into the global queue.
232 * It is necessary to achieve FIFO ordering for the pending single-sop
233 * operations when a multi-semop operation must sleep.
234 * Only the alter operations must be moved, the const operations can stay.
235 */
236static void merge_queues(struct sem_array *sma)
237{
238	int i;
239	for (i = 0; i < sma->sem_nsems; i++) {
240		struct sem *sem = sma->sem_base + i;
241
242		list_splice_init(&sem->pending_alter, &sma->pending_alter);
243	}
244}
245
246static void sem_rcu_free(struct rcu_head *head)
247{
248	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
249	struct sem_array *sma = ipc_rcu_to_struct(p);
250
251	security_sem_free(sma);
252	ipc_rcu_free(head);
253}
254
255/*
256 * Wait until all currently ongoing simple ops have completed.
257 * Caller must own sem_perm.lock.
258 * New simple ops cannot start, because simple ops first check
259 * that sem_perm.lock is free.
260 * that a) sem_perm.lock is free and b) complex_count is 0.
261 */
262static void sem_wait_array(struct sem_array *sma)
263{
264	int i;
265	struct sem *sem;
266
267	if (sma->complex_count)  {
268		/* The thread that increased sma->complex_count waited on
269		 * all sem->lock locks. Thus we don't need to wait again.
270		 */
271		return;
272	}
273
274	for (i = 0; i < sma->sem_nsems; i++) {
275		sem = sma->sem_base + i;
276		spin_unlock_wait(&sem->lock);
277	}
278}
279
280/*
281 * If the request contains only one semaphore operation, and there are
282 * no complex transactions pending, lock only the semaphore involved.
283 * Otherwise, lock the entire semaphore array, since we either have
284 * multiple semaphores in our own semops, or we need to look at
285 * semaphores from other pending complex operations.
286 */
287static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
288			      int nsops)
289{
290	struct sem *sem;
291
292	if (nsops != 1) {
293		/* Complex operation - acquire a full lock */
294		ipc_lock_object(&sma->sem_perm);
295
296		/* And wait until all simple ops that are processed
297		 * right now have dropped their locks.
298		 */
299		sem_wait_array(sma);
300		return -1;
301	}
302
303	/*
304	 * Only one semaphore affected - try to optimize locking.
305	 * The rules are:
306	 * - optimized locking is possible if no complex operation
307	 *   is either enqueued or processed right now.
308	 * - The test for enqueued complex ops is simple:
309	 *      sma->complex_count != 0
310	 * - Testing for complex ops that are processed right now is
311	 *   a bit more difficult. Complex ops acquire the full lock
312	 *   and first wait that the running simple ops have completed.
313	 *   (see above)
314	 *   Thus: If we own a simple lock and the global lock is free
315	 *	and complex_count is now 0, then it will stay 0 and
316	 *	thus just locking sem->lock is sufficient.
317	 */
318	sem = sma->sem_base + sops->sem_num;
319
320	if (sma->complex_count == 0) {
321		/*
322		 * It appears that no complex operation is around.
323		 * Acquire the per-semaphore lock.
324		 */
325		spin_lock(&sem->lock);
326
327		/* Then check that the global lock is free */
328		if (!spin_is_locked(&sma->sem_perm.lock)) {
329			/* spin_is_locked() is not a memory barrier */
330			smp_mb();
331
332			/* Now repeat the test of complex_count:
333			 * It can't change anymore until we drop sem->lock.
334			 * Thus: if is now 0, then it will stay 0.
335			 */
336			if (sma->complex_count == 0) {
337				/* fast path successful! */
338				return sops->sem_num;
339			}
340		}
341		spin_unlock(&sem->lock);
342	}
343
344	/* slow path: acquire the full lock */
345	ipc_lock_object(&sma->sem_perm);
346
347	if (sma->complex_count == 0) {
348		/* False alarm:
349		 * There is no complex operation, thus we can switch
350		 * back to the fast path.
351		 */
352		spin_lock(&sem->lock);
353		ipc_unlock_object(&sma->sem_perm);
354		return sops->sem_num;
355	} else {
356		/* Not a false alarm, thus complete the sequence for a
357		 * full lock.
358		 */
359		sem_wait_array(sma);
360		return -1;
361	}
362}
363
364static inline void sem_unlock(struct sem_array *sma, int locknum)
365{
366	if (locknum == -1) {
367		unmerge_queues(sma);
368		ipc_unlock_object(&sma->sem_perm);
369	} else {
370		struct sem *sem = sma->sem_base + locknum;
371		spin_unlock(&sem->lock);
372	}
373}
374
375/*
376 * sem_lock_(check_) routines are called in the paths where the rwsem
377 * is not held.
378 *
379 * The caller holds the RCU read lock.
380 */
381static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns,
382			int id, struct sembuf *sops, int nsops, int *locknum)
383{
384	struct kern_ipc_perm *ipcp;
385	struct sem_array *sma;
386
387	ipcp = ipc_obtain_object(&sem_ids(ns), id);
388	if (IS_ERR(ipcp))
389		return ERR_CAST(ipcp);
390
391	sma = container_of(ipcp, struct sem_array, sem_perm);
392	*locknum = sem_lock(sma, sops, nsops);
393
394	/* ipc_rmid() may have already freed the ID while sem_lock
395	 * was spinning: verify that the structure is still valid
396	 */
397	if (!ipcp->deleted)
398		return container_of(ipcp, struct sem_array, sem_perm);
399
400	sem_unlock(sma, *locknum);
401	return ERR_PTR(-EINVAL);
402}
403
404static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
405{
406	struct kern_ipc_perm *ipcp = ipc_obtain_object(&sem_ids(ns), id);
407
408	if (IS_ERR(ipcp))
409		return ERR_CAST(ipcp);
410
411	return container_of(ipcp, struct sem_array, sem_perm);
412}
413
414static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns,
415							int id)
416{
417	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);
418
419	if (IS_ERR(ipcp))
420		return ERR_CAST(ipcp);
421
422	return container_of(ipcp, struct sem_array, sem_perm);
423}
424
425static inline void sem_lock_and_putref(struct sem_array *sma)
426{
427	sem_lock(sma, NULL, -1);
428	ipc_rcu_putref(sma, ipc_rcu_free);
429}
430
431static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
432{
433	ipc_rmid(&sem_ids(ns), &s->sem_perm);
434}
435
436/*
437 * Lockless wakeup algorithm:
438 * Without the check/retry algorithm a lockless wakeup is possible:
439 * - queue.status is initialized to -EINTR before blocking.
440 * - wakeup is performed by
441 *	* unlinking the queue entry from the pending list
442 *	* setting queue.status to IN_WAKEUP
443 *	  This is the notification for the blocked thread that a
444 *	  result value is imminent.
445 *	* call wake_up_process
446 *	* set queue.status to the final value.
447 * - the previously blocked thread checks queue.status:
448 *   	* if it's IN_WAKEUP, then it must wait until the value changes
449 *   	* if it's not -EINTR, then the operation was completed by
450 *   	  update_queue. semtimedop can return queue.status without
451 *   	  performing any operation on the sem array.
452 *   	* otherwise it must acquire the spinlock and check what's up.
453 *
454 * The two-stage algorithm is necessary to protect against the following
455 * races:
456 * - if queue.status is set after wake_up_process, then the woken up idle
457 *   thread could race forward and try (and fail) to acquire sma->lock
458 *   before update_queue had a chance to set queue.status
459 * - if queue.status is written before wake_up_process and if the
460 *   blocked process is woken up by a signal between writing
461 *   queue.status and the wake_up_process, then the woken up
462 *   process could return from semtimedop and die by calling
463 *   sys_exit before wake_up_process is called. Then wake_up_process
464 *   will oops, because the task structure is already invalid.
465 *   (yes, this happened on s390 with sysv msg).
466 *
467 */
468#define IN_WAKEUP	1
469
470/**
471 * newary - Create a new semaphore set
472 * @ns: namespace
473 * @params: ptr to the structure that contains key, semflg and nsems
474 *
475 * Called with sem_ids.rwsem held (as a writer)
476 */
477
478static int newary(struct ipc_namespace *ns, struct ipc_params *params)
479{
480	int id;
481	int retval;
482	struct sem_array *sma;
483	int size;
484	key_t key = params->key;
485	int nsems = params->u.nsems;
486	int semflg = params->flg;
487	int i;
488
489	if (!nsems)
490		return -EINVAL;
491	if (ns->used_sems + nsems > ns->sc_semmns)
492		return -ENOSPC;
493
494	size = sizeof (*sma) + nsems * sizeof (struct sem);
495	sma = ipc_rcu_alloc(size);
496	if (!sma) {
497		return -ENOMEM;
498	}
499	memset (sma, 0, size);
500
501	sma->sem_perm.mode = (semflg & S_IRWXUGO);
502	sma->sem_perm.key = key;
503
504	sma->sem_perm.security = NULL;
505	retval = security_sem_alloc(sma);
506	if (retval) {
507		ipc_rcu_putref(sma, ipc_rcu_free);
508		return retval;
509	}
510
511	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
512	if (id < 0) {
513		ipc_rcu_putref(sma, sem_rcu_free);
514		return id;
515	}
516	ns->used_sems += nsems;
517
518	sma->sem_base = (struct sem *) &sma[1];
519
520	for (i = 0; i < nsems; i++) {
521		INIT_LIST_HEAD(&sma->sem_base[i].pending_alter);
522		INIT_LIST_HEAD(&sma->sem_base[i].pending_const);
523		spin_lock_init(&sma->sem_base[i].lock);
524	}
525
526	sma->complex_count = 0;
527	INIT_LIST_HEAD(&sma->pending_alter);
528	INIT_LIST_HEAD(&sma->pending_const);
529	INIT_LIST_HEAD(&sma->list_id);
530	sma->sem_nsems = nsems;
531	sma->sem_ctime = get_seconds();
532	sem_unlock(sma, -1);
533	rcu_read_unlock();
534
535	return sma->sem_perm.id;
536}
537
538
539/*
540 * Called with sem_ids.rwsem and ipcp locked.
541 */
542static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
543{
544	struct sem_array *sma;
545
546	sma = container_of(ipcp, struct sem_array, sem_perm);
547	return security_sem_associate(sma, semflg);
548}
549
550/*
551 * Called with sem_ids.rwsem and ipcp locked.
552 */
553static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
554				struct ipc_params *params)
555{
556	struct sem_array *sma;
557
558	sma = container_of(ipcp, struct sem_array, sem_perm);
559	if (params->u.nsems > sma->sem_nsems)
560		return -EINVAL;
561
562	return 0;
563}
564
565SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
566{
567	struct ipc_namespace *ns;
568	struct ipc_ops sem_ops;
569	struct ipc_params sem_params;
570
571	ns = current->nsproxy->ipc_ns;
572
573	if (nsems < 0 || nsems > ns->sc_semmsl)
574		return -EINVAL;
575
576	sem_ops.getnew = newary;
577	sem_ops.associate = sem_security;
578	sem_ops.more_checks = sem_more_checks;
579
580	sem_params.key = key;
581	sem_params.flg = semflg;
582	sem_params.u.nsems = nsems;
583
584	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
585}
586
587/** perform_atomic_semop - Perform (if possible) a semaphore operation
588 * @sma: semaphore array
589 * @sops: array with operations that should be checked
590 * @nsems: number of sops
591 * @un: undo array
592 * @pid: pid that did the change
593 *
594 * Returns 0 if the operation was possible.
595 * Returns 1 if the operation is impossible, the caller must sleep.
596 * Negative values are error codes.
597 */
598
599static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops,
600			     int nsops, struct sem_undo *un, int pid)
601{
602	int result, sem_op;
603	struct sembuf *sop;
604	struct sem * curr;
605
606	for (sop = sops; sop < sops + nsops; sop++) {
607		curr = sma->sem_base + sop->sem_num;
608		sem_op = sop->sem_op;
609		result = curr->semval;
610
611		if (!sem_op && result)
612			goto would_block;
613
614		result += sem_op;
615		if (result < 0)
616			goto would_block;
617		if (result > SEMVMX)
618			goto out_of_range;
619		if (sop->sem_flg & SEM_UNDO) {
620			int undo = un->semadj[sop->sem_num] - sem_op;
621			/*
622	 		 *	Exceeding the undo range is an error.
623			 */
624			if (undo < (-SEMAEM - 1) || undo > SEMAEM)
625				goto out_of_range;
626		}
627		curr->semval = result;
628	}
629
630	sop--;
631	while (sop >= sops) {
632		sma->sem_base[sop->sem_num].sempid = pid;
633		if (sop->sem_flg & SEM_UNDO)
634			un->semadj[sop->sem_num] -= sop->sem_op;
635		sop--;
636	}
637
638	return 0;
639
640out_of_range:
641	result = -ERANGE;
642	goto undo;
643
644would_block:
645	if (sop->sem_flg & IPC_NOWAIT)
646		result = -EAGAIN;
647	else
648		result = 1;
649
650undo:
651	sop--;
652	while (sop >= sops) {
653		sma->sem_base[sop->sem_num].semval -= sop->sem_op;
654		sop--;
655	}
656
657	return result;
658}
659
660/** wake_up_sem_queue_prepare(q, error): Prepare wake-up
661 * @q: queue entry that must be signaled
662 * @error: Error value for the signal
663 *
664 * Prepare the wake-up of the queue entry q.
665 */
666static void wake_up_sem_queue_prepare(struct list_head *pt,
667				struct sem_queue *q, int error)
668{
669	if (list_empty(pt)) {
670		/*
671		 * Hold preempt off so that we don't get preempted and have the
672		 * wakee busy-wait until we're scheduled back on.
673		 */
674		preempt_disable();
675	}
676	q->status = IN_WAKEUP;
677	q->pid = error;
678
679	list_add_tail(&q->list, pt);
680}
681
682/**
683 * wake_up_sem_queue_do(pt) - do the actual wake-up
684 * @pt: list of tasks to be woken up
685 *
686 * Do the actual wake-up.
687 * The function is called without any locks held, thus the semaphore array
688 * could be destroyed already and the tasks can disappear as soon as the
689 * status is set to the actual return code.
690 */
691static void wake_up_sem_queue_do(struct list_head *pt)
692{
693	struct sem_queue *q, *t;
694	int did_something;
695
696	did_something = !list_empty(pt);
697	list_for_each_entry_safe(q, t, pt, list) {
698		wake_up_process(q->sleeper);
699		/* q can disappear immediately after writing q->status. */
700		smp_wmb();
701		q->status = q->pid;
702	}
703	if (did_something)
704		preempt_enable();
705}
706
707static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
708{
709	list_del(&q->list);
710	if (q->nsops > 1)
711		sma->complex_count--;
712}
713
714/** check_restart(sma, q)
715 * @sma: semaphore array
716 * @q: the operation that just completed
717 *
718 * update_queue is O(N^2) when it restarts scanning the whole queue of
719 * waiting operations. Therefore this function checks if the restart is
720 * really necessary. It is called after a previously waiting operation
721 * modified the array.
722 * Note that wait-for-zero operations are handled without restart.
723 */
724static int check_restart(struct sem_array *sma, struct sem_queue *q)
725{
726	/* pending complex alter operations are too difficult to analyse */
727	if (!list_empty(&sma->pending_alter))
728		return 1;
729
730	/* we were a sleeping complex operation. Too difficult */
731	if (q->nsops > 1)
732		return 1;
733
734	/* It is impossible that someone waits for the new value:
735	 * - complex operations always restart.
736	 * - wait-for-zero are handled seperately.
737	 * - q is a previously sleeping simple operation that
738	 *   altered the array. It must be a decrement, because
739	 *   simple increments never sleep.
740	 * - If there are older (higher priority) decrements
741	 *   in the queue, then they have observed the original
742	 *   semval value and couldn't proceed. The operation
743	 *   decremented to value - thus they won't proceed either.
744	 */
745	return 0;
746}
747
748/**
749 * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks
750 * @sma: semaphore array.
751 * @semnum: semaphore that was modified.
752 * @pt: list head for the tasks that must be woken up.
753 *
754 * wake_const_ops must be called after a semaphore in a semaphore array
755 * was set to 0. If complex const operations are pending, wake_const_ops must
756 * be called with semnum = -1, as well as with the number of each modified
757 * semaphore.
758 * The tasks that must be woken up are added to @pt. The return code
759 * is stored in q->pid.
760 * The function returns 1 if at least one operation was completed successfully.
761 */
762static int wake_const_ops(struct sem_array *sma, int semnum,
763				struct list_head *pt)
764{
765	struct sem_queue *q;
766	struct list_head *walk;
767	struct list_head *pending_list;
768	int semop_completed = 0;
769
770	if (semnum == -1)
771		pending_list = &sma->pending_const;
772	else
773		pending_list = &sma->sem_base[semnum].pending_const;
774
775	walk = pending_list->next;
776	while (walk != pending_list) {
777		int error;
778
779		q = container_of(walk, struct sem_queue, list);
780		walk = walk->next;
781
782		error = perform_atomic_semop(sma, q->sops, q->nsops,
783						 q->undo, q->pid);
784
785		if (error <= 0) {
786			/* operation completed, remove from queue & wakeup */
787
788			unlink_queue(sma, q);
789
790			wake_up_sem_queue_prepare(pt, q, error);
791			if (error == 0)
792				semop_completed = 1;
793		}
794	}
795	return semop_completed;
796}
797
798/**
799 * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks
800 * @sma: semaphore array
801 * @sops: operations that were performed
802 * @nsops: number of operations
803 * @pt: list head of the tasks that must be woken up.
804 *
805 * do_smart_wakeup_zero() checks all required queue for wait-for-zero
806 * operations, based on the actual changes that were performed on the
807 * semaphore array.
808 * The function returns 1 if at least one operation was completed successfully.
809 */
810static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
811					int nsops, struct list_head *pt)
812{
813	int i;
814	int semop_completed = 0;
815	int got_zero = 0;
816
817	/* first: the per-semaphore queues, if known */
818	if (sops) {
819		for (i = 0; i < nsops; i++) {
820			int num = sops[i].sem_num;
821
822			if (sma->sem_base[num].semval == 0) {
823				got_zero = 1;
824				semop_completed |= wake_const_ops(sma, num, pt);
825			}
826		}
827	} else {
828		/*
829		 * No sops means modified semaphores not known.
830		 * Assume all were changed.
831		 */
832		for (i = 0; i < sma->sem_nsems; i++) {
833			if (sma->sem_base[i].semval == 0) {
834				got_zero = 1;
835				semop_completed |= wake_const_ops(sma, i, pt);
836			}
837		}
838	}
839	/*
840	 * If one of the modified semaphores got 0,
841	 * then check the global queue, too.
842	 */
843	if (got_zero)
844		semop_completed |= wake_const_ops(sma, -1, pt);
845
846	return semop_completed;
847}
848
849
850/**
851 * update_queue(sma, semnum): Look for tasks that can be completed.
852 * @sma: semaphore array.
853 * @semnum: semaphore that was modified.
854 * @pt: list head for the tasks that must be woken up.
855 *
856 * update_queue must be called after a semaphore in a semaphore array
857 * was modified. If multiple semaphores were modified, update_queue must
858 * be called with semnum = -1, as well as with the number of each modified
859 * semaphore.
860 * The tasks that must be woken up are added to @pt. The return code
861 * is stored in q->pid.
862 * The function internally checks if const operations can now succeed.
863 *
864 * The function return 1 if at least one semop was completed successfully.
865 */
866static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
867{
868	struct sem_queue *q;
869	struct list_head *walk;
870	struct list_head *pending_list;
871	int semop_completed = 0;
872
873	if (semnum == -1)
874		pending_list = &sma->pending_alter;
875	else
876		pending_list = &sma->sem_base[semnum].pending_alter;
877
878again:
879	walk = pending_list->next;
880	while (walk != pending_list) {
881		int error, restart;
882
883		q = container_of(walk, struct sem_queue, list);
884		walk = walk->next;
885
886		/* If we are scanning the single sop, per-semaphore list of
887		 * one semaphore and that semaphore is 0, then it is not
888		 * necessary to scan further: simple increments
889		 * that affect only one entry succeed immediately and cannot
890		 * be in the  per semaphore pending queue, and decrements
891		 * cannot be successful if the value is already 0.
892		 */
893		if (semnum != -1 && sma->sem_base[semnum].semval == 0)
894			break;
895
896		error = perform_atomic_semop(sma, q->sops, q->nsops,
897					 q->undo, q->pid);
898
899		/* Does q->sleeper still need to sleep? */
900		if (error > 0)
901			continue;
902
903		unlink_queue(sma, q);
904
905		if (error) {
906			restart = 0;
907		} else {
908			semop_completed = 1;
909			do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
910			restart = check_restart(sma, q);
911		}
912
913		wake_up_sem_queue_prepare(pt, q, error);
914		if (restart)
915			goto again;
916	}
917	return semop_completed;
918}
919
920/**
921 * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue
922 * @sma: semaphore array
923 * @sops: operations that were performed
924 * @nsops: number of operations
925 * @otime: force setting otime
926 * @pt: list head of the tasks that must be woken up.
927 *
928 * do_smart_update() does the required calls to update_queue and wakeup_zero,
929 * based on the actual changes that were performed on the semaphore array.
930 * Note that the function does not do the actual wake-up: the caller is
931 * responsible for calling wake_up_sem_queue_do(@pt).
932 * It is safe to perform this call after dropping all locks.
933 */
934static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
935			int otime, struct list_head *pt)
936{
937	int i;
938
939	otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
940
941	if (!list_empty(&sma->pending_alter)) {
942		/* semaphore array uses the global queue - just process it. */
943		otime |= update_queue(sma, -1, pt);
944	} else {
945		if (!sops) {
946			/*
947			 * No sops, thus the modified semaphores are not
948			 * known. Check all.
949			 */
950			for (i = 0; i < sma->sem_nsems; i++)
951				otime |= update_queue(sma, i, pt);
952		} else {
953			/*
954			 * Check the semaphores that were increased:
955			 * - No complex ops, thus all sleeping ops are
956			 *   decrease.
957			 * - if we decreased the value, then any sleeping
958			 *   semaphore ops wont be able to run: If the
959			 *   previous value was too small, then the new
960			 *   value will be too small, too.
961			 */
962			for (i = 0; i < nsops; i++) {
963				if (sops[i].sem_op > 0) {
964					otime |= update_queue(sma,
965							sops[i].sem_num, pt);
966				}
967			}
968		}
969	}
970	if (otime) {
971		if (sops == NULL) {
972			sma->sem_base[0].sem_otime = get_seconds();
973		} else {
974			sma->sem_base[sops[0].sem_num].sem_otime =
975								get_seconds();
976		}
977	}
978}
979
980
981/* The following counts are associated to each semaphore:
982 *   semncnt        number of tasks waiting on semval being nonzero
983 *   semzcnt        number of tasks waiting on semval being zero
984 * This model assumes that a task waits on exactly one semaphore.
985 * Since semaphore operations are to be performed atomically, tasks actually
986 * wait on a whole sequence of semaphores simultaneously.
987 * The counts we return here are a rough approximation, but still
988 * warrant that semncnt+semzcnt>0 if the task is on the pending queue.
989 */
990static int count_semncnt (struct sem_array * sma, ushort semnum)
991{
992	int semncnt;
993	struct sem_queue * q;
994
995	semncnt = 0;
996	list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) {
997		struct sembuf * sops = q->sops;
998		BUG_ON(sops->sem_num != semnum);
999		if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT))
1000			semncnt++;
1001	}
1002
1003	list_for_each_entry(q, &sma->pending_alter, list) {
1004		struct sembuf * sops = q->sops;
1005		int nsops = q->nsops;
1006		int i;
1007		for (i = 0; i < nsops; i++)
1008			if (sops[i].sem_num == semnum
1009			    && (sops[i].sem_op < 0)
1010			    && !(sops[i].sem_flg & IPC_NOWAIT))
1011				semncnt++;
1012	}
1013	return semncnt;
1014}
1015
1016static int count_semzcnt (struct sem_array * sma, ushort semnum)
1017{
1018	int semzcnt;
1019	struct sem_queue * q;
1020
1021	semzcnt = 0;
1022	list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) {
1023		struct sembuf * sops = q->sops;
1024		BUG_ON(sops->sem_num != semnum);
1025		if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT))
1026			semzcnt++;
1027	}
1028
1029	list_for_each_entry(q, &sma->pending_const, list) {
1030		struct sembuf * sops = q->sops;
1031		int nsops = q->nsops;
1032		int i;
1033		for (i = 0; i < nsops; i++)
1034			if (sops[i].sem_num == semnum
1035			    && (sops[i].sem_op == 0)
1036			    && !(sops[i].sem_flg & IPC_NOWAIT))
1037				semzcnt++;
1038	}
1039	return semzcnt;
1040}
1041
1042/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
1043 * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
1044 * remains locked on exit.
1045 */
1046static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1047{
1048	struct sem_undo *un, *tu;
1049	struct sem_queue *q, *tq;
1050	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
1051	struct list_head tasks;
1052	int i;
1053
1054	/* Free the existing undo structures for this semaphore set.  */
1055	ipc_assert_locked_object(&sma->sem_perm);
1056	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
1057		list_del(&un->list_id);
1058		spin_lock(&un->ulp->lock);
1059		un->semid = -1;
1060		list_del_rcu(&un->list_proc);
1061		spin_unlock(&un->ulp->lock);
1062		kfree_rcu(un, rcu);
1063	}
1064
1065	/* Wake up all pending processes and let them fail with EIDRM. */
1066	INIT_LIST_HEAD(&tasks);
1067	list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
1068		unlink_queue(sma, q);
1069		wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
1070	}
1071
1072	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
1073		unlink_queue(sma, q);
1074		wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
1075	}
1076	for (i = 0; i < sma->sem_nsems; i++) {
1077		struct sem *sem = sma->sem_base + i;
1078		list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
1079			unlink_queue(sma, q);
1080			wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
1081		}
1082		list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
1083			unlink_queue(sma, q);
1084			wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
1085		}
1086	}
1087
1088	/* Remove the semaphore set from the IDR */
1089	sem_rmid(ns, sma);
1090	sem_unlock(sma, -1);
1091	rcu_read_unlock();
1092
1093	wake_up_sem_queue_do(&tasks);
1094	ns->used_sems -= sma->sem_nsems;
1095	ipc_rcu_putref(sma, sem_rcu_free);
1096}
1097
1098static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
1099{
1100	switch(version) {
1101	case IPC_64:
1102		return copy_to_user(buf, in, sizeof(*in));
1103	case IPC_OLD:
1104	    {
1105		struct semid_ds out;
1106
1107		memset(&out, 0, sizeof(out));
1108
1109		ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);
1110
1111		out.sem_otime	= in->sem_otime;
1112		out.sem_ctime	= in->sem_ctime;
1113		out.sem_nsems	= in->sem_nsems;
1114
1115		return copy_to_user(buf, &out, sizeof(out));
1116	    }
1117	default:
1118		return -EINVAL;
1119	}
1120}
1121
1122static time_t get_semotime(struct sem_array *sma)
1123{
1124	int i;
1125	time_t res;
1126
1127	res = sma->sem_base[0].sem_otime;
1128	for (i = 1; i < sma->sem_nsems; i++) {
1129		time_t to = sma->sem_base[i].sem_otime;
1130
1131		if (to > res)
1132			res = to;
1133	}
1134	return res;
1135}
1136
1137static int semctl_nolock(struct ipc_namespace *ns, int semid,
1138			 int cmd, int version, void __user *p)
1139{
1140	int err;
1141	struct sem_array *sma;
1142
1143	switch(cmd) {
1144	case IPC_INFO:
1145	case SEM_INFO:
1146	{
1147		struct seminfo seminfo;
1148		int max_id;
1149
1150		err = security_sem_semctl(NULL, cmd);
1151		if (err)
1152			return err;
1153
1154		memset(&seminfo,0,sizeof(seminfo));
1155		seminfo.semmni = ns->sc_semmni;
1156		seminfo.semmns = ns->sc_semmns;
1157		seminfo.semmsl = ns->sc_semmsl;
1158		seminfo.semopm = ns->sc_semopm;
1159		seminfo.semvmx = SEMVMX;
1160		seminfo.semmnu = SEMMNU;
1161		seminfo.semmap = SEMMAP;
1162		seminfo.semume = SEMUME;
1163		down_read(&sem_ids(ns).rwsem);
1164		if (cmd == SEM_INFO) {
1165			seminfo.semusz = sem_ids(ns).in_use;
1166			seminfo.semaem = ns->used_sems;
1167		} else {
1168			seminfo.semusz = SEMUSZ;
1169			seminfo.semaem = SEMAEM;
1170		}
1171		max_id = ipc_get_maxid(&sem_ids(ns));
1172		up_read(&sem_ids(ns).rwsem);
1173		if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
1174			return -EFAULT;
1175		return (max_id < 0) ? 0: max_id;
1176	}
1177	case IPC_STAT:
1178	case SEM_STAT:
1179	{
1180		struct semid64_ds tbuf;
1181		int id = 0;
1182
1183		memset(&tbuf, 0, sizeof(tbuf));
1184
1185		rcu_read_lock();
1186		if (cmd == SEM_STAT) {
1187			sma = sem_obtain_object(ns, semid);
1188			if (IS_ERR(sma)) {
1189				err = PTR_ERR(sma);
1190				goto out_unlock;
1191			}
1192			id = sma->sem_perm.id;
1193		} else {
1194			sma = sem_obtain_object_check(ns, semid);
1195			if (IS_ERR(sma)) {
1196				err = PTR_ERR(sma);
1197				goto out_unlock;
1198			}
1199		}
1200
1201		err = -EACCES;
1202		if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
1203			goto out_unlock;
1204
1205		err = security_sem_semctl(sma, cmd);
1206		if (err)
1207			goto out_unlock;
1208
1209		kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm);
1210		tbuf.sem_otime = get_semotime(sma);
1211		tbuf.sem_ctime = sma->sem_ctime;
1212		tbuf.sem_nsems = sma->sem_nsems;
1213		rcu_read_unlock();
1214		if (copy_semid_to_user(p, &tbuf, version))
1215			return -EFAULT;
1216		return id;
1217	}
1218	default:
1219		return -EINVAL;
1220	}
1221out_unlock:
1222	rcu_read_unlock();
1223	return err;
1224}
1225
1226static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1227		unsigned long arg)
1228{
1229	struct sem_undo *un;
1230	struct sem_array *sma;
1231	struct sem* curr;
1232	int err;
1233	struct list_head tasks;
1234	int val;
1235#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
1236	/* big-endian 64bit */
1237	val = arg >> 32;
1238#else
1239	/* 32bit or little-endian 64bit */
1240	val = arg;
1241#endif
1242
1243	if (val > SEMVMX || val < 0)
1244		return -ERANGE;
1245
1246	INIT_LIST_HEAD(&tasks);
1247
1248	rcu_read_lock();
1249	sma = sem_obtain_object_check(ns, semid);
1250	if (IS_ERR(sma)) {
1251		rcu_read_unlock();
1252		return PTR_ERR(sma);
1253	}
1254
1255	if (semnum < 0 || semnum >= sma->sem_nsems) {
1256		rcu_read_unlock();
1257		return -EINVAL;
1258	}
1259
1260
1261	if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
1262		rcu_read_unlock();
1263		return -EACCES;
1264	}
1265
1266	err = security_sem_semctl(sma, SETVAL);
1267	if (err) {
1268		rcu_read_unlock();
1269		return -EACCES;
1270	}
1271
1272	sem_lock(sma, NULL, -1);
1273
1274	curr = &sma->sem_base[semnum];
1275
1276	ipc_assert_locked_object(&sma->sem_perm);
1277	list_for_each_entry(un, &sma->list_id, list_id)
1278		un->semadj[semnum] = 0;
1279
1280	curr->semval = val;
1281	curr->sempid = task_tgid_vnr(current);
1282	sma->sem_ctime = get_seconds();
1283	/* maybe some queued-up processes were waiting for this */
1284	do_smart_update(sma, NULL, 0, 0, &tasks);
1285	sem_unlock(sma, -1);
1286	rcu_read_unlock();
1287	wake_up_sem_queue_do(&tasks);
1288	return 0;
1289}
1290
1291static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
1292		int cmd, void __user *p)
1293{
1294	struct sem_array *sma;
1295	struct sem* curr;
1296	int err, nsems;
1297	ushort fast_sem_io[SEMMSL_FAST];
1298	ushort* sem_io = fast_sem_io;
1299	struct list_head tasks;
1300
1301	INIT_LIST_HEAD(&tasks);
1302
1303	rcu_read_lock();
1304	sma = sem_obtain_object_check(ns, semid);
1305	if (IS_ERR(sma)) {
1306		rcu_read_unlock();
1307		return PTR_ERR(sma);
1308	}
1309
1310	nsems = sma->sem_nsems;
1311
1312	err = -EACCES;
1313	if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
1314		goto out_rcu_wakeup;
1315
1316	err = security_sem_semctl(sma, cmd);
1317	if (err)
1318		goto out_rcu_wakeup;
1319
1320	err = -EACCES;
1321	switch (cmd) {
1322	case GETALL:
1323	{
1324		ushort __user *array = p;
1325		int i;
1326
1327		sem_lock(sma, NULL, -1);
1328		if(nsems > SEMMSL_FAST) {
1329			if (!ipc_rcu_getref(sma)) {
1330				sem_unlock(sma, -1);
1331				rcu_read_unlock();
1332				err = -EIDRM;
1333				goto out_free;
1334			}
1335			sem_unlock(sma, -1);
1336			rcu_read_unlock();
1337			sem_io = ipc_alloc(sizeof(ushort)*nsems);
1338			if(sem_io == NULL) {
1339				ipc_rcu_putref(sma, ipc_rcu_free);
1340				return -ENOMEM;
1341			}
1342
1343			rcu_read_lock();
1344			sem_lock_and_putref(sma);
1345			if (sma->sem_perm.deleted) {
1346				sem_unlock(sma, -1);
1347				rcu_read_unlock();
1348				err = -EIDRM;
1349				goto out_free;
1350			}
1351		}
1352		for (i = 0; i < sma->sem_nsems; i++)
1353			sem_io[i] = sma->sem_base[i].semval;
1354		sem_unlock(sma, -1);
1355		rcu_read_unlock();
1356		err = 0;
1357		if(copy_to_user(array, sem_io, nsems*sizeof(ushort)))
1358			err = -EFAULT;
1359		goto out_free;
1360	}
1361	case SETALL:
1362	{
1363		int i;
1364		struct sem_undo *un;
1365
1366		if (!ipc_rcu_getref(sma)) {
1367			rcu_read_unlock();
1368			return -EIDRM;
1369		}
1370		rcu_read_unlock();
1371
1372		if(nsems > SEMMSL_FAST) {
1373			sem_io = ipc_alloc(sizeof(ushort)*nsems);
1374			if(sem_io == NULL) {
1375				ipc_rcu_putref(sma, ipc_rcu_free);
1376				return -ENOMEM;
1377			}
1378		}
1379
1380		if (copy_from_user (sem_io, p, nsems*sizeof(ushort))) {
1381			ipc_rcu_putref(sma, ipc_rcu_free);
1382			err = -EFAULT;
1383			goto out_free;
1384		}
1385
1386		for (i = 0; i < nsems; i++) {
1387			if (sem_io[i] > SEMVMX) {
1388				ipc_rcu_putref(sma, ipc_rcu_free);
1389				err = -ERANGE;
1390				goto out_free;
1391			}
1392		}
1393		rcu_read_lock();
1394		sem_lock_and_putref(sma);
1395		if (sma->sem_perm.deleted) {
1396			sem_unlock(sma, -1);
1397			rcu_read_unlock();
1398			err = -EIDRM;
1399			goto out_free;
1400		}
1401
1402		for (i = 0; i < nsems; i++)
1403			sma->sem_base[i].semval = sem_io[i];
1404
1405		ipc_assert_locked_object(&sma->sem_perm);
1406		list_for_each_entry(un, &sma->list_id, list_id) {
1407			for (i = 0; i < nsems; i++)
1408				un->semadj[i] = 0;
1409		}
1410		sma->sem_ctime = get_seconds();
1411		/* maybe some queued-up processes were waiting for this */
1412		do_smart_update(sma, NULL, 0, 0, &tasks);
1413		err = 0;
1414		goto out_unlock;
1415	}
1416	/* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
1417	}
1418	err = -EINVAL;
1419	if (semnum < 0 || semnum >= nsems)
1420		goto out_rcu_wakeup;
1421
1422	sem_lock(sma, NULL, -1);
1423	curr = &sma->sem_base[semnum];
1424
1425	switch (cmd) {
1426	case GETVAL:
1427		err = curr->semval;
1428		goto out_unlock;
1429	case GETPID:
1430		err = curr->sempid;
1431		goto out_unlock;
1432	case GETNCNT:
1433		err = count_semncnt(sma,semnum);
1434		goto out_unlock;
1435	case GETZCNT:
1436		err = count_semzcnt(sma,semnum);
1437		goto out_unlock;
1438	}
1439
1440out_unlock:
1441	sem_unlock(sma, -1);
1442out_rcu_wakeup:
1443	rcu_read_unlock();
1444	wake_up_sem_queue_do(&tasks);
1445out_free:
1446	if(sem_io != fast_sem_io)
1447		ipc_free(sem_io, sizeof(ushort)*nsems);
1448	return err;
1449}
1450
1451static inline unsigned long
1452copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
1453{
1454	switch(version) {
1455	case IPC_64:
1456		if (copy_from_user(out, buf, sizeof(*out)))
1457			return -EFAULT;
1458		return 0;
1459	case IPC_OLD:
1460	    {
1461		struct semid_ds tbuf_old;
1462
1463		if(copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
1464			return -EFAULT;
1465
1466		out->sem_perm.uid	= tbuf_old.sem_perm.uid;
1467		out->sem_perm.gid	= tbuf_old.sem_perm.gid;
1468		out->sem_perm.mode	= tbuf_old.sem_perm.mode;
1469
1470		return 0;
1471	    }
1472	default:
1473		return -EINVAL;
1474	}
1475}
1476
1477/*
1478 * This function handles some semctl commands which require the rwsem
1479 * to be held in write mode.
1480 * NOTE: no locks must be held, the rwsem is taken inside this function.
1481 */
1482static int semctl_down(struct ipc_namespace *ns, int semid,
1483		       int cmd, int version, void __user *p)
1484{
1485	struct sem_array *sma;
1486	int err;
1487	struct semid64_ds semid64;
1488	struct kern_ipc_perm *ipcp;
1489
1490	if(cmd == IPC_SET) {
1491		if (copy_semid_from_user(&semid64, p, version))
1492			return -EFAULT;
1493	}
1494
1495	down_write(&sem_ids(ns).rwsem);
1496	rcu_read_lock();
1497
1498	ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
1499				      &semid64.sem_perm, 0);
1500	if (IS_ERR(ipcp)) {
1501		err = PTR_ERR(ipcp);
1502		goto out_unlock1;
1503	}
1504
1505	sma = container_of(ipcp, struct sem_array, sem_perm);
1506
1507	err = security_sem_semctl(sma, cmd);
1508	if (err)
1509		goto out_unlock1;
1510
1511	switch (cmd) {
1512	case IPC_RMID:
1513		sem_lock(sma, NULL, -1);
1514		/* freeary unlocks the ipc object and rcu */
1515		freeary(ns, ipcp);
1516		goto out_up;
1517	case IPC_SET:
1518		sem_lock(sma, NULL, -1);
1519		err = ipc_update_perm(&semid64.sem_perm, ipcp);
1520		if (err)
1521			goto out_unlock0;
1522		sma->sem_ctime = get_seconds();
1523		break;
1524	default:
1525		err = -EINVAL;
1526		goto out_unlock1;
1527	}
1528
1529out_unlock0:
1530	sem_unlock(sma, -1);
1531out_unlock1:
1532	rcu_read_unlock();
1533out_up:
1534	up_write(&sem_ids(ns).rwsem);
1535	return err;
1536}
1537
1538SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
1539{
1540	int version;
1541	struct ipc_namespace *ns;
1542	void __user *p = (void __user *)arg;
1543
1544	if (semid < 0)
1545		return -EINVAL;
1546
1547	version = ipc_parse_version(&cmd);
1548	ns = current->nsproxy->ipc_ns;
1549
1550	switch(cmd) {
1551	case IPC_INFO:
1552	case SEM_INFO:
1553	case IPC_STAT:
1554	case SEM_STAT:
1555		return semctl_nolock(ns, semid, cmd, version, p);
1556	case GETALL:
1557	case GETVAL:
1558	case GETPID:
1559	case GETNCNT:
1560	case GETZCNT:
1561	case SETALL:
1562		return semctl_main(ns, semid, semnum, cmd, p);
1563	case SETVAL:
1564		return semctl_setval(ns, semid, semnum, arg);
1565	case IPC_RMID:
1566	case IPC_SET:
1567		return semctl_down(ns, semid, cmd, version, p);
1568	default:
1569		return -EINVAL;
1570	}
1571}
1572
1573/* If the task doesn't already have a undo_list, then allocate one
1574 * here.  We guarantee there is only one thread using this undo list,
1575 * and current is THE ONE
1576 *
1577 * If this allocation and assignment succeeds, but later
1578 * portions of this code fail, there is no need to free the sem_undo_list.
1579 * Just let it stay associated with the task, and it'll be freed later
1580 * at exit time.
1581 *
1582 * This can block, so callers must hold no locks.
1583 */
1584static inline int get_undo_list(struct sem_undo_list **undo_listp)
1585{
1586	struct sem_undo_list *undo_list;
1587
1588	undo_list = current->sysvsem.undo_list;
1589	if (!undo_list) {
1590		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
1591		if (undo_list == NULL)
1592			return -ENOMEM;
1593		spin_lock_init(&undo_list->lock);
1594		atomic_set(&undo_list->refcnt, 1);
1595		INIT_LIST_HEAD(&undo_list->list_proc);
1596
1597		current->sysvsem.undo_list = undo_list;
1598	}
1599	*undo_listp = undo_list;
1600	return 0;
1601}
1602
1603static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid)
1604{
1605	struct sem_undo *un;
1606
1607	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) {
1608		if (un->semid == semid)
1609			return un;
1610	}
1611	return NULL;
1612}
1613
1614static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
1615{
1616	struct sem_undo *un;
1617
1618  	assert_spin_locked(&ulp->lock);
1619
1620	un = __lookup_undo(ulp, semid);
1621	if (un) {
1622		list_del_rcu(&un->list_proc);
1623		list_add_rcu(&un->list_proc, &ulp->list_proc);
1624	}
1625	return un;
1626}
1627
1628/**
1629 * find_alloc_undo - Lookup (and if not present create) undo array
1630 * @ns: namespace
1631 * @semid: semaphore array id
1632 *
1633 * The function looks up (and if not present creates) the undo structure.
1634 * The size of the undo structure depends on the size of the semaphore
1635 * array, thus the alloc path is not that straightforward.
1636 * Lifetime-rules: sem_undo is rcu-protected, on success, the function
1637 * performs a rcu_read_lock().
1638 */
1639static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
1640{
1641	struct sem_array *sma;
1642	struct sem_undo_list *ulp;
1643	struct sem_undo *un, *new;
1644	int nsems, error;
1645
1646	error = get_undo_list(&ulp);
1647	if (error)
1648		return ERR_PTR(error);
1649
1650	rcu_read_lock();
1651	spin_lock(&ulp->lock);
1652	un = lookup_undo(ulp, semid);
1653	spin_unlock(&ulp->lock);
1654	if (likely(un!=NULL))
1655		goto out;
1656
1657	/* no undo structure around - allocate one. */
1658	/* step 1: figure out the size of the semaphore array */
1659	sma = sem_obtain_object_check(ns, semid);
1660	if (IS_ERR(sma)) {
1661		rcu_read_unlock();
1662		return ERR_CAST(sma);
1663	}
1664
1665	nsems = sma->sem_nsems;
1666	if (!ipc_rcu_getref(sma)) {
1667		rcu_read_unlock();
1668		un = ERR_PTR(-EIDRM);
1669		goto out;
1670	}
1671	rcu_read_unlock();
1672
1673	/* step 2: allocate new undo structure */
1674	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
1675	if (!new) {
1676		ipc_rcu_putref(sma, ipc_rcu_free);
1677		return ERR_PTR(-ENOMEM);
1678	}
1679
1680	/* step 3: Acquire the lock on semaphore array */
1681	rcu_read_lock();
1682	sem_lock_and_putref(sma);
1683	if (sma->sem_perm.deleted) {
1684		sem_unlock(sma, -1);
1685		rcu_read_unlock();
1686		kfree(new);
1687		un = ERR_PTR(-EIDRM);
1688		goto out;
1689	}
1690	spin_lock(&ulp->lock);
1691
1692	/*
1693	 * step 4: check for races: did someone else allocate the undo struct?
1694	 */
1695	un = lookup_undo(ulp, semid);
1696	if (un) {
1697		kfree(new);
1698		goto success;
1699	}
1700	/* step 5: initialize & link new undo structure */
1701	new->semadj = (short *) &new[1];
1702	new->ulp = ulp;
1703	new->semid = semid;
1704	assert_spin_locked(&ulp->lock);
1705	list_add_rcu(&new->list_proc, &ulp->list_proc);
1706	ipc_assert_locked_object(&sma->sem_perm);
1707	list_add(&new->list_id, &sma->list_id);
1708	un = new;
1709
1710success:
1711	spin_unlock(&ulp->lock);
1712	sem_unlock(sma, -1);
1713out:
1714	return un;
1715}
1716
1717
1718/**
1719 * get_queue_result - Retrieve the result code from sem_queue
1720 * @q: Pointer to queue structure
1721 *
1722 * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
1723 * q->status, then we must loop until the value is replaced with the final
1724 * value: This may happen if a task is woken up by an unrelated event (e.g.
1725 * signal) and in parallel the task is woken up by another task because it got
1726 * the requested semaphores.
1727 *
1728 * The function can be called with or without holding the semaphore spinlock.
1729 */
1730static int get_queue_result(struct sem_queue *q)
1731{
1732	int error;
1733
1734	error = q->status;
1735	while (unlikely(error == IN_WAKEUP)) {
1736		cpu_relax();
1737		error = q->status;
1738	}
1739
1740	return error;
1741}
1742
1743SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1744		unsigned, nsops, const struct timespec __user *, timeout)
1745{
1746	int error = -EINVAL;
1747	struct sem_array *sma;
1748	struct sembuf fast_sops[SEMOPM_FAST];
1749	struct sembuf* sops = fast_sops, *sop;
1750	struct sem_undo *un;
1751	int undos = 0, alter = 0, max, locknum;
1752	struct sem_queue queue;
1753	unsigned long jiffies_left = 0;
1754	struct ipc_namespace *ns;
1755	struct list_head tasks;
1756
1757	ns = current->nsproxy->ipc_ns;
1758
1759	if (nsops < 1 || semid < 0)
1760		return -EINVAL;
1761	if (nsops > ns->sc_semopm)
1762		return -E2BIG;
1763	if(nsops > SEMOPM_FAST) {
1764		sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL);
1765		if(sops==NULL)
1766			return -ENOMEM;
1767	}
1768	if (copy_from_user (sops, tsops, nsops * sizeof(*tsops))) {
1769		error=-EFAULT;
1770		goto out_free;
1771	}
1772	if (timeout) {
1773		struct timespec _timeout;
1774		if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
1775			error = -EFAULT;
1776			goto out_free;
1777		}
1778		if (_timeout.tv_sec < 0 || _timeout.tv_nsec < 0 ||
1779			_timeout.tv_nsec >= 1000000000L) {
1780			error = -EINVAL;
1781			goto out_free;
1782		}
1783		jiffies_left = timespec_to_jiffies(&_timeout);
1784	}
1785	max = 0;
1786	for (sop = sops; sop < sops + nsops; sop++) {
1787		if (sop->sem_num >= max)
1788			max = sop->sem_num;
1789		if (sop->sem_flg & SEM_UNDO)
1790			undos = 1;
1791		if (sop->sem_op != 0)
1792			alter = 1;
1793	}
1794
1795	INIT_LIST_HEAD(&tasks);
1796
1797	if (undos) {
1798		/* On success, find_alloc_undo takes the rcu_read_lock */
1799		un = find_alloc_undo(ns, semid);
1800		if (IS_ERR(un)) {
1801			error = PTR_ERR(un);
1802			goto out_free;
1803		}
1804	} else {
1805		un = NULL;
1806		rcu_read_lock();
1807	}
1808
1809	sma = sem_obtain_object_check(ns, semid);
1810	if (IS_ERR(sma)) {
1811		rcu_read_unlock();
1812		error = PTR_ERR(sma);
1813		goto out_free;
1814	}
1815
1816	error = -EFBIG;
1817	if (max >= sma->sem_nsems)
1818		goto out_rcu_wakeup;
1819
1820	error = -EACCES;
1821	if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO))
1822		goto out_rcu_wakeup;
1823
1824	error = security_sem_semop(sma, sops, nsops, alter);
1825	if (error)
1826		goto out_rcu_wakeup;
1827
1828	/*
1829	 * semid identifiers are not unique - find_alloc_undo may have
1830	 * allocated an undo structure, it was invalidated by an RMID
1831	 * and now a new array with received the same id. Check and fail.
1832	 * This case can be detected checking un->semid. The existence of
1833	 * "un" itself is guaranteed by rcu.
1834	 */
1835	error = -EIDRM;
1836	locknum = sem_lock(sma, sops, nsops);
1837	if (un && un->semid == -1)
1838		goto out_unlock_free;
1839
1840	error = perform_atomic_semop(sma, sops, nsops, un,
1841					task_tgid_vnr(current));
1842	if (error <= 0) {
1843		if (alter && error == 0)
1844			do_smart_update(sma, sops, nsops, 1, &tasks);
1845
1846		goto out_unlock_free;
1847	}
1848
1849	/* We need to sleep on this operation, so we put the current
1850	 * task into the pending queue and go to sleep.
1851	 */
1852
1853	queue.sops = sops;
1854	queue.nsops = nsops;
1855	queue.undo = un;
1856	queue.pid = task_tgid_vnr(current);
1857	queue.alter = alter;
1858
1859	if (nsops == 1) {
1860		struct sem *curr;
1861		curr = &sma->sem_base[sops->sem_num];
1862
1863		if (alter) {
1864			if (sma->complex_count) {
1865				list_add_tail(&queue.list,
1866						&sma->pending_alter);
1867			} else {
1868
1869				list_add_tail(&queue.list,
1870						&curr->pending_alter);
1871			}
1872		} else {
1873			list_add_tail(&queue.list, &curr->pending_const);
1874		}
1875	} else {
1876		if (!sma->complex_count)
1877			merge_queues(sma);
1878
1879		if (alter)
1880			list_add_tail(&queue.list, &sma->pending_alter);
1881		else
1882			list_add_tail(&queue.list, &sma->pending_const);
1883
1884		sma->complex_count++;
1885	}
1886
1887	queue.status = -EINTR;
1888	queue.sleeper = current;
1889
1890sleep_again:
1891	current->state = TASK_INTERRUPTIBLE;
1892	sem_unlock(sma, locknum);
1893	rcu_read_unlock();
1894
1895	if (timeout)
1896		jiffies_left = schedule_timeout(jiffies_left);
1897	else
1898		schedule();
1899
1900	error = get_queue_result(&queue);
1901
1902	if (error != -EINTR) {
1903		/* fast path: update_queue already obtained all requested
1904		 * resources.
1905		 * Perform a smp_mb(): User space could assume that semop()
1906		 * is a memory barrier: Without the mb(), the cpu could
1907		 * speculatively read in user space stale data that was
1908		 * overwritten by the previous owner of the semaphore.
1909		 */
1910		smp_mb();
1911
1912		goto out_free;
1913	}
1914
1915	rcu_read_lock();
1916	sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum);
1917
1918	/*
1919	 * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
1920	 */
1921	error = get_queue_result(&queue);
1922
1923	/*
1924	 * Array removed? If yes, leave without sem_unlock().
1925	 */
1926	if (IS_ERR(sma)) {
1927		rcu_read_unlock();
1928		goto out_free;
1929	}
1930
1931
1932	/*
1933	 * If queue.status != -EINTR we are woken up by another process.
1934	 * Leave without unlink_queue(), but with sem_unlock().
1935	 */
1936
1937	if (error != -EINTR) {
1938		goto out_unlock_free;
1939	}
1940
1941	/*
1942	 * If an interrupt occurred we have to clean up the queue
1943	 */
1944	if (timeout && jiffies_left == 0)
1945		error = -EAGAIN;
1946
1947	/*
1948	 * If the wakeup was spurious, just retry
1949	 */
1950	if (error == -EINTR && !signal_pending(current))
1951		goto sleep_again;
1952
1953	unlink_queue(sma, &queue);
1954
1955out_unlock_free:
1956	sem_unlock(sma, locknum);
1957out_rcu_wakeup:
1958	rcu_read_unlock();
1959	wake_up_sem_queue_do(&tasks);
1960out_free:
1961	if(sops != fast_sops)
1962		kfree(sops);
1963	return error;
1964}
1965
1966SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
1967		unsigned, nsops)
1968{
1969	return sys_semtimedop(semid, tsops, nsops, NULL);
1970}
1971
1972/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
1973 * parent and child tasks.
1974 */
1975
1976int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
1977{
1978	struct sem_undo_list *undo_list;
1979	int error;
1980
1981	if (clone_flags & CLONE_SYSVSEM) {
1982		error = get_undo_list(&undo_list);
1983		if (error)
1984			return error;
1985		atomic_inc(&undo_list->refcnt);
1986		tsk->sysvsem.undo_list = undo_list;
1987	} else
1988		tsk->sysvsem.undo_list = NULL;
1989
1990	return 0;
1991}
1992
1993/*
1994 * add semadj values to semaphores, free undo structures.
1995 * undo structures are not freed when semaphore arrays are destroyed
1996 * so some of them may be out of date.
1997 * IMPLEMENTATION NOTE: There is some confusion over whether the
1998 * set of adjustments that needs to be done should be done in an atomic
1999 * manner or not. That is, if we are attempting to decrement the semval
2000 * should we queue up and wait until we can do so legally?
2001 * The original implementation attempted to do this (queue and wait).
2002 * The current implementation does not do so. The POSIX standard
2003 * and SVID should be consulted to determine what behavior is mandated.
2004 */
2005void exit_sem(struct task_struct *tsk)
2006{
2007	struct sem_undo_list *ulp;
2008
2009	ulp = tsk->sysvsem.undo_list;
2010	if (!ulp)
2011		return;
2012	tsk->sysvsem.undo_list = NULL;
2013
2014	if (!atomic_dec_and_test(&ulp->refcnt))
2015		return;
2016
2017	for (;;) {
2018		struct sem_array *sma;
2019		struct sem_undo *un;
2020		struct list_head tasks;
2021		int semid, i;
2022
2023		rcu_read_lock();
2024		un = list_entry_rcu(ulp->list_proc.next,
2025				    struct sem_undo, list_proc);
2026		if (&un->list_proc == &ulp->list_proc)
2027			semid = -1;
2028		 else
2029			semid = un->semid;
2030
2031		if (semid == -1) {
2032			rcu_read_unlock();
2033			break;
2034		}
2035
2036		sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, un->semid);
2037		/* exit_sem raced with IPC_RMID, nothing to do */
2038		if (IS_ERR(sma)) {
2039			rcu_read_unlock();
2040			continue;
2041		}
2042
2043		sem_lock(sma, NULL, -1);
2044		un = __lookup_undo(ulp, semid);
2045		if (un == NULL) {
2046			/* exit_sem raced with IPC_RMID+semget() that created
2047			 * exactly the same semid. Nothing to do.
2048			 */
2049			sem_unlock(sma, -1);
2050			rcu_read_unlock();
2051			continue;
2052		}
2053
2054		/* remove un from the linked lists */
2055		ipc_assert_locked_object(&sma->sem_perm);
2056		list_del(&un->list_id);
2057
2058		spin_lock(&ulp->lock);
2059		list_del_rcu(&un->list_proc);
2060		spin_unlock(&ulp->lock);
2061
2062		/* perform adjustments registered in un */
2063		for (i = 0; i < sma->sem_nsems; i++) {
2064			struct sem * semaphore = &sma->sem_base[i];
2065			if (un->semadj[i]) {
2066				semaphore->semval += un->semadj[i];
2067				/*
2068				 * Range checks of the new semaphore value,
2069				 * not defined by sus:
2070				 * - Some unices ignore the undo entirely
2071				 *   (e.g. HP UX 11i 11.22, Tru64 V5.1)
2072				 * - some cap the value (e.g. FreeBSD caps
2073				 *   at 0, but doesn't enforce SEMVMX)
2074				 *
2075				 * Linux caps the semaphore value, both at 0
2076				 * and at SEMVMX.
2077				 *
2078				 * 	Manfred <manfred@colorfullife.com>
2079				 */
2080				if (semaphore->semval < 0)
2081					semaphore->semval = 0;
2082				if (semaphore->semval > SEMVMX)
2083					semaphore->semval = SEMVMX;
2084				semaphore->sempid = task_tgid_vnr(current);
2085			}
2086		}
2087		/* maybe some queued-up processes were waiting for this */
2088		INIT_LIST_HEAD(&tasks);
2089		do_smart_update(sma, NULL, 0, 1, &tasks);
2090		sem_unlock(sma, -1);
2091		rcu_read_unlock();
2092		wake_up_sem_queue_do(&tasks);
2093
2094		kfree_rcu(un, rcu);
2095	}
2096	kfree(ulp);
2097}
2098
2099#ifdef CONFIG_PROC_FS
2100static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
2101{
2102	struct user_namespace *user_ns = seq_user_ns(s);
2103	struct sem_array *sma = it;
2104	time_t sem_otime;
2105
2106	/*
2107	 * The proc interface isn't aware of sem_lock(), it calls
2108	 * ipc_lock_object() directly (in sysvipc_find_ipc).
2109	 * In order to stay compatible with sem_lock(), we must wait until
2110	 * all simple semop() calls have left their critical regions.
2111	 */
2112	sem_wait_array(sma);
2113
2114	sem_otime = get_semotime(sma);
2115
2116	return seq_printf(s,
2117			  "%10d %10d  %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
2118			  sma->sem_perm.key,
2119			  sma->sem_perm.id,
2120			  sma->sem_perm.mode,
2121			  sma->sem_nsems,
2122			  from_kuid_munged(user_ns, sma->sem_perm.uid),
2123			  from_kgid_munged(user_ns, sma->sem_perm.gid),
2124			  from_kuid_munged(user_ns, sma->sem_perm.cuid),
2125			  from_kgid_munged(user_ns, sma->sem_perm.cgid),
2126			  sem_otime,
2127			  sma->sem_ctime);
2128}
2129#endif
2130