1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved  	*/
28
29/*
30 * Inter-Process Communication Semaphore Facility.
31 *
32 * See os/ipc.c for a description of common IPC functionality.
33 *
34 * Resource controls
35 * -----------------
36 *
37 * Control:      zone.max-sem-ids (rc_zone_semmni)
38 * Description:  Maximum number of semaphore ids allowed a zone.
39 *
40 *   When semget() is used to allocate a semaphore set, one id is
41 *   allocated.  If the id allocation doesn't succeed, semget() fails
42 *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
43 *   the id is deallocated.
44 *
45 * Control:      project.max-sem-ids (rc_project_semmni)
46 * Description:  Maximum number of semaphore ids allowed a project.
47 *
48 *   When semget() is used to allocate a semaphore set, one id is
49 *   allocated.  If the id allocation doesn't succeed, semget() fails
50 *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
51 *   the id is deallocated.
52 *
53 * Control:      process.max-sem-nsems (rc_process_semmsl)
54 * Description:  Maximum number of semaphores allowed per semaphore set.
55 *
56 *   When semget() is used to allocate a semaphore set, the size of the
57 *   set is compared with this limit.  If the number of semaphores
58 *   exceeds the limit, semget() fails and errno is set to EINVAL.
59 *
60 * Control:      process.max-sem-ops (rc_process_semopm)
61 * Description:  Maximum number of semaphore operations allowed per
62 *               semop call.
63 *
64 *   When semget() successfully allocates a semaphore set, the minimum
65 *   enforced value of this limit is used to initialize the
66 *   "system-imposed maximum" number of operations a semop() call for
67 *   this set can perform.
68 *
69 * Undo structures
70 * ---------------
71 *
72 * Removing the undo structure tunables involved a serious redesign of
73 * how they were implemented.  There is now one undo structure for
74 * every process/semaphore array combination (lazily allocated, of
75 * course), and each is equal in size to the semaphore it corresponds
76 * to.  To avoid scalability and performance problems, the undo
77 * structures are stored in two places: a per-process AVL tree sorted
78 * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted
79 * per-semaphore linked list (sem_undos, protected by the semaphore's
80 * ID lock).  The former is used by semop, where a lookup is performed
81 * once and cached if SEM_UNDO is specified for any of the operations,
82 * and at process exit where the undoable operations are rolled back.
83 * The latter is used when removing the semaphore, so the undo
84 * structures can be removed from the appropriate processes' trees.
85 *
86 * The undo structure itself contains pointers to the ksemid and proc
87 * to which it corresponds, a list node, an AVL node, and an array of
88 * adjust-on-exit (AOE) values.  When an undo structure is allocated it
89 * is immediately added to both the process's tree and the semaphore's
90 * list.  Lastly, the reference count on the semaphore is increased.
91 *
92 * Avoiding a lock ordering violation between p_lock and the ID lock,
93 * wont to occur when there is a race between a process exiting and the
94 * removal of a semaphore, mandates the delicate dance that exists
95 * between semexit and sem_rmid.
96 *
97 * sem_rmid, holding the ID lock, iterates through all undo structures
98 * and for each takes the appropriate process's p_lock and checks to
99 * see if p_semacct is NULL.  If it is, it skips that undo structure
100 * and continues to the next.  Otherwise, it removes the undo structure
101 * from both the AVL tree and the semaphore's list, and releases the
102 * hold that the undo structure had on the semaphore.
103 *
104 * The important other half of this is semexit, which will immediately
105 * take p_lock, obtain the AVL pointer, clear p_semacct, and drop
106 * p_lock.  From this point on it is semexit's responsibility to clean
107 * up all undo structures found in the tree -- a coexecuting sem_rmid
108 * will see the NULL p_semacct and skip that undo structure.  It walks
109 * the AVL tree (using avl_destroy_nodes) and for each undo structure
110 * takes the appropriate semaphore's ID lock (always legal since the
111 * undo structure has a hold on the semaphore), updates all semaphores
112 * with non-zero AOE values, and removes the structure from the
113 * semaphore's list.  It then drops the structure's reference on the
114 * semaphore, drops the ID lock, and frees the undo structure.
115 */
116
117#include <sys/types.h>
118#include <sys/t_lock.h>
119#include <sys/param.h>
120#include <sys/systm.h>
121#include <sys/sysmacros.h>
122#include <sys/cred.h>
123#include <sys/vmem.h>
124#include <sys/kmem.h>
125#include <sys/errno.h>
126#include <sys/time.h>
127#include <sys/ipc.h>
128#include <sys/ipc_impl.h>
129#include <sys/sem.h>
130#include <sys/sem_impl.h>
131#include <sys/user.h>
132#include <sys/proc.h>
133#include <sys/cpuvar.h>
134#include <sys/debug.h>
135#include <sys/var.h>
136#include <sys/cmn_err.h>
137#include <sys/modctl.h>
138#include <sys/syscall.h>
139#include <sys/avl.h>
140#include <sys/list.h>
141#include <sys/zone.h>
142
143#include <c2/audit.h>
144
145extern rctl_hndl_t rc_zone_semmni;
146extern rctl_hndl_t rc_project_semmni;
147extern rctl_hndl_t rc_process_semmsl;
148extern rctl_hndl_t rc_process_semopm;
149static ipc_service_t *sem_svc;
150static zone_key_t sem_zone_key;
151
152/*
153 * The following tunables are obsolete.  Though for compatibility we
154 * still read and interpret seminfo_semmsl, seminfo_semopm and
155 * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred
156 * mechanism for administrating the IPC Semaphore facility is through
157 * the resource controls described at the top of this file.
158 */
159int seminfo_semaem = 16384;	/* (obsolete) */
160int seminfo_semmap = 10;	/* (obsolete) */
161int seminfo_semmni = 10;	/* (obsolete) */
162int seminfo_semmns = 60;	/* (obsolete) */
163int seminfo_semmnu = 30;	/* (obsolete) */
164int seminfo_semmsl = 25;	/* (obsolete) */
165int seminfo_semopm = 10;	/* (obsolete) */
166int seminfo_semume = 10;	/* (obsolete) */
167int seminfo_semusz = 96;	/* (obsolete) */
168int seminfo_semvmx = 32767;	/* (obsolete) */
169
170#define	SEM_MAXUCOPS	4096	/* max # of unchecked ops per semop call */
171#define	SEM_UNDOSZ(n)	(sizeof (struct sem_undo) + (n - 1) * sizeof (int))
172
173static int semsys(int opcode, uintptr_t a0, uintptr_t a1,
174    uintptr_t a2, uintptr_t a3);
175static void sem_dtor(kipc_perm_t *);
176static void sem_rmid(kipc_perm_t *);
177static void sem_remove_zone(zoneid_t, void *);
178
179static struct sysent ipcsem_sysent = {
180	5,
181	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
182	semsys
183};
184
185/*
186 * Module linkage information for the kernel.
187 */
188static struct modlsys modlsys = {
189	&mod_syscallops, "System V semaphore facility", &ipcsem_sysent
190};
191
192#ifdef _SYSCALL32_IMPL
193static struct modlsys modlsys32 = {
194	&mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent
195};
196#endif
197
198static struct modlinkage modlinkage = {
199	MODREV_1,
200	&modlsys,
201#ifdef _SYSCALL32_IMPL
202	&modlsys32,
203#endif
204	NULL
205};
206
207
208int
209_init(void)
210{
211	int result;
212
213	sem_svc = ipcs_create("semids", rc_project_semmni, rc_zone_semmni,
214	    sizeof (ksemid_t), sem_dtor, sem_rmid, AT_IPC_SEM,
215	    offsetof(ipc_rqty_t, ipcq_semmni));
216	zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL);
217
218	if ((result = mod_install(&modlinkage)) == 0)
219		return (0);
220
221	(void) zone_key_delete(sem_zone_key);
222	ipcs_destroy(sem_svc);
223
224	return (result);
225}
226
227int
228_fini(void)
229{
230	return (EBUSY);
231}
232
233int
234_info(struct modinfo *modinfop)
235{
236	return (mod_info(&modlinkage, modinfop));
237}
238
239static void
240sem_dtor(kipc_perm_t *perm)
241{
242	ksemid_t *sp = (ksemid_t *)perm;
243
244	kmem_free(sp->sem_base,
245	    P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64));
246	list_destroy(&sp->sem_undos);
247}
248
249/*
250 * sem_undo_add - Create or update adjust on exit entry.
251 */
252static int
253sem_undo_add(short val, ushort_t num, struct sem_undo *undo)
254{
255	int newval = undo->un_aoe[num] - val;
256
257	if (newval > USHRT_MAX || newval < -USHRT_MAX)
258		return (ERANGE);
259	undo->un_aoe[num] = newval;
260
261	return (0);
262}
263
264/*
265 * sem_undo_clear - clears all undo entries for specified semaphores
266 *
267 * Used when semaphores are reset by SETVAL or SETALL.
268 */
269static void
270sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high)
271{
272	struct sem_undo *undo;
273	int i;
274
275	ASSERT(low <= high);
276	ASSERT(high < sp->sem_nsems);
277
278	for (undo = list_head(&sp->sem_undos); undo;
279	    undo = list_next(&sp->sem_undos, undo))
280		for (i = low; i <= high; i++)
281			undo->un_aoe[i] = 0;
282}
283
284/*
285 * sem_rollback - roll back work done so far if unable to complete operation
286 */
287static void
288sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo)
289{
290	struct sem *semp;	/* semaphore ptr */
291
292	for (op += n - 1; n--; op--) {
293		if (op->sem_op == 0)
294			continue;
295		semp = &sp->sem_base[op->sem_num];
296		semp->semval -= op->sem_op;
297		if (op->sem_flg & SEM_UNDO) {
298			ASSERT(undo != NULL);
299			(void) sem_undo_add(-op->sem_op, op->sem_num, undo);
300		}
301	}
302}
303
304static void
305sem_rmid(kipc_perm_t *perm)
306{
307	ksemid_t *sp = (ksemid_t *)perm;
308	struct sem *semp;
309	struct sem_undo *undo;
310	size_t size = SEM_UNDOSZ(sp->sem_nsems);
311	int i;
312
313	/*LINTED*/
314	while (undo = list_head(&sp->sem_undos)) {
315		list_remove(&sp->sem_undos, undo);
316		mutex_enter(&undo->un_proc->p_lock);
317		if (undo->un_proc->p_semacct == NULL) {
318			mutex_exit(&undo->un_proc->p_lock);
319			continue;
320		}
321		avl_remove(undo->un_proc->p_semacct, undo);
322		mutex_exit(&undo->un_proc->p_lock);
323		kmem_free(undo, size);
324		ipc_rele_locked(sem_svc, (kipc_perm_t *)sp);
325	}
326
327	for (i = 0; i < sp->sem_nsems; i++) {
328		semp = &sp->sem_base[i];
329		semp->semval = semp->sempid = 0;
330		if (semp->semncnt) {
331			cv_broadcast(&semp->semncnt_cv);
332			semp->semncnt = 0;
333		}
334		if (semp->semzcnt) {
335			cv_broadcast(&semp->semzcnt_cv);
336			semp->semzcnt = 0;
337		}
338	}
339}
340
341/*
342 * semctl - Semctl system call.
343 */
344static int
345semctl(int semid, uint_t semnum, int cmd, uintptr_t arg)
346{
347	ksemid_t		*sp;	/* ptr to semaphore header */
348	struct sem		*p;	/* ptr to semaphore */
349	unsigned int		i;	/* loop control */
350	ushort_t		*vals, *vp;
351	size_t			vsize = 0;
352	int			error = 0;
353	int			retval = 0;
354	struct cred		*cr;
355	kmutex_t		*lock;
356	model_t			mdl = get_udatamodel();
357	STRUCT_DECL(semid_ds, sid);
358	struct semid_ds64	ds64;
359
360	STRUCT_INIT(sid, mdl);
361	cr = CRED();
362
363	/*
364	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
365	 */
366	switch (cmd) {
367	case IPC_SET:
368		if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid)))
369			return (set_errno(EFAULT));
370		break;
371
372	case IPC_SET64:
373		if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64)))
374			return (set_errno(EFAULT));
375		break;
376
377	case SETALL:
378		if ((lock = ipc_lookup(sem_svc, semid,
379		    (kipc_perm_t **)&sp)) == NULL)
380			return (set_errno(EINVAL));
381		vsize = sp->sem_nsems * sizeof (*vals);
382		mutex_exit(lock);
383
384		/* allocate space to hold all semaphore values */
385		vals = kmem_alloc(vsize, KM_SLEEP);
386
387		if (copyin((void *)arg, vals, vsize)) {
388			kmem_free(vals, vsize);
389			return (set_errno(EFAULT));
390		}
391		break;
392
393	case IPC_RMID:
394		if (error = ipc_rmid(sem_svc, semid, cr))
395			return (set_errno(error));
396		return (0);
397	}
398
399	if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) {
400		if (vsize != 0)
401			kmem_free(vals, vsize);
402		return (set_errno(EINVAL));
403	}
404	switch (cmd) {
405	/* Set ownership and permissions. */
406	case IPC_SET:
407
408		if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm,
409		    &STRUCT_BUF(sid)->sem_perm, mdl)) {
410			mutex_exit(lock);
411			return (set_errno(error));
412		}
413		sp->sem_ctime = gethrestime_sec();
414		mutex_exit(lock);
415		return (0);
416
417	/* Get semaphore data structure. */
418	case IPC_STAT:
419
420		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
421			mutex_exit(lock);
422			return (set_errno(error));
423		}
424
425		ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl);
426		STRUCT_FSETP(sid, sem_base, NULL);	/* kernel addr */
427		STRUCT_FSET(sid, sem_nsems, sp->sem_nsems);
428		STRUCT_FSET(sid, sem_otime, sp->sem_otime);
429		STRUCT_FSET(sid, sem_ctime, sp->sem_ctime);
430		STRUCT_FSET(sid, sem_binary, sp->sem_binary);
431		mutex_exit(lock);
432
433		if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid)))
434			return (set_errno(EFAULT));
435		return (0);
436
437	case IPC_SET64:
438
439		if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm,
440		    &ds64.semx_perm)) {
441			mutex_exit(lock);
442			return (set_errno(error));
443		}
444		sp->sem_ctime = gethrestime_sec();
445		mutex_exit(lock);
446		return (0);
447
448	case IPC_STAT64:
449
450		ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm);
451		ds64.semx_nsems = sp->sem_nsems;
452		ds64.semx_otime = sp->sem_otime;
453		ds64.semx_ctime = sp->sem_ctime;
454
455		mutex_exit(lock);
456		if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64)))
457			return (set_errno(EFAULT));
458
459		return (0);
460
461	/* Get # of processes sleeping for greater semval. */
462	case GETNCNT:
463		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
464			mutex_exit(lock);
465			return (set_errno(error));
466		}
467		if (semnum >= sp->sem_nsems) {
468			mutex_exit(lock);
469			return (set_errno(EINVAL));
470		}
471		retval = sp->sem_base[semnum].semncnt;
472		mutex_exit(lock);
473		return (retval);
474
475	/* Get pid of last process to operate on semaphore. */
476	case GETPID:
477		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
478			mutex_exit(lock);
479			return (set_errno(error));
480		}
481		if (semnum >= sp->sem_nsems) {
482			mutex_exit(lock);
483			return (set_errno(EINVAL));
484		}
485		retval = sp->sem_base[semnum].sempid;
486		mutex_exit(lock);
487		return (retval);
488
489	/* Get semval of one semaphore. */
490	case GETVAL:
491		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
492			mutex_exit(lock);
493			return (set_errno(error));
494		}
495		if (semnum >= sp->sem_nsems) {
496			mutex_exit(lock);
497			return (set_errno(EINVAL));
498		}
499		retval = sp->sem_base[semnum].semval;
500		mutex_exit(lock);
501		return (retval);
502
503	/* Get all semvals in set. */
504	case GETALL:
505		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
506			mutex_exit(lock);
507			return (set_errno(error));
508		}
509
510		/* allocate space to hold all semaphore values */
511		vsize = sp->sem_nsems * sizeof (*vals);
512		vals = vp = kmem_alloc(vsize, KM_SLEEP);
513
514		for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++)
515			bcopy(&p->semval, vp, sizeof (p->semval));
516
517		mutex_exit(lock);
518
519		if (copyout((void *)vals, (void *)arg, vsize)) {
520			kmem_free(vals, vsize);
521			return (set_errno(EFAULT));
522		}
523
524		kmem_free(vals, vsize);
525		return (0);
526
527	/* Get # of processes sleeping for semval to become zero. */
528	case GETZCNT:
529		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
530			mutex_exit(lock);
531			return (set_errno(error));
532		}
533		if (semnum >= sp->sem_nsems) {
534			mutex_exit(lock);
535			return (set_errno(EINVAL));
536		}
537		retval = sp->sem_base[semnum].semzcnt;
538		mutex_exit(lock);
539		return (retval);
540
541	/* Set semval of one semaphore. */
542	case SETVAL:
543		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
544			mutex_exit(lock);
545			return (set_errno(error));
546		}
547		if (semnum >= sp->sem_nsems) {
548			mutex_exit(lock);
549			return (set_errno(EINVAL));
550		}
551		if ((uint_t)arg > USHRT_MAX) {
552			mutex_exit(lock);
553			return (set_errno(ERANGE));
554		}
555		p = &sp->sem_base[semnum];
556		if ((p->semval = (ushort_t)arg) != 0) {
557			if (p->semncnt) {
558				cv_broadcast(&p->semncnt_cv);
559			}
560		} else if (p->semzcnt) {
561			cv_broadcast(&p->semzcnt_cv);
562		}
563		p->sempid = curproc->p_pid;
564		sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum);
565		mutex_exit(lock);
566		return (0);
567
568	/* Set semvals of all semaphores in set. */
569	case SETALL:
570		/* Check if semaphore set has been deleted and reallocated. */
571		if (sp->sem_nsems * sizeof (*vals) != vsize) {
572			error = set_errno(EINVAL);
573			goto seterr;
574		}
575		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
576			error = set_errno(error);
577			goto seterr;
578		}
579		sem_undo_clear(sp, 0, sp->sem_nsems - 1);
580		for (i = 0, p = sp->sem_base; i < sp->sem_nsems;
581		    (p++)->sempid = curproc->p_pid) {
582			if ((p->semval = vals[i++]) != 0) {
583				if (p->semncnt) {
584					cv_broadcast(&p->semncnt_cv);
585				}
586			} else if (p->semzcnt) {
587				cv_broadcast(&p->semzcnt_cv);
588			}
589		}
590seterr:
591		mutex_exit(lock);
592		kmem_free(vals, vsize);
593		return (error);
594
595	default:
596		mutex_exit(lock);
597		return (set_errno(EINVAL));
598	}
599
600	/* NOTREACHED */
601}
602
603/*
604 * semexit - Called by exit() to clean up on process exit.
605 */
606void
607semexit(proc_t *pp)
608{
609	avl_tree_t	*tree;
610	struct sem_undo	*undo;
611	void		*cookie = NULL;
612
613	mutex_enter(&pp->p_lock);
614	tree = pp->p_semacct;
615	pp->p_semacct = NULL;
616	mutex_exit(&pp->p_lock);
617
618	while (undo = avl_destroy_nodes(tree, &cookie)) {
619		ksemid_t *sp = undo->un_sp;
620		size_t size = SEM_UNDOSZ(sp->sem_nsems);
621		int i;
622
623		(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
624		if (!IPC_FREE(&sp->sem_perm)) {
625			for (i = 0; i < sp->sem_nsems; i++) {
626				int adj = undo->un_aoe[i];
627				if (adj) {
628					struct sem *semp = &sp->sem_base[i];
629					int v = (int)semp->semval + adj;
630
631					if (v < 0 || v > USHRT_MAX)
632						continue;
633					semp->semval = (ushort_t)v;
634					if (v == 0 && semp->semzcnt)
635						cv_broadcast(&semp->semzcnt_cv);
636					if (adj > 0 && semp->semncnt)
637						cv_broadcast(&semp->semncnt_cv);
638				}
639			}
640			list_remove(&sp->sem_undos, undo);
641		}
642		ipc_rele(sem_svc, (kipc_perm_t *)sp);
643		kmem_free(undo, size);
644	}
645
646	avl_destroy(tree);
647	kmem_free(tree, sizeof (avl_tree_t));
648}
649
650/*
651 * Remove all semaphores associated with a given zone.  Called by
652 * zone_shutdown when the zone is halted.
653 */
654/*ARGSUSED1*/
655static void
656sem_remove_zone(zoneid_t zoneid, void *arg)
657{
658	ipc_remove_zone(sem_svc, zoneid);
659}
660
661/*
662 * semget - Semget system call.
663 */
664static int
665semget(key_t key, int nsems, int semflg)
666{
667	ksemid_t	*sp;
668	kmutex_t	*lock;
669	int		id, error;
670	proc_t		*pp = curproc;
671
672top:
673	if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock))
674		return (set_errno(error));
675
676	if (!IPC_FREE(&sp->sem_perm)) {
677		/*
678		 * A semaphore with the requested key exists.
679		 */
680		if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) {
681			mutex_exit(lock);
682			return (set_errno(EINVAL));
683		}
684	} else {
685		/*
686		 * This is a new semaphore set.  Finish initialization.
687		 */
688		if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp,
689		    nsems, RCA_SAFE) & RCT_DENY)) {
690			mutex_exit(lock);
691			mutex_exit(&pp->p_lock);
692			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
693			return (set_errno(EINVAL));
694		}
695		mutex_exit(lock);
696		mutex_exit(&pp->p_lock);
697
698		/*
699		 * We round the allocation up to coherency granularity
700		 * so that multiple semaphore allocations won't result
701		 * in the false sharing of their sem structures.
702		 */
703		sp->sem_base =
704		    kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64),
705		    KM_SLEEP);
706		sp->sem_binary = (nsems == 1);
707		sp->sem_nsems = (ushort_t)nsems;
708		sp->sem_ctime = gethrestime_sec();
709		sp->sem_otime = 0;
710		list_create(&sp->sem_undos, sizeof (struct sem_undo),
711		    offsetof(struct sem_undo, un_list));
712
713		if (error = ipc_commit_begin(sem_svc, key, semflg,
714		    (kipc_perm_t *)sp)) {
715			if (error == EAGAIN)
716				goto top;
717			return (set_errno(error));
718		}
719		sp->sem_maxops =
720		    rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
721		if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems,
722		    RCA_SAFE) & RCT_DENY) {
723			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
724			return (set_errno(EINVAL));
725		}
726		lock = ipc_commit_end(sem_svc, &sp->sem_perm);
727	}
728
729	if (AU_AUDITING())
730		audit_ipcget(AT_IPC_SEM, (void *)sp);
731
732	id = sp->sem_perm.ipc_id;
733	mutex_exit(lock);
734	return (id);
735}
736
737/*
738 * semids system call.
739 */
740static int
741semids(int *buf, uint_t nids, uint_t *pnids)
742{
743	int error;
744
745	if (error = ipc_ids(sem_svc, buf, nids, pnids))
746		return (set_errno(error));
747
748	return (0);
749}
750
751
752/*
753 * Helper function for semop - copies in the provided timespec and
754 * computes the absolute future time after which we must return.
755 */
756static int
757compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now,
758	timespec_t *timeout)
759{
760	model_t datamodel = get_udatamodel();
761
762	if (datamodel == DATAMODEL_NATIVE) {
763		if (copyin(timeout, ts, sizeof (timespec_t)))
764			return (EFAULT);
765	} else {
766		timespec32_t ts32;
767
768		if (copyin(timeout, &ts32, sizeof (timespec32_t)))
769			return (EFAULT);
770		TIMESPEC32_TO_TIMESPEC(ts, &ts32)
771	}
772
773	if (itimerspecfix(ts))
774		return (EINVAL);
775
776	/*
777	 * Convert the timespec value into absolute time.
778	 */
779	timespecadd(ts, now);
780	*tsp = ts;
781
782	return (0);
783}
784
785/*
786 * Undo structure comparator.  We sort based on ksemid_t pointer.
787 */
788static int
789sem_undo_compar(const void *x, const void *y)
790{
791	struct sem_undo *undo1 = (struct sem_undo *)x;
792	struct sem_undo *undo2 = (struct sem_undo *)y;
793
794	if (undo1->un_sp < undo2->un_sp)
795		return (-1);
796	if (undo1->un_sp > undo2->un_sp)
797		return (1);
798	return (0);
799}
800
801/*
802 * Helper function for semop - creates an undo structure and adds it to
803 * the process's avl tree and the semaphore's list.
804 */
805static int
806sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock,
807    struct sem_undo *template, struct sem_undo **un)
808{
809	size_t size;
810	struct sem_undo *undo;
811	avl_tree_t *tree = NULL;
812	avl_index_t where;
813
814	mutex_exit(*lock);
815
816	size = SEM_UNDOSZ(sp->sem_nsems);
817	undo = kmem_zalloc(size, KM_SLEEP);
818	undo->un_proc = pp;
819	undo->un_sp = sp;
820
821	if (pp->p_semacct == NULL)
822		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
823
824	*lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
825	if (IPC_FREE(&sp->sem_perm)) {
826		kmem_free(undo, size);
827		if (tree)
828			kmem_free(tree, sizeof (avl_tree_t));
829		return (EIDRM);
830	}
831
832	mutex_enter(&pp->p_lock);
833	if (tree) {
834		if (pp->p_semacct == NULL) {
835			avl_create(tree, sem_undo_compar,
836			    sizeof (struct sem_undo),
837			    offsetof(struct sem_undo, un_avl));
838			pp->p_semacct = tree;
839		} else {
840			kmem_free(tree, sizeof (avl_tree_t));
841		}
842	}
843
844	if (*un = avl_find(pp->p_semacct, template, &where)) {
845		mutex_exit(&pp->p_lock);
846		kmem_free(undo, size);
847	} else {
848		*un = undo;
849		avl_insert(pp->p_semacct, undo, where);
850		mutex_exit(&pp->p_lock);
851		list_insert_head(&sp->sem_undos, undo);
852		ipc_hold(sem_svc, (kipc_perm_t *)sp);
853	}
854
855
856	return (0);
857}
858
859/*
860 * semop - Semop system call.
861 */
862static int
863semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout)
864{
865	ksemid_t	*sp = NULL;
866	kmutex_t	*lock;
867	struct sembuf	*op;	/* ptr to operation */
868	int		i;	/* loop control */
869	struct sem	*semp;	/* ptr to semaphore */
870	int 		error = 0;
871	struct sembuf	*uops;	/* ptr to copy of user ops */
872	struct sembuf 	x_sem;	/* avoid kmem_alloc's */
873	timespec_t	now, ts, *tsp = NULL;
874	int		timecheck = 0;
875	int		cvres, needundo, mode;
876	struct sem_undo	*undo;
877	proc_t		*pp = curproc;
878	int		held = 0;
879
880	CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */
881
882	/*
883	 * To avoid the cost of copying in 'timeout' in the common
884	 * case, we could only grab the time here and defer the copyin
885	 * and associated computations until we are about to block.
886	 *
887	 * The down side to this is that we would then have to spin
888	 * some goto top nonsense to avoid the copyin behind the semid
889	 * lock.  As a common use of timed semaphores is as an explicit
890	 * blocking mechanism, this could incur a greater penalty.
891	 *
892	 * If we eventually decide that this would be a wise route to
893	 * take, the deferrable functionality is completely contained
894	 * in 'compute_timeout', and the interface is defined such that
895	 * we can legally not validate 'timeout' if it is unused.
896	 */
897	if (timeout != NULL) {
898		timecheck = timechanged;
899		gethrestime(&now);
900		if (error = compute_timeout(&tsp, &ts, &now, timeout))
901			return (set_errno(error));
902	}
903
904	/*
905	 * Allocate space to hold the vector of semaphore ops.  If
906	 * there is only 1 operation we use a preallocated buffer on
907	 * the stack for speed.
908	 *
909	 * Since we don't want to allow the user to allocate an
910	 * arbitrary amount of kernel memory, we need to check against
911	 * the number of operations allowed by the semaphore.  We only
912	 * bother doing this if the number of operations is larger than
913	 * SEM_MAXUCOPS.
914	 */
915	if (nsops == 1)
916		uops = &x_sem;
917	else if (nsops == 0)
918		return (0);
919	else if (nsops <= SEM_MAXUCOPS)
920		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
921
922	if (nsops > SEM_MAXUCOPS) {
923		if ((lock = ipc_lookup(sem_svc, semid,
924		    (kipc_perm_t **)&sp)) == NULL)
925			return (set_errno(EFAULT));
926
927		if (nsops > sp->sem_maxops) {
928			mutex_exit(lock);
929			return (set_errno(E2BIG));
930		}
931		held = 1;
932		ipc_hold(sem_svc, (kipc_perm_t *)sp);
933		mutex_exit(lock);
934
935		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
936		if (copyin(sops, uops, nsops * sizeof (*op))) {
937			error = EFAULT;
938			(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
939			goto semoperr;
940		}
941
942		lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
943		if (IPC_FREE(&sp->sem_perm)) {
944			error = EIDRM;
945			goto semoperr;
946		}
947	} else {
948		/*
949		 * This could be interleaved with the above code, but
950		 * keeping them separate improves readability.
951		 */
952		if (copyin(sops, uops, nsops * sizeof (*op))) {
953			error = EFAULT;
954			goto semoperr_unlocked;
955		}
956
957		if ((lock = ipc_lookup(sem_svc, semid,
958		    (kipc_perm_t **)&sp)) == NULL) {
959			error = EINVAL;
960			goto semoperr_unlocked;
961		}
962
963		if (nsops > sp->sem_maxops) {
964			error = E2BIG;
965			goto semoperr;
966		}
967	}
968
969	/*
970	 * Scan all operations.  Verify that sem #s are in range and
971	 * this process is allowed the requested operations.  If any
972	 * operations are marked SEM_UNDO, find (or allocate) the undo
973	 * structure for this process and semaphore.
974	 */
975	needundo = 0;
976	mode = 0;
977	for (i = 0, op = uops; i++ < nsops; op++) {
978		mode |= op->sem_op ? SEM_A : SEM_R;
979		if (op->sem_num >= sp->sem_nsems) {
980			error = EFBIG;
981			goto semoperr;
982		}
983		if ((op->sem_flg & SEM_UNDO) && op->sem_op)
984			needundo = 1;
985	}
986	if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
987		goto semoperr;
988
989	if (needundo) {
990		struct sem_undo template;
991
992		template.un_sp = sp;
993		mutex_enter(&pp->p_lock);
994		if (pp->p_semacct)
995			undo = avl_find(pp->p_semacct, &template, NULL);
996		else
997			undo = NULL;
998		mutex_exit(&pp->p_lock);
999		if (undo == NULL) {
1000			if (!held) {
1001				held = 1;
1002				ipc_hold(sem_svc, (kipc_perm_t *)sp);
1003			}
1004			if (error = sem_undo_alloc(pp, sp, &lock, &template,
1005			    &undo))
1006				goto semoperr;
1007
1008			/* sem_undo_alloc unlocks the semaphore */
1009			if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
1010				goto semoperr;
1011		}
1012	}
1013
1014check:
1015	/*
1016	 * Loop waiting for the operations to be satisfied atomically.
1017	 * Actually, do the operations and undo them if a wait is needed
1018	 * or an error is detected.
1019	 */
1020	for (i = 0; i < nsops; i++) {
1021		op = &uops[i];
1022		semp = &sp->sem_base[op->sem_num];
1023
1024		/*
1025		 * Raise the semaphore (i.e. sema_v)
1026		 */
1027		if (op->sem_op > 0) {
1028			if (op->sem_op + (int)semp->semval > USHRT_MAX ||
1029			    ((op->sem_flg & SEM_UNDO) &&
1030			    (error = sem_undo_add(op->sem_op, op->sem_num,
1031			    undo)))) {
1032				if (i)
1033					sem_rollback(sp, uops, i, undo);
1034				if (error == 0)
1035					error = ERANGE;
1036				goto semoperr;
1037			}
1038			semp->semval += op->sem_op;
1039			/*
1040			 * If we are only incrementing the semaphore value
1041			 * by one on a binary semaphore, we can cv_signal.
1042			 */
1043			if (semp->semncnt) {
1044				if (op->sem_op == 1 && sp->sem_binary)
1045					cv_signal(&semp->semncnt_cv);
1046				else
1047					cv_broadcast(&semp->semncnt_cv);
1048			}
1049			if (semp->semzcnt && !semp->semval)
1050				cv_broadcast(&semp->semzcnt_cv);
1051			continue;
1052		}
1053
1054		/*
1055		 * Lower the semaphore (i.e. sema_p)
1056		 */
1057		if (op->sem_op < 0) {
1058			if (semp->semval >= (unsigned)(-op->sem_op)) {
1059				if ((op->sem_flg & SEM_UNDO) &&
1060				    (error = sem_undo_add(op->sem_op,
1061				    op->sem_num, undo))) {
1062					if (i)
1063						sem_rollback(sp, uops, i, undo);
1064					goto semoperr;
1065				}
1066				semp->semval += op->sem_op;
1067				if (semp->semzcnt && !semp->semval)
1068					cv_broadcast(&semp->semzcnt_cv);
1069				continue;
1070			}
1071			if (i)
1072				sem_rollback(sp, uops, i, undo);
1073			if (op->sem_flg & IPC_NOWAIT) {
1074				error = EAGAIN;
1075				goto semoperr;
1076			}
1077
1078			/*
1079			 * Mark the semaphore set as not a binary type
1080			 * if we are decrementing the value by more than 1.
1081			 *
1082			 * V operations will resort to cv_broadcast
1083			 * for this set because there are too many weird
1084			 * cases that have to be caught.
1085			 */
1086			if (op->sem_op < -1)
1087				sp->sem_binary = 0;
1088			if (!held) {
1089				held = 1;
1090				ipc_hold(sem_svc, (kipc_perm_t *)sp);
1091			}
1092			semp->semncnt++;
1093			cvres = cv_waituntil_sig(&semp->semncnt_cv, lock,
1094			    tsp, timecheck);
1095			lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
1096
1097			if (!IPC_FREE(&sp->sem_perm)) {
1098				ASSERT(semp->semncnt != 0);
1099				semp->semncnt--;
1100				if (cvres > 0)	/* normal wakeup */
1101					goto check;
1102			}
1103
1104			/* EINTR or EAGAIN overrides EIDRM */
1105			if (cvres == 0)
1106				error = EINTR;
1107			else if (cvres < 0)
1108				error = EAGAIN;
1109			else
1110				error = EIDRM;
1111			goto semoperr;
1112		}
1113
1114		/*
1115		 * Wait for zero value
1116		 */
1117		if (semp->semval) {
1118			if (i)
1119				sem_rollback(sp, uops, i, undo);
1120			if (op->sem_flg & IPC_NOWAIT) {
1121				error = EAGAIN;
1122				goto semoperr;
1123			}
1124
1125			if (!held) {
1126				held = 1;
1127				ipc_hold(sem_svc, (kipc_perm_t *)sp);
1128			}
1129			semp->semzcnt++;
1130			cvres = cv_waituntil_sig(&semp->semzcnt_cv, lock,
1131			    tsp, timecheck);
1132			lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
1133
1134			/*
1135			 * Don't touch semp if the semaphores have been removed.
1136			 */
1137			if (!IPC_FREE(&sp->sem_perm)) {
1138				ASSERT(semp->semzcnt != 0);
1139				semp->semzcnt--;
1140				if (cvres > 0)	/* normal wakeup */
1141					goto check;
1142			}
1143
1144			/* EINTR or EAGAIN overrides EIDRM */
1145			if (cvres == 0)
1146				error = EINTR;
1147			else if (cvres < 0)
1148				error = EAGAIN;
1149			else
1150				error = EIDRM;
1151			goto semoperr;
1152		}
1153	}
1154
1155	/* All operations succeeded.  Update sempid for accessed semaphores. */
1156	for (i = 0, op = uops; i++ < nsops;
1157	    sp->sem_base[(op++)->sem_num].sempid = pp->p_pid)
1158		;
1159	sp->sem_otime = gethrestime_sec();
1160	if (held)
1161		ipc_rele(sem_svc, (kipc_perm_t *)sp);
1162	else
1163		mutex_exit(lock);
1164
1165	/* Before leaving, deallocate the buffer that held the user semops */
1166	if (nsops != 1)
1167		kmem_free(uops, sizeof (*uops) * nsops);
1168	return (0);
1169
1170	/*
1171	 * Error return labels
1172	 */
1173semoperr:
1174	if (held)
1175		ipc_rele(sem_svc, (kipc_perm_t *)sp);
1176	else
1177		mutex_exit(lock);
1178
1179semoperr_unlocked:
1180
1181	/* Before leaving, deallocate the buffer that held the user semops */
1182	if (nsops != 1)
1183		kmem_free(uops, sizeof (*uops) * nsops);
1184	return (set_errno(error));
1185}
1186
1187/*
1188 * semsys - System entry point for semctl, semget, and semop system calls.
1189 */
1190static int
1191semsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4)
1192{
1193	int error;
1194
1195	switch (opcode) {
1196	case SEMCTL:
1197		error = semctl((int)a1, (uint_t)a2, (int)a3, a4);
1198		break;
1199	case SEMGET:
1200		error = semget((key_t)a1, (int)a2, (int)a3);
1201		break;
1202	case SEMOP:
1203		error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, 0);
1204		break;
1205	case SEMIDS:
1206		error = semids((int *)a1, (uint_t)a2, (uint_t *)a3);
1207		break;
1208	case SEMTIMEDOP:
1209		error = semop((int)a1, (struct sembuf *)a2, (size_t)a3,
1210		    (timespec_t *)a4);
1211		break;
1212	default:
1213		error = set_errno(EINVAL);
1214		break;
1215	}
1216	return (error);
1217}
1218