sysv_shm.c revision 1.108
1/*	$NetBSD: sysv_shm.c,v 1.108 2008/05/11 18:48:00 rmind Exp $	*/
2
3/*-
4 * Copyright (c) 1999, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, and by Mindaugas Rasiukevicius.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * Copyright (c) 1994 Adam Glass and Charles M. Hannum.  All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 *    must display the following acknowledgement:
46 *	This product includes software developed by Adam Glass and Charles M.
47 *	Hannum.
48 * 4. The names of the authors may not be used to endorse or promote products
49 *    derived from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
52 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
53 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
54 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
55 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
56 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
57 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
58 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
59 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
60 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
61 */
62
63#include <sys/cdefs.h>
64__KERNEL_RCSID(0, "$NetBSD: sysv_shm.c,v 1.108 2008/05/11 18:48:00 rmind Exp $");
65
66#define SYSVSHM
67
68#include <sys/param.h>
69#include <sys/kernel.h>
70#include <sys/kmem.h>
71#include <sys/shm.h>
72#include <sys/mutex.h>
73#include <sys/mman.h>
74#include <sys/stat.h>
75#include <sys/sysctl.h>
76#include <sys/mount.h>		/* XXX for <sys/syscallargs.h> */
77#include <sys/syscallargs.h>
78#include <sys/queue.h>
79#include <sys/pool.h>
80#include <sys/kauth.h>
81
82#include <uvm/uvm_extern.h>
83#include <uvm/uvm_object.h>
84
85int shm_nused;
86struct	shmid_ds *shmsegs;
87
88struct shmmap_entry {
89	SLIST_ENTRY(shmmap_entry) next;
90	vaddr_t va;
91	int shmid;
92};
93
94static kmutex_t		shm_lock;
95static kcondvar_t *	shm_cv;
96static struct pool	shmmap_entry_pool;
97static int		shm_last_free, shm_use_phys;
98static size_t		shm_committed;
99
100static kcondvar_t	shm_realloc_cv;
101static bool		shm_realloc_state;
102static u_int		shm_realloc_disable;
103
104struct shmmap_state {
105	unsigned int nitems;
106	unsigned int nrefs;
107	SLIST_HEAD(, shmmap_entry) entries;
108};
109
110#ifdef SHMDEBUG
111#define SHMPRINTF(a) printf a
112#else
113#define SHMPRINTF(a)
114#endif
115
116static int shmrealloc(int);
117
118/*
119 * Find the shared memory segment by the identifier.
120 *  => must be called with shm_lock held;
121 */
122static struct shmid_ds *
123shm_find_segment_by_shmid(int shmid)
124{
125	int segnum;
126	struct shmid_ds *shmseg;
127
128	KASSERT(mutex_owned(&shm_lock));
129
130	segnum = IPCID_TO_IX(shmid);
131	if (segnum < 0 || segnum >= shminfo.shmmni)
132		return NULL;
133	shmseg = &shmsegs[segnum];
134	if ((shmseg->shm_perm.mode & SHMSEG_ALLOCATED) == 0)
135		return NULL;
136	if ((shmseg->shm_perm.mode &
137	    (SHMSEG_REMOVED|SHMSEG_RMLINGER)) == SHMSEG_REMOVED)
138		return NULL;
139	if (shmseg->shm_perm._seq != IPCID_TO_SEQ(shmid))
140		return NULL;
141
142	return shmseg;
143}
144
145/*
146 * Free memory segment.
147 *  => must be called with shm_lock held;
148 */
149static void
150shm_free_segment(int segnum)
151{
152	struct shmid_ds *shmseg;
153	size_t size;
154	bool wanted;
155
156	KASSERT(mutex_owned(&shm_lock));
157
158	shmseg = &shmsegs[segnum];
159	SHMPRINTF(("shm freeing key 0x%lx seq 0x%x\n",
160	    shmseg->shm_perm._key, shmseg->shm_perm._seq));
161
162	size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
163	wanted = (shmseg->shm_perm.mode & SHMSEG_WANTED);
164
165	shmseg->_shm_internal = NULL;
166	shm_committed -= btoc(size);
167	shm_nused--;
168	shmseg->shm_perm.mode = SHMSEG_FREE;
169	shm_last_free = segnum;
170	if (wanted == true)
171		cv_broadcast(&shm_cv[segnum]);
172}
173
174/*
175 * Delete entry from the shm map.
176 *  => must be called with shm_lock held;
177 */
178static struct uvm_object *
179shm_delete_mapping(struct shmmap_state *shmmap_s,
180    struct shmmap_entry *shmmap_se)
181{
182	struct uvm_object *uobj = NULL;
183	struct shmid_ds *shmseg;
184	int segnum;
185
186	KASSERT(mutex_owned(&shm_lock));
187
188	segnum = IPCID_TO_IX(shmmap_se->shmid);
189	shmseg = &shmsegs[segnum];
190	SLIST_REMOVE(&shmmap_s->entries, shmmap_se, shmmap_entry, next);
191	shmmap_s->nitems--;
192	shmseg->shm_dtime = time_second;
193	if ((--shmseg->shm_nattch <= 0) &&
194	    (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
195		uobj = shmseg->_shm_internal;
196		shm_free_segment(segnum);
197	}
198
199	return uobj;
200}
201
202/*
203 * Get a non-shared shm map for that vmspace.  Note, that memory
204 * allocation might be performed with lock held.
205 */
206static struct shmmap_state *
207shmmap_getprivate(struct proc *p)
208{
209	struct shmmap_state *oshmmap_s, *shmmap_s;
210	struct shmmap_entry *oshmmap_se, *shmmap_se;
211
212	KASSERT(mutex_owned(&shm_lock));
213
214	/* 1. A shm map with refcnt = 1, used by ourselves, thus return */
215	oshmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
216	if (oshmmap_s && oshmmap_s->nrefs == 1)
217		return oshmmap_s;
218
219	/* 2. No shm map preset - create a fresh one */
220	shmmap_s = kmem_zalloc(sizeof(struct shmmap_state), KM_SLEEP);
221	shmmap_s->nrefs = 1;
222	SLIST_INIT(&shmmap_s->entries);
223	p->p_vmspace->vm_shm = (void *)shmmap_s;
224
225	if (oshmmap_s == NULL)
226		return shmmap_s;
227
228	SHMPRINTF(("shmmap_getprivate: vm %p split (%d entries), was used by %d\n",
229	    p->p_vmspace, oshmmap_s->nitems, oshmmap_s->nrefs));
230
231	/* 3. A shared shm map, copy to a fresh one and adjust refcounts */
232	SLIST_FOREACH(oshmmap_se, &oshmmap_s->entries, next) {
233		shmmap_se = pool_get(&shmmap_entry_pool, PR_WAITOK);
234		shmmap_se->va = oshmmap_se->va;
235		shmmap_se->shmid = oshmmap_se->shmid;
236		SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next);
237	}
238	shmmap_s->nitems = oshmmap_s->nitems;
239	oshmmap_s->nrefs--;
240
241	return shmmap_s;
242}
243
244/*
245 * Lock/unlock the memory.
246 *  => must be called with shm_lock held;
247 *  => called from one place, thus, inline;
248 */
249static inline int
250shm_memlock(struct lwp *l, struct shmid_ds *shmseg, int shmid, int cmd)
251{
252	struct proc *p = l->l_proc;
253	struct shmmap_entry *shmmap_se;
254	struct shmmap_state *shmmap_s;
255	size_t size;
256	int error;
257
258	KASSERT(mutex_owned(&shm_lock));
259	shmmap_s = shmmap_getprivate(p);
260
261	/* Find our shared memory address by shmid */
262	SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next) {
263		if (shmmap_se->shmid != shmid)
264			continue;
265
266		size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
267
268		if (cmd == SHM_LOCK &&
269		    (shmseg->shm_perm.mode & SHMSEG_WIRED) == 0) {
270			/* Wire the object and map, then tag it */
271			error = uobj_wirepages(shmseg->_shm_internal, 0,
272			    round_page(shmseg->shm_segsz));
273			if (error)
274				return EIO;
275			error = uvm_map_pageable(&p->p_vmspace->vm_map,
276			    shmmap_se->va, shmmap_se->va + size, false, 0);
277			if (error) {
278				uobj_unwirepages(shmseg->_shm_internal, 0,
279				    round_page(shmseg->shm_segsz));
280				if (error == EFAULT)
281					error = ENOMEM;
282				return error;
283			}
284			shmseg->shm_perm.mode |= SHMSEG_WIRED;
285
286		} else if (cmd == SHM_UNLOCK &&
287		    (shmseg->shm_perm.mode & SHMSEG_WIRED) != 0) {
288			/* Unwire the object and map, then untag it */
289			uobj_unwirepages(shmseg->_shm_internal, 0,
290			    round_page(shmseg->shm_segsz));
291			error = uvm_map_pageable(&p->p_vmspace->vm_map,
292			    shmmap_se->va, shmmap_se->va + size, true, 0);
293			if (error)
294				return EIO;
295			shmseg->shm_perm.mode &= ~SHMSEG_WIRED;
296		}
297	}
298
299	return 0;
300}
301
302/*
303 * Unmap shared memory.
304 */
305int
306sys_shmdt(struct lwp *l, const struct sys_shmdt_args *uap, register_t *retval)
307{
308	/* {
309		syscallarg(const void *) shmaddr;
310	} */
311	struct proc *p = l->l_proc;
312	struct shmmap_state *shmmap_s1, *shmmap_s;
313	struct shmmap_entry *shmmap_se;
314	struct uvm_object *uobj;
315	struct shmid_ds *shmseg;
316	size_t size;
317
318	mutex_enter(&shm_lock);
319	/* In case of reallocation, we will wait for completion */
320	while (__predict_false(shm_realloc_state))
321		cv_wait(&shm_realloc_cv, &shm_lock);
322
323	shmmap_s1 = (struct shmmap_state *)p->p_vmspace->vm_shm;
324	if (shmmap_s1 == NULL) {
325		mutex_exit(&shm_lock);
326		return EINVAL;
327	}
328
329	/* Find the map entry */
330	SLIST_FOREACH(shmmap_se, &shmmap_s1->entries, next)
331		if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr))
332			break;
333	if (shmmap_se == NULL) {
334		mutex_exit(&shm_lock);
335		return EINVAL;
336	}
337
338	shmmap_s = shmmap_getprivate(p);
339	if (shmmap_s != shmmap_s1) {
340		/* Map has been copied, lookup entry in new map */
341		SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
342			if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr))
343				break;
344		if (shmmap_se == NULL) {
345			mutex_exit(&shm_lock);
346			return EINVAL;
347		}
348	}
349
350	SHMPRINTF(("shmdt: vm %p: remove %d @%lx\n",
351	    p->p_vmspace, shmmap_se->shmid, shmmap_se->va));
352
353	/* Delete the entry from shm map */
354	uobj = shm_delete_mapping(shmmap_s, shmmap_se);
355	shmseg = &shmsegs[IPCID_TO_IX(shmmap_se->shmid)];
356	size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
357	mutex_exit(&shm_lock);
358
359	uvm_deallocate(&p->p_vmspace->vm_map, shmmap_se->va, size);
360	if (uobj != NULL)
361		uao_detach(uobj);
362	pool_put(&shmmap_entry_pool, shmmap_se);
363
364	return 0;
365}
366
367/*
368 * Map shared memory.
369 */
370int
371sys_shmat(struct lwp *l, const struct sys_shmat_args *uap, register_t *retval)
372{
373	/* {
374		syscallarg(int) shmid;
375		syscallarg(const void *) shmaddr;
376		syscallarg(int) shmflg;
377	} */
378	int error, flags = 0;
379	struct proc *p = l->l_proc;
380	kauth_cred_t cred = l->l_cred;
381	struct shmid_ds *shmseg;
382	struct shmmap_state *shmmap_s;
383	struct shmmap_entry *shmmap_se;
384	struct uvm_object *uobj;
385	struct vmspace *vm;
386	vaddr_t attach_va;
387	vm_prot_t prot;
388	vsize_t size;
389
390	/* Allocate a new map entry and set it */
391	shmmap_se = pool_get(&shmmap_entry_pool, PR_WAITOK);
392
393	mutex_enter(&shm_lock);
394	/* In case of reallocation, we will wait for completion */
395	while (__predict_false(shm_realloc_state))
396		cv_wait(&shm_realloc_cv, &shm_lock);
397
398	shmseg = shm_find_segment_by_shmid(SCARG(uap, shmid));
399	if (shmseg == NULL) {
400		error = EINVAL;
401		goto err;
402	}
403	error = ipcperm(cred, &shmseg->shm_perm,
404	    (SCARG(uap, shmflg) & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
405	if (error)
406		goto err;
407
408	vm = p->p_vmspace;
409	shmmap_s = (struct shmmap_state *)vm->vm_shm;
410	if (shmmap_s && shmmap_s->nitems >= shminfo.shmseg) {
411		error = EMFILE;
412		goto err;
413	}
414
415	size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
416	prot = VM_PROT_READ;
417	if ((SCARG(uap, shmflg) & SHM_RDONLY) == 0)
418		prot |= VM_PROT_WRITE;
419	if (SCARG(uap, shmaddr)) {
420		flags |= UVM_FLAG_FIXED;
421		if (SCARG(uap, shmflg) & SHM_RND)
422			attach_va =
423			    (vaddr_t)SCARG(uap, shmaddr) & ~(SHMLBA-1);
424		else if (((vaddr_t)SCARG(uap, shmaddr) & (SHMLBA-1)) == 0)
425			attach_va = (vaddr_t)SCARG(uap, shmaddr);
426		else {
427			error = EINVAL;
428			goto err;
429		}
430	} else {
431		/* This is just a hint to uvm_map() about where to put it. */
432		attach_va = p->p_emul->e_vm_default_addr(p,
433		    (vaddr_t)vm->vm_daddr, size);
434	}
435
436	/*
437	 * Create a map entry, add it to the list and increase the counters.
438	 * The lock will be dropped before the mapping, disable reallocation.
439	 */
440	shmmap_s = shmmap_getprivate(p);
441	SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next);
442	shmmap_s->nitems++;
443	shmseg->shm_lpid = p->p_pid;
444	shmseg->shm_nattch++;
445	shm_realloc_disable++;
446	mutex_exit(&shm_lock);
447
448	/*
449	 * Add a reference to the memory object, map it to the
450	 * address space, and lock the memory, if needed.
451	 */
452	uobj = shmseg->_shm_internal;
453	uao_reference(uobj);
454	error = uvm_map(&vm->vm_map, &attach_va, size, uobj, 0, 0,
455	    UVM_MAPFLAG(prot, prot, UVM_INH_SHARE, UVM_ADV_RANDOM, flags));
456	if (error)
457		goto err_detach;
458	if (shm_use_phys || (shmseg->shm_perm.mode & SHMSEG_WIRED)) {
459		error = uvm_map_pageable(&vm->vm_map, attach_va,
460		    attach_va + size, false, 0);
461		if (error) {
462			if (error == EFAULT)
463				error = ENOMEM;
464			uvm_deallocate(&vm->vm_map, attach_va, size);
465			goto err_detach;
466		}
467	}
468
469	/* Set the new address, and update the time */
470	mutex_enter(&shm_lock);
471	shmmap_se->va = attach_va;
472	shmmap_se->shmid = SCARG(uap, shmid);
473	shmseg->shm_atime = time_second;
474	shm_realloc_disable--;
475	retval[0] = attach_va;
476	SHMPRINTF(("shmat: vm %p: add %d @%lx\n",
477	    p->p_vmspace, shmmap_se->shmid, attach_va));
478err:
479	cv_broadcast(&shm_realloc_cv);
480	mutex_exit(&shm_lock);
481	if (error && shmmap_se)
482		pool_put(&shmmap_entry_pool, shmmap_se);
483	return error;
484
485err_detach:
486	uao_detach(uobj);
487	mutex_enter(&shm_lock);
488	uobj = shm_delete_mapping(shmmap_s, shmmap_se);
489	shm_realloc_disable--;
490	cv_broadcast(&shm_realloc_cv);
491	mutex_exit(&shm_lock);
492	if (uobj != NULL)
493		uao_detach(uobj);
494	pool_put(&shmmap_entry_pool, shmmap_se);
495	return error;
496}
497
498/*
499 * Shared memory control operations.
500 */
501int
502sys___shmctl13(struct lwp *l, const struct sys___shmctl13_args *uap, register_t *retval)
503{
504	/* {
505		syscallarg(int) shmid;
506		syscallarg(int) cmd;
507		syscallarg(struct shmid_ds *) buf;
508	} */
509	struct shmid_ds shmbuf;
510	int cmd, error;
511
512	cmd = SCARG(uap, cmd);
513	if (cmd == IPC_SET) {
514		error = copyin(SCARG(uap, buf), &shmbuf, sizeof(shmbuf));
515		if (error)
516			return error;
517	}
518
519	error = shmctl1(l, SCARG(uap, shmid), cmd,
520	    (cmd == IPC_SET || cmd == IPC_STAT) ? &shmbuf : NULL);
521
522	if (error == 0 && cmd == IPC_STAT)
523		error = copyout(&shmbuf, SCARG(uap, buf), sizeof(shmbuf));
524
525	return error;
526}
527
528int
529shmctl1(struct lwp *l, int shmid, int cmd, struct shmid_ds *shmbuf)
530{
531	struct uvm_object *uobj = NULL;
532	kauth_cred_t cred = l->l_cred;
533	struct shmid_ds *shmseg;
534	int error = 0;
535
536	mutex_enter(&shm_lock);
537	/* In case of reallocation, we will wait for completion */
538	while (__predict_false(shm_realloc_state))
539		cv_wait(&shm_realloc_cv, &shm_lock);
540
541	shmseg = shm_find_segment_by_shmid(shmid);
542	if (shmseg == NULL) {
543		mutex_exit(&shm_lock);
544		return EINVAL;
545	}
546
547	switch (cmd) {
548	case IPC_STAT:
549		if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_R)) != 0)
550			break;
551		memcpy(shmbuf, shmseg, sizeof(struct shmid_ds));
552		break;
553	case IPC_SET:
554		if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_M)) != 0)
555			break;
556		shmseg->shm_perm.uid = shmbuf->shm_perm.uid;
557		shmseg->shm_perm.gid = shmbuf->shm_perm.gid;
558		shmseg->shm_perm.mode =
559		    (shmseg->shm_perm.mode & ~ACCESSPERMS) |
560		    (shmbuf->shm_perm.mode & ACCESSPERMS);
561		shmseg->shm_ctime = time_second;
562		break;
563	case IPC_RMID:
564		if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_M)) != 0)
565			break;
566		shmseg->shm_perm._key = IPC_PRIVATE;
567		shmseg->shm_perm.mode |= SHMSEG_REMOVED;
568		if (shmseg->shm_nattch <= 0) {
569			uobj = shmseg->_shm_internal;
570			shm_free_segment(IPCID_TO_IX(shmid));
571		}
572		break;
573	case SHM_LOCK:
574	case SHM_UNLOCK:
575		if ((error = kauth_authorize_generic(cred,
576		    KAUTH_GENERIC_ISSUSER, NULL)) != 0)
577			break;
578		error = shm_memlock(l, shmseg, shmid, cmd);
579		break;
580	default:
581		error = EINVAL;
582	}
583
584	mutex_exit(&shm_lock);
585	if (uobj != NULL)
586		uao_detach(uobj);
587	return error;
588}
589
590/*
591 * Try to take an already existing segment.
592 *  => must be called with shm_lock held;
593 *  => called from one place, thus, inline;
594 */
595static inline int
596shmget_existing(struct lwp *l, const struct sys_shmget_args *uap, int mode,
597    register_t *retval)
598{
599	struct shmid_ds *shmseg;
600	kauth_cred_t cred = l->l_cred;
601	int segnum, error;
602again:
603	KASSERT(mutex_owned(&shm_lock));
604
605	/* Find segment by key */
606	for (segnum = 0; segnum < shminfo.shmmni; segnum++)
607		if ((shmsegs[segnum].shm_perm.mode & SHMSEG_ALLOCATED) &&
608		    shmsegs[segnum].shm_perm._key == SCARG(uap, key))
609			break;
610	if (segnum == shminfo.shmmni) {
611		/* Not found */
612		return -1;
613	}
614
615	shmseg = &shmsegs[segnum];
616	if (shmseg->shm_perm.mode & SHMSEG_REMOVED) {
617		/*
618		 * This segment is in the process of being allocated.  Wait
619		 * until it's done, and look the key up again (in case the
620		 * allocation failed or it was freed).
621		 */
622		shmseg->shm_perm.mode |= SHMSEG_WANTED;
623		error = cv_wait_sig(&shm_cv[segnum], &shm_lock);
624		if (error)
625			return error;
626		goto again;
627	}
628
629	/* Check the permission, segment size and appropriate flag */
630	error = ipcperm(cred, &shmseg->shm_perm, mode);
631	if (error)
632		return error;
633	if (SCARG(uap, size) && SCARG(uap, size) > shmseg->shm_segsz)
634		return EINVAL;
635	if ((SCARG(uap, shmflg) & (IPC_CREAT | IPC_EXCL)) ==
636	    (IPC_CREAT | IPC_EXCL))
637		return EEXIST;
638
639	*retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
640	return 0;
641}
642
643int
644sys_shmget(struct lwp *l, const struct sys_shmget_args *uap, register_t *retval)
645{
646	/* {
647		syscallarg(key_t) key;
648		syscallarg(size_t) size;
649		syscallarg(int) shmflg;
650	} */
651	struct shmid_ds *shmseg;
652	kauth_cred_t cred = l->l_cred;
653	key_t key = SCARG(uap, key);
654	size_t size;
655	int error, mode, segnum;
656	bool lockmem;
657
658	mode = SCARG(uap, shmflg) & ACCESSPERMS;
659	if (SCARG(uap, shmflg) & _SHM_RMLINGER)
660		mode |= SHMSEG_RMLINGER;
661
662	SHMPRINTF(("shmget: key 0x%lx size 0x%x shmflg 0x%x mode 0x%x\n",
663	    SCARG(uap, key), SCARG(uap, size), SCARG(uap, shmflg), mode));
664
665	mutex_enter(&shm_lock);
666	/* In case of reallocation, we will wait for completion */
667	while (__predict_false(shm_realloc_state))
668		cv_wait(&shm_realloc_cv, &shm_lock);
669
670	if (key != IPC_PRIVATE) {
671		error = shmget_existing(l, uap, mode, retval);
672		if (error != -1) {
673			mutex_exit(&shm_lock);
674			return error;
675		}
676		if ((SCARG(uap, shmflg) & IPC_CREAT) == 0) {
677			mutex_exit(&shm_lock);
678			return ENOENT;
679		}
680	}
681	error = 0;
682
683	/*
684	 * Check the for the limits.
685	 */
686	size = SCARG(uap, size);
687	if (size < shminfo.shmmin || size > shminfo.shmmax) {
688		mutex_exit(&shm_lock);
689		return EINVAL;
690	}
691	if (shm_nused >= shminfo.shmmni) {
692		mutex_exit(&shm_lock);
693		return ENOSPC;
694	}
695	size = (size + PGOFSET) & ~PGOFSET;
696	if (shm_committed + btoc(size) > shminfo.shmall) {
697		mutex_exit(&shm_lock);
698		return ENOMEM;
699	}
700
701	/* Find the first available segment */
702	if (shm_last_free < 0) {
703		for (segnum = 0; segnum < shminfo.shmmni; segnum++)
704			if (shmsegs[segnum].shm_perm.mode & SHMSEG_FREE)
705				break;
706		KASSERT(segnum < shminfo.shmmni);
707	} else {
708		segnum = shm_last_free;
709		shm_last_free = -1;
710	}
711
712	/*
713	 * Initialize the segment.
714	 * We will drop the lock while allocating the memory, thus mark the
715	 * segment present, but removed, that no other thread could take it.
716	 * Also, disable reallocation, while lock is dropped.
717	 */
718	shmseg = &shmsegs[segnum];
719	shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
720	shm_committed += btoc(size);
721	shm_nused++;
722	lockmem = shm_use_phys;
723	shm_realloc_disable++;
724	mutex_exit(&shm_lock);
725
726	/* Allocate the memory object and lock it if needed */
727	shmseg->_shm_internal = uao_create(size, 0);
728	if (lockmem) {
729		/* Wire the pages and tag it */
730		error = uobj_wirepages(shmseg->_shm_internal, 0,
731		    round_page(shmseg->shm_segsz));
732		if (error) {
733			uao_detach(shmseg->_shm_internal);
734			mutex_enter(&shm_lock);
735			shm_free_segment(segnum);
736			shm_realloc_disable--;
737			mutex_exit(&shm_lock);
738			return error;
739		}
740	}
741
742	/*
743	 * Please note, while segment is marked, there are no need to hold the
744	 * lock, while setting it (except shm_perm.mode).
745	 */
746	shmseg->shm_perm._key = SCARG(uap, key);
747	shmseg->shm_perm._seq = (shmseg->shm_perm._seq + 1) & 0x7fff;
748	*retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
749
750	shmseg->shm_perm.cuid = shmseg->shm_perm.uid = kauth_cred_geteuid(cred);
751	shmseg->shm_perm.cgid = shmseg->shm_perm.gid = kauth_cred_getegid(cred);
752	shmseg->shm_segsz = SCARG(uap, size);
753	shmseg->shm_cpid = l->l_proc->p_pid;
754	shmseg->shm_lpid = shmseg->shm_nattch = 0;
755	shmseg->shm_atime = shmseg->shm_dtime = 0;
756	shmseg->shm_ctime = time_second;
757
758	/*
759	 * Segment is initialized.
760	 * Enter the lock, mark as allocated, and notify waiters (if any).
761	 * Also, unmark the state of reallocation.
762	 */
763	mutex_enter(&shm_lock);
764	shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) |
765	    (mode & (ACCESSPERMS | SHMSEG_RMLINGER)) |
766	    SHMSEG_ALLOCATED | (lockmem ? SHMSEG_WIRED : 0);
767	if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
768		shmseg->shm_perm.mode &= ~SHMSEG_WANTED;
769		cv_broadcast(&shm_cv[segnum]);
770	}
771	shm_realloc_disable--;
772	cv_broadcast(&shm_realloc_cv);
773	mutex_exit(&shm_lock);
774
775	return error;
776}
777
778void
779shmfork(struct vmspace *vm1, struct vmspace *vm2)
780{
781	struct shmmap_state *shmmap_s;
782	struct shmmap_entry *shmmap_se;
783
784	SHMPRINTF(("shmfork %p->%p\n", vm1, vm2));
785	mutex_enter(&shm_lock);
786	vm2->vm_shm = vm1->vm_shm;
787	if (vm1->vm_shm) {
788		shmmap_s = (struct shmmap_state *)vm1->vm_shm;
789		SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
790			shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch++;
791		shmmap_s->nrefs++;
792	}
793	mutex_exit(&shm_lock);
794}
795
796void
797shmexit(struct vmspace *vm)
798{
799	struct shmmap_state *shmmap_s;
800	struct shmmap_entry *shmmap_se;
801	struct uvm_object **uobj;
802	size_t *size;
803	u_int i, n;
804
805	SLIST_HEAD(, shmmap_entry) tmp_entries;
806
807	mutex_enter(&shm_lock);
808	shmmap_s = (struct shmmap_state *)vm->vm_shm;
809	if (shmmap_s == NULL) {
810		mutex_exit(&shm_lock);
811		return;
812	}
813
814	vm->vm_shm = NULL;
815
816	if (--shmmap_s->nrefs > 0) {
817		SHMPRINTF(("shmexit: vm %p drop ref (%d entries), refs = %d\n",
818		    vm, shmmap_s->nitems, shmmap_s->nrefs));
819		SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
820			shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch--;
821		mutex_exit(&shm_lock);
822		return;
823	}
824
825	KASSERT(shmmap_s->nrefs == 0);
826	n = shmmap_s->nitems;
827	SHMPRINTF(("shmexit: vm %p cleanup (%d entries)\n", vm, n));
828	mutex_exit(&shm_lock);
829	if (n == 0) {
830		kmem_free(shmmap_s, sizeof(struct shmmap_state));
831		return;
832	}
833
834	/* Allocate the arrays */
835	SLIST_INIT(&tmp_entries);
836	uobj = kmem_zalloc(n * sizeof(void *), KM_SLEEP);
837	size = kmem_zalloc(n * sizeof(size_t), KM_SLEEP);
838
839	/* Delete the entry from shm map */
840	i = 0;
841	mutex_enter(&shm_lock);
842	while (!SLIST_EMPTY(&shmmap_s->entries)) {
843		struct shmid_ds *shmseg;
844
845		shmmap_se = SLIST_FIRST(&shmmap_s->entries);
846		shmseg = &shmsegs[IPCID_TO_IX(shmmap_se->shmid)];
847		size[i] = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
848		uobj[i] = shm_delete_mapping(shmmap_s, shmmap_se);
849		SLIST_INSERT_HEAD(&tmp_entries, shmmap_se, next);
850		i++;
851	}
852	mutex_exit(&shm_lock);
853
854	/* Unmap all segments, free the entries */
855	i = 0;
856	while (!SLIST_EMPTY(&tmp_entries)) {
857		KASSERT(i < n);
858		shmmap_se = SLIST_FIRST(&tmp_entries);
859		SLIST_REMOVE(&tmp_entries, shmmap_se, shmmap_entry, next);
860		uvm_deallocate(&vm->vm_map, shmmap_se->va, size[i]);
861		if (uobj[i] != NULL)
862			uao_detach(uobj[i]);
863		pool_put(&shmmap_entry_pool, shmmap_se);
864		i++;
865	}
866
867	kmem_free(uobj, n * sizeof(void *));
868	kmem_free(size, n * sizeof(size_t));
869	kmem_free(shmmap_s, sizeof(struct shmmap_state));
870}
871
872static int
873shmrealloc(int newshmni)
874{
875	vaddr_t v;
876	struct shmid_ds *oldshmsegs, *newshmsegs;
877	kcondvar_t *newshm_cv;
878	size_t sz;
879	int i, lsegid;
880
881	if (newshmni < 1)
882		return EINVAL;
883
884	/* Allocate new memory area */
885	sz = ALIGN(newshmni * sizeof(struct shmid_ds)) +
886	    ALIGN(newshmni * sizeof(kcondvar_t));
887	v = uvm_km_alloc(kernel_map, round_page(sz), 0,
888	    UVM_KMF_WIRED|UVM_KMF_ZERO);
889	if (v == 0)
890		return ENOMEM;
891
892	mutex_enter(&shm_lock);
893	while (shm_realloc_state || shm_realloc_disable)
894		cv_wait(&shm_realloc_cv, &shm_lock);
895
896	/*
897	 * Get the number of last segment.  Fail we are trying to
898	 * reallocate less memory than we use.
899	 */
900	lsegid = 0;
901	for (i = 0; i < shminfo.shmmni; i++)
902		if ((shmsegs[i].shm_perm.mode & SHMSEG_FREE) == 0)
903			lsegid = i;
904	if (lsegid >= newshmni) {
905		mutex_exit(&shm_lock);
906		uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
907		return EBUSY;
908	}
909	shm_realloc_state = true;
910
911	newshmsegs = (void *)v;
912	newshm_cv = (void *)(ALIGN(newshmsegs) +
913	    newshmni * sizeof(struct shmid_ds));
914
915	/* Copy all memory to the new area */
916	for (i = 0; i < shm_nused; i++)
917		(void)memcpy(&newshmsegs[i], &shmsegs[i],
918		    sizeof(newshmsegs[0]));
919
920	/* Mark as free all new segments, if there is any */
921	for (; i < newshmni; i++) {
922		cv_init(&newshm_cv[i], "shmwait");
923		newshmsegs[i].shm_perm.mode = SHMSEG_FREE;
924		newshmsegs[i].shm_perm._seq = 0;
925	}
926
927	oldshmsegs = shmsegs;
928	sz = ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)) +
929	    ALIGN(shminfo.shmmni * sizeof(kcondvar_t));
930
931	shminfo.shmmni = newshmni;
932	shmsegs = newshmsegs;
933	shm_cv = newshm_cv;
934
935	/* Reallocation completed - notify all waiters, if any */
936	shm_realloc_state = false;
937	cv_broadcast(&shm_realloc_cv);
938	mutex_exit(&shm_lock);
939
940	uvm_km_free(kernel_map, (vaddr_t)oldshmsegs, sz, UVM_KMF_WIRED);
941	return 0;
942}
943
944void
945shminit(void)
946{
947	vaddr_t v;
948	size_t sz;
949	int i;
950
951	mutex_init(&shm_lock, MUTEX_DEFAULT, IPL_NONE);
952	pool_init(&shmmap_entry_pool, sizeof(struct shmmap_entry), 0, 0, 0,
953	    "shmmp", &pool_allocator_nointr, IPL_NONE);
954	cv_init(&shm_realloc_cv, "shmrealc");
955
956	/* Allocate the wired memory for our structures */
957	sz = ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)) +
958	    ALIGN(shminfo.shmmni * sizeof(kcondvar_t));
959	v = uvm_km_alloc(kernel_map, round_page(sz), 0,
960	    UVM_KMF_WIRED|UVM_KMF_ZERO);
961	if (v == 0)
962		panic("sysv_shm: cannot allocate memory");
963	shmsegs = (void *)v;
964	shm_cv = (void *)(ALIGN(shmsegs) +
965	    shminfo.shmmni * sizeof(struct shmid_ds));
966
967	shminfo.shmmax *= PAGE_SIZE;
968
969	for (i = 0; i < shminfo.shmmni; i++) {
970		cv_init(&shm_cv[i], "shmwait");
971		shmsegs[i].shm_perm.mode = SHMSEG_FREE;
972		shmsegs[i].shm_perm._seq = 0;
973	}
974	shm_last_free = 0;
975	shm_nused = 0;
976	shm_committed = 0;
977	shm_realloc_disable = 0;
978	shm_realloc_state = false;
979}
980
981static int
982sysctl_ipc_shmmni(SYSCTLFN_ARGS)
983{
984	int newsize, error;
985	struct sysctlnode node;
986	node = *rnode;
987	node.sysctl_data = &newsize;
988
989	newsize = shminfo.shmmni;
990	error = sysctl_lookup(SYSCTLFN_CALL(&node));
991	if (error || newp == NULL)
992		return error;
993
994	sysctl_unlock();
995	error = shmrealloc(newsize);
996	sysctl_relock();
997	return error;
998}
999
1000static int
1001sysctl_ipc_shmmaxpgs(SYSCTLFN_ARGS)
1002{
1003	int newsize, error;
1004	struct sysctlnode node;
1005	node = *rnode;
1006	node.sysctl_data = &newsize;
1007
1008	newsize = shminfo.shmall;
1009	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1010	if (error || newp == NULL)
1011		return error;
1012
1013	if (newsize < 1)
1014		return EINVAL;
1015
1016	shminfo.shmall = newsize;
1017	shminfo.shmmax = shminfo.shmall * PAGE_SIZE;
1018
1019	return 0;
1020}
1021
1022SYSCTL_SETUP(sysctl_ipc_shm_setup, "sysctl kern.ipc subtree setup")
1023{
1024
1025	sysctl_createv(clog, 0, NULL, NULL,
1026		CTLFLAG_PERMANENT,
1027		CTLTYPE_NODE, "kern", NULL,
1028		NULL, 0, NULL, 0,
1029		CTL_KERN, CTL_EOL);
1030	sysctl_createv(clog, 0, NULL, NULL,
1031		CTLFLAG_PERMANENT,
1032		CTLTYPE_NODE, "ipc",
1033		SYSCTL_DESCR("SysV IPC options"),
1034		NULL, 0, NULL, 0,
1035		CTL_KERN, KERN_SYSVIPC, CTL_EOL);
1036	sysctl_createv(clog, 0, NULL, NULL,
1037		CTLFLAG_PERMANENT | CTLFLAG_READONLY,
1038		CTLTYPE_INT, "shmmax",
1039		SYSCTL_DESCR("Max shared memory segment size in bytes"),
1040		NULL, 0, &shminfo.shmmax, 0,
1041		CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMAX, CTL_EOL);
1042	sysctl_createv(clog, 0, NULL, NULL,
1043		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1044		CTLTYPE_INT, "shmmni",
1045		SYSCTL_DESCR("Max number of shared memory identifiers"),
1046		sysctl_ipc_shmmni, 0, &shminfo.shmmni, 0,
1047		CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMNI, CTL_EOL);
1048	sysctl_createv(clog, 0, NULL, NULL,
1049		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1050		CTLTYPE_INT, "shmseg",
1051		SYSCTL_DESCR("Max shared memory segments per process"),
1052		NULL, 0, &shminfo.shmseg, 0,
1053		CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMSEG, CTL_EOL);
1054	sysctl_createv(clog, 0, NULL, NULL,
1055		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1056		CTLTYPE_INT, "shmmaxpgs",
1057		SYSCTL_DESCR("Max amount of shared memory in pages"),
1058		sysctl_ipc_shmmaxpgs, 0, &shminfo.shmall, 0,
1059		CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMAXPGS, CTL_EOL);
1060	sysctl_createv(clog, 0, NULL, NULL,
1061		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1062		CTLTYPE_INT, "shm_use_phys",
1063		SYSCTL_DESCR("Enable/disable locking of shared memory in "
1064		    "physical memory"), NULL, 0, &shm_use_phys, 0,
1065		CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMUSEPHYS, CTL_EOL);
1066}
1067