1/*-
2 * SPDX-License-Identifier: BSD-4-Clause AND BSD-2-Clause
3 *
4 * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. All advertising materials mentioning features or use of this software
15 *    must display the following acknowledgement:
16 *	This product includes software developed by Adam Glass and Charles
17 *	Hannum.
18 * 4. The names of the authors may not be used to endorse or promote products
19 *    derived from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 *
32 * $NetBSD: sysv_shm.c,v 1.39 1997/10/07 10:02:03 drochner Exp $
33 */
34/*-
35 * Copyright (c) 2003-2005 McAfee, Inc.
36 * Copyright (c) 2016-2017 Robert N. M. Watson
37 * All rights reserved.
38 *
39 * This software was developed for the FreeBSD Project in part by McAfee
40 * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
41 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
42 * program.
43 *
44 * Portions of this software were developed by BAE Systems, the University of
45 * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
46 * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
47 * Computing (TC) research program.
48 *
49 * Redistribution and use in source and binary forms, with or without
50 * modification, are permitted provided that the following conditions
51 * are met:
52 * 1. Redistributions of source code must retain the above copyright
53 *    notice, this list of conditions and the following disclaimer.
54 * 2. Redistributions in binary form must reproduce the above copyright
55 *    notice, this list of conditions and the following disclaimer in the
56 *    documentation and/or other materials provided with the distribution.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71#include <sys/cdefs.h>
72#include "opt_sysvipc.h"
73
74#include <sys/param.h>
75#include <sys/systm.h>
76#include <sys/abi_compat.h>
77#include <sys/kernel.h>
78#include <sys/limits.h>
79#include <sys/lock.h>
80#include <sys/sysctl.h>
81#include <sys/shm.h>
82#include <sys/proc.h>
83#include <sys/malloc.h>
84#include <sys/mman.h>
85#include <sys/module.h>
86#include <sys/mutex.h>
87#include <sys/racct.h>
88#include <sys/resourcevar.h>
89#include <sys/rwlock.h>
90#include <sys/stat.h>
91#include <sys/syscall.h>
92#include <sys/syscallsubr.h>
93#include <sys/sysent.h>
94#include <sys/sysproto.h>
95#include <sys/jail.h>
96
97#include <security/audit/audit.h>
98#include <security/mac/mac_framework.h>
99
100#include <vm/vm.h>
101#include <vm/vm_param.h>
102#include <vm/pmap.h>
103#include <vm/vm_object.h>
104#include <vm/vm_map.h>
105#include <vm/vm_page.h>
106#include <vm/vm_pager.h>
107
108FEATURE(sysv_shm, "System V shared memory segments support");
109
110static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
111
112static int shm_last_free, shm_nused, shmalloced;
113vm_size_t shm_committed;
114static struct shmid_kernel *shmsegs;
115static unsigned shm_prison_slot;
116
117struct shmmap_state {
118	vm_offset_t va;
119	int shmid;
120};
121
122static void shm_deallocate_segment(struct shmid_kernel *);
123static int shm_find_segment_by_key(struct prison *, key_t);
124static struct shmid_kernel *shm_find_segment(struct prison *, int, bool);
125static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *);
126static int shmget_allocate_segment(struct thread *td, key_t key, size_t size,
127    int mode);
128static int shmget_existing(struct thread *td, size_t size, int shmflg,
129    int mode, int segnum);
130static void shmrealloc(void);
131static int shminit(void);
132static int sysvshm_modload(struct module *, int, void *);
133static int shmunload(void);
134#ifndef SYSVSHM
135static void shmexit_myhook(struct vmspace *vm);
136static void shmfork_myhook(struct proc *p1, struct proc *p2);
137#endif
138static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
139static void shm_remove(struct shmid_kernel *, int);
140static struct prison *shm_find_prison(struct ucred *);
141static int shm_prison_cansee(struct prison *, struct shmid_kernel *);
142static int shm_prison_check(void *, void *);
143static int shm_prison_set(void *, void *);
144static int shm_prison_get(void *, void *);
145static int shm_prison_remove(void *, void *);
146static void shm_prison_cleanup(struct prison *);
147
148/*
149 * Tuneable values.
150 */
151#ifndef SHMMAXPGS
152#define	SHMMAXPGS	131072ul /* Note: sysv shared memory is swap backed. */
153#endif
154#ifndef SHMMAX
155#define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
156#endif
157#ifndef SHMMIN
158#define	SHMMIN	1
159#endif
160#ifndef SHMMNI
161#define	SHMMNI	192
162#endif
163#ifndef SHMSEG
164#define	SHMSEG	128
165#endif
166#ifndef SHMALL
167#define	SHMALL	(SHMMAXPGS)
168#endif
169
170struct	shminfo shminfo = {
171	.shmmax = SHMMAX,
172	.shmmin = SHMMIN,
173	.shmmni = SHMMNI,
174	.shmseg = SHMSEG,
175	.shmall = SHMALL
176};
177
178static int shm_use_phys;
179static int shm_allow_removed = 1;
180
181SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RWTUN, &shminfo.shmmax, 0,
182    "Maximum shared memory segment size");
183SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RWTUN, &shminfo.shmmin, 0,
184    "Minimum shared memory segment size");
185SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
186    "Number of shared memory identifiers");
187SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
188    "Number of segments per process");
189SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RWTUN, &shminfo.shmall, 0,
190    "Maximum number of pages available for shared memory");
191SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RWTUN,
192    &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
193SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RWTUN,
194    &shm_allow_removed, 0,
195    "Enable/Disable attachment to attached segments marked for removal");
196SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE | CTLFLAG_RD |
197    CTLFLAG_MPSAFE, NULL, 0, sysctl_shmsegs, "",
198    "Array of struct shmid_kernel for each potential shared memory segment");
199
200static struct sx sysvshmsx;
201#define	SYSVSHM_LOCK()		sx_xlock(&sysvshmsx)
202#define	SYSVSHM_UNLOCK()	sx_xunlock(&sysvshmsx)
203#define	SYSVSHM_ASSERT_LOCKED()	sx_assert(&sysvshmsx, SA_XLOCKED)
204
205static int
206shm_find_segment_by_key(struct prison *pr, key_t key)
207{
208	int i;
209
210	for (i = 0; i < shmalloced; i++)
211		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
212		    shmsegs[i].cred != NULL &&
213		    shmsegs[i].cred->cr_prison == pr &&
214		    shmsegs[i].u.shm_perm.key == key)
215			return (i);
216	return (-1);
217}
218
219/*
220 * Finds segment either by shmid if is_shmid is true, or by segnum if
221 * is_shmid is false.
222 */
223static struct shmid_kernel *
224shm_find_segment(struct prison *rpr, int arg, bool is_shmid)
225{
226	struct shmid_kernel *shmseg;
227	int segnum;
228
229	segnum = is_shmid ? IPCID_TO_IX(arg) : arg;
230	if (segnum < 0 || segnum >= shmalloced)
231		return (NULL);
232	shmseg = &shmsegs[segnum];
233	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
234	    (!shm_allow_removed &&
235	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) ||
236	    (is_shmid && shmseg->u.shm_perm.seq != IPCID_TO_SEQ(arg)) ||
237	    shm_prison_cansee(rpr, shmseg) != 0)
238		return (NULL);
239	return (shmseg);
240}
241
242static void
243shm_deallocate_segment(struct shmid_kernel *shmseg)
244{
245	vm_size_t size;
246
247	SYSVSHM_ASSERT_LOCKED();
248
249	vm_object_deallocate(shmseg->object);
250	shmseg->object = NULL;
251	size = round_page(shmseg->u.shm_segsz);
252	shm_committed -= btoc(size);
253	shm_nused--;
254	shmseg->u.shm_perm.mode = SHMSEG_FREE;
255#ifdef MAC
256	mac_sysvshm_cleanup(shmseg);
257#endif
258	racct_sub_cred(shmseg->cred, RACCT_NSHM, 1);
259	racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size);
260	crfree(shmseg->cred);
261	shmseg->cred = NULL;
262}
263
264static int
265shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
266{
267	struct shmid_kernel *shmseg;
268	int segnum, result;
269	vm_size_t size;
270
271	SYSVSHM_ASSERT_LOCKED();
272	segnum = IPCID_TO_IX(shmmap_s->shmid);
273	KASSERT(segnum >= 0 && segnum < shmalloced,
274	    ("segnum %d shmalloced %d", segnum, shmalloced));
275
276	shmseg = &shmsegs[segnum];
277	size = round_page(shmseg->u.shm_segsz);
278	result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
279	if (result != KERN_SUCCESS)
280		return (EINVAL);
281	shmmap_s->shmid = -1;
282	shmseg->u.shm_dtime = time_second;
283	if (--shmseg->u.shm_nattch == 0 &&
284	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
285		shm_deallocate_segment(shmseg);
286		shm_last_free = segnum;
287	}
288	return (0);
289}
290
291static void
292shm_remove(struct shmid_kernel *shmseg, int segnum)
293{
294
295	shmseg->u.shm_perm.key = IPC_PRIVATE;
296	shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
297	if (shmseg->u.shm_nattch == 0) {
298		shm_deallocate_segment(shmseg);
299		shm_last_free = segnum;
300	}
301}
302
303static struct prison *
304shm_find_prison(struct ucred *cred)
305{
306	struct prison *pr, *rpr;
307
308	pr = cred->cr_prison;
309	prison_lock(pr);
310	rpr = osd_jail_get(pr, shm_prison_slot);
311	prison_unlock(pr);
312	return rpr;
313}
314
315static int
316shm_prison_cansee(struct prison *rpr, struct shmid_kernel *shmseg)
317{
318
319	if (shmseg->cred == NULL ||
320	    !(rpr == shmseg->cred->cr_prison ||
321	      prison_ischild(rpr, shmseg->cred->cr_prison)))
322		return (EINVAL);
323	return (0);
324}
325
326static int
327kern_shmdt_locked(struct thread *td, const void *shmaddr)
328{
329	struct proc *p = td->td_proc;
330	struct shmmap_state *shmmap_s;
331#ifdef MAC
332	int error;
333#endif
334	int i;
335
336	SYSVSHM_ASSERT_LOCKED();
337	if (shm_find_prison(td->td_ucred) == NULL)
338		return (ENOSYS);
339	shmmap_s = p->p_vmspace->vm_shm;
340 	if (shmmap_s == NULL)
341		return (EINVAL);
342	AUDIT_ARG_SVIPC_ID(shmmap_s->shmid);
343	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
344		if (shmmap_s->shmid != -1 &&
345		    shmmap_s->va == (vm_offset_t)shmaddr) {
346			break;
347		}
348	}
349	if (i == shminfo.shmseg)
350		return (EINVAL);
351#ifdef MAC
352	error = mac_sysvshm_check_shmdt(td->td_ucred,
353	    &shmsegs[IPCID_TO_IX(shmmap_s->shmid)]);
354	if (error != 0)
355		return (error);
356#endif
357	return (shm_delete_mapping(p->p_vmspace, shmmap_s));
358}
359
360#ifndef _SYS_SYSPROTO_H_
361struct shmdt_args {
362	const void *shmaddr;
363};
364#endif
365int
366sys_shmdt(struct thread *td, struct shmdt_args *uap)
367{
368	int error;
369
370	SYSVSHM_LOCK();
371	error = kern_shmdt_locked(td, uap->shmaddr);
372	SYSVSHM_UNLOCK();
373	return (error);
374}
375
376static int
377kern_shmat_locked(struct thread *td, int shmid, const void *shmaddr,
378    int shmflg)
379{
380	struct prison *rpr;
381	struct proc *p = td->td_proc;
382	struct shmid_kernel *shmseg;
383	struct shmmap_state *shmmap_s;
384	vm_offset_t attach_va;
385	vm_prot_t prot;
386	vm_size_t size;
387	int cow, error, find_space, i, rv;
388
389	AUDIT_ARG_SVIPC_ID(shmid);
390	AUDIT_ARG_VALUE(shmflg);
391
392	SYSVSHM_ASSERT_LOCKED();
393	rpr = shm_find_prison(td->td_ucred);
394	if (rpr == NULL)
395		return (ENOSYS);
396	shmmap_s = p->p_vmspace->vm_shm;
397	if (shmmap_s == NULL) {
398		shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state),
399		    M_SHM, M_WAITOK);
400		for (i = 0; i < shminfo.shmseg; i++)
401			shmmap_s[i].shmid = -1;
402		KASSERT(p->p_vmspace->vm_shm == NULL, ("raced"));
403		p->p_vmspace->vm_shm = shmmap_s;
404	}
405	shmseg = shm_find_segment(rpr, shmid, true);
406	if (shmseg == NULL)
407		return (EINVAL);
408	error = ipcperm(td, &shmseg->u.shm_perm,
409	    (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
410	if (error != 0)
411		return (error);
412#ifdef MAC
413	error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg);
414	if (error != 0)
415		return (error);
416#endif
417	for (i = 0; i < shminfo.shmseg; i++) {
418		if (shmmap_s->shmid == -1)
419			break;
420		shmmap_s++;
421	}
422	if (i >= shminfo.shmseg)
423		return (EMFILE);
424	size = round_page(shmseg->u.shm_segsz);
425	prot = VM_PROT_READ;
426	cow = MAP_INHERIT_SHARE | MAP_PREFAULT_PARTIAL;
427	if ((shmflg & SHM_RDONLY) == 0)
428		prot |= VM_PROT_WRITE;
429	if (shmaddr != NULL) {
430		if ((shmflg & SHM_RND) != 0)
431			attach_va = rounddown2((vm_offset_t)shmaddr, SHMLBA);
432		else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0)
433			attach_va = (vm_offset_t)shmaddr;
434		else
435			return (EINVAL);
436		if ((shmflg & SHM_REMAP) != 0)
437			cow |= MAP_REMAP;
438		find_space = VMFS_NO_SPACE;
439	} else {
440		/*
441		 * This is just a hint to vm_map_find() about where to
442		 * put it.
443		 */
444		attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
445		    lim_max(td, RLIMIT_DATA));
446		find_space = VMFS_OPTIMAL_SPACE;
447	}
448
449	vm_object_reference(shmseg->object);
450	rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object, 0, &attach_va,
451	    size, 0, find_space, prot, prot, cow);
452	if (rv != KERN_SUCCESS) {
453		vm_object_deallocate(shmseg->object);
454		return (ENOMEM);
455	}
456
457	shmmap_s->va = attach_va;
458	shmmap_s->shmid = shmid;
459	shmseg->u.shm_lpid = p->p_pid;
460	shmseg->u.shm_atime = time_second;
461	shmseg->u.shm_nattch++;
462	td->td_retval[0] = attach_va;
463	return (error);
464}
465
466int
467kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg)
468{
469	int error;
470
471	SYSVSHM_LOCK();
472	error = kern_shmat_locked(td, shmid, shmaddr, shmflg);
473	SYSVSHM_UNLOCK();
474	return (error);
475}
476
477#ifndef _SYS_SYSPROTO_H_
478struct shmat_args {
479	int shmid;
480	const void *shmaddr;
481	int shmflg;
482};
483#endif
484int
485sys_shmat(struct thread *td, struct shmat_args *uap)
486{
487
488	return (kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg));
489}
490
491static int
492kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf,
493    size_t *bufsz)
494{
495	struct prison *rpr;
496	struct shmid_kernel *shmseg;
497	struct shmid_ds *shmidp;
498	struct shm_info shm_info;
499	int error;
500
501	SYSVSHM_ASSERT_LOCKED();
502
503	rpr = shm_find_prison(td->td_ucred);
504	if (rpr == NULL)
505		return (ENOSYS);
506
507	AUDIT_ARG_SVIPC_ID(shmid);
508	AUDIT_ARG_SVIPC_CMD(cmd);
509
510	switch (cmd) {
511	/*
512	 * It is possible that kern_shmctl is being called from the Linux ABI
513	 * layer, in which case, we will need to implement IPC_INFO.  It should
514	 * be noted that other shmctl calls will be funneled through here for
515	 * Linix binaries as well.
516	 *
517	 * NB: The Linux ABI layer will convert this data to structure(s) more
518	 * consistent with the Linux ABI.
519	 */
520	case IPC_INFO:
521		memcpy(buf, &shminfo, sizeof(shminfo));
522		if (bufsz)
523			*bufsz = sizeof(shminfo);
524		td->td_retval[0] = shmalloced;
525		return (0);
526	case SHM_INFO: {
527		shm_info.used_ids = shm_nused;
528		shm_info.shm_rss = 0;	/*XXX where to get from ? */
529		shm_info.shm_tot = 0;	/*XXX where to get from ? */
530		shm_info.shm_swp = 0;	/*XXX where to get from ? */
531		shm_info.swap_attempts = 0;	/*XXX where to get from ? */
532		shm_info.swap_successes = 0;	/*XXX where to get from ? */
533		memcpy(buf, &shm_info, sizeof(shm_info));
534		if (bufsz != NULL)
535			*bufsz = sizeof(shm_info);
536		td->td_retval[0] = shmalloced;
537		return (0);
538	}
539	}
540	shmseg = shm_find_segment(rpr, shmid, cmd != SHM_STAT);
541	if (shmseg == NULL)
542		return (EINVAL);
543#ifdef MAC
544	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd);
545	if (error != 0)
546		return (error);
547#endif
548	switch (cmd) {
549	case SHM_STAT:
550	case IPC_STAT:
551		shmidp = (struct shmid_ds *)buf;
552		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
553		if (error != 0)
554			return (error);
555		memcpy(shmidp, &shmseg->u, sizeof(struct shmid_ds));
556		if (td->td_ucred->cr_prison != shmseg->cred->cr_prison)
557			shmidp->shm_perm.key = IPC_PRIVATE;
558		if (bufsz != NULL)
559			*bufsz = sizeof(struct shmid_ds);
560		if (cmd == SHM_STAT) {
561			td->td_retval[0] = IXSEQ_TO_IPCID(shmid,
562			    shmseg->u.shm_perm);
563		}
564		break;
565	case IPC_SET:
566		shmidp = (struct shmid_ds *)buf;
567		AUDIT_ARG_SVIPC_PERM(&shmidp->shm_perm);
568		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
569		if (error != 0)
570			return (error);
571		shmseg->u.shm_perm.uid = shmidp->shm_perm.uid;
572		shmseg->u.shm_perm.gid = shmidp->shm_perm.gid;
573		shmseg->u.shm_perm.mode =
574		    (shmseg->u.shm_perm.mode & ~ACCESSPERMS) |
575		    (shmidp->shm_perm.mode & ACCESSPERMS);
576		shmseg->u.shm_ctime = time_second;
577		break;
578	case IPC_RMID:
579		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
580		if (error != 0)
581			return (error);
582		shm_remove(shmseg, IPCID_TO_IX(shmid));
583		break;
584#if 0
585	case SHM_LOCK:
586	case SHM_UNLOCK:
587#endif
588	default:
589		error = EINVAL;
590		break;
591	}
592	return (error);
593}
594
595int
596kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz)
597{
598	int error;
599
600	SYSVSHM_LOCK();
601	error = kern_shmctl_locked(td, shmid, cmd, buf, bufsz);
602	SYSVSHM_UNLOCK();
603	return (error);
604}
605
606#ifndef _SYS_SYSPROTO_H_
607struct shmctl_args {
608	int shmid;
609	int cmd;
610	struct shmid_ds *buf;
611};
612#endif
613int
614sys_shmctl(struct thread *td, struct shmctl_args *uap)
615{
616	int error;
617	struct shmid_ds buf;
618	size_t bufsz;
619
620	/*
621	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
622	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
623	 * return an error back to the user since we do not to support this.
624	 */
625	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
626	    uap->cmd == SHM_STAT)
627		return (EINVAL);
628
629	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
630	if (uap->cmd == IPC_SET) {
631		if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
632			goto done;
633	}
634
635	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
636	if (error)
637		goto done;
638
639	/* Cases in which we need to copyout */
640	switch (uap->cmd) {
641	case IPC_STAT:
642		error = copyout(&buf, uap->buf, bufsz);
643		break;
644	}
645
646done:
647	if (error) {
648		/* Invalidate the return value */
649		td->td_retval[0] = -1;
650	}
651	return (error);
652}
653
654static int
655shmget_existing(struct thread *td, size_t size, int shmflg, int mode,
656    int segnum)
657{
658	struct shmid_kernel *shmseg;
659#ifdef MAC
660	int error;
661#endif
662
663	SYSVSHM_ASSERT_LOCKED();
664	KASSERT(segnum >= 0 && segnum < shmalloced,
665	    ("segnum %d shmalloced %d", segnum, shmalloced));
666	shmseg = &shmsegs[segnum];
667	if ((shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
668		return (EEXIST);
669#ifdef MAC
670	error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, shmflg);
671	if (error != 0)
672		return (error);
673#endif
674	if (size != 0 && size > shmseg->u.shm_segsz)
675		return (EINVAL);
676	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
677	return (0);
678}
679
680static int
681shmget_allocate_segment(struct thread *td, key_t key, size_t size, int mode)
682{
683	struct ucred *cred = td->td_ucred;
684	struct shmid_kernel *shmseg;
685	vm_object_t shm_object;
686	int i, segnum;
687
688	SYSVSHM_ASSERT_LOCKED();
689
690	if (size < shminfo.shmmin || size > shminfo.shmmax)
691		return (EINVAL);
692	if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
693		return (ENOSPC);
694	size = round_page(size);
695	if (shm_committed + btoc(size) > shminfo.shmall)
696		return (ENOMEM);
697	if (shm_last_free < 0) {
698		shmrealloc();	/* Maybe expand the shmsegs[] array. */
699		for (i = 0; i < shmalloced; i++)
700			if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
701				break;
702		if (i == shmalloced)
703			return (ENOSPC);
704		segnum = i;
705	} else  {
706		segnum = shm_last_free;
707		shm_last_free = -1;
708	}
709	KASSERT(segnum >= 0 && segnum < shmalloced,
710	    ("segnum %d shmalloced %d", segnum, shmalloced));
711	shmseg = &shmsegs[segnum];
712#ifdef RACCT
713	if (racct_enable) {
714		PROC_LOCK(td->td_proc);
715		if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
716			PROC_UNLOCK(td->td_proc);
717			return (ENOSPC);
718		}
719		if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) {
720			racct_sub(td->td_proc, RACCT_NSHM, 1);
721			PROC_UNLOCK(td->td_proc);
722			return (ENOMEM);
723		}
724		PROC_UNLOCK(td->td_proc);
725	}
726#endif
727
728	/*
729	 * We make sure that we have allocated a pager before we need
730	 * to.
731	 */
732	shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
733	    0, size, VM_PROT_DEFAULT, 0, cred);
734	if (shm_object == NULL) {
735#ifdef RACCT
736		if (racct_enable) {
737			PROC_LOCK(td->td_proc);
738			racct_sub(td->td_proc, RACCT_NSHM, 1);
739			racct_sub(td->td_proc, RACCT_SHMSIZE, size);
740			PROC_UNLOCK(td->td_proc);
741		}
742#endif
743		return (ENOMEM);
744	}
745
746	shmseg->object = shm_object;
747	shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
748	shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
749	shmseg->u.shm_perm.mode = (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
750	shmseg->u.shm_perm.key = key;
751	shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
752	shmseg->cred = crhold(cred);
753	shmseg->u.shm_segsz = size;
754	shmseg->u.shm_cpid = td->td_proc->p_pid;
755	shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
756	shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
757#ifdef MAC
758	mac_sysvshm_create(cred, shmseg);
759#endif
760	shmseg->u.shm_ctime = time_second;
761	shm_committed += btoc(size);
762	shm_nused++;
763	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
764
765	return (0);
766}
767
768#ifndef _SYS_SYSPROTO_H_
769struct shmget_args {
770	key_t key;
771	size_t size;
772	int shmflg;
773};
774#endif
775int
776sys_shmget(struct thread *td, struct shmget_args *uap)
777{
778	int segnum, mode;
779	int error;
780
781	if (shm_find_prison(td->td_ucred) == NULL)
782		return (ENOSYS);
783	mode = uap->shmflg & ACCESSPERMS;
784	SYSVSHM_LOCK();
785	if (uap->key == IPC_PRIVATE) {
786		error = shmget_allocate_segment(td, uap->key, uap->size, mode);
787	} else {
788		segnum = shm_find_segment_by_key(td->td_ucred->cr_prison,
789		    uap->key);
790		if (segnum >= 0)
791			error = shmget_existing(td, uap->size, uap->shmflg,
792			    mode, segnum);
793		else if ((uap->shmflg & IPC_CREAT) == 0)
794			error = ENOENT;
795		else
796			error = shmget_allocate_segment(td, uap->key,
797			    uap->size, mode);
798	}
799	SYSVSHM_UNLOCK();
800	return (error);
801}
802
803#ifdef SYSVSHM
804void
805shmfork(struct proc *p1, struct proc *p2)
806#else
807static void
808shmfork_myhook(struct proc *p1, struct proc *p2)
809#endif
810{
811	struct shmmap_state *shmmap_s;
812	size_t size;
813	int i;
814
815	SYSVSHM_LOCK();
816	size = shminfo.shmseg * sizeof(struct shmmap_state);
817	shmmap_s = malloc(size, M_SHM, M_WAITOK);
818	bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
819	p2->p_vmspace->vm_shm = shmmap_s;
820	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
821		if (shmmap_s->shmid != -1) {
822			KASSERT(IPCID_TO_IX(shmmap_s->shmid) >= 0 &&
823			    IPCID_TO_IX(shmmap_s->shmid) < shmalloced,
824			    ("segnum %d shmalloced %d",
825			    IPCID_TO_IX(shmmap_s->shmid), shmalloced));
826			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
827		}
828	}
829	SYSVSHM_UNLOCK();
830}
831
832#ifdef SYSVSHM
833void
834shmexit(struct vmspace *vm)
835#else
836static void
837shmexit_myhook(struct vmspace *vm)
838#endif
839{
840	struct shmmap_state *base, *shm;
841	int i;
842
843	base = vm->vm_shm;
844	if (base != NULL) {
845		vm->vm_shm = NULL;
846		SYSVSHM_LOCK();
847		for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
848			if (shm->shmid != -1)
849				shm_delete_mapping(vm, shm);
850		}
851		SYSVSHM_UNLOCK();
852		free(base, M_SHM);
853	}
854}
855
856static void
857shmrealloc(void)
858{
859	struct shmid_kernel *newsegs;
860	int i;
861
862	SYSVSHM_ASSERT_LOCKED();
863
864	if (shmalloced >= shminfo.shmmni)
865		return;
866
867	newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM,
868	    M_WAITOK | M_ZERO);
869	for (i = 0; i < shmalloced; i++)
870		bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
871	for (; i < shminfo.shmmni; i++) {
872		newsegs[i].u.shm_perm.mode = SHMSEG_FREE;
873		newsegs[i].u.shm_perm.seq = 0;
874#ifdef MAC
875		mac_sysvshm_init(&newsegs[i]);
876#endif
877	}
878	free(shmsegs, M_SHM);
879	shmsegs = newsegs;
880	shmalloced = shminfo.shmmni;
881}
882
883static struct syscall_helper_data shm_syscalls[] = {
884	SYSCALL_INIT_HELPER(shmat),
885	SYSCALL_INIT_HELPER(shmctl),
886	SYSCALL_INIT_HELPER(shmdt),
887	SYSCALL_INIT_HELPER(shmget),
888#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
889    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
890	SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl),
891#endif
892#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
893	SYSCALL_INIT_HELPER(shmsys),
894#endif
895	SYSCALL_INIT_LAST
896};
897
898#ifdef COMPAT_FREEBSD32
899#include <compat/freebsd32/freebsd32.h>
900#include <compat/freebsd32/freebsd32_ipc.h>
901#include <compat/freebsd32/freebsd32_proto.h>
902#include <compat/freebsd32/freebsd32_signal.h>
903#include <compat/freebsd32/freebsd32_syscall.h>
904#include <compat/freebsd32/freebsd32_util.h>
905
906static struct syscall_helper_data shm32_syscalls[] = {
907	SYSCALL32_INIT_HELPER_COMPAT(shmat),
908	SYSCALL32_INIT_HELPER_COMPAT(shmdt),
909	SYSCALL32_INIT_HELPER_COMPAT(shmget),
910	SYSCALL32_INIT_HELPER(freebsd32_shmsys),
911	SYSCALL32_INIT_HELPER(freebsd32_shmctl),
912#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
913    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
914	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl),
915#endif
916	SYSCALL_INIT_LAST
917};
918#endif
919
920static int
921shminit(void)
922{
923	struct prison *pr;
924	void **rsv;
925	int i, error;
926	osd_method_t methods[PR_MAXMETHOD] = {
927	    [PR_METHOD_CHECK] =		shm_prison_check,
928	    [PR_METHOD_SET] =		shm_prison_set,
929	    [PR_METHOD_GET] =		shm_prison_get,
930	    [PR_METHOD_REMOVE] =	shm_prison_remove,
931	};
932
933#ifndef BURN_BRIDGES
934	if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0)
935		printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n");
936#endif
937	if (shminfo.shmmax == SHMMAX) {
938		/* Initialize shmmax dealing with possible overflow. */
939		for (i = PAGE_SIZE; i != 0; i--) {
940			shminfo.shmmax = shminfo.shmall * i;
941			if ((shminfo.shmmax / shminfo.shmall) == (u_long)i)
942				break;
943		}
944	}
945	shmalloced = shminfo.shmmni;
946	shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM,
947	    M_WAITOK|M_ZERO);
948	for (i = 0; i < shmalloced; i++) {
949		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
950		shmsegs[i].u.shm_perm.seq = 0;
951#ifdef MAC
952		mac_sysvshm_init(&shmsegs[i]);
953#endif
954	}
955	shm_last_free = 0;
956	shm_nused = 0;
957	shm_committed = 0;
958	sx_init(&sysvshmsx, "sysvshmsx");
959#ifndef SYSVSHM
960	shmexit_hook = &shmexit_myhook;
961	shmfork_hook = &shmfork_myhook;
962#endif
963
964	/* Set current prisons according to their allow.sysvipc. */
965	shm_prison_slot = osd_jail_register(NULL, methods);
966	rsv = osd_reserve(shm_prison_slot);
967	prison_lock(&prison0);
968	(void)osd_jail_set_reserved(&prison0, shm_prison_slot, rsv, &prison0);
969	prison_unlock(&prison0);
970	rsv = NULL;
971	sx_slock(&allprison_lock);
972	TAILQ_FOREACH(pr, &allprison, pr_list) {
973		if (rsv == NULL)
974			rsv = osd_reserve(shm_prison_slot);
975		prison_lock(pr);
976		if (pr->pr_allow & PR_ALLOW_SYSVIPC) {
977			(void)osd_jail_set_reserved(pr, shm_prison_slot, rsv,
978			    &prison0);
979			rsv = NULL;
980		}
981		prison_unlock(pr);
982	}
983	if (rsv != NULL)
984		osd_free_reserved(rsv);
985	sx_sunlock(&allprison_lock);
986
987	error = syscall_helper_register(shm_syscalls, SY_THR_STATIC_KLD);
988	if (error != 0)
989		return (error);
990#ifdef COMPAT_FREEBSD32
991	error = syscall32_helper_register(shm32_syscalls, SY_THR_STATIC_KLD);
992	if (error != 0)
993		return (error);
994#endif
995	return (0);
996}
997
998static int
999shmunload(void)
1000{
1001	int i;
1002
1003	if (shm_nused > 0)
1004		return (EBUSY);
1005
1006#ifdef COMPAT_FREEBSD32
1007	syscall32_helper_unregister(shm32_syscalls);
1008#endif
1009	syscall_helper_unregister(shm_syscalls);
1010	if (shm_prison_slot != 0)
1011		osd_jail_deregister(shm_prison_slot);
1012
1013	for (i = 0; i < shmalloced; i++) {
1014#ifdef MAC
1015		mac_sysvshm_destroy(&shmsegs[i]);
1016#endif
1017		/*
1018		 * Objects might be still mapped into the processes
1019		 * address spaces.  Actual free would happen on the
1020		 * last mapping destruction.
1021		 */
1022		if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE)
1023			vm_object_deallocate(shmsegs[i].object);
1024	}
1025	free(shmsegs, M_SHM);
1026#ifndef SYSVSHM
1027	shmexit_hook = NULL;
1028	shmfork_hook = NULL;
1029#endif
1030	sx_destroy(&sysvshmsx);
1031	return (0);
1032}
1033
1034static int
1035sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
1036{
1037	struct shmid_kernel tshmseg;
1038#ifdef COMPAT_FREEBSD32
1039	struct shmid_kernel32 tshmseg32;
1040#endif
1041	struct prison *pr, *rpr;
1042	void *outaddr;
1043	size_t outsize;
1044	int error, i;
1045
1046	SYSVSHM_LOCK();
1047	pr = req->td->td_ucred->cr_prison;
1048	rpr = shm_find_prison(req->td->td_ucred);
1049	error = 0;
1050	for (i = 0; i < shmalloced; i++) {
1051		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
1052		    rpr == NULL || shm_prison_cansee(rpr, &shmsegs[i]) != 0) {
1053			bzero(&tshmseg, sizeof(tshmseg));
1054			tshmseg.u.shm_perm.mode = SHMSEG_FREE;
1055		} else {
1056			tshmseg = shmsegs[i];
1057			if (tshmseg.cred->cr_prison != pr)
1058				tshmseg.u.shm_perm.key = IPC_PRIVATE;
1059		}
1060#ifdef COMPAT_FREEBSD32
1061		if (SV_CURPROC_FLAG(SV_ILP32)) {
1062			bzero(&tshmseg32, sizeof(tshmseg32));
1063			freebsd32_ipcperm_out(&tshmseg.u.shm_perm,
1064			    &tshmseg32.u.shm_perm);
1065			CP(tshmseg, tshmseg32, u.shm_segsz);
1066			CP(tshmseg, tshmseg32, u.shm_lpid);
1067			CP(tshmseg, tshmseg32, u.shm_cpid);
1068			CP(tshmseg, tshmseg32, u.shm_nattch);
1069			CP(tshmseg, tshmseg32, u.shm_atime);
1070			CP(tshmseg, tshmseg32, u.shm_dtime);
1071			CP(tshmseg, tshmseg32, u.shm_ctime);
1072			/* Don't copy object, label, or cred */
1073			outaddr = &tshmseg32;
1074			outsize = sizeof(tshmseg32);
1075		} else
1076#endif
1077		{
1078			tshmseg.object = NULL;
1079			tshmseg.label = NULL;
1080			tshmseg.cred = NULL;
1081			outaddr = &tshmseg;
1082			outsize = sizeof(tshmseg);
1083		}
1084		error = SYSCTL_OUT(req, outaddr, outsize);
1085		if (error != 0)
1086			break;
1087	}
1088	SYSVSHM_UNLOCK();
1089	return (error);
1090}
1091
1092int
1093kern_get_shmsegs(struct thread *td, struct shmid_kernel **res, size_t *sz)
1094{
1095	struct shmid_kernel *pshmseg;
1096	struct prison *pr, *rpr;
1097	int i;
1098
1099	SYSVSHM_LOCK();
1100	*sz = shmalloced;
1101	if (res == NULL)
1102		goto out;
1103
1104	pr = td->td_ucred->cr_prison;
1105	rpr = shm_find_prison(td->td_ucred);
1106	*res = malloc(sizeof(struct shmid_kernel) * shmalloced, M_TEMP,
1107	    M_WAITOK);
1108	for (i = 0; i < shmalloced; i++) {
1109		pshmseg = &(*res)[i];
1110		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
1111		    rpr == NULL || shm_prison_cansee(rpr, &shmsegs[i]) != 0) {
1112			bzero(pshmseg, sizeof(*pshmseg));
1113			pshmseg->u.shm_perm.mode = SHMSEG_FREE;
1114		} else {
1115			*pshmseg = shmsegs[i];
1116			if (pshmseg->cred->cr_prison != pr)
1117				pshmseg->u.shm_perm.key = IPC_PRIVATE;
1118		}
1119		pshmseg->object = NULL;
1120		pshmseg->label = NULL;
1121		pshmseg->cred = NULL;
1122	}
1123out:
1124	SYSVSHM_UNLOCK();
1125	return (0);
1126}
1127
1128static int
1129shm_prison_check(void *obj, void *data)
1130{
1131	struct prison *pr = obj;
1132	struct prison *prpr;
1133	struct vfsoptlist *opts = data;
1134	int error, jsys;
1135
1136	/*
1137	 * sysvshm is a jailsys integer.
1138	 * It must be "disable" if the parent jail is disabled.
1139	 */
1140	error = vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys));
1141	if (error != ENOENT) {
1142		if (error != 0)
1143			return (error);
1144		switch (jsys) {
1145		case JAIL_SYS_DISABLE:
1146			break;
1147		case JAIL_SYS_NEW:
1148		case JAIL_SYS_INHERIT:
1149			prison_lock(pr->pr_parent);
1150			prpr = osd_jail_get(pr->pr_parent, shm_prison_slot);
1151			prison_unlock(pr->pr_parent);
1152			if (prpr == NULL)
1153				return (EPERM);
1154			break;
1155		default:
1156			return (EINVAL);
1157		}
1158	}
1159
1160	return (0);
1161}
1162
1163static int
1164shm_prison_set(void *obj, void *data)
1165{
1166	struct prison *pr = obj;
1167	struct prison *tpr, *orpr, *nrpr, *trpr;
1168	struct vfsoptlist *opts = data;
1169	void *rsv;
1170	int jsys, descend;
1171
1172	/*
1173	 * sysvshm controls which jail is the root of the associated segments
1174	 * (this jail or same as the parent), or if the feature is available
1175	 * at all.
1176	 */
1177	if (vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys)) == ENOENT)
1178		jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0)
1179		    ? JAIL_SYS_INHERIT
1180		    : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0)
1181		    ? JAIL_SYS_DISABLE
1182		    : -1;
1183	if (jsys == JAIL_SYS_DISABLE) {
1184		prison_lock(pr);
1185		orpr = osd_jail_get(pr, shm_prison_slot);
1186		if (orpr != NULL)
1187			osd_jail_del(pr, shm_prison_slot);
1188		prison_unlock(pr);
1189		if (orpr != NULL) {
1190			if (orpr == pr)
1191				shm_prison_cleanup(pr);
1192			/* Disable all child jails as well. */
1193			FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1194				prison_lock(tpr);
1195				trpr = osd_jail_get(tpr, shm_prison_slot);
1196				if (trpr != NULL) {
1197					osd_jail_del(tpr, shm_prison_slot);
1198					prison_unlock(tpr);
1199					if (trpr == tpr)
1200						shm_prison_cleanup(tpr);
1201				} else {
1202					prison_unlock(tpr);
1203					descend = 0;
1204				}
1205			}
1206		}
1207	} else if (jsys != -1) {
1208		if (jsys == JAIL_SYS_NEW)
1209			nrpr = pr;
1210		else {
1211			prison_lock(pr->pr_parent);
1212			nrpr = osd_jail_get(pr->pr_parent, shm_prison_slot);
1213			prison_unlock(pr->pr_parent);
1214		}
1215		rsv = osd_reserve(shm_prison_slot);
1216		prison_lock(pr);
1217		orpr = osd_jail_get(pr, shm_prison_slot);
1218		if (orpr != nrpr)
1219			(void)osd_jail_set_reserved(pr, shm_prison_slot, rsv,
1220			    nrpr);
1221		else
1222			osd_free_reserved(rsv);
1223		prison_unlock(pr);
1224		if (orpr != nrpr) {
1225			if (orpr == pr)
1226				shm_prison_cleanup(pr);
1227			if (orpr != NULL) {
1228				/* Change child jails matching the old root, */
1229				FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1230					prison_lock(tpr);
1231					trpr = osd_jail_get(tpr,
1232					    shm_prison_slot);
1233					if (trpr == orpr) {
1234						(void)osd_jail_set(tpr,
1235						    shm_prison_slot, nrpr);
1236						prison_unlock(tpr);
1237						if (trpr == tpr)
1238							shm_prison_cleanup(tpr);
1239					} else {
1240						prison_unlock(tpr);
1241						descend = 0;
1242					}
1243				}
1244			}
1245		}
1246	}
1247
1248	return (0);
1249}
1250
1251static int
1252shm_prison_get(void *obj, void *data)
1253{
1254	struct prison *pr = obj;
1255	struct prison *rpr;
1256	struct vfsoptlist *opts = data;
1257	int error, jsys;
1258
1259	/* Set sysvshm based on the jail's root prison. */
1260	prison_lock(pr);
1261	rpr = osd_jail_get(pr, shm_prison_slot);
1262	prison_unlock(pr);
1263	jsys = rpr == NULL ? JAIL_SYS_DISABLE
1264	    : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
1265	error = vfs_setopt(opts, "sysvshm", &jsys, sizeof(jsys));
1266	if (error == ENOENT)
1267		error = 0;
1268	return (error);
1269}
1270
1271static int
1272shm_prison_remove(void *obj, void *data __unused)
1273{
1274	struct prison *pr = obj;
1275	struct prison *rpr;
1276
1277	SYSVSHM_LOCK();
1278	prison_lock(pr);
1279	rpr = osd_jail_get(pr, shm_prison_slot);
1280	prison_unlock(pr);
1281	if (rpr == pr)
1282		shm_prison_cleanup(pr);
1283	SYSVSHM_UNLOCK();
1284	return (0);
1285}
1286
1287static void
1288shm_prison_cleanup(struct prison *pr)
1289{
1290	struct shmid_kernel *shmseg;
1291	int i;
1292
1293	/* Remove any segments that belong to this jail. */
1294	for (i = 0; i < shmalloced; i++) {
1295		shmseg = &shmsegs[i];
1296		if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) &&
1297		    shmseg->cred != NULL && shmseg->cred->cr_prison == pr) {
1298			shm_remove(shmseg, i);
1299		}
1300	}
1301}
1302
1303SYSCTL_JAIL_PARAM_SYS_NODE(sysvshm, CTLFLAG_RW, "SYSV shared memory");
1304
1305#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
1306struct oshmid_ds {
1307	struct	ipc_perm_old shm_perm;	/* operation perms */
1308	int	shm_segsz;		/* size of segment (bytes) */
1309	u_short	shm_cpid;		/* pid, creator */
1310	u_short	shm_lpid;		/* pid, last operation */
1311	short	shm_nattch;		/* no. of current attaches */
1312	time_t	shm_atime;		/* last attach time */
1313	time_t	shm_dtime;		/* last detach time */
1314	time_t	shm_ctime;		/* last change time */
1315	void	*shm_handle;		/* internal handle for shm segment */
1316};
1317
1318struct oshmctl_args {
1319	int shmid;
1320	int cmd;
1321	struct oshmid_ds *ubuf;
1322};
1323
1324static int
1325oshmctl(struct thread *td, struct oshmctl_args *uap)
1326{
1327#ifdef COMPAT_43
1328	int error = 0;
1329	struct prison *rpr;
1330	struct shmid_kernel *shmseg;
1331	struct oshmid_ds outbuf;
1332
1333	rpr = shm_find_prison(td->td_ucred);
1334	if (rpr == NULL)
1335		return (ENOSYS);
1336	if (uap->cmd != IPC_STAT) {
1337		return (freebsd7_shmctl(td,
1338		    (struct freebsd7_shmctl_args *)uap));
1339	}
1340	SYSVSHM_LOCK();
1341	shmseg = shm_find_segment(rpr, uap->shmid, true);
1342	if (shmseg == NULL) {
1343		SYSVSHM_UNLOCK();
1344		return (EINVAL);
1345	}
1346	error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
1347	if (error != 0) {
1348		SYSVSHM_UNLOCK();
1349		return (error);
1350	}
1351#ifdef MAC
1352	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd);
1353	if (error != 0) {
1354		SYSVSHM_UNLOCK();
1355		return (error);
1356	}
1357#endif
1358	ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm);
1359	outbuf.shm_segsz = shmseg->u.shm_segsz;
1360	outbuf.shm_cpid = shmseg->u.shm_cpid;
1361	outbuf.shm_lpid = shmseg->u.shm_lpid;
1362	outbuf.shm_nattch = shmseg->u.shm_nattch;
1363	outbuf.shm_atime = shmseg->u.shm_atime;
1364	outbuf.shm_dtime = shmseg->u.shm_dtime;
1365	outbuf.shm_ctime = shmseg->u.shm_ctime;
1366	outbuf.shm_handle = shmseg->object;
1367	SYSVSHM_UNLOCK();
1368	return (copyout(&outbuf, uap->ubuf, sizeof(outbuf)));
1369#else
1370	return (EINVAL);
1371#endif
1372}
1373
1374/* XXX casting to (sy_call_t *) is bogus, as usual. */
1375static sy_call_t *shmcalls[] = {
1376	(sy_call_t *)sys_shmat, (sy_call_t *)oshmctl,
1377	(sy_call_t *)sys_shmdt, (sy_call_t *)sys_shmget,
1378	(sy_call_t *)freebsd7_shmctl
1379};
1380
1381#ifndef _SYS_SYSPROTO_H_
1382/* XXX actually varargs. */
1383struct shmsys_args {
1384	int	which;
1385	int	a2;
1386	int	a3;
1387	int	a4;
1388};
1389#endif
1390int
1391sys_shmsys(struct thread *td, struct shmsys_args *uap)
1392{
1393
1394	AUDIT_ARG_SVIPC_WHICH(uap->which);
1395	if (uap->which < 0 || uap->which >= nitems(shmcalls))
1396		return (EINVAL);
1397	return ((*shmcalls[uap->which])(td, &uap->a2));
1398}
1399
1400#endif	/* i386 && (COMPAT_FREEBSD4 || COMPAT_43) */
1401
1402#ifdef COMPAT_FREEBSD32
1403
1404int
1405freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
1406{
1407
1408#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
1409    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
1410	AUDIT_ARG_SVIPC_WHICH(uap->which);
1411	switch (uap->which) {
1412	case 0:	{	/* shmat */
1413		struct shmat_args ap;
1414
1415		ap.shmid = uap->a2;
1416		ap.shmaddr = PTRIN(uap->a3);
1417		ap.shmflg = uap->a4;
1418		return (sysent[SYS_shmat].sy_call(td, &ap));
1419	}
1420	case 2: {	/* shmdt */
1421		struct shmdt_args ap;
1422
1423		ap.shmaddr = PTRIN(uap->a2);
1424		return (sysent[SYS_shmdt].sy_call(td, &ap));
1425	}
1426	case 3: {	/* shmget */
1427		struct shmget_args ap;
1428
1429		ap.key = uap->a2;
1430		ap.size = uap->a3;
1431		ap.shmflg = uap->a4;
1432		return (sysent[SYS_shmget].sy_call(td, &ap));
1433	}
1434	case 4: {	/* shmctl */
1435		struct freebsd7_freebsd32_shmctl_args ap;
1436
1437		ap.shmid = uap->a2;
1438		ap.cmd = uap->a3;
1439		ap.buf = PTRIN(uap->a4);
1440		return (freebsd7_freebsd32_shmctl(td, &ap));
1441	}
1442	case 1:		/* oshmctl */
1443	default:
1444		return (EINVAL);
1445	}
1446#else
1447	return (nosys(td, NULL));
1448#endif
1449}
1450
1451#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
1452    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
1453int
1454freebsd7_freebsd32_shmctl(struct thread *td,
1455    struct freebsd7_freebsd32_shmctl_args *uap)
1456{
1457	int error;
1458	union {
1459		struct shmid_ds shmid_ds;
1460		struct shm_info shm_info;
1461		struct shminfo shminfo;
1462	} u;
1463	union {
1464		struct shmid_ds_old32 shmid_ds32;
1465		struct shm_info32 shm_info32;
1466		struct shminfo32 shminfo32;
1467	} u32;
1468	size_t sz;
1469
1470	if (uap->cmd == IPC_SET) {
1471		if ((error = copyin(uap->buf, &u32.shmid_ds32,
1472		    sizeof(u32.shmid_ds32))))
1473			goto done;
1474		freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm,
1475		    &u.shmid_ds.shm_perm);
1476		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
1477		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
1478		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
1479		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
1480		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
1481		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
1482		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
1483	}
1484
1485	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
1486	if (error)
1487		goto done;
1488
1489	/* Cases in which we need to copyout */
1490	switch (uap->cmd) {
1491	case IPC_INFO:
1492		CP(u.shminfo, u32.shminfo32, shmmax);
1493		CP(u.shminfo, u32.shminfo32, shmmin);
1494		CP(u.shminfo, u32.shminfo32, shmmni);
1495		CP(u.shminfo, u32.shminfo32, shmseg);
1496		CP(u.shminfo, u32.shminfo32, shmall);
1497		error = copyout(&u32.shminfo32, uap->buf,
1498		    sizeof(u32.shminfo32));
1499		break;
1500	case SHM_INFO:
1501		CP(u.shm_info, u32.shm_info32, used_ids);
1502		CP(u.shm_info, u32.shm_info32, shm_rss);
1503		CP(u.shm_info, u32.shm_info32, shm_tot);
1504		CP(u.shm_info, u32.shm_info32, shm_swp);
1505		CP(u.shm_info, u32.shm_info32, swap_attempts);
1506		CP(u.shm_info, u32.shm_info32, swap_successes);
1507		error = copyout(&u32.shm_info32, uap->buf,
1508		    sizeof(u32.shm_info32));
1509		break;
1510	case SHM_STAT:
1511	case IPC_STAT:
1512		memset(&u32.shmid_ds32, 0, sizeof(u32.shmid_ds32));
1513		freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm,
1514		    &u32.shmid_ds32.shm_perm);
1515		if (u.shmid_ds.shm_segsz > INT32_MAX)
1516			u32.shmid_ds32.shm_segsz = INT32_MAX;
1517		else
1518			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
1519		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
1520		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
1521		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
1522		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
1523		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
1524		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
1525		u32.shmid_ds32.shm_internal = 0;
1526		error = copyout(&u32.shmid_ds32, uap->buf,
1527		    sizeof(u32.shmid_ds32));
1528		break;
1529	}
1530
1531done:
1532	if (error) {
1533		/* Invalidate the return value */
1534		td->td_retval[0] = -1;
1535	}
1536	return (error);
1537}
1538#endif
1539
1540int
1541freebsd32_shmctl(struct thread *td, struct freebsd32_shmctl_args *uap)
1542{
1543	int error;
1544	union {
1545		struct shmid_ds shmid_ds;
1546		struct shm_info shm_info;
1547		struct shminfo shminfo;
1548	} u;
1549	union {
1550		struct shmid_ds32 shmid_ds32;
1551		struct shm_info32 shm_info32;
1552		struct shminfo32 shminfo32;
1553	} u32;
1554	size_t sz;
1555
1556	if (uap->cmd == IPC_SET) {
1557		if ((error = copyin(uap->buf, &u32.shmid_ds32,
1558		    sizeof(u32.shmid_ds32))))
1559			goto done;
1560		freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm,
1561		    &u.shmid_ds.shm_perm);
1562		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
1563		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
1564		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
1565		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
1566		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
1567		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
1568		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
1569	}
1570
1571	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
1572	if (error)
1573		goto done;
1574
1575	/* Cases in which we need to copyout */
1576	switch (uap->cmd) {
1577	case IPC_INFO:
1578		CP(u.shminfo, u32.shminfo32, shmmax);
1579		CP(u.shminfo, u32.shminfo32, shmmin);
1580		CP(u.shminfo, u32.shminfo32, shmmni);
1581		CP(u.shminfo, u32.shminfo32, shmseg);
1582		CP(u.shminfo, u32.shminfo32, shmall);
1583		error = copyout(&u32.shminfo32, uap->buf,
1584		    sizeof(u32.shminfo32));
1585		break;
1586	case SHM_INFO:
1587		CP(u.shm_info, u32.shm_info32, used_ids);
1588		CP(u.shm_info, u32.shm_info32, shm_rss);
1589		CP(u.shm_info, u32.shm_info32, shm_tot);
1590		CP(u.shm_info, u32.shm_info32, shm_swp);
1591		CP(u.shm_info, u32.shm_info32, swap_attempts);
1592		CP(u.shm_info, u32.shm_info32, swap_successes);
1593		error = copyout(&u32.shm_info32, uap->buf,
1594		    sizeof(u32.shm_info32));
1595		break;
1596	case SHM_STAT:
1597	case IPC_STAT:
1598		freebsd32_ipcperm_out(&u.shmid_ds.shm_perm,
1599		    &u32.shmid_ds32.shm_perm);
1600		if (u.shmid_ds.shm_segsz > INT32_MAX)
1601			u32.shmid_ds32.shm_segsz = INT32_MAX;
1602		else
1603			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
1604		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
1605		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
1606		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
1607		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
1608		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
1609		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
1610		error = copyout(&u32.shmid_ds32, uap->buf,
1611		    sizeof(u32.shmid_ds32));
1612		break;
1613	}
1614
1615done:
1616	if (error) {
1617		/* Invalidate the return value */
1618		td->td_retval[0] = -1;
1619	}
1620	return (error);
1621}
1622#endif
1623
1624#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
1625    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
1626
1627#ifndef _SYS_SYSPROTO_H_
1628struct freebsd7_shmctl_args {
1629	int shmid;
1630	int cmd;
1631	struct shmid_ds_old *buf;
1632};
1633#endif
1634int
1635freebsd7_shmctl(struct thread *td, struct freebsd7_shmctl_args *uap)
1636{
1637	int error;
1638	struct shmid_ds_old old;
1639	struct shmid_ds buf;
1640	size_t bufsz;
1641
1642	/*
1643	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
1644	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
1645	 * return an error back to the user since we do not to support this.
1646	 */
1647	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
1648	    uap->cmd == SHM_STAT)
1649		return (EINVAL);
1650
1651	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
1652	if (uap->cmd == IPC_SET) {
1653		if ((error = copyin(uap->buf, &old, sizeof(old))))
1654			goto done;
1655		ipcperm_old2new(&old.shm_perm, &buf.shm_perm);
1656		CP(old, buf, shm_segsz);
1657		CP(old, buf, shm_lpid);
1658		CP(old, buf, shm_cpid);
1659		CP(old, buf, shm_nattch);
1660		CP(old, buf, shm_atime);
1661		CP(old, buf, shm_dtime);
1662		CP(old, buf, shm_ctime);
1663	}
1664
1665	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
1666	if (error)
1667		goto done;
1668
1669	/* Cases in which we need to copyout */
1670	switch (uap->cmd) {
1671	case IPC_STAT:
1672		memset(&old, 0, sizeof(old));
1673		ipcperm_new2old(&buf.shm_perm, &old.shm_perm);
1674		if (buf.shm_segsz > INT_MAX)
1675			old.shm_segsz = INT_MAX;
1676		else
1677			CP(buf, old, shm_segsz);
1678		CP(buf, old, shm_lpid);
1679		CP(buf, old, shm_cpid);
1680		if (buf.shm_nattch > SHRT_MAX)
1681			old.shm_nattch = SHRT_MAX;
1682		else
1683			CP(buf, old, shm_nattch);
1684		CP(buf, old, shm_atime);
1685		CP(buf, old, shm_dtime);
1686		CP(buf, old, shm_ctime);
1687		old.shm_internal = NULL;
1688		error = copyout(&old, uap->buf, sizeof(old));
1689		break;
1690	}
1691
1692done:
1693	if (error) {
1694		/* Invalidate the return value */
1695		td->td_retval[0] = -1;
1696	}
1697	return (error);
1698}
1699
1700#endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
1701	   COMPAT_FREEBSD7 */
1702
1703static int
1704sysvshm_modload(struct module *module, int cmd, void *arg)
1705{
1706	int error = 0;
1707
1708	switch (cmd) {
1709	case MOD_LOAD:
1710		error = shminit();
1711		if (error != 0)
1712			shmunload();
1713		break;
1714	case MOD_UNLOAD:
1715		error = shmunload();
1716		break;
1717	case MOD_SHUTDOWN:
1718		break;
1719	default:
1720		error = EINVAL;
1721		break;
1722	}
1723	return (error);
1724}
1725
1726static moduledata_t sysvshm_mod = {
1727	"sysvshm",
1728	&sysvshm_modload,
1729	NULL
1730};
1731
1732DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
1733MODULE_VERSION(sysvshm, 1);
1734