uipc_shm.c revision 302408
1/*-
2 * Copyright (c) 2006, 2011 Robert N. M. Watson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/*
28 * Support for shared swap-backed anonymous memory objects via
29 * shm_open(2) and shm_unlink(2).  While most of the implementation is
30 * here, vm_mmap.c contains mapping logic changes.
31 *
32 * TODO:
33 *
34 * (1) Need to export data to a userland tool via a sysctl.  Should ipcs(1)
35 *     and ipcrm(1) be expanded or should new tools to manage both POSIX
36 *     kernel semaphores and POSIX shared memory be written?
37 *
38 * (2) Add support for this file type to fstat(1).
39 *
40 * (3) Resource limits?  Does this need its own resource limits or are the
41 *     existing limits in mmap(2) sufficient?
42 */
43
44#include <sys/cdefs.h>
45__FBSDID("$FreeBSD: stable/11/sys/kern/uipc_shm.c 302151 2016-06-23 20:59:13Z jilles $");
46
47#include "opt_capsicum.h"
48#include "opt_ktrace.h"
49
50#include <sys/param.h>
51#include <sys/capsicum.h>
52#include <sys/conf.h>
53#include <sys/fcntl.h>
54#include <sys/file.h>
55#include <sys/filedesc.h>
56#include <sys/fnv_hash.h>
57#include <sys/kernel.h>
58#include <sys/uio.h>
59#include <sys/signal.h>
60#include <sys/jail.h>
61#include <sys/ktrace.h>
62#include <sys/lock.h>
63#include <sys/malloc.h>
64#include <sys/mman.h>
65#include <sys/mutex.h>
66#include <sys/priv.h>
67#include <sys/proc.h>
68#include <sys/refcount.h>
69#include <sys/resourcevar.h>
70#include <sys/rwlock.h>
71#include <sys/stat.h>
72#include <sys/syscallsubr.h>
73#include <sys/sysctl.h>
74#include <sys/sysproto.h>
75#include <sys/systm.h>
76#include <sys/sx.h>
77#include <sys/time.h>
78#include <sys/vnode.h>
79#include <sys/unistd.h>
80#include <sys/user.h>
81
82#include <security/mac/mac_framework.h>
83
84#include <vm/vm.h>
85#include <vm/vm_param.h>
86#include <vm/pmap.h>
87#include <vm/vm_extern.h>
88#include <vm/vm_map.h>
89#include <vm/vm_kern.h>
90#include <vm/vm_object.h>
91#include <vm/vm_page.h>
92#include <vm/vm_pageout.h>
93#include <vm/vm_pager.h>
94#include <vm/swap_pager.h>
95
96struct shm_mapping {
97	char		*sm_path;
98	Fnv32_t		sm_fnv;
99	struct shmfd	*sm_shmfd;
100	LIST_ENTRY(shm_mapping) sm_link;
101};
102
103static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
104static LIST_HEAD(, shm_mapping) *shm_dictionary;
105static struct sx shm_dict_lock;
106static struct mtx shm_timestamp_lock;
107static u_long shm_hash;
108static struct unrhdr *shm_ino_unr;
109static dev_t shm_dev_ino;
110
111#define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
112
113static void	shm_init(void *arg);
114static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
115static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
116static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
117
118static fo_rdwr_t	shm_read;
119static fo_rdwr_t	shm_write;
120static fo_truncate_t	shm_truncate;
121static fo_stat_t	shm_stat;
122static fo_close_t	shm_close;
123static fo_chmod_t	shm_chmod;
124static fo_chown_t	shm_chown;
125static fo_seek_t	shm_seek;
126static fo_fill_kinfo_t	shm_fill_kinfo;
127static fo_mmap_t	shm_mmap;
128
129/* File descriptor operations. */
130struct fileops shm_ops = {
131	.fo_read = shm_read,
132	.fo_write = shm_write,
133	.fo_truncate = shm_truncate,
134	.fo_ioctl = invfo_ioctl,
135	.fo_poll = invfo_poll,
136	.fo_kqfilter = invfo_kqfilter,
137	.fo_stat = shm_stat,
138	.fo_close = shm_close,
139	.fo_chmod = shm_chmod,
140	.fo_chown = shm_chown,
141	.fo_sendfile = vn_sendfile,
142	.fo_seek = shm_seek,
143	.fo_fill_kinfo = shm_fill_kinfo,
144	.fo_mmap = shm_mmap,
145	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
146};
147
148FEATURE(posix_shm, "POSIX shared memory");
149
150static int
151uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
152{
153	vm_page_t m;
154	vm_pindex_t idx;
155	size_t tlen;
156	int error, offset, rv;
157
158	idx = OFF_TO_IDX(uio->uio_offset);
159	offset = uio->uio_offset & PAGE_MASK;
160	tlen = MIN(PAGE_SIZE - offset, len);
161
162	VM_OBJECT_WLOCK(obj);
163
164	/*
165	 * Read I/O without either a corresponding resident page or swap
166	 * page: use zero_region.  This is intended to avoid instantiating
167	 * pages on read from a sparse region.
168	 */
169	if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL &&
170	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
171		VM_OBJECT_WUNLOCK(obj);
172		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
173	}
174
175	/*
176	 * Parallel reads of the page content from disk are prevented
177	 * by exclusive busy.
178	 *
179	 * Although the tmpfs vnode lock is held here, it is
180	 * nonetheless safe to sleep waiting for a free page.  The
181	 * pageout daemon does not need to acquire the tmpfs vnode
182	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
183	 * type object.
184	 */
185	m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL);
186	if (m->valid != VM_PAGE_BITS_ALL) {
187		if (vm_pager_has_page(obj, idx, NULL, NULL)) {
188			rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL);
189			if (rv != VM_PAGER_OK) {
190				printf(
191	    "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n",
192				    obj, idx, m->valid, rv);
193				vm_page_lock(m);
194				vm_page_free(m);
195				vm_page_unlock(m);
196				VM_OBJECT_WUNLOCK(obj);
197				return (EIO);
198			}
199		} else
200			vm_page_zero_invalid(m, TRUE);
201	}
202	vm_page_xunbusy(m);
203	vm_page_lock(m);
204	vm_page_hold(m);
205	if (m->queue == PQ_NONE) {
206		vm_page_deactivate(m);
207	} else {
208		/* Requeue to maintain LRU ordering. */
209		vm_page_requeue(m);
210	}
211	vm_page_unlock(m);
212	VM_OBJECT_WUNLOCK(obj);
213	error = uiomove_fromphys(&m, offset, tlen, uio);
214	if (uio->uio_rw == UIO_WRITE && error == 0) {
215		VM_OBJECT_WLOCK(obj);
216		vm_page_dirty(m);
217		vm_pager_page_unswapped(m);
218		VM_OBJECT_WUNLOCK(obj);
219	}
220	vm_page_lock(m);
221	vm_page_unhold(m);
222	vm_page_unlock(m);
223
224	return (error);
225}
226
227int
228uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
229{
230	ssize_t resid;
231	size_t len;
232	int error;
233
234	error = 0;
235	while ((resid = uio->uio_resid) > 0) {
236		if (obj_size <= uio->uio_offset)
237			break;
238		len = MIN(obj_size - uio->uio_offset, resid);
239		if (len == 0)
240			break;
241		error = uiomove_object_page(obj, len, uio);
242		if (error != 0 || resid == uio->uio_resid)
243			break;
244	}
245	return (error);
246}
247
248static int
249shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
250{
251	struct shmfd *shmfd;
252	off_t foffset;
253	int error;
254
255	shmfd = fp->f_data;
256	foffset = foffset_lock(fp, 0);
257	error = 0;
258	switch (whence) {
259	case L_INCR:
260		if (foffset < 0 ||
261		    (offset > 0 && foffset > OFF_MAX - offset)) {
262			error = EOVERFLOW;
263			break;
264		}
265		offset += foffset;
266		break;
267	case L_XTND:
268		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
269			error = EOVERFLOW;
270			break;
271		}
272		offset += shmfd->shm_size;
273		break;
274	case L_SET:
275		break;
276	default:
277		error = EINVAL;
278	}
279	if (error == 0) {
280		if (offset < 0 || offset > shmfd->shm_size)
281			error = EINVAL;
282		else
283			td->td_uretoff.tdu_off = offset;
284	}
285	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
286	return (error);
287}
288
289static int
290shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
291    int flags, struct thread *td)
292{
293	struct shmfd *shmfd;
294	void *rl_cookie;
295	int error;
296
297	shmfd = fp->f_data;
298#ifdef MAC
299	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
300	if (error)
301		return (error);
302#endif
303	foffset_lock_uio(fp, uio, flags);
304	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
305	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
306	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
307	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
308	foffset_unlock_uio(fp, uio, flags);
309	return (error);
310}
311
312static int
313shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
314    int flags, struct thread *td)
315{
316	struct shmfd *shmfd;
317	void *rl_cookie;
318	int error;
319
320	shmfd = fp->f_data;
321#ifdef MAC
322	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
323	if (error)
324		return (error);
325#endif
326	foffset_lock_uio(fp, uio, flags);
327	if ((flags & FOF_OFFSET) == 0) {
328		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
329		    &shmfd->shm_mtx);
330	} else {
331		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
332		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
333	}
334
335	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
336	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
337	foffset_unlock_uio(fp, uio, flags);
338	return (error);
339}
340
341static int
342shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
343    struct thread *td)
344{
345	struct shmfd *shmfd;
346#ifdef MAC
347	int error;
348#endif
349
350	shmfd = fp->f_data;
351#ifdef MAC
352	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
353	if (error)
354		return (error);
355#endif
356	return (shm_dotruncate(shmfd, length));
357}
358
359static int
360shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
361    struct thread *td)
362{
363	struct shmfd *shmfd;
364#ifdef MAC
365	int error;
366#endif
367
368	shmfd = fp->f_data;
369
370#ifdef MAC
371	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
372	if (error)
373		return (error);
374#endif
375
376	/*
377	 * Attempt to return sanish values for fstat() on a memory file
378	 * descriptor.
379	 */
380	bzero(sb, sizeof(*sb));
381	sb->st_blksize = PAGE_SIZE;
382	sb->st_size = shmfd->shm_size;
383	sb->st_blocks = howmany(sb->st_size, sb->st_blksize);
384	mtx_lock(&shm_timestamp_lock);
385	sb->st_atim = shmfd->shm_atime;
386	sb->st_ctim = shmfd->shm_ctime;
387	sb->st_mtim = shmfd->shm_mtime;
388	sb->st_birthtim = shmfd->shm_birthtime;
389	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
390	sb->st_uid = shmfd->shm_uid;
391	sb->st_gid = shmfd->shm_gid;
392	mtx_unlock(&shm_timestamp_lock);
393	sb->st_dev = shm_dev_ino;
394	sb->st_ino = shmfd->shm_ino;
395
396	return (0);
397}
398
399static int
400shm_close(struct file *fp, struct thread *td)
401{
402	struct shmfd *shmfd;
403
404	shmfd = fp->f_data;
405	fp->f_data = NULL;
406	shm_drop(shmfd);
407
408	return (0);
409}
410
411int
412shm_dotruncate(struct shmfd *shmfd, off_t length)
413{
414	vm_object_t object;
415	vm_page_t m;
416	vm_pindex_t idx, nobjsize;
417	vm_ooffset_t delta;
418	int base, rv;
419
420	object = shmfd->shm_object;
421	VM_OBJECT_WLOCK(object);
422	if (length == shmfd->shm_size) {
423		VM_OBJECT_WUNLOCK(object);
424		return (0);
425	}
426	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
427
428	/* Are we shrinking?  If so, trim the end. */
429	if (length < shmfd->shm_size) {
430		/*
431		 * Disallow any requests to shrink the size if this
432		 * object is mapped into the kernel.
433		 */
434		if (shmfd->shm_kmappings > 0) {
435			VM_OBJECT_WUNLOCK(object);
436			return (EBUSY);
437		}
438
439		/*
440		 * Zero the truncated part of the last page.
441		 */
442		base = length & PAGE_MASK;
443		if (base != 0) {
444			idx = OFF_TO_IDX(length);
445retry:
446			m = vm_page_lookup(object, idx);
447			if (m != NULL) {
448				if (vm_page_sleep_if_busy(m, "shmtrc"))
449					goto retry;
450			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
451				m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL);
452				if (m == NULL) {
453					VM_OBJECT_WUNLOCK(object);
454					VM_WAIT;
455					VM_OBJECT_WLOCK(object);
456					goto retry;
457				} else if (m->valid != VM_PAGE_BITS_ALL)
458					rv = vm_pager_get_pages(object, &m, 1,
459					    NULL, NULL);
460				else
461					/* A cached page was reactivated. */
462					rv = VM_PAGER_OK;
463				vm_page_lock(m);
464				if (rv == VM_PAGER_OK) {
465					vm_page_deactivate(m);
466					vm_page_unlock(m);
467					vm_page_xunbusy(m);
468				} else {
469					vm_page_free(m);
470					vm_page_unlock(m);
471					VM_OBJECT_WUNLOCK(object);
472					return (EIO);
473				}
474			}
475			if (m != NULL) {
476				pmap_zero_page_area(m, base, PAGE_SIZE - base);
477				KASSERT(m->valid == VM_PAGE_BITS_ALL,
478				    ("shm_dotruncate: page %p is invalid", m));
479				vm_page_dirty(m);
480				vm_pager_page_unswapped(m);
481			}
482		}
483		delta = ptoa(object->size - nobjsize);
484
485		/* Toss in memory pages. */
486		if (nobjsize < object->size)
487			vm_object_page_remove(object, nobjsize, object->size,
488			    0);
489
490		/* Toss pages from swap. */
491		if (object->type == OBJT_SWAP)
492			swap_pager_freespace(object, nobjsize, delta);
493
494		/* Free the swap accounted for shm */
495		swap_release_by_cred(delta, object->cred);
496		object->charge -= delta;
497	} else {
498		/* Attempt to reserve the swap */
499		delta = ptoa(nobjsize - object->size);
500		if (!swap_reserve_by_cred(delta, object->cred)) {
501			VM_OBJECT_WUNLOCK(object);
502			return (ENOMEM);
503		}
504		object->charge += delta;
505	}
506	shmfd->shm_size = length;
507	mtx_lock(&shm_timestamp_lock);
508	vfs_timestamp(&shmfd->shm_ctime);
509	shmfd->shm_mtime = shmfd->shm_ctime;
510	mtx_unlock(&shm_timestamp_lock);
511	object->size = nobjsize;
512	VM_OBJECT_WUNLOCK(object);
513	return (0);
514}
515
516/*
517 * shmfd object management including creation and reference counting
518 * routines.
519 */
520struct shmfd *
521shm_alloc(struct ucred *ucred, mode_t mode)
522{
523	struct shmfd *shmfd;
524	int ino;
525
526	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
527	shmfd->shm_size = 0;
528	shmfd->shm_uid = ucred->cr_uid;
529	shmfd->shm_gid = ucred->cr_gid;
530	shmfd->shm_mode = mode;
531	shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
532	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
533	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
534	shmfd->shm_object->pg_color = 0;
535	VM_OBJECT_WLOCK(shmfd->shm_object);
536	vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
537	vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT);
538	VM_OBJECT_WUNLOCK(shmfd->shm_object);
539	vfs_timestamp(&shmfd->shm_birthtime);
540	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
541	    shmfd->shm_birthtime;
542	ino = alloc_unr(shm_ino_unr);
543	if (ino == -1)
544		shmfd->shm_ino = 0;
545	else
546		shmfd->shm_ino = ino;
547	refcount_init(&shmfd->shm_refs, 1);
548	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
549	rangelock_init(&shmfd->shm_rl);
550#ifdef MAC
551	mac_posixshm_init(shmfd);
552	mac_posixshm_create(ucred, shmfd);
553#endif
554
555	return (shmfd);
556}
557
558struct shmfd *
559shm_hold(struct shmfd *shmfd)
560{
561
562	refcount_acquire(&shmfd->shm_refs);
563	return (shmfd);
564}
565
566void
567shm_drop(struct shmfd *shmfd)
568{
569
570	if (refcount_release(&shmfd->shm_refs)) {
571#ifdef MAC
572		mac_posixshm_destroy(shmfd);
573#endif
574		rangelock_destroy(&shmfd->shm_rl);
575		mtx_destroy(&shmfd->shm_mtx);
576		vm_object_deallocate(shmfd->shm_object);
577		if (shmfd->shm_ino != 0)
578			free_unr(shm_ino_unr, shmfd->shm_ino);
579		free(shmfd, M_SHMFD);
580	}
581}
582
583/*
584 * Determine if the credentials have sufficient permissions for a
585 * specified combination of FREAD and FWRITE.
586 */
587int
588shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
589{
590	accmode_t accmode;
591	int error;
592
593	accmode = 0;
594	if (flags & FREAD)
595		accmode |= VREAD;
596	if (flags & FWRITE)
597		accmode |= VWRITE;
598	mtx_lock(&shm_timestamp_lock);
599	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
600	    accmode, ucred, NULL);
601	mtx_unlock(&shm_timestamp_lock);
602	return (error);
603}
604
605/*
606 * Dictionary management.  We maintain an in-kernel dictionary to map
607 * paths to shmfd objects.  We use the FNV hash on the path to store
608 * the mappings in a hash table.
609 */
610static void
611shm_init(void *arg)
612{
613
614	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
615	sx_init(&shm_dict_lock, "shm dictionary");
616	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
617	shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL);
618	KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized"));
619	shm_dev_ino = devfs_alloc_cdp_inode();
620	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
621}
622SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
623
624static struct shmfd *
625shm_lookup(char *path, Fnv32_t fnv)
626{
627	struct shm_mapping *map;
628
629	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
630		if (map->sm_fnv != fnv)
631			continue;
632		if (strcmp(map->sm_path, path) == 0)
633			return (map->sm_shmfd);
634	}
635
636	return (NULL);
637}
638
639static void
640shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
641{
642	struct shm_mapping *map;
643
644	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
645	map->sm_path = path;
646	map->sm_fnv = fnv;
647	map->sm_shmfd = shm_hold(shmfd);
648	shmfd->shm_path = path;
649	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
650}
651
652static int
653shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
654{
655	struct shm_mapping *map;
656	int error;
657
658	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
659		if (map->sm_fnv != fnv)
660			continue;
661		if (strcmp(map->sm_path, path) == 0) {
662#ifdef MAC
663			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
664			if (error)
665				return (error);
666#endif
667			error = shm_access(map->sm_shmfd, ucred,
668			    FREAD | FWRITE);
669			if (error)
670				return (error);
671			map->sm_shmfd->shm_path = NULL;
672			LIST_REMOVE(map, sm_link);
673			shm_drop(map->sm_shmfd);
674			free(map->sm_path, M_SHMFD);
675			free(map, M_SHMFD);
676			return (0);
677		}
678	}
679
680	return (ENOENT);
681}
682
683int
684kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode,
685    struct filecaps *fcaps)
686{
687	struct filedesc *fdp;
688	struct shmfd *shmfd;
689	struct file *fp;
690	char *path;
691	const char *pr_path;
692	size_t pr_pathlen;
693	Fnv32_t fnv;
694	mode_t cmode;
695	int fd, error;
696
697#ifdef CAPABILITY_MODE
698	/*
699	 * shm_open(2) is only allowed for anonymous objects.
700	 */
701	if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON))
702		return (ECAPMODE);
703#endif
704
705	if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
706		return (EINVAL);
707
708	if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
709		return (EINVAL);
710
711	fdp = td->td_proc->p_fd;
712	cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
713
714	error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps);
715	if (error)
716		return (error);
717
718	/* A SHM_ANON path pointer creates an anonymous object. */
719	if (userpath == SHM_ANON) {
720		/* A read-only anonymous object is pointless. */
721		if ((flags & O_ACCMODE) == O_RDONLY) {
722			fdclose(td, fp, fd);
723			fdrop(fp, td);
724			return (EINVAL);
725		}
726		shmfd = shm_alloc(td->td_ucred, cmode);
727	} else {
728		path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
729		pr_path = td->td_ucred->cr_prison->pr_path;
730
731		/* Construct a full pathname for jailed callers. */
732		pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
733		    : strlcpy(path, pr_path, MAXPATHLEN);
734		error = copyinstr(userpath, path + pr_pathlen,
735		    MAXPATHLEN - pr_pathlen, NULL);
736#ifdef KTRACE
737		if (error == 0 && KTRPOINT(curthread, KTR_NAMEI))
738			ktrnamei(path);
739#endif
740		/* Require paths to start with a '/' character. */
741		if (error == 0 && path[pr_pathlen] != '/')
742			error = EINVAL;
743		if (error) {
744			fdclose(td, fp, fd);
745			fdrop(fp, td);
746			free(path, M_SHMFD);
747			return (error);
748		}
749
750		fnv = fnv_32_str(path, FNV1_32_INIT);
751		sx_xlock(&shm_dict_lock);
752		shmfd = shm_lookup(path, fnv);
753		if (shmfd == NULL) {
754			/* Object does not yet exist, create it if requested. */
755			if (flags & O_CREAT) {
756#ifdef MAC
757				error = mac_posixshm_check_create(td->td_ucred,
758				    path);
759				if (error == 0) {
760#endif
761					shmfd = shm_alloc(td->td_ucred, cmode);
762					shm_insert(path, fnv, shmfd);
763#ifdef MAC
764				}
765#endif
766			} else {
767				free(path, M_SHMFD);
768				error = ENOENT;
769			}
770		} else {
771			/*
772			 * Object already exists, obtain a new
773			 * reference if requested and permitted.
774			 */
775			free(path, M_SHMFD);
776			if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
777				error = EEXIST;
778			else {
779#ifdef MAC
780				error = mac_posixshm_check_open(td->td_ucred,
781				    shmfd, FFLAGS(flags & O_ACCMODE));
782				if (error == 0)
783#endif
784				error = shm_access(shmfd, td->td_ucred,
785				    FFLAGS(flags & O_ACCMODE));
786			}
787
788			/*
789			 * Truncate the file back to zero length if
790			 * O_TRUNC was specified and the object was
791			 * opened with read/write.
792			 */
793			if (error == 0 &&
794			    (flags & (O_ACCMODE | O_TRUNC)) ==
795			    (O_RDWR | O_TRUNC)) {
796#ifdef MAC
797				error = mac_posixshm_check_truncate(
798					td->td_ucred, fp->f_cred, shmfd);
799				if (error == 0)
800#endif
801					shm_dotruncate(shmfd, 0);
802			}
803			if (error == 0)
804				shm_hold(shmfd);
805		}
806		sx_xunlock(&shm_dict_lock);
807
808		if (error) {
809			fdclose(td, fp, fd);
810			fdrop(fp, td);
811			return (error);
812		}
813	}
814
815	finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
816
817	td->td_retval[0] = fd;
818	fdrop(fp, td);
819
820	return (0);
821}
822
823/* System calls. */
824int
825sys_shm_open(struct thread *td, struct shm_open_args *uap)
826{
827
828	return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL));
829}
830
831int
832sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
833{
834	char *path;
835	const char *pr_path;
836	size_t pr_pathlen;
837	Fnv32_t fnv;
838	int error;
839
840	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
841	pr_path = td->td_ucred->cr_prison->pr_path;
842	pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
843	    : strlcpy(path, pr_path, MAXPATHLEN);
844	error = copyinstr(uap->path, path + pr_pathlen, MAXPATHLEN - pr_pathlen,
845	    NULL);
846	if (error) {
847		free(path, M_TEMP);
848		return (error);
849	}
850#ifdef KTRACE
851	if (KTRPOINT(curthread, KTR_NAMEI))
852		ktrnamei(path);
853#endif
854	fnv = fnv_32_str(path, FNV1_32_INIT);
855	sx_xlock(&shm_dict_lock);
856	error = shm_remove(path, fnv, td->td_ucred);
857	sx_xunlock(&shm_dict_lock);
858	free(path, M_TEMP);
859
860	return (error);
861}
862
863int
864shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
865    vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
866    vm_ooffset_t foff, struct thread *td)
867{
868	struct shmfd *shmfd;
869	vm_prot_t maxprot;
870	int error;
871
872	shmfd = fp->f_data;
873	maxprot = VM_PROT_NONE;
874
875	/* FREAD should always be set. */
876	if ((fp->f_flag & FREAD) != 0)
877		maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
878	if ((fp->f_flag & FWRITE) != 0)
879		maxprot |= VM_PROT_WRITE;
880
881	/* Don't permit shared writable mappings on read-only descriptors. */
882	if ((flags & MAP_SHARED) != 0 &&
883	    (maxprot & VM_PROT_WRITE) == 0 &&
884	    (prot & VM_PROT_WRITE) != 0)
885		return (EACCES);
886	maxprot &= cap_maxprot;
887
888#ifdef MAC
889	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags);
890	if (error != 0)
891		return (error);
892#endif
893
894	/*
895	 * XXXRW: This validation is probably insufficient, and subject to
896	 * sign errors.  It should be fixed.
897	 */
898	if (foff >= shmfd->shm_size ||
899	    foff + objsize > round_page(shmfd->shm_size))
900		return (EINVAL);
901
902	mtx_lock(&shm_timestamp_lock);
903	vfs_timestamp(&shmfd->shm_atime);
904	mtx_unlock(&shm_timestamp_lock);
905	vm_object_reference(shmfd->shm_object);
906
907	error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
908	    shmfd->shm_object, foff, FALSE, td);
909	if (error != 0)
910		vm_object_deallocate(shmfd->shm_object);
911	return (0);
912}
913
914static int
915shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
916    struct thread *td)
917{
918	struct shmfd *shmfd;
919	int error;
920
921	error = 0;
922	shmfd = fp->f_data;
923	mtx_lock(&shm_timestamp_lock);
924	/*
925	 * SUSv4 says that x bits of permission need not be affected.
926	 * Be consistent with our shm_open there.
927	 */
928#ifdef MAC
929	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
930	if (error != 0)
931		goto out;
932#endif
933	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
934	    shmfd->shm_gid, VADMIN, active_cred, NULL);
935	if (error != 0)
936		goto out;
937	shmfd->shm_mode = mode & ACCESSPERMS;
938out:
939	mtx_unlock(&shm_timestamp_lock);
940	return (error);
941}
942
943static int
944shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
945    struct thread *td)
946{
947	struct shmfd *shmfd;
948	int error;
949
950	error = 0;
951	shmfd = fp->f_data;
952	mtx_lock(&shm_timestamp_lock);
953#ifdef MAC
954	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
955	if (error != 0)
956		goto out;
957#endif
958	if (uid == (uid_t)-1)
959		uid = shmfd->shm_uid;
960	if (gid == (gid_t)-1)
961                 gid = shmfd->shm_gid;
962	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
963	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
964	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
965		goto out;
966	shmfd->shm_uid = uid;
967	shmfd->shm_gid = gid;
968out:
969	mtx_unlock(&shm_timestamp_lock);
970	return (error);
971}
972
973/*
974 * Helper routines to allow the backing object of a shared memory file
975 * descriptor to be mapped in the kernel.
976 */
977int
978shm_map(struct file *fp, size_t size, off_t offset, void **memp)
979{
980	struct shmfd *shmfd;
981	vm_offset_t kva, ofs;
982	vm_object_t obj;
983	int rv;
984
985	if (fp->f_type != DTYPE_SHM)
986		return (EINVAL);
987	shmfd = fp->f_data;
988	obj = shmfd->shm_object;
989	VM_OBJECT_WLOCK(obj);
990	/*
991	 * XXXRW: This validation is probably insufficient, and subject to
992	 * sign errors.  It should be fixed.
993	 */
994	if (offset >= shmfd->shm_size ||
995	    offset + size > round_page(shmfd->shm_size)) {
996		VM_OBJECT_WUNLOCK(obj);
997		return (EINVAL);
998	}
999
1000	shmfd->shm_kmappings++;
1001	vm_object_reference_locked(obj);
1002	VM_OBJECT_WUNLOCK(obj);
1003
1004	/* Map the object into the kernel_map and wire it. */
1005	kva = vm_map_min(kernel_map);
1006	ofs = offset & PAGE_MASK;
1007	offset = trunc_page(offset);
1008	size = round_page(size + ofs);
1009	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
1010	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
1011	    VM_PROT_READ | VM_PROT_WRITE, 0);
1012	if (rv == KERN_SUCCESS) {
1013		rv = vm_map_wire(kernel_map, kva, kva + size,
1014		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
1015		if (rv == KERN_SUCCESS) {
1016			*memp = (void *)(kva + ofs);
1017			return (0);
1018		}
1019		vm_map_remove(kernel_map, kva, kva + size);
1020	} else
1021		vm_object_deallocate(obj);
1022
1023	/* On failure, drop our mapping reference. */
1024	VM_OBJECT_WLOCK(obj);
1025	shmfd->shm_kmappings--;
1026	VM_OBJECT_WUNLOCK(obj);
1027
1028	return (vm_mmap_to_errno(rv));
1029}
1030
1031/*
1032 * We require the caller to unmap the entire entry.  This allows us to
1033 * safely decrement shm_kmappings when a mapping is removed.
1034 */
1035int
1036shm_unmap(struct file *fp, void *mem, size_t size)
1037{
1038	struct shmfd *shmfd;
1039	vm_map_entry_t entry;
1040	vm_offset_t kva, ofs;
1041	vm_object_t obj;
1042	vm_pindex_t pindex;
1043	vm_prot_t prot;
1044	boolean_t wired;
1045	vm_map_t map;
1046	int rv;
1047
1048	if (fp->f_type != DTYPE_SHM)
1049		return (EINVAL);
1050	shmfd = fp->f_data;
1051	kva = (vm_offset_t)mem;
1052	ofs = kva & PAGE_MASK;
1053	kva = trunc_page(kva);
1054	size = round_page(size + ofs);
1055	map = kernel_map;
1056	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
1057	    &obj, &pindex, &prot, &wired);
1058	if (rv != KERN_SUCCESS)
1059		return (EINVAL);
1060	if (entry->start != kva || entry->end != kva + size) {
1061		vm_map_lookup_done(map, entry);
1062		return (EINVAL);
1063	}
1064	vm_map_lookup_done(map, entry);
1065	if (obj != shmfd->shm_object)
1066		return (EINVAL);
1067	vm_map_remove(map, kva, kva + size);
1068	VM_OBJECT_WLOCK(obj);
1069	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
1070	shmfd->shm_kmappings--;
1071	VM_OBJECT_WUNLOCK(obj);
1072	return (0);
1073}
1074
1075static int
1076shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
1077{
1078	const char *path, *pr_path;
1079	struct shmfd *shmfd;
1080	size_t pr_pathlen;
1081
1082	kif->kf_type = KF_TYPE_SHM;
1083	shmfd = fp->f_data;
1084
1085	mtx_lock(&shm_timestamp_lock);
1086	kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode;	/* XXX */
1087	mtx_unlock(&shm_timestamp_lock);
1088	kif->kf_un.kf_file.kf_file_size = shmfd->shm_size;
1089	if (shmfd->shm_path != NULL) {
1090		sx_slock(&shm_dict_lock);
1091		if (shmfd->shm_path != NULL) {
1092			path = shmfd->shm_path;
1093			pr_path = curthread->td_ucred->cr_prison->pr_path;
1094			if (strcmp(pr_path, "/") != 0) {
1095				/* Return the jail-rooted pathname. */
1096				pr_pathlen = strlen(pr_path);
1097				if (strncmp(path, pr_path, pr_pathlen) == 0 &&
1098				    path[pr_pathlen] == '/')
1099					path += pr_pathlen;
1100			}
1101			strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
1102		}
1103		sx_sunlock(&shm_dict_lock);
1104	}
1105	return (0);
1106}
1107