1/*- 2 * Copyright (c) 2006, 2011 Robert N. M. Watson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27/* 28 * Support for shared swap-backed anonymous memory objects via 29 * shm_open(2) and shm_unlink(2). While most of the implementation is 30 * here, vm_mmap.c contains mapping logic changes. 31 * 32 * TODO: 33 * 34 * (1) Need to export data to a userland tool via a sysctl. Should ipcs(1) 35 * and ipcrm(1) be expanded or should new tools to manage both POSIX 36 * kernel semaphores and POSIX shared memory be written? 37 * 38 * (2) Add support for this file type to fstat(1). 39 * 40 * (3) Resource limits? Does this need its own resource limits or are the 41 * existing limits in mmap(2) sufficient? 42 */ 43 44#include <sys/cdefs.h> 45__FBSDID("$FreeBSD: stable/11/sys/kern/uipc_shm.c 351142 2019-08-16 21:01:35Z kevans $"); 46 47#include "opt_capsicum.h" 48#include "opt_ktrace.h" 49 50#include <sys/param.h> 51#include <sys/capsicum.h> 52#include <sys/conf.h> 53#include <sys/fcntl.h> 54#include <sys/file.h> 55#include <sys/filedesc.h> 56#include <sys/fnv_hash.h> 57#include <sys/kernel.h> 58#include <sys/uio.h> 59#include <sys/signal.h> 60#include <sys/jail.h> 61#include <sys/ktrace.h> 62#include <sys/lock.h> 63#include <sys/malloc.h> 64#include <sys/mman.h> 65#include <sys/mutex.h> 66#include <sys/priv.h> 67#include <sys/proc.h> 68#include <sys/refcount.h> 69#include <sys/resourcevar.h> 70#include <sys/rwlock.h> 71#include <sys/stat.h> 72#include <sys/syscallsubr.h> 73#include <sys/sysctl.h> 74#include <sys/sysproto.h> 75#include <sys/systm.h> 76#include <sys/sx.h> 77#include <sys/time.h> 78#include <sys/vnode.h> 79#include <sys/unistd.h> 80#include <sys/user.h> 81 82#include <security/mac/mac_framework.h> 83 84#include <vm/vm.h> 85#include <vm/vm_param.h> 86#include <vm/pmap.h> 87#include <vm/vm_extern.h> 88#include <vm/vm_map.h> 89#include <vm/vm_kern.h> 90#include <vm/vm_object.h> 91#include <vm/vm_page.h> 92#include <vm/vm_pageout.h> 93#include <vm/vm_pager.h> 94#include <vm/swap_pager.h> 95 96struct shm_mapping { 97 char *sm_path; 98 Fnv32_t sm_fnv; 99 struct shmfd *sm_shmfd; 100 LIST_ENTRY(shm_mapping) sm_link; 101}; 102 103static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); 104static LIST_HEAD(, shm_mapping) *shm_dictionary; 105static struct sx shm_dict_lock; 106static struct mtx shm_timestamp_lock; 107static u_long shm_hash; 108static struct unrhdr *shm_ino_unr; 109static dev_t shm_dev_ino; 110 111#define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) 112 113static void shm_init(void *arg); 114static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); 115static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); 116static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); 117 118static fo_rdwr_t shm_read; 119static fo_rdwr_t shm_write; 120static fo_truncate_t shm_truncate; 121static fo_stat_t shm_stat; 122static fo_close_t shm_close; 123static fo_chmod_t shm_chmod; 124static fo_chown_t shm_chown; 125static fo_seek_t shm_seek; 126static fo_fill_kinfo_t shm_fill_kinfo; 127static fo_mmap_t shm_mmap; 128 129/* File descriptor operations. */ 130struct fileops shm_ops = { 131 .fo_read = shm_read, 132 .fo_write = shm_write, 133 .fo_truncate = shm_truncate, 134 .fo_ioctl = invfo_ioctl, 135 .fo_poll = invfo_poll, 136 .fo_kqfilter = invfo_kqfilter, 137 .fo_stat = shm_stat, 138 .fo_close = shm_close, 139 .fo_chmod = shm_chmod, 140 .fo_chown = shm_chown, 141 .fo_sendfile = vn_sendfile, 142 .fo_seek = shm_seek, 143 .fo_fill_kinfo = shm_fill_kinfo, 144 .fo_mmap = shm_mmap, 145 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 146}; 147 148FEATURE(posix_shm, "POSIX shared memory"); 149 150static int 151uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) 152{ 153 vm_page_t m; 154 vm_pindex_t idx; 155 size_t tlen; 156 int error, offset, rv; 157 158 idx = OFF_TO_IDX(uio->uio_offset); 159 offset = uio->uio_offset & PAGE_MASK; 160 tlen = MIN(PAGE_SIZE - offset, len); 161 162 VM_OBJECT_WLOCK(obj); 163 164 /* 165 * Read I/O without either a corresponding resident page or swap 166 * page: use zero_region. This is intended to avoid instantiating 167 * pages on read from a sparse region. 168 */ 169 if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && 170 !vm_pager_has_page(obj, idx, NULL, NULL)) { 171 VM_OBJECT_WUNLOCK(obj); 172 return (uiomove(__DECONST(void *, zero_region), tlen, uio)); 173 } 174 175 /* 176 * Parallel reads of the page content from disk are prevented 177 * by exclusive busy. 178 * 179 * Although the tmpfs vnode lock is held here, it is 180 * nonetheless safe to sleep waiting for a free page. The 181 * pageout daemon does not need to acquire the tmpfs vnode 182 * lock to page out tobj's pages because tobj is a OBJT_SWAP 183 * type object. 184 */ 185 m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); 186 if (m->valid != VM_PAGE_BITS_ALL) { 187 vm_page_xbusy(m); 188 if (vm_pager_has_page(obj, idx, NULL, NULL)) { 189 rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL); 190 if (rv != VM_PAGER_OK) { 191 printf( 192 "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n", 193 obj, idx, m->valid, rv); 194 vm_page_lock(m); 195 vm_page_free(m); 196 vm_page_unlock(m); 197 VM_OBJECT_WUNLOCK(obj); 198 return (EIO); 199 } 200 } else 201 vm_page_zero_invalid(m, TRUE); 202 vm_page_xunbusy(m); 203 } 204 vm_page_lock(m); 205 vm_page_hold(m); 206 if (m->queue != PQ_ACTIVE) 207 vm_page_activate(m); 208 else 209 vm_page_reference(m); 210 vm_page_unlock(m); 211 VM_OBJECT_WUNLOCK(obj); 212 error = uiomove_fromphys(&m, offset, tlen, uio); 213 if (uio->uio_rw == UIO_WRITE && error == 0) { 214 VM_OBJECT_WLOCK(obj); 215 vm_page_dirty(m); 216 vm_pager_page_unswapped(m); 217 VM_OBJECT_WUNLOCK(obj); 218 } 219 vm_page_lock(m); 220 vm_page_unhold(m); 221 vm_page_unlock(m); 222 223 return (error); 224} 225 226int 227uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) 228{ 229 ssize_t resid; 230 size_t len; 231 int error; 232 233 error = 0; 234 while ((resid = uio->uio_resid) > 0) { 235 if (obj_size <= uio->uio_offset) 236 break; 237 len = MIN(obj_size - uio->uio_offset, resid); 238 if (len == 0) 239 break; 240 error = uiomove_object_page(obj, len, uio); 241 if (error != 0 || resid == uio->uio_resid) 242 break; 243 } 244 return (error); 245} 246 247static int 248shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) 249{ 250 struct shmfd *shmfd; 251 off_t foffset; 252 int error; 253 254 shmfd = fp->f_data; 255 foffset = foffset_lock(fp, 0); 256 error = 0; 257 switch (whence) { 258 case L_INCR: 259 if (foffset < 0 || 260 (offset > 0 && foffset > OFF_MAX - offset)) { 261 error = EOVERFLOW; 262 break; 263 } 264 offset += foffset; 265 break; 266 case L_XTND: 267 if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { 268 error = EOVERFLOW; 269 break; 270 } 271 offset += shmfd->shm_size; 272 break; 273 case L_SET: 274 break; 275 default: 276 error = EINVAL; 277 } 278 if (error == 0) { 279 if (offset < 0 || offset > shmfd->shm_size) 280 error = EINVAL; 281 else 282 td->td_uretoff.tdu_off = offset; 283 } 284 foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); 285 return (error); 286} 287 288static int 289shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 290 int flags, struct thread *td) 291{ 292 struct shmfd *shmfd; 293 void *rl_cookie; 294 int error; 295 296 shmfd = fp->f_data; 297#ifdef MAC 298 error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); 299 if (error) 300 return (error); 301#endif 302 foffset_lock_uio(fp, uio, flags); 303 rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, 304 uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); 305 error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); 306 rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 307 foffset_unlock_uio(fp, uio, flags); 308 return (error); 309} 310 311static int 312shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 313 int flags, struct thread *td) 314{ 315 struct shmfd *shmfd; 316 void *rl_cookie; 317 int error; 318 319 shmfd = fp->f_data; 320#ifdef MAC 321 error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); 322 if (error) 323 return (error); 324#endif 325 foffset_lock_uio(fp, uio, flags); 326 if ((flags & FOF_OFFSET) == 0) { 327 rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 328 &shmfd->shm_mtx); 329 } else { 330 rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, 331 uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); 332 } 333 334 error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); 335 rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 336 foffset_unlock_uio(fp, uio, flags); 337 return (error); 338} 339 340static int 341shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, 342 struct thread *td) 343{ 344 struct shmfd *shmfd; 345#ifdef MAC 346 int error; 347#endif 348 349 shmfd = fp->f_data; 350#ifdef MAC 351 error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); 352 if (error) 353 return (error); 354#endif 355 return (shm_dotruncate(shmfd, length)); 356} 357 358static int 359shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, 360 struct thread *td) 361{ 362 struct shmfd *shmfd; 363#ifdef MAC 364 int error; 365#endif 366 367 shmfd = fp->f_data; 368 369#ifdef MAC 370 error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); 371 if (error) 372 return (error); 373#endif 374 375 /* 376 * Attempt to return sanish values for fstat() on a memory file 377 * descriptor. 378 */ 379 bzero(sb, sizeof(*sb)); 380 sb->st_blksize = PAGE_SIZE; 381 sb->st_size = shmfd->shm_size; 382 sb->st_blocks = howmany(sb->st_size, sb->st_blksize); 383 mtx_lock(&shm_timestamp_lock); 384 sb->st_atim = shmfd->shm_atime; 385 sb->st_ctim = shmfd->shm_ctime; 386 sb->st_mtim = shmfd->shm_mtime; 387 sb->st_birthtim = shmfd->shm_birthtime; 388 sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ 389 sb->st_uid = shmfd->shm_uid; 390 sb->st_gid = shmfd->shm_gid; 391 mtx_unlock(&shm_timestamp_lock); 392 sb->st_dev = shm_dev_ino; 393 sb->st_ino = shmfd->shm_ino; 394 395 return (0); 396} 397 398static int 399shm_close(struct file *fp, struct thread *td) 400{ 401 struct shmfd *shmfd; 402 403 shmfd = fp->f_data; 404 fp->f_data = NULL; 405 shm_drop(shmfd); 406 407 return (0); 408} 409 410int 411shm_dotruncate(struct shmfd *shmfd, off_t length) 412{ 413 vm_object_t object; 414 vm_page_t m; 415 vm_pindex_t idx, nobjsize; 416 vm_ooffset_t delta; 417 int base, rv; 418 419 KASSERT(length >= 0, ("shm_dotruncate: length < 0")); 420 object = shmfd->shm_object; 421 VM_OBJECT_WLOCK(object); 422 if (length == shmfd->shm_size) { 423 VM_OBJECT_WUNLOCK(object); 424 return (0); 425 } 426 nobjsize = OFF_TO_IDX(length + PAGE_MASK); 427 428 /* Are we shrinking? If so, trim the end. */ 429 if (length < shmfd->shm_size) { 430 /* 431 * Disallow any requests to shrink the size if this 432 * object is mapped into the kernel. 433 */ 434 if (shmfd->shm_kmappings > 0) { 435 VM_OBJECT_WUNLOCK(object); 436 return (EBUSY); 437 } 438 439 /* 440 * Zero the truncated part of the last page. 441 */ 442 base = length & PAGE_MASK; 443 if (base != 0) { 444 idx = OFF_TO_IDX(length); 445retry: 446 m = vm_page_lookup(object, idx); 447 if (m != NULL) { 448 if (vm_page_sleep_if_busy(m, "shmtrc")) 449 goto retry; 450 } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 451 m = vm_page_alloc(object, idx, 452 VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); 453 if (m == NULL) 454 goto retry; 455 rv = vm_pager_get_pages(object, &m, 1, NULL, 456 NULL); 457 vm_page_lock(m); 458 if (rv == VM_PAGER_OK) { 459 vm_page_deactivate(m); 460 vm_page_unlock(m); 461 vm_page_xunbusy(m); 462 } else { 463 vm_page_free(m); 464 vm_page_unlock(m); 465 VM_OBJECT_WUNLOCK(object); 466 return (EIO); 467 } 468 } 469 if (m != NULL) { 470 pmap_zero_page_area(m, base, PAGE_SIZE - base); 471 KASSERT(m->valid == VM_PAGE_BITS_ALL, 472 ("shm_dotruncate: page %p is invalid", m)); 473 vm_page_dirty(m); 474 vm_pager_page_unswapped(m); 475 } 476 } 477 delta = IDX_TO_OFF(object->size - nobjsize); 478 479 /* Toss in memory pages. */ 480 if (nobjsize < object->size) 481 vm_object_page_remove(object, nobjsize, object->size, 482 0); 483 484 /* Toss pages from swap. */ 485 if (object->type == OBJT_SWAP) 486 swap_pager_freespace(object, nobjsize, delta); 487 488 /* Free the swap accounted for shm */ 489 swap_release_by_cred(delta, object->cred); 490 object->charge -= delta; 491 } else { 492 /* Try to reserve additional swap space. */ 493 delta = IDX_TO_OFF(nobjsize - object->size); 494 if (!swap_reserve_by_cred(delta, object->cred)) { 495 VM_OBJECT_WUNLOCK(object); 496 return (ENOMEM); 497 } 498 object->charge += delta; 499 } 500 shmfd->shm_size = length; 501 mtx_lock(&shm_timestamp_lock); 502 vfs_timestamp(&shmfd->shm_ctime); 503 shmfd->shm_mtime = shmfd->shm_ctime; 504 mtx_unlock(&shm_timestamp_lock); 505 object->size = nobjsize; 506 VM_OBJECT_WUNLOCK(object); 507 return (0); 508} 509 510/* 511 * shmfd object management including creation and reference counting 512 * routines. 513 */ 514struct shmfd * 515shm_alloc(struct ucred *ucred, mode_t mode) 516{ 517 struct shmfd *shmfd; 518 int ino; 519 520 shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); 521 shmfd->shm_size = 0; 522 shmfd->shm_uid = ucred->cr_uid; 523 shmfd->shm_gid = ucred->cr_gid; 524 shmfd->shm_mode = mode; 525 shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, 526 shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); 527 KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); 528 shmfd->shm_object->pg_color = 0; 529 VM_OBJECT_WLOCK(shmfd->shm_object); 530 vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); 531 vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT); 532 VM_OBJECT_WUNLOCK(shmfd->shm_object); 533 vfs_timestamp(&shmfd->shm_birthtime); 534 shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = 535 shmfd->shm_birthtime; 536 ino = alloc_unr(shm_ino_unr); 537 if (ino == -1) 538 shmfd->shm_ino = 0; 539 else 540 shmfd->shm_ino = ino; 541 refcount_init(&shmfd->shm_refs, 1); 542 mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); 543 rangelock_init(&shmfd->shm_rl); 544#ifdef MAC 545 mac_posixshm_init(shmfd); 546 mac_posixshm_create(ucred, shmfd); 547#endif 548 549 return (shmfd); 550} 551 552struct shmfd * 553shm_hold(struct shmfd *shmfd) 554{ 555 556 refcount_acquire(&shmfd->shm_refs); 557 return (shmfd); 558} 559 560void 561shm_drop(struct shmfd *shmfd) 562{ 563 564 if (refcount_release(&shmfd->shm_refs)) { 565#ifdef MAC 566 mac_posixshm_destroy(shmfd); 567#endif 568 rangelock_destroy(&shmfd->shm_rl); 569 mtx_destroy(&shmfd->shm_mtx); 570 vm_object_deallocate(shmfd->shm_object); 571 if (shmfd->shm_ino != 0) 572 free_unr(shm_ino_unr, shmfd->shm_ino); 573 free(shmfd, M_SHMFD); 574 } 575} 576 577/* 578 * Determine if the credentials have sufficient permissions for a 579 * specified combination of FREAD and FWRITE. 580 */ 581int 582shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) 583{ 584 accmode_t accmode; 585 int error; 586 587 accmode = 0; 588 if (flags & FREAD) 589 accmode |= VREAD; 590 if (flags & FWRITE) 591 accmode |= VWRITE; 592 mtx_lock(&shm_timestamp_lock); 593 error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, 594 accmode, ucred, NULL); 595 mtx_unlock(&shm_timestamp_lock); 596 return (error); 597} 598 599/* 600 * Dictionary management. We maintain an in-kernel dictionary to map 601 * paths to shmfd objects. We use the FNV hash on the path to store 602 * the mappings in a hash table. 603 */ 604static void 605shm_init(void *arg) 606{ 607 608 mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); 609 sx_init(&shm_dict_lock, "shm dictionary"); 610 shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); 611 shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL); 612 KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized")); 613 shm_dev_ino = devfs_alloc_cdp_inode(); 614 KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); 615} 616SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); 617 618static struct shmfd * 619shm_lookup(char *path, Fnv32_t fnv) 620{ 621 struct shm_mapping *map; 622 623 LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 624 if (map->sm_fnv != fnv) 625 continue; 626 if (strcmp(map->sm_path, path) == 0) 627 return (map->sm_shmfd); 628 } 629 630 return (NULL); 631} 632 633static void 634shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) 635{ 636 struct shm_mapping *map; 637 638 map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); 639 map->sm_path = path; 640 map->sm_fnv = fnv; 641 map->sm_shmfd = shm_hold(shmfd); 642 shmfd->shm_path = path; 643 LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); 644} 645 646static int 647shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) 648{ 649 struct shm_mapping *map; 650 int error; 651 652 LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 653 if (map->sm_fnv != fnv) 654 continue; 655 if (strcmp(map->sm_path, path) == 0) { 656#ifdef MAC 657 error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); 658 if (error) 659 return (error); 660#endif 661 error = shm_access(map->sm_shmfd, ucred, 662 FREAD | FWRITE); 663 if (error) 664 return (error); 665 map->sm_shmfd->shm_path = NULL; 666 LIST_REMOVE(map, sm_link); 667 shm_drop(map->sm_shmfd); 668 free(map->sm_path, M_SHMFD); 669 free(map, M_SHMFD); 670 return (0); 671 } 672 } 673 674 return (ENOENT); 675} 676 677int 678kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, 679 struct filecaps *fcaps) 680{ 681 struct filedesc *fdp; 682 struct shmfd *shmfd; 683 struct file *fp; 684 char *path; 685 const char *pr_path; 686 size_t pr_pathlen; 687 Fnv32_t fnv; 688 mode_t cmode; 689 int fd, error; 690 691#ifdef CAPABILITY_MODE 692 /* 693 * shm_open(2) is only allowed for anonymous objects. 694 */ 695 if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) 696 return (ECAPMODE); 697#endif 698 699 if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) 700 return (EINVAL); 701 702 if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) 703 return (EINVAL); 704 705 fdp = td->td_proc->p_fd; 706 cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; 707 708 /* 709 * shm_open(2) created shm should always have O_CLOEXEC set, as mandated 710 * by POSIX. We allow it to be unset here so that an in-kernel 711 * interface may be written as a thin layer around shm, optionally not 712 * setting CLOEXEC. For shm_open(2), O_CLOEXEC is set unconditionally 713 * in sys_shm_open() to keep this implementation compliant. 714 */ 715 error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps); 716 if (error) 717 return (error); 718 719 /* A SHM_ANON path pointer creates an anonymous object. */ 720 if (userpath == SHM_ANON) { 721 /* A read-only anonymous object is pointless. */ 722 if ((flags & O_ACCMODE) == O_RDONLY) { 723 fdclose(td, fp, fd); 724 fdrop(fp, td); 725 return (EINVAL); 726 } 727 shmfd = shm_alloc(td->td_ucred, cmode); 728 } else { 729 path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); 730 pr_path = td->td_ucred->cr_prison->pr_path; 731 732 /* Construct a full pathname for jailed callers. */ 733 pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 734 : strlcpy(path, pr_path, MAXPATHLEN); 735 error = copyinstr(userpath, path + pr_pathlen, 736 MAXPATHLEN - pr_pathlen, NULL); 737#ifdef KTRACE 738 if (error == 0 && KTRPOINT(curthread, KTR_NAMEI)) 739 ktrnamei(path); 740#endif 741 /* Require paths to start with a '/' character. */ 742 if (error == 0 && path[pr_pathlen] != '/') 743 error = EINVAL; 744 if (error) { 745 fdclose(td, fp, fd); 746 fdrop(fp, td); 747 free(path, M_SHMFD); 748 return (error); 749 } 750 751 fnv = fnv_32_str(path, FNV1_32_INIT); 752 sx_xlock(&shm_dict_lock); 753 shmfd = shm_lookup(path, fnv); 754 if (shmfd == NULL) { 755 /* Object does not yet exist, create it if requested. */ 756 if (flags & O_CREAT) { 757#ifdef MAC 758 error = mac_posixshm_check_create(td->td_ucred, 759 path); 760 if (error == 0) { 761#endif 762 shmfd = shm_alloc(td->td_ucred, cmode); 763 shm_insert(path, fnv, shmfd); 764#ifdef MAC 765 } 766#endif 767 } else { 768 free(path, M_SHMFD); 769 error = ENOENT; 770 } 771 } else { 772 /* 773 * Object already exists, obtain a new 774 * reference if requested and permitted. 775 */ 776 free(path, M_SHMFD); 777 if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 778 error = EEXIST; 779 else { 780#ifdef MAC 781 error = mac_posixshm_check_open(td->td_ucred, 782 shmfd, FFLAGS(flags & O_ACCMODE)); 783 if (error == 0) 784#endif 785 error = shm_access(shmfd, td->td_ucred, 786 FFLAGS(flags & O_ACCMODE)); 787 } 788 789 /* 790 * Truncate the file back to zero length if 791 * O_TRUNC was specified and the object was 792 * opened with read/write. 793 */ 794 if (error == 0 && 795 (flags & (O_ACCMODE | O_TRUNC)) == 796 (O_RDWR | O_TRUNC)) { 797#ifdef MAC 798 error = mac_posixshm_check_truncate( 799 td->td_ucred, fp->f_cred, shmfd); 800 if (error == 0) 801#endif 802 shm_dotruncate(shmfd, 0); 803 } 804 if (error == 0) 805 shm_hold(shmfd); 806 } 807 sx_xunlock(&shm_dict_lock); 808 809 if (error) { 810 fdclose(td, fp, fd); 811 fdrop(fp, td); 812 return (error); 813 } 814 } 815 816 finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); 817 818 td->td_retval[0] = fd; 819 fdrop(fp, td); 820 821 return (0); 822} 823 824/* System calls. */ 825int 826sys_shm_open(struct thread *td, struct shm_open_args *uap) 827{ 828 829 return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC, uap->mode, 830 NULL)); 831} 832 833int 834sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) 835{ 836 char *path; 837 const char *pr_path; 838 size_t pr_pathlen; 839 Fnv32_t fnv; 840 int error; 841 842 path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 843 pr_path = td->td_ucred->cr_prison->pr_path; 844 pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 845 : strlcpy(path, pr_path, MAXPATHLEN); 846 error = copyinstr(uap->path, path + pr_pathlen, MAXPATHLEN - pr_pathlen, 847 NULL); 848 if (error) { 849 free(path, M_TEMP); 850 return (error); 851 } 852#ifdef KTRACE 853 if (KTRPOINT(curthread, KTR_NAMEI)) 854 ktrnamei(path); 855#endif 856 fnv = fnv_32_str(path, FNV1_32_INIT); 857 sx_xlock(&shm_dict_lock); 858 error = shm_remove(path, fnv, td->td_ucred); 859 sx_xunlock(&shm_dict_lock); 860 free(path, M_TEMP); 861 862 return (error); 863} 864 865int 866shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, 867 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, 868 vm_ooffset_t foff, struct thread *td) 869{ 870 struct shmfd *shmfd; 871 vm_prot_t maxprot; 872 int error; 873 874 shmfd = fp->f_data; 875 maxprot = VM_PROT_NONE; 876 877 /* FREAD should always be set. */ 878 if ((fp->f_flag & FREAD) != 0) 879 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 880 if ((fp->f_flag & FWRITE) != 0) 881 maxprot |= VM_PROT_WRITE; 882 883 /* Don't permit shared writable mappings on read-only descriptors. */ 884 if ((flags & MAP_SHARED) != 0 && 885 (maxprot & VM_PROT_WRITE) == 0 && 886 (prot & VM_PROT_WRITE) != 0) 887 return (EACCES); 888 maxprot &= cap_maxprot; 889 890 /* See comment in vn_mmap(). */ 891 if ( 892#ifdef _LP64 893 objsize > OFF_MAX || 894#endif 895 foff < 0 || foff > OFF_MAX - objsize) 896 return (EINVAL); 897 898#ifdef MAC 899 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); 900 if (error != 0) 901 return (error); 902#endif 903 904 mtx_lock(&shm_timestamp_lock); 905 vfs_timestamp(&shmfd->shm_atime); 906 mtx_unlock(&shm_timestamp_lock); 907 vm_object_reference(shmfd->shm_object); 908 909 error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, 910 shmfd->shm_object, foff, FALSE, td); 911 if (error != 0) 912 vm_object_deallocate(shmfd->shm_object); 913 return (error); 914} 915 916static int 917shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 918 struct thread *td) 919{ 920 struct shmfd *shmfd; 921 int error; 922 923 error = 0; 924 shmfd = fp->f_data; 925 mtx_lock(&shm_timestamp_lock); 926 /* 927 * SUSv4 says that x bits of permission need not be affected. 928 * Be consistent with our shm_open there. 929 */ 930#ifdef MAC 931 error = mac_posixshm_check_setmode(active_cred, shmfd, mode); 932 if (error != 0) 933 goto out; 934#endif 935 error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, 936 shmfd->shm_gid, VADMIN, active_cred, NULL); 937 if (error != 0) 938 goto out; 939 shmfd->shm_mode = mode & ACCESSPERMS; 940out: 941 mtx_unlock(&shm_timestamp_lock); 942 return (error); 943} 944 945static int 946shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 947 struct thread *td) 948{ 949 struct shmfd *shmfd; 950 int error; 951 952 error = 0; 953 shmfd = fp->f_data; 954 mtx_lock(&shm_timestamp_lock); 955#ifdef MAC 956 error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); 957 if (error != 0) 958 goto out; 959#endif 960 if (uid == (uid_t)-1) 961 uid = shmfd->shm_uid; 962 if (gid == (gid_t)-1) 963 gid = shmfd->shm_gid; 964 if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || 965 (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && 966 (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) 967 goto out; 968 shmfd->shm_uid = uid; 969 shmfd->shm_gid = gid; 970out: 971 mtx_unlock(&shm_timestamp_lock); 972 return (error); 973} 974 975/* 976 * Helper routines to allow the backing object of a shared memory file 977 * descriptor to be mapped in the kernel. 978 */ 979int 980shm_map(struct file *fp, size_t size, off_t offset, void **memp) 981{ 982 struct shmfd *shmfd; 983 vm_offset_t kva, ofs; 984 vm_object_t obj; 985 int rv; 986 987 if (fp->f_type != DTYPE_SHM) 988 return (EINVAL); 989 shmfd = fp->f_data; 990 obj = shmfd->shm_object; 991 VM_OBJECT_WLOCK(obj); 992 /* 993 * XXXRW: This validation is probably insufficient, and subject to 994 * sign errors. It should be fixed. 995 */ 996 if (offset >= shmfd->shm_size || 997 offset + size > round_page(shmfd->shm_size)) { 998 VM_OBJECT_WUNLOCK(obj); 999 return (EINVAL); 1000 } 1001 1002 shmfd->shm_kmappings++; 1003 vm_object_reference_locked(obj); 1004 VM_OBJECT_WUNLOCK(obj); 1005 1006 /* Map the object into the kernel_map and wire it. */ 1007 kva = vm_map_min(kernel_map); 1008 ofs = offset & PAGE_MASK; 1009 offset = trunc_page(offset); 1010 size = round_page(size + ofs); 1011 rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, 1012 VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, 1013 VM_PROT_READ | VM_PROT_WRITE, 0); 1014 if (rv == KERN_SUCCESS) { 1015 rv = vm_map_wire(kernel_map, kva, kva + size, 1016 VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); 1017 if (rv == KERN_SUCCESS) { 1018 *memp = (void *)(kva + ofs); 1019 return (0); 1020 } 1021 vm_map_remove(kernel_map, kva, kva + size); 1022 } else 1023 vm_object_deallocate(obj); 1024 1025 /* On failure, drop our mapping reference. */ 1026 VM_OBJECT_WLOCK(obj); 1027 shmfd->shm_kmappings--; 1028 VM_OBJECT_WUNLOCK(obj); 1029 1030 return (vm_mmap_to_errno(rv)); 1031} 1032 1033/* 1034 * We require the caller to unmap the entire entry. This allows us to 1035 * safely decrement shm_kmappings when a mapping is removed. 1036 */ 1037int 1038shm_unmap(struct file *fp, void *mem, size_t size) 1039{ 1040 struct shmfd *shmfd; 1041 vm_map_entry_t entry; 1042 vm_offset_t kva, ofs; 1043 vm_object_t obj; 1044 vm_pindex_t pindex; 1045 vm_prot_t prot; 1046 boolean_t wired; 1047 vm_map_t map; 1048 int rv; 1049 1050 if (fp->f_type != DTYPE_SHM) 1051 return (EINVAL); 1052 shmfd = fp->f_data; 1053 kva = (vm_offset_t)mem; 1054 ofs = kva & PAGE_MASK; 1055 kva = trunc_page(kva); 1056 size = round_page(size + ofs); 1057 map = kernel_map; 1058 rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, 1059 &obj, &pindex, &prot, &wired); 1060 if (rv != KERN_SUCCESS) 1061 return (EINVAL); 1062 if (entry->start != kva || entry->end != kva + size) { 1063 vm_map_lookup_done(map, entry); 1064 return (EINVAL); 1065 } 1066 vm_map_lookup_done(map, entry); 1067 if (obj != shmfd->shm_object) 1068 return (EINVAL); 1069 vm_map_remove(map, kva, kva + size); 1070 VM_OBJECT_WLOCK(obj); 1071 KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); 1072 shmfd->shm_kmappings--; 1073 VM_OBJECT_WUNLOCK(obj); 1074 return (0); 1075} 1076 1077static int 1078shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1079{ 1080 const char *path, *pr_path; 1081 struct shmfd *shmfd; 1082 size_t pr_pathlen; 1083 1084 kif->kf_type = KF_TYPE_SHM; 1085 shmfd = fp->f_data; 1086 1087 mtx_lock(&shm_timestamp_lock); 1088 kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; /* XXX */ 1089 mtx_unlock(&shm_timestamp_lock); 1090 kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; 1091 if (shmfd->shm_path != NULL) { 1092 sx_slock(&shm_dict_lock); 1093 if (shmfd->shm_path != NULL) { 1094 path = shmfd->shm_path; 1095 pr_path = curthread->td_ucred->cr_prison->pr_path; 1096 if (strcmp(pr_path, "/") != 0) { 1097 /* Return the jail-rooted pathname. */ 1098 pr_pathlen = strlen(pr_path); 1099 if (strncmp(path, pr_path, pr_pathlen) == 0 && 1100 path[pr_pathlen] == '/') 1101 path += pr_pathlen; 1102 } 1103 strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); 1104 } 1105 sx_sunlock(&shm_dict_lock); 1106 } 1107 return (0); 1108} 1109