uipc_shm.c revision 269495
1/*- 2 * Copyright (c) 2006, 2011 Robert N. M. Watson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27/* 28 * Support for shared swap-backed anonymous memory objects via 29 * shm_open(2) and shm_unlink(2). While most of the implementation is 30 * here, vm_mmap.c contains mapping logic changes. 31 * 32 * TODO: 33 * 34 * (1) Need to export data to a userland tool via a sysctl. Should ipcs(1) 35 * and ipcrm(1) be expanded or should new tools to manage both POSIX 36 * kernel semaphores and POSIX shared memory be written? 37 * 38 * (2) Add support for this file type to fstat(1). 39 * 40 * (3) Resource limits? Does this need its own resource limits or are the 41 * existing limits in mmap(2) sufficient? 42 */ 43 44#include <sys/cdefs.h> 45__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_shm.c 269495 2014-08-04 01:14:27Z kib $"); 46 47#include "opt_capsicum.h" 48 49#include <sys/param.h> 50#include <sys/capability.h> 51#include <sys/fcntl.h> 52#include <sys/file.h> 53#include <sys/filedesc.h> 54#include <sys/fnv_hash.h> 55#include <sys/kernel.h> 56#include <sys/lock.h> 57#include <sys/malloc.h> 58#include <sys/mman.h> 59#include <sys/mutex.h> 60#include <sys/priv.h> 61#include <sys/proc.h> 62#include <sys/refcount.h> 63#include <sys/resourcevar.h> 64#include <sys/rwlock.h> 65#include <sys/stat.h> 66#include <sys/sysctl.h> 67#include <sys/sysproto.h> 68#include <sys/systm.h> 69#include <sys/sx.h> 70#include <sys/time.h> 71#include <sys/vnode.h> 72#include <sys/unistd.h> 73 74#include <security/mac/mac_framework.h> 75 76#include <vm/vm.h> 77#include <vm/vm_param.h> 78#include <vm/pmap.h> 79#include <vm/vm_extern.h> 80#include <vm/vm_map.h> 81#include <vm/vm_kern.h> 82#include <vm/vm_object.h> 83#include <vm/vm_page.h> 84#include <vm/vm_pageout.h> 85#include <vm/vm_pager.h> 86#include <vm/swap_pager.h> 87 88struct shm_mapping { 89 char *sm_path; 90 Fnv32_t sm_fnv; 91 struct shmfd *sm_shmfd; 92 LIST_ENTRY(shm_mapping) sm_link; 93}; 94 95static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); 96static LIST_HEAD(, shm_mapping) *shm_dictionary; 97static struct sx shm_dict_lock; 98static struct mtx shm_timestamp_lock; 99static u_long shm_hash; 100 101#define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) 102 103static int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); 104static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode); 105static void shm_dict_init(void *arg); 106static void shm_drop(struct shmfd *shmfd); 107static struct shmfd *shm_hold(struct shmfd *shmfd); 108static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); 109static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); 110static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); 111static int shm_dotruncate(struct shmfd *shmfd, off_t length); 112 113static fo_rdwr_t shm_read; 114static fo_rdwr_t shm_write; 115static fo_truncate_t shm_truncate; 116static fo_ioctl_t shm_ioctl; 117static fo_poll_t shm_poll; 118static fo_kqfilter_t shm_kqfilter; 119static fo_stat_t shm_stat; 120static fo_close_t shm_close; 121static fo_chmod_t shm_chmod; 122static fo_chown_t shm_chown; 123static fo_seek_t shm_seek; 124 125/* File descriptor operations. */ 126static struct fileops shm_ops = { 127 .fo_read = shm_read, 128 .fo_write = shm_write, 129 .fo_truncate = shm_truncate, 130 .fo_ioctl = shm_ioctl, 131 .fo_poll = shm_poll, 132 .fo_kqfilter = shm_kqfilter, 133 .fo_stat = shm_stat, 134 .fo_close = shm_close, 135 .fo_chmod = shm_chmod, 136 .fo_chown = shm_chown, 137 .fo_sendfile = vn_sendfile, 138 .fo_seek = shm_seek, 139 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 140}; 141 142FEATURE(posix_shm, "POSIX shared memory"); 143 144static int 145uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) 146{ 147 vm_page_t m; 148 vm_pindex_t idx; 149 size_t tlen; 150 int error, offset, rv; 151 152 idx = OFF_TO_IDX(uio->uio_offset); 153 offset = uio->uio_offset & PAGE_MASK; 154 tlen = MIN(PAGE_SIZE - offset, len); 155 156 VM_OBJECT_WLOCK(obj); 157 158 /* 159 * Parallel reads of the page content from disk are prevented 160 * by exclusive busy. 161 * 162 * Although the tmpfs vnode lock is held here, it is 163 * nonetheless safe to sleep waiting for a free page. The 164 * pageout daemon does not need to acquire the tmpfs vnode 165 * lock to page out tobj's pages because tobj is a OBJT_SWAP 166 * type object. 167 */ 168 m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL); 169 if (m->valid != VM_PAGE_BITS_ALL) { 170 if (vm_pager_has_page(obj, idx, NULL, NULL)) { 171 rv = vm_pager_get_pages(obj, &m, 1, 0); 172 m = vm_page_lookup(obj, idx); 173 if (m == NULL) { 174 printf( 175 "uiomove_object: vm_obj %p idx %jd null lookup rv %d\n", 176 obj, idx, rv); 177 VM_OBJECT_WUNLOCK(obj); 178 return (EIO); 179 } 180 if (rv != VM_PAGER_OK) { 181 printf( 182 "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n", 183 obj, idx, m->valid, rv); 184 vm_page_lock(m); 185 vm_page_free(m); 186 vm_page_unlock(m); 187 VM_OBJECT_WUNLOCK(obj); 188 return (EIO); 189 } 190 } else 191 vm_page_zero_invalid(m, TRUE); 192 } 193 vm_page_xunbusy(m); 194 vm_page_lock(m); 195 vm_page_hold(m); 196 vm_page_unlock(m); 197 VM_OBJECT_WUNLOCK(obj); 198 error = uiomove_fromphys(&m, offset, tlen, uio); 199 if (uio->uio_rw == UIO_WRITE && error == 0) { 200 VM_OBJECT_WLOCK(obj); 201 vm_page_dirty(m); 202 vm_pager_page_unswapped(m); 203 VM_OBJECT_WUNLOCK(obj); 204 } 205 vm_page_lock(m); 206 vm_page_unhold(m); 207 if (m->queue == PQ_NONE) { 208 vm_page_deactivate(m); 209 } else { 210 /* Requeue to maintain LRU ordering. */ 211 vm_page_requeue(m); 212 } 213 vm_page_unlock(m); 214 215 return (error); 216} 217 218int 219uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) 220{ 221 ssize_t resid; 222 size_t len; 223 int error; 224 225 error = 0; 226 while ((resid = uio->uio_resid) > 0) { 227 if (obj_size <= uio->uio_offset) 228 break; 229 len = MIN(obj_size - uio->uio_offset, resid); 230 if (len == 0) 231 break; 232 error = uiomove_object_page(obj, len, uio); 233 if (error != 0 || resid == uio->uio_resid) 234 break; 235 } 236 return (error); 237} 238 239static int 240shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) 241{ 242 struct shmfd *shmfd; 243 off_t foffset; 244 int error; 245 246 shmfd = fp->f_data; 247 foffset = foffset_lock(fp, 0); 248 error = 0; 249 switch (whence) { 250 case L_INCR: 251 if (foffset < 0 || 252 (offset > 0 && foffset > OFF_MAX - offset)) { 253 error = EOVERFLOW; 254 break; 255 } 256 offset += foffset; 257 break; 258 case L_XTND: 259 if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { 260 error = EOVERFLOW; 261 break; 262 } 263 offset += shmfd->shm_size; 264 break; 265 case L_SET: 266 break; 267 default: 268 error = EINVAL; 269 } 270 if (error == 0) { 271 if (offset < 0 || offset > shmfd->shm_size) 272 error = EINVAL; 273 else 274 *(off_t *)(td->td_retval) = offset; 275 } 276 foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); 277 return (error); 278} 279 280static int 281shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 282 int flags, struct thread *td) 283{ 284 struct shmfd *shmfd; 285 void *rl_cookie; 286 int error; 287 288 shmfd = fp->f_data; 289 foffset_lock_uio(fp, uio, flags); 290 rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, 291 uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); 292#ifdef MAC 293 error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); 294 if (error) 295 return (error); 296#endif 297 error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); 298 rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 299 foffset_unlock_uio(fp, uio, flags); 300 return (error); 301} 302 303static int 304shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 305 int flags, struct thread *td) 306{ 307 struct shmfd *shmfd; 308 void *rl_cookie; 309 int error; 310 311 shmfd = fp->f_data; 312#ifdef MAC 313 error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); 314 if (error) 315 return (error); 316#endif 317 foffset_lock_uio(fp, uio, flags); 318 if ((flags & FOF_OFFSET) == 0) { 319 rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, 320 &shmfd->shm_mtx); 321 } else { 322 rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, 323 uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); 324 } 325 326 error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); 327 rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); 328 foffset_unlock_uio(fp, uio, flags); 329 return (error); 330} 331 332static int 333shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, 334 struct thread *td) 335{ 336 struct shmfd *shmfd; 337#ifdef MAC 338 int error; 339#endif 340 341 shmfd = fp->f_data; 342#ifdef MAC 343 error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); 344 if (error) 345 return (error); 346#endif 347 return (shm_dotruncate(shmfd, length)); 348} 349 350static int 351shm_ioctl(struct file *fp, u_long com, void *data, 352 struct ucred *active_cred, struct thread *td) 353{ 354 355 return (EOPNOTSUPP); 356} 357 358static int 359shm_poll(struct file *fp, int events, struct ucred *active_cred, 360 struct thread *td) 361{ 362 363 return (EOPNOTSUPP); 364} 365 366static int 367shm_kqfilter(struct file *fp, struct knote *kn) 368{ 369 370 return (EOPNOTSUPP); 371} 372 373static int 374shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, 375 struct thread *td) 376{ 377 struct shmfd *shmfd; 378#ifdef MAC 379 int error; 380#endif 381 382 shmfd = fp->f_data; 383 384#ifdef MAC 385 error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); 386 if (error) 387 return (error); 388#endif 389 390 /* 391 * Attempt to return sanish values for fstat() on a memory file 392 * descriptor. 393 */ 394 bzero(sb, sizeof(*sb)); 395 sb->st_blksize = PAGE_SIZE; 396 sb->st_size = shmfd->shm_size; 397 sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize; 398 mtx_lock(&shm_timestamp_lock); 399 sb->st_atim = shmfd->shm_atime; 400 sb->st_ctim = shmfd->shm_ctime; 401 sb->st_mtim = shmfd->shm_mtime; 402 sb->st_birthtim = shmfd->shm_birthtime; 403 sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ 404 sb->st_uid = shmfd->shm_uid; 405 sb->st_gid = shmfd->shm_gid; 406 mtx_unlock(&shm_timestamp_lock); 407 408 return (0); 409} 410 411static int 412shm_close(struct file *fp, struct thread *td) 413{ 414 struct shmfd *shmfd; 415 416 shmfd = fp->f_data; 417 fp->f_data = NULL; 418 shm_drop(shmfd); 419 420 return (0); 421} 422 423static int 424shm_dotruncate(struct shmfd *shmfd, off_t length) 425{ 426 vm_object_t object; 427 vm_page_t m, ma[1]; 428 vm_pindex_t idx, nobjsize; 429 vm_ooffset_t delta; 430 int base, rv; 431 432 object = shmfd->shm_object; 433 VM_OBJECT_WLOCK(object); 434 if (length == shmfd->shm_size) { 435 VM_OBJECT_WUNLOCK(object); 436 return (0); 437 } 438 nobjsize = OFF_TO_IDX(length + PAGE_MASK); 439 440 /* Are we shrinking? If so, trim the end. */ 441 if (length < shmfd->shm_size) { 442 /* 443 * Disallow any requests to shrink the size if this 444 * object is mapped into the kernel. 445 */ 446 if (shmfd->shm_kmappings > 0) { 447 VM_OBJECT_WUNLOCK(object); 448 return (EBUSY); 449 } 450 451 /* 452 * Zero the truncated part of the last page. 453 */ 454 base = length & PAGE_MASK; 455 if (base != 0) { 456 idx = OFF_TO_IDX(length); 457retry: 458 m = vm_page_lookup(object, idx); 459 if (m != NULL) { 460 if (vm_page_sleep_if_busy(m, "shmtrc")) 461 goto retry; 462 } else if (vm_pager_has_page(object, idx, NULL, NULL)) { 463 m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL); 464 if (m == NULL) { 465 VM_OBJECT_WUNLOCK(object); 466 VM_WAIT; 467 VM_OBJECT_WLOCK(object); 468 goto retry; 469 } else if (m->valid != VM_PAGE_BITS_ALL) { 470 ma[0] = m; 471 rv = vm_pager_get_pages(object, ma, 1, 472 0); 473 m = vm_page_lookup(object, idx); 474 } else 475 /* A cached page was reactivated. */ 476 rv = VM_PAGER_OK; 477 vm_page_lock(m); 478 if (rv == VM_PAGER_OK) { 479 vm_page_deactivate(m); 480 vm_page_unlock(m); 481 vm_page_xunbusy(m); 482 } else { 483 vm_page_free(m); 484 vm_page_unlock(m); 485 VM_OBJECT_WUNLOCK(object); 486 return (EIO); 487 } 488 } 489 if (m != NULL) { 490 pmap_zero_page_area(m, base, PAGE_SIZE - base); 491 KASSERT(m->valid == VM_PAGE_BITS_ALL, 492 ("shm_dotruncate: page %p is invalid", m)); 493 vm_page_dirty(m); 494 vm_pager_page_unswapped(m); 495 } 496 } 497 delta = ptoa(object->size - nobjsize); 498 499 /* Toss in memory pages. */ 500 if (nobjsize < object->size) 501 vm_object_page_remove(object, nobjsize, object->size, 502 0); 503 504 /* Toss pages from swap. */ 505 if (object->type == OBJT_SWAP) 506 swap_pager_freespace(object, nobjsize, delta); 507 508 /* Free the swap accounted for shm */ 509 swap_release_by_cred(delta, object->cred); 510 object->charge -= delta; 511 } else { 512 /* Attempt to reserve the swap */ 513 delta = ptoa(nobjsize - object->size); 514 if (!swap_reserve_by_cred(delta, object->cred)) { 515 VM_OBJECT_WUNLOCK(object); 516 return (ENOMEM); 517 } 518 object->charge += delta; 519 } 520 shmfd->shm_size = length; 521 mtx_lock(&shm_timestamp_lock); 522 vfs_timestamp(&shmfd->shm_ctime); 523 shmfd->shm_mtime = shmfd->shm_ctime; 524 mtx_unlock(&shm_timestamp_lock); 525 object->size = nobjsize; 526 VM_OBJECT_WUNLOCK(object); 527 return (0); 528} 529 530/* 531 * shmfd object management including creation and reference counting 532 * routines. 533 */ 534static struct shmfd * 535shm_alloc(struct ucred *ucred, mode_t mode) 536{ 537 struct shmfd *shmfd; 538 539 shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); 540 shmfd->shm_size = 0; 541 shmfd->shm_uid = ucred->cr_uid; 542 shmfd->shm_gid = ucred->cr_gid; 543 shmfd->shm_mode = mode; 544 shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, 545 shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); 546 KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); 547 VM_OBJECT_WLOCK(shmfd->shm_object); 548 vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); 549 vm_object_set_flag(shmfd->shm_object, OBJ_NOSPLIT); 550 VM_OBJECT_WUNLOCK(shmfd->shm_object); 551 vfs_timestamp(&shmfd->shm_birthtime); 552 shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = 553 shmfd->shm_birthtime; 554 refcount_init(&shmfd->shm_refs, 1); 555 mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); 556 rangelock_init(&shmfd->shm_rl); 557#ifdef MAC 558 mac_posixshm_init(shmfd); 559 mac_posixshm_create(ucred, shmfd); 560#endif 561 562 return (shmfd); 563} 564 565static struct shmfd * 566shm_hold(struct shmfd *shmfd) 567{ 568 569 refcount_acquire(&shmfd->shm_refs); 570 return (shmfd); 571} 572 573static void 574shm_drop(struct shmfd *shmfd) 575{ 576 577 if (refcount_release(&shmfd->shm_refs)) { 578#ifdef MAC 579 mac_posixshm_destroy(shmfd); 580#endif 581 rangelock_destroy(&shmfd->shm_rl); 582 mtx_destroy(&shmfd->shm_mtx); 583 vm_object_deallocate(shmfd->shm_object); 584 free(shmfd, M_SHMFD); 585 } 586} 587 588/* 589 * Determine if the credentials have sufficient permissions for a 590 * specified combination of FREAD and FWRITE. 591 */ 592static int 593shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) 594{ 595 accmode_t accmode; 596 int error; 597 598 accmode = 0; 599 if (flags & FREAD) 600 accmode |= VREAD; 601 if (flags & FWRITE) 602 accmode |= VWRITE; 603 mtx_lock(&shm_timestamp_lock); 604 error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, 605 accmode, ucred, NULL); 606 mtx_unlock(&shm_timestamp_lock); 607 return (error); 608} 609 610/* 611 * Dictionary management. We maintain an in-kernel dictionary to map 612 * paths to shmfd objects. We use the FNV hash on the path to store 613 * the mappings in a hash table. 614 */ 615static void 616shm_dict_init(void *arg) 617{ 618 619 mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); 620 sx_init(&shm_dict_lock, "shm dictionary"); 621 shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); 622} 623SYSINIT(shm_dict_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_dict_init, NULL); 624 625static struct shmfd * 626shm_lookup(char *path, Fnv32_t fnv) 627{ 628 struct shm_mapping *map; 629 630 LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 631 if (map->sm_fnv != fnv) 632 continue; 633 if (strcmp(map->sm_path, path) == 0) 634 return (map->sm_shmfd); 635 } 636 637 return (NULL); 638} 639 640static void 641shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) 642{ 643 struct shm_mapping *map; 644 645 map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); 646 map->sm_path = path; 647 map->sm_fnv = fnv; 648 map->sm_shmfd = shm_hold(shmfd); 649 shmfd->shm_path = path; 650 LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); 651} 652 653static int 654shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) 655{ 656 struct shm_mapping *map; 657 int error; 658 659 LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { 660 if (map->sm_fnv != fnv) 661 continue; 662 if (strcmp(map->sm_path, path) == 0) { 663#ifdef MAC 664 error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); 665 if (error) 666 return (error); 667#endif 668 error = shm_access(map->sm_shmfd, ucred, 669 FREAD | FWRITE); 670 if (error) 671 return (error); 672 map->sm_shmfd->shm_path = NULL; 673 LIST_REMOVE(map, sm_link); 674 shm_drop(map->sm_shmfd); 675 free(map->sm_path, M_SHMFD); 676 free(map, M_SHMFD); 677 return (0); 678 } 679 } 680 681 return (ENOENT); 682} 683 684/* System calls. */ 685int 686sys_shm_open(struct thread *td, struct shm_open_args *uap) 687{ 688 struct filedesc *fdp; 689 struct shmfd *shmfd; 690 struct file *fp; 691 char *path; 692 Fnv32_t fnv; 693 mode_t cmode; 694 int fd, error; 695 696#ifdef CAPABILITY_MODE 697 /* 698 * shm_open(2) is only allowed for anonymous objects. 699 */ 700 if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON)) 701 return (ECAPMODE); 702#endif 703 704 if ((uap->flags & O_ACCMODE) != O_RDONLY && 705 (uap->flags & O_ACCMODE) != O_RDWR) 706 return (EINVAL); 707 708 if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) 709 return (EINVAL); 710 711 fdp = td->td_proc->p_fd; 712 cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS; 713 714 error = falloc(td, &fp, &fd, O_CLOEXEC); 715 if (error) 716 return (error); 717 718 /* A SHM_ANON path pointer creates an anonymous object. */ 719 if (uap->path == SHM_ANON) { 720 /* A read-only anonymous object is pointless. */ 721 if ((uap->flags & O_ACCMODE) == O_RDONLY) { 722 fdclose(fdp, fp, fd, td); 723 fdrop(fp, td); 724 return (EINVAL); 725 } 726 shmfd = shm_alloc(td->td_ucred, cmode); 727 } else { 728 path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); 729 error = copyinstr(uap->path, path, MAXPATHLEN, NULL); 730 731 /* Require paths to start with a '/' character. */ 732 if (error == 0 && path[0] != '/') 733 error = EINVAL; 734 if (error) { 735 fdclose(fdp, fp, fd, td); 736 fdrop(fp, td); 737 free(path, M_SHMFD); 738 return (error); 739 } 740 741 fnv = fnv_32_str(path, FNV1_32_INIT); 742 sx_xlock(&shm_dict_lock); 743 shmfd = shm_lookup(path, fnv); 744 if (shmfd == NULL) { 745 /* Object does not yet exist, create it if requested. */ 746 if (uap->flags & O_CREAT) { 747#ifdef MAC 748 error = mac_posixshm_check_create(td->td_ucred, 749 path); 750 if (error == 0) { 751#endif 752 shmfd = shm_alloc(td->td_ucred, cmode); 753 shm_insert(path, fnv, shmfd); 754#ifdef MAC 755 } 756#endif 757 } else { 758 free(path, M_SHMFD); 759 error = ENOENT; 760 } 761 } else { 762 /* 763 * Object already exists, obtain a new 764 * reference if requested and permitted. 765 */ 766 free(path, M_SHMFD); 767 if ((uap->flags & (O_CREAT | O_EXCL)) == 768 (O_CREAT | O_EXCL)) 769 error = EEXIST; 770 else { 771#ifdef MAC 772 error = mac_posixshm_check_open(td->td_ucred, 773 shmfd, FFLAGS(uap->flags & O_ACCMODE)); 774 if (error == 0) 775#endif 776 error = shm_access(shmfd, td->td_ucred, 777 FFLAGS(uap->flags & O_ACCMODE)); 778 } 779 780 /* 781 * Truncate the file back to zero length if 782 * O_TRUNC was specified and the object was 783 * opened with read/write. 784 */ 785 if (error == 0 && 786 (uap->flags & (O_ACCMODE | O_TRUNC)) == 787 (O_RDWR | O_TRUNC)) { 788#ifdef MAC 789 error = mac_posixshm_check_truncate( 790 td->td_ucred, fp->f_cred, shmfd); 791 if (error == 0) 792#endif 793 shm_dotruncate(shmfd, 0); 794 } 795 if (error == 0) 796 shm_hold(shmfd); 797 } 798 sx_xunlock(&shm_dict_lock); 799 800 if (error) { 801 fdclose(fdp, fp, fd, td); 802 fdrop(fp, td); 803 return (error); 804 } 805 } 806 807 finit(fp, FFLAGS(uap->flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); 808 809 td->td_retval[0] = fd; 810 fdrop(fp, td); 811 812 return (0); 813} 814 815int 816sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) 817{ 818 char *path; 819 Fnv32_t fnv; 820 int error; 821 822 path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 823 error = copyinstr(uap->path, path, MAXPATHLEN, NULL); 824 if (error) { 825 free(path, M_TEMP); 826 return (error); 827 } 828 829 fnv = fnv_32_str(path, FNV1_32_INIT); 830 sx_xlock(&shm_dict_lock); 831 error = shm_remove(path, fnv, td->td_ucred); 832 sx_xunlock(&shm_dict_lock); 833 free(path, M_TEMP); 834 835 return (error); 836} 837 838/* 839 * mmap() helper to validate mmap() requests against shm object state 840 * and give mmap() the vm_object to use for the mapping. 841 */ 842int 843shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff, 844 vm_object_t *obj) 845{ 846 847 /* 848 * XXXRW: This validation is probably insufficient, and subject to 849 * sign errors. It should be fixed. 850 */ 851 if (foff >= shmfd->shm_size || 852 foff + objsize > round_page(shmfd->shm_size)) 853 return (EINVAL); 854 855 mtx_lock(&shm_timestamp_lock); 856 vfs_timestamp(&shmfd->shm_atime); 857 mtx_unlock(&shm_timestamp_lock); 858 vm_object_reference(shmfd->shm_object); 859 *obj = shmfd->shm_object; 860 return (0); 861} 862 863static int 864shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 865 struct thread *td) 866{ 867 struct shmfd *shmfd; 868 int error; 869 870 error = 0; 871 shmfd = fp->f_data; 872 mtx_lock(&shm_timestamp_lock); 873 /* 874 * SUSv4 says that x bits of permission need not be affected. 875 * Be consistent with our shm_open there. 876 */ 877#ifdef MAC 878 error = mac_posixshm_check_setmode(active_cred, shmfd, mode); 879 if (error != 0) 880 goto out; 881#endif 882 error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, 883 shmfd->shm_gid, VADMIN, active_cred, NULL); 884 if (error != 0) 885 goto out; 886 shmfd->shm_mode = mode & ACCESSPERMS; 887out: 888 mtx_unlock(&shm_timestamp_lock); 889 return (error); 890} 891 892static int 893shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 894 struct thread *td) 895{ 896 struct shmfd *shmfd; 897 int error; 898 899 error = 0; 900 shmfd = fp->f_data; 901 mtx_lock(&shm_timestamp_lock); 902#ifdef MAC 903 error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); 904 if (error != 0) 905 goto out; 906#endif 907 if (uid == (uid_t)-1) 908 uid = shmfd->shm_uid; 909 if (gid == (gid_t)-1) 910 gid = shmfd->shm_gid; 911 if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || 912 (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && 913 (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) 914 goto out; 915 shmfd->shm_uid = uid; 916 shmfd->shm_gid = gid; 917out: 918 mtx_unlock(&shm_timestamp_lock); 919 return (error); 920} 921 922/* 923 * Helper routines to allow the backing object of a shared memory file 924 * descriptor to be mapped in the kernel. 925 */ 926int 927shm_map(struct file *fp, size_t size, off_t offset, void **memp) 928{ 929 struct shmfd *shmfd; 930 vm_offset_t kva, ofs; 931 vm_object_t obj; 932 int rv; 933 934 if (fp->f_type != DTYPE_SHM) 935 return (EINVAL); 936 shmfd = fp->f_data; 937 obj = shmfd->shm_object; 938 VM_OBJECT_WLOCK(obj); 939 /* 940 * XXXRW: This validation is probably insufficient, and subject to 941 * sign errors. It should be fixed. 942 */ 943 if (offset >= shmfd->shm_size || 944 offset + size > round_page(shmfd->shm_size)) { 945 VM_OBJECT_WUNLOCK(obj); 946 return (EINVAL); 947 } 948 949 shmfd->shm_kmappings++; 950 vm_object_reference_locked(obj); 951 VM_OBJECT_WUNLOCK(obj); 952 953 /* Map the object into the kernel_map and wire it. */ 954 kva = vm_map_min(kernel_map); 955 ofs = offset & PAGE_MASK; 956 offset = trunc_page(offset); 957 size = round_page(size + ofs); 958 rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, 959 VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, 960 VM_PROT_READ | VM_PROT_WRITE, 0); 961 if (rv == KERN_SUCCESS) { 962 rv = vm_map_wire(kernel_map, kva, kva + size, 963 VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); 964 if (rv == KERN_SUCCESS) { 965 *memp = (void *)(kva + ofs); 966 return (0); 967 } 968 vm_map_remove(kernel_map, kva, kva + size); 969 } else 970 vm_object_deallocate(obj); 971 972 /* On failure, drop our mapping reference. */ 973 VM_OBJECT_WLOCK(obj); 974 shmfd->shm_kmappings--; 975 VM_OBJECT_WUNLOCK(obj); 976 977 return (vm_mmap_to_errno(rv)); 978} 979 980/* 981 * We require the caller to unmap the entire entry. This allows us to 982 * safely decrement shm_kmappings when a mapping is removed. 983 */ 984int 985shm_unmap(struct file *fp, void *mem, size_t size) 986{ 987 struct shmfd *shmfd; 988 vm_map_entry_t entry; 989 vm_offset_t kva, ofs; 990 vm_object_t obj; 991 vm_pindex_t pindex; 992 vm_prot_t prot; 993 boolean_t wired; 994 vm_map_t map; 995 int rv; 996 997 if (fp->f_type != DTYPE_SHM) 998 return (EINVAL); 999 shmfd = fp->f_data; 1000 kva = (vm_offset_t)mem; 1001 ofs = kva & PAGE_MASK; 1002 kva = trunc_page(kva); 1003 size = round_page(size + ofs); 1004 map = kernel_map; 1005 rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, 1006 &obj, &pindex, &prot, &wired); 1007 if (rv != KERN_SUCCESS) 1008 return (EINVAL); 1009 if (entry->start != kva || entry->end != kva + size) { 1010 vm_map_lookup_done(map, entry); 1011 return (EINVAL); 1012 } 1013 vm_map_lookup_done(map, entry); 1014 if (obj != shmfd->shm_object) 1015 return (EINVAL); 1016 vm_map_remove(map, kva, kva + size); 1017 VM_OBJECT_WLOCK(obj); 1018 KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); 1019 shmfd->shm_kmappings--; 1020 VM_OBJECT_WUNLOCK(obj); 1021 return (0); 1022} 1023 1024void 1025shm_path(struct shmfd *shmfd, char *path, size_t size) 1026{ 1027 1028 if (shmfd->shm_path == NULL) 1029 return; 1030 sx_slock(&shm_dict_lock); 1031 if (shmfd->shm_path != NULL) 1032 strlcpy(path, shmfd->shm_path, size); 1033 sx_sunlock(&shm_dict_lock); 1034} 1035