1/*- 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 33 * 34 * 35 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 36 * All rights reserved. 37 * 38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 39 * 40 * Permission to use, copy, modify and distribute this software and 41 * its documentation is hereby granted, provided that both the copyright 42 * notice and this permission notice appear in all copies of the 43 * software, derivative works or modified versions, and any portions 44 * thereof, and that both notices appear in supporting documentation. 45 * 46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 49 * 50 * Carnegie Mellon requests users of this software to return to 51 * 52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 53 * School of Computer Science 54 * Carnegie Mellon University 55 * Pittsburgh PA 15213-3890 56 * 57 * any improvements or extensions that they make and grant Carnegie the 58 * rights to redistribute these changes. 59 */ 60 61/* 62 * Virtual memory mapping module. 63 */ 64 65#include <sys/cdefs.h> 66__FBSDID("$FreeBSD: stable/11/sys/vm/vm_map.c 355049 2019-11-24 06:54:17Z dougm $"); 67 68#include <sys/param.h> 69#include <sys/systm.h> 70#include <sys/kernel.h> 71#include <sys/ktr.h> 72#include <sys/lock.h> 73#include <sys/mutex.h> 74#include <sys/proc.h> 75#include <sys/vmmeter.h> 76#include <sys/mman.h> 77#include <sys/vnode.h> 78#include <sys/racct.h> 79#include <sys/resourcevar.h> 80#include <sys/rwlock.h> 81#include <sys/file.h> 82#include <sys/sysctl.h> 83#include <sys/sysent.h> 84#include <sys/shm.h> 85 86#include <vm/vm.h> 87#include <vm/vm_param.h> 88#include <vm/pmap.h> 89#include <vm/vm_map.h> 90#include <vm/vm_page.h> 91#include <vm/vm_object.h> 92#include <vm/vm_pager.h> 93#include <vm/vm_kern.h> 94#include <vm/vm_extern.h> 95#include <vm/vnode_pager.h> 96#include <vm/swap_pager.h> 97#include <vm/uma.h> 98 99/* 100 * Virtual memory maps provide for the mapping, protection, 101 * and sharing of virtual memory objects. In addition, 102 * this module provides for an efficient virtual copy of 103 * memory from one map to another. 104 * 105 * Synchronization is required prior to most operations. 106 * 107 * Maps consist of an ordered doubly-linked list of simple 108 * entries; a self-adjusting binary search tree of these 109 * entries is used to speed up lookups. 110 * 111 * Since portions of maps are specified by start/end addresses, 112 * which may not align with existing map entries, all 113 * routines merely "clip" entries to these start/end values. 114 * [That is, an entry is split into two, bordering at a 115 * start or end value.] Note that these clippings may not 116 * always be necessary (as the two resulting entries are then 117 * not changed); however, the clipping is done for convenience. 118 * 119 * As mentioned above, virtual copy operations are performed 120 * by copying VM object references from one map to 121 * another, and then marking both regions as copy-on-write. 122 */ 123 124static struct mtx map_sleep_mtx; 125static uma_zone_t mapentzone; 126static uma_zone_t kmapentzone; 127static uma_zone_t mapzone; 128static uma_zone_t vmspace_zone; 129static int vmspace_zinit(void *mem, int size, int flags); 130static int vm_map_zinit(void *mem, int ize, int flags); 131static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, 132 vm_offset_t max); 133static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map); 134static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); 135static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry); 136static int vm_map_growstack(vm_map_t map, vm_offset_t addr, 137 vm_map_entry_t gap_entry); 138static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, 139 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); 140#ifdef INVARIANTS 141static void vm_map_zdtor(void *mem, int size, void *arg); 142static void vmspace_zdtor(void *mem, int size, void *arg); 143#endif 144static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, 145 vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, 146 int cow); 147static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, 148 vm_offset_t failed_addr); 149 150#define ENTRY_CHARGED(e) ((e)->cred != NULL || \ 151 ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \ 152 !((e)->eflags & MAP_ENTRY_NEEDS_COPY))) 153 154/* 155 * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type 156 * stable. 157 */ 158#define PROC_VMSPACE_LOCK(p) do { } while (0) 159#define PROC_VMSPACE_UNLOCK(p) do { } while (0) 160 161/* 162 * VM_MAP_RANGE_CHECK: [ internal use only ] 163 * 164 * Asserts that the starting and ending region 165 * addresses fall within the valid range of the map. 166 */ 167#define VM_MAP_RANGE_CHECK(map, start, end) \ 168 { \ 169 if (start < vm_map_min(map)) \ 170 start = vm_map_min(map); \ 171 if (end > vm_map_max(map)) \ 172 end = vm_map_max(map); \ 173 if (start > end) \ 174 start = end; \ 175 } 176 177/* 178 * vm_map_startup: 179 * 180 * Initialize the vm_map module. Must be called before 181 * any other vm_map routines. 182 * 183 * Map and entry structures are allocated from the general 184 * purpose memory pool with some exceptions: 185 * 186 * - The kernel map and kmem submap are allocated statically. 187 * - Kernel map entries are allocated out of a static pool. 188 * 189 * These restrictions are necessary since malloc() uses the 190 * maps and requires map entries. 191 */ 192 193void 194vm_map_startup(void) 195{ 196 mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); 197 mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL, 198#ifdef INVARIANTS 199 vm_map_zdtor, 200#else 201 NULL, 202#endif 203 vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 204 uma_prealloc(mapzone, MAX_KMAP); 205 kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), 206 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 207 UMA_ZONE_MTXCLASS | UMA_ZONE_VM); 208 mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), 209 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 210 vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, 211#ifdef INVARIANTS 212 vmspace_zdtor, 213#else 214 NULL, 215#endif 216 vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 217} 218 219static int 220vmspace_zinit(void *mem, int size, int flags) 221{ 222 struct vmspace *vm; 223 224 vm = (struct vmspace *)mem; 225 226 vm->vm_map.pmap = NULL; 227 (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags); 228 PMAP_LOCK_INIT(vmspace_pmap(vm)); 229 return (0); 230} 231 232static int 233vm_map_zinit(void *mem, int size, int flags) 234{ 235 vm_map_t map; 236 237 map = (vm_map_t)mem; 238 memset(map, 0, sizeof(*map)); 239 mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK); 240 sx_init(&map->lock, "vm map (user)"); 241 return (0); 242} 243 244#ifdef INVARIANTS 245static void 246vmspace_zdtor(void *mem, int size, void *arg) 247{ 248 struct vmspace *vm; 249 250 vm = (struct vmspace *)mem; 251 252 vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg); 253} 254static void 255vm_map_zdtor(void *mem, int size, void *arg) 256{ 257 vm_map_t map; 258 259 map = (vm_map_t)mem; 260 KASSERT(map->nentries == 0, 261 ("map %p nentries == %d on free.", 262 map, map->nentries)); 263 KASSERT(map->size == 0, 264 ("map %p size == %lu on free.", 265 map, (unsigned long)map->size)); 266} 267#endif /* INVARIANTS */ 268 269/* 270 * Allocate a vmspace structure, including a vm_map and pmap, 271 * and initialize those structures. The refcnt is set to 1. 272 * 273 * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit(). 274 */ 275struct vmspace * 276vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit) 277{ 278 struct vmspace *vm; 279 280 vm = uma_zalloc(vmspace_zone, M_WAITOK); 281 KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL")); 282 if (!pinit(vmspace_pmap(vm))) { 283 uma_zfree(vmspace_zone, vm); 284 return (NULL); 285 } 286 CTR1(KTR_VM, "vmspace_alloc: %p", vm); 287 _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max); 288 vm->vm_refcnt = 1; 289 vm->vm_shm = NULL; 290 vm->vm_swrss = 0; 291 vm->vm_tsize = 0; 292 vm->vm_dsize = 0; 293 vm->vm_ssize = 0; 294 vm->vm_taddr = 0; 295 vm->vm_daddr = 0; 296 vm->vm_maxsaddr = 0; 297 return (vm); 298} 299 300#ifdef RACCT 301static void 302vmspace_container_reset(struct proc *p) 303{ 304 305 PROC_LOCK(p); 306 racct_set(p, RACCT_DATA, 0); 307 racct_set(p, RACCT_STACK, 0); 308 racct_set(p, RACCT_RSS, 0); 309 racct_set(p, RACCT_MEMLOCK, 0); 310 racct_set(p, RACCT_VMEM, 0); 311 PROC_UNLOCK(p); 312} 313#endif 314 315static inline void 316vmspace_dofree(struct vmspace *vm) 317{ 318 319 CTR1(KTR_VM, "vmspace_free: %p", vm); 320 321 /* 322 * Make sure any SysV shm is freed, it might not have been in 323 * exit1(). 324 */ 325 shmexit(vm); 326 327 /* 328 * Lock the map, to wait out all other references to it. 329 * Delete all of the mappings and pages they hold, then call 330 * the pmap module to reclaim anything left. 331 */ 332 (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map), 333 vm_map_max(&vm->vm_map)); 334 335 pmap_release(vmspace_pmap(vm)); 336 vm->vm_map.pmap = NULL; 337 uma_zfree(vmspace_zone, vm); 338} 339 340void 341vmspace_free(struct vmspace *vm) 342{ 343 344 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 345 "vmspace_free() called"); 346 347 if (vm->vm_refcnt == 0) 348 panic("vmspace_free: attempt to free already freed vmspace"); 349 350 if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1) 351 vmspace_dofree(vm); 352} 353 354void 355vmspace_exitfree(struct proc *p) 356{ 357 struct vmspace *vm; 358 359 PROC_VMSPACE_LOCK(p); 360 vm = p->p_vmspace; 361 p->p_vmspace = NULL; 362 PROC_VMSPACE_UNLOCK(p); 363 KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace")); 364 vmspace_free(vm); 365} 366 367void 368vmspace_exit(struct thread *td) 369{ 370 int refcnt; 371 struct vmspace *vm; 372 struct proc *p; 373 374 /* 375 * Release user portion of address space. 376 * This releases references to vnodes, 377 * which could cause I/O if the file has been unlinked. 378 * Need to do this early enough that we can still sleep. 379 * 380 * The last exiting process to reach this point releases as 381 * much of the environment as it can. vmspace_dofree() is the 382 * slower fallback in case another process had a temporary 383 * reference to the vmspace. 384 */ 385 386 p = td->td_proc; 387 vm = p->p_vmspace; 388 atomic_add_int(&vmspace0.vm_refcnt, 1); 389 do { 390 refcnt = vm->vm_refcnt; 391 if (refcnt > 1 && p->p_vmspace != &vmspace0) { 392 /* Switch now since other proc might free vmspace */ 393 PROC_VMSPACE_LOCK(p); 394 p->p_vmspace = &vmspace0; 395 PROC_VMSPACE_UNLOCK(p); 396 pmap_activate(td); 397 } 398 } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1)); 399 if (refcnt == 1) { 400 if (p->p_vmspace != vm) { 401 /* vmspace not yet freed, switch back */ 402 PROC_VMSPACE_LOCK(p); 403 p->p_vmspace = vm; 404 PROC_VMSPACE_UNLOCK(p); 405 pmap_activate(td); 406 } 407 pmap_remove_pages(vmspace_pmap(vm)); 408 /* Switch now since this proc will free vmspace */ 409 PROC_VMSPACE_LOCK(p); 410 p->p_vmspace = &vmspace0; 411 PROC_VMSPACE_UNLOCK(p); 412 pmap_activate(td); 413 vmspace_dofree(vm); 414 } 415#ifdef RACCT 416 if (racct_enable) 417 vmspace_container_reset(p); 418#endif 419} 420 421/* Acquire reference to vmspace owned by another process. */ 422 423struct vmspace * 424vmspace_acquire_ref(struct proc *p) 425{ 426 struct vmspace *vm; 427 int refcnt; 428 429 PROC_VMSPACE_LOCK(p); 430 vm = p->p_vmspace; 431 if (vm == NULL) { 432 PROC_VMSPACE_UNLOCK(p); 433 return (NULL); 434 } 435 do { 436 refcnt = vm->vm_refcnt; 437 if (refcnt <= 0) { /* Avoid 0->1 transition */ 438 PROC_VMSPACE_UNLOCK(p); 439 return (NULL); 440 } 441 } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1)); 442 if (vm != p->p_vmspace) { 443 PROC_VMSPACE_UNLOCK(p); 444 vmspace_free(vm); 445 return (NULL); 446 } 447 PROC_VMSPACE_UNLOCK(p); 448 return (vm); 449} 450 451/* 452 * Switch between vmspaces in an AIO kernel process. 453 * 454 * The new vmspace is either the vmspace of a user process obtained 455 * from an active AIO request or the initial vmspace of the AIO kernel 456 * process (when it is idling). Because user processes will block to 457 * drain any active AIO requests before proceeding in exit() or 458 * execve(), the reference count for vmspaces from AIO requests can 459 * never be 0. Similarly, AIO kernel processes hold an extra 460 * reference on their initial vmspace for the life of the process. As 461 * a result, the 'newvm' vmspace always has a non-zero reference 462 * count. This permits an additional reference on 'newvm' to be 463 * acquired via a simple atomic increment rather than the loop in 464 * vmspace_acquire_ref() above. 465 */ 466void 467vmspace_switch_aio(struct vmspace *newvm) 468{ 469 struct vmspace *oldvm; 470 471 /* XXX: Need some way to assert that this is an aio daemon. */ 472 473 KASSERT(newvm->vm_refcnt > 0, 474 ("vmspace_switch_aio: newvm unreferenced")); 475 476 oldvm = curproc->p_vmspace; 477 if (oldvm == newvm) 478 return; 479 480 /* 481 * Point to the new address space and refer to it. 482 */ 483 curproc->p_vmspace = newvm; 484 atomic_add_int(&newvm->vm_refcnt, 1); 485 486 /* Activate the new mapping. */ 487 pmap_activate(curthread); 488 489 vmspace_free(oldvm); 490} 491 492void 493_vm_map_lock(vm_map_t map, const char *file, int line) 494{ 495 496 if (map->system_map) 497 mtx_lock_flags_(&map->system_mtx, 0, file, line); 498 else 499 sx_xlock_(&map->lock, file, line); 500 map->timestamp++; 501} 502 503static void 504vm_map_process_deferred(void) 505{ 506 struct thread *td; 507 vm_map_entry_t entry, next; 508 vm_object_t object; 509 510 td = curthread; 511 entry = td->td_map_def_user; 512 td->td_map_def_user = NULL; 513 while (entry != NULL) { 514 next = entry->next; 515 if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) { 516 /* 517 * Decrement the object's writemappings and 518 * possibly the vnode's v_writecount. 519 */ 520 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, 521 ("Submap with writecount")); 522 object = entry->object.vm_object; 523 KASSERT(object != NULL, ("No object for writecount")); 524 vnode_pager_release_writecount(object, entry->start, 525 entry->end); 526 } 527 vm_map_entry_deallocate(entry, FALSE); 528 entry = next; 529 } 530} 531 532void 533_vm_map_unlock(vm_map_t map, const char *file, int line) 534{ 535 536 if (map->system_map) 537 mtx_unlock_flags_(&map->system_mtx, 0, file, line); 538 else { 539 sx_xunlock_(&map->lock, file, line); 540 vm_map_process_deferred(); 541 } 542} 543 544void 545_vm_map_lock_read(vm_map_t map, const char *file, int line) 546{ 547 548 if (map->system_map) 549 mtx_lock_flags_(&map->system_mtx, 0, file, line); 550 else 551 sx_slock_(&map->lock, file, line); 552} 553 554void 555_vm_map_unlock_read(vm_map_t map, const char *file, int line) 556{ 557 558 if (map->system_map) 559 mtx_unlock_flags_(&map->system_mtx, 0, file, line); 560 else { 561 sx_sunlock_(&map->lock, file, line); 562 vm_map_process_deferred(); 563 } 564} 565 566int 567_vm_map_trylock(vm_map_t map, const char *file, int line) 568{ 569 int error; 570 571 error = map->system_map ? 572 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : 573 !sx_try_xlock_(&map->lock, file, line); 574 if (error == 0) 575 map->timestamp++; 576 return (error == 0); 577} 578 579int 580_vm_map_trylock_read(vm_map_t map, const char *file, int line) 581{ 582 int error; 583 584 error = map->system_map ? 585 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : 586 !sx_try_slock_(&map->lock, file, line); 587 return (error == 0); 588} 589 590/* 591 * _vm_map_lock_upgrade: [ internal use only ] 592 * 593 * Tries to upgrade a read (shared) lock on the specified map to a write 594 * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a 595 * non-zero value if the upgrade fails. If the upgrade fails, the map is 596 * returned without a read or write lock held. 597 * 598 * Requires that the map be read locked. 599 */ 600int 601_vm_map_lock_upgrade(vm_map_t map, const char *file, int line) 602{ 603 unsigned int last_timestamp; 604 605 if (map->system_map) { 606 mtx_assert_(&map->system_mtx, MA_OWNED, file, line); 607 } else { 608 if (!sx_try_upgrade_(&map->lock, file, line)) { 609 last_timestamp = map->timestamp; 610 sx_sunlock_(&map->lock, file, line); 611 vm_map_process_deferred(); 612 /* 613 * If the map's timestamp does not change while the 614 * map is unlocked, then the upgrade succeeds. 615 */ 616 sx_xlock_(&map->lock, file, line); 617 if (last_timestamp != map->timestamp) { 618 sx_xunlock_(&map->lock, file, line); 619 return (1); 620 } 621 } 622 } 623 map->timestamp++; 624 return (0); 625} 626 627void 628_vm_map_lock_downgrade(vm_map_t map, const char *file, int line) 629{ 630 631 if (map->system_map) { 632 mtx_assert_(&map->system_mtx, MA_OWNED, file, line); 633 } else 634 sx_downgrade_(&map->lock, file, line); 635} 636 637/* 638 * vm_map_locked: 639 * 640 * Returns a non-zero value if the caller holds a write (exclusive) lock 641 * on the specified map and the value "0" otherwise. 642 */ 643int 644vm_map_locked(vm_map_t map) 645{ 646 647 if (map->system_map) 648 return (mtx_owned(&map->system_mtx)); 649 else 650 return (sx_xlocked(&map->lock)); 651} 652 653#ifdef INVARIANTS 654static void 655_vm_map_assert_locked(vm_map_t map, const char *file, int line) 656{ 657 658 if (map->system_map) 659 mtx_assert_(&map->system_mtx, MA_OWNED, file, line); 660 else 661 sx_assert_(&map->lock, SA_XLOCKED, file, line); 662} 663 664#define VM_MAP_ASSERT_LOCKED(map) \ 665 _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE) 666#else 667#define VM_MAP_ASSERT_LOCKED(map) 668#endif 669 670/* 671 * _vm_map_unlock_and_wait: 672 * 673 * Atomically releases the lock on the specified map and puts the calling 674 * thread to sleep. The calling thread will remain asleep until either 675 * vm_map_wakeup() is performed on the map or the specified timeout is 676 * exceeded. 677 * 678 * WARNING! This function does not perform deferred deallocations of 679 * objects and map entries. Therefore, the calling thread is expected to 680 * reacquire the map lock after reawakening and later perform an ordinary 681 * unlock operation, such as vm_map_unlock(), before completing its 682 * operation on the map. 683 */ 684int 685_vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line) 686{ 687 688 mtx_lock(&map_sleep_mtx); 689 if (map->system_map) 690 mtx_unlock_flags_(&map->system_mtx, 0, file, line); 691 else 692 sx_xunlock_(&map->lock, file, line); 693 return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", 694 timo)); 695} 696 697/* 698 * vm_map_wakeup: 699 * 700 * Awaken any threads that have slept on the map using 701 * vm_map_unlock_and_wait(). 702 */ 703void 704vm_map_wakeup(vm_map_t map) 705{ 706 707 /* 708 * Acquire and release map_sleep_mtx to prevent a wakeup() 709 * from being performed (and lost) between the map unlock 710 * and the msleep() in _vm_map_unlock_and_wait(). 711 */ 712 mtx_lock(&map_sleep_mtx); 713 mtx_unlock(&map_sleep_mtx); 714 wakeup(&map->root); 715} 716 717void 718vm_map_busy(vm_map_t map) 719{ 720 721 VM_MAP_ASSERT_LOCKED(map); 722 map->busy++; 723} 724 725void 726vm_map_unbusy(vm_map_t map) 727{ 728 729 VM_MAP_ASSERT_LOCKED(map); 730 KASSERT(map->busy, ("vm_map_unbusy: not busy")); 731 if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) { 732 vm_map_modflags(map, 0, MAP_BUSY_WAKEUP); 733 wakeup(&map->busy); 734 } 735} 736 737void 738vm_map_wait_busy(vm_map_t map) 739{ 740 741 VM_MAP_ASSERT_LOCKED(map); 742 while (map->busy) { 743 vm_map_modflags(map, MAP_BUSY_WAKEUP, 0); 744 if (map->system_map) 745 msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0); 746 else 747 sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0); 748 } 749 map->timestamp++; 750} 751 752long 753vmspace_resident_count(struct vmspace *vmspace) 754{ 755 return pmap_resident_count(vmspace_pmap(vmspace)); 756} 757 758/* 759 * vm_map_create: 760 * 761 * Creates and returns a new empty VM map with 762 * the given physical map structure, and having 763 * the given lower and upper address bounds. 764 */ 765vm_map_t 766vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max) 767{ 768 vm_map_t result; 769 770 result = uma_zalloc(mapzone, M_WAITOK); 771 CTR1(KTR_VM, "vm_map_create: %p", result); 772 _vm_map_init(result, pmap, min, max); 773 return (result); 774} 775 776/* 777 * Initialize an existing vm_map structure 778 * such as that in the vmspace structure. 779 */ 780static void 781_vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) 782{ 783 784 map->header.next = map->header.prev = &map->header; 785 map->needs_wakeup = FALSE; 786 map->system_map = 0; 787 map->pmap = pmap; 788 map->header.end = min; 789 map->header.start = max; 790 map->flags = 0; 791 map->root = NULL; 792 map->timestamp = 0; 793 map->busy = 0; 794} 795 796void 797vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) 798{ 799 800 _vm_map_init(map, pmap, min, max); 801 mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); 802 sx_init(&map->lock, "user map"); 803} 804 805/* 806 * vm_map_entry_dispose: [ internal use only ] 807 * 808 * Inverse of vm_map_entry_create. 809 */ 810static void 811vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) 812{ 813 uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); 814} 815 816/* 817 * vm_map_entry_create: [ internal use only ] 818 * 819 * Allocates a VM map entry for insertion. 820 * No entry fields are filled in. 821 */ 822static vm_map_entry_t 823vm_map_entry_create(vm_map_t map) 824{ 825 vm_map_entry_t new_entry; 826 827 if (map->system_map) 828 new_entry = uma_zalloc(kmapentzone, M_NOWAIT); 829 else 830 new_entry = uma_zalloc(mapentzone, M_WAITOK); 831 if (new_entry == NULL) 832 panic("vm_map_entry_create: kernel resources exhausted"); 833 return (new_entry); 834} 835 836/* 837 * vm_map_entry_set_behavior: 838 * 839 * Set the expected access behavior, either normal, random, or 840 * sequential. 841 */ 842static inline void 843vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) 844{ 845 entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) | 846 (behavior & MAP_ENTRY_BEHAV_MASK); 847} 848 849/* 850 * vm_map_entry_set_max_free: 851 * 852 * Set the max_free field in a vm_map_entry. 853 */ 854static inline void 855vm_map_entry_set_max_free(vm_map_entry_t entry) 856{ 857 858 entry->max_free = entry->adj_free; 859 if (entry->left != NULL && entry->left->max_free > entry->max_free) 860 entry->max_free = entry->left->max_free; 861 if (entry->right != NULL && entry->right->max_free > entry->max_free) 862 entry->max_free = entry->right->max_free; 863} 864 865/* 866 * vm_map_entry_splay: 867 * 868 * The Sleator and Tarjan top-down splay algorithm with the 869 * following variation. Max_free must be computed bottom-up, so 870 * on the downward pass, maintain the left and right spines in 871 * reverse order. Then, make a second pass up each side to fix 872 * the pointers and compute max_free. The time bound is O(log n) 873 * amortized. 874 * 875 * The new root is the vm_map_entry containing "addr", or else an 876 * adjacent entry (lower or higher) if addr is not in the tree. 877 * 878 * The map must be locked, and leaves it so. 879 * 880 * Returns: the new root. 881 */ 882static vm_map_entry_t 883vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root) 884{ 885 vm_map_entry_t llist, rlist; 886 vm_map_entry_t ltree, rtree; 887 vm_map_entry_t y; 888 889 /* Special case of empty tree. */ 890 if (root == NULL) 891 return (root); 892 893 /* 894 * Pass One: Splay down the tree until we find addr or a NULL 895 * pointer where addr would go. llist and rlist are the two 896 * sides in reverse order (bottom-up), with llist linked by 897 * the right pointer and rlist linked by the left pointer in 898 * the vm_map_entry. Wait until Pass Two to set max_free on 899 * the two spines. 900 */ 901 llist = NULL; 902 rlist = NULL; 903 for (;;) { 904 /* root is never NULL in here. */ 905 if (addr < root->start) { 906 y = root->left; 907 if (y == NULL) 908 break; 909 if (addr < y->start && y->left != NULL) { 910 /* Rotate right and put y on rlist. */ 911 root->left = y->right; 912 y->right = root; 913 vm_map_entry_set_max_free(root); 914 root = y->left; 915 y->left = rlist; 916 rlist = y; 917 } else { 918 /* Put root on rlist. */ 919 root->left = rlist; 920 rlist = root; 921 root = y; 922 } 923 } else if (addr >= root->end) { 924 y = root->right; 925 if (y == NULL) 926 break; 927 if (addr >= y->end && y->right != NULL) { 928 /* Rotate left and put y on llist. */ 929 root->right = y->left; 930 y->left = root; 931 vm_map_entry_set_max_free(root); 932 root = y->right; 933 y->right = llist; 934 llist = y; 935 } else { 936 /* Put root on llist. */ 937 root->right = llist; 938 llist = root; 939 root = y; 940 } 941 } else 942 break; 943 } 944 945 /* 946 * Pass Two: Walk back up the two spines, flip the pointers 947 * and set max_free. The subtrees of the root go at the 948 * bottom of llist and rlist. 949 */ 950 ltree = root->left; 951 while (llist != NULL) { 952 y = llist->right; 953 llist->right = ltree; 954 vm_map_entry_set_max_free(llist); 955 ltree = llist; 956 llist = y; 957 } 958 rtree = root->right; 959 while (rlist != NULL) { 960 y = rlist->left; 961 rlist->left = rtree; 962 vm_map_entry_set_max_free(rlist); 963 rtree = rlist; 964 rlist = y; 965 } 966 967 /* 968 * Final assembly: add ltree and rtree as subtrees of root. 969 */ 970 root->left = ltree; 971 root->right = rtree; 972 vm_map_entry_set_max_free(root); 973 974 return (root); 975} 976 977/* 978 * vm_map_entry_{un,}link: 979 * 980 * Insert/remove entries from maps. 981 */ 982static void 983vm_map_entry_link(vm_map_t map, 984 vm_map_entry_t after_where, 985 vm_map_entry_t entry) 986{ 987 988 CTR4(KTR_VM, 989 "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map, 990 map->nentries, entry, after_where); 991 VM_MAP_ASSERT_LOCKED(map); 992 KASSERT(after_where->end <= entry->start, 993 ("vm_map_entry_link: prev end %jx new start %jx overlap", 994 (uintmax_t)after_where->end, (uintmax_t)entry->start)); 995 KASSERT(entry->end <= after_where->next->start, 996 ("vm_map_entry_link: new end %jx next start %jx overlap", 997 (uintmax_t)entry->end, (uintmax_t)after_where->next->start)); 998 999 map->nentries++; 1000 entry->prev = after_where; 1001 entry->next = after_where->next; 1002 entry->next->prev = entry; 1003 after_where->next = entry; 1004 1005 if (after_where != &map->header) { 1006 if (after_where != map->root) 1007 vm_map_entry_splay(after_where->start, map->root); 1008 entry->right = after_where->right; 1009 entry->left = after_where; 1010 after_where->right = NULL; 1011 after_where->adj_free = entry->start - after_where->end; 1012 vm_map_entry_set_max_free(after_where); 1013 } else { 1014 entry->right = map->root; 1015 entry->left = NULL; 1016 } 1017 entry->adj_free = entry->next->start - entry->end; 1018 vm_map_entry_set_max_free(entry); 1019 map->root = entry; 1020} 1021 1022static void 1023vm_map_entry_unlink(vm_map_t map, 1024 vm_map_entry_t entry) 1025{ 1026 vm_map_entry_t next, prev, root; 1027 1028 VM_MAP_ASSERT_LOCKED(map); 1029 if (entry != map->root) 1030 vm_map_entry_splay(entry->start, map->root); 1031 if (entry->left == NULL) 1032 root = entry->right; 1033 else { 1034 root = vm_map_entry_splay(entry->start, entry->left); 1035 root->right = entry->right; 1036 root->adj_free = entry->next->start - root->end; 1037 vm_map_entry_set_max_free(root); 1038 } 1039 map->root = root; 1040 1041 prev = entry->prev; 1042 next = entry->next; 1043 next->prev = prev; 1044 prev->next = next; 1045 map->nentries--; 1046 CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map, 1047 map->nentries, entry); 1048} 1049 1050/* 1051 * vm_map_entry_resize_free: 1052 * 1053 * Recompute the amount of free space following a vm_map_entry 1054 * and propagate that value up the tree. Call this function after 1055 * resizing a map entry in-place, that is, without a call to 1056 * vm_map_entry_link() or _unlink(). 1057 * 1058 * The map must be locked, and leaves it so. 1059 */ 1060static void 1061vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry) 1062{ 1063 1064 /* 1065 * Using splay trees without parent pointers, propagating 1066 * max_free up the tree is done by moving the entry to the 1067 * root and making the change there. 1068 */ 1069 if (entry != map->root) 1070 map->root = vm_map_entry_splay(entry->start, map->root); 1071 1072 entry->adj_free = entry->next->start - entry->end; 1073 vm_map_entry_set_max_free(entry); 1074} 1075 1076/* 1077 * vm_map_lookup_entry: [ internal use only ] 1078 * 1079 * Finds the map entry containing (or 1080 * immediately preceding) the specified address 1081 * in the given map; the entry is returned 1082 * in the "entry" parameter. The boolean 1083 * result indicates whether the address is 1084 * actually contained in the map. 1085 */ 1086boolean_t 1087vm_map_lookup_entry( 1088 vm_map_t map, 1089 vm_offset_t address, 1090 vm_map_entry_t *entry) /* OUT */ 1091{ 1092 vm_map_entry_t cur; 1093 boolean_t locked; 1094 1095 /* 1096 * If the map is empty, then the map entry immediately preceding 1097 * "address" is the map's header. 1098 */ 1099 cur = map->root; 1100 if (cur == NULL) 1101 *entry = &map->header; 1102 else if (address >= cur->start && cur->end > address) { 1103 *entry = cur; 1104 return (TRUE); 1105 } else if ((locked = vm_map_locked(map)) || 1106 sx_try_upgrade(&map->lock)) { 1107 /* 1108 * Splay requires a write lock on the map. However, it only 1109 * restructures the binary search tree; it does not otherwise 1110 * change the map. Thus, the map's timestamp need not change 1111 * on a temporary upgrade. 1112 */ 1113 map->root = cur = vm_map_entry_splay(address, cur); 1114 if (!locked) 1115 sx_downgrade(&map->lock); 1116 1117 /* 1118 * If "address" is contained within a map entry, the new root 1119 * is that map entry. Otherwise, the new root is a map entry 1120 * immediately before or after "address". 1121 */ 1122 if (address >= cur->start) { 1123 *entry = cur; 1124 if (cur->end > address) 1125 return (TRUE); 1126 } else 1127 *entry = cur->prev; 1128 } else 1129 /* 1130 * Since the map is only locked for read access, perform a 1131 * standard binary search tree lookup for "address". 1132 */ 1133 for (;;) { 1134 if (address < cur->start) { 1135 if (cur->left == NULL) { 1136 *entry = cur->prev; 1137 break; 1138 } 1139 cur = cur->left; 1140 } else if (cur->end > address) { 1141 *entry = cur; 1142 return (TRUE); 1143 } else { 1144 if (cur->right == NULL) { 1145 *entry = cur; 1146 break; 1147 } 1148 cur = cur->right; 1149 } 1150 } 1151 return (FALSE); 1152} 1153 1154/* 1155 * vm_map_insert: 1156 * 1157 * Inserts the given whole VM object into the target 1158 * map at the specified address range. The object's 1159 * size should match that of the address range. 1160 * 1161 * Requires that the map be locked, and leaves it so. 1162 * 1163 * If object is non-NULL, ref count must be bumped by caller 1164 * prior to making call to account for the new entry. 1165 */ 1166int 1167vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1168 vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow) 1169{ 1170 vm_map_entry_t new_entry, prev_entry, temp_entry; 1171 struct ucred *cred; 1172 vm_eflags_t protoeflags; 1173 vm_inherit_t inheritance; 1174 1175 VM_MAP_ASSERT_LOCKED(map); 1176 KASSERT((object != kmem_object && object != kernel_object) || 1177 (cow & MAP_COPY_ON_WRITE) == 0, 1178 ("vm_map_insert: kmem or kernel object and COW")); 1179 KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0, 1180 ("vm_map_insert: paradoxical MAP_NOFAULT request")); 1181 KASSERT((prot & ~max) == 0, 1182 ("prot %#x is not subset of max_prot %#x", prot, max)); 1183 1184 /* 1185 * Check that the start and end points are not bogus. 1186 */ 1187 if (start < vm_map_min(map) || end > vm_map_max(map) || 1188 start >= end) 1189 return (KERN_INVALID_ADDRESS); 1190 1191 /* 1192 * Find the entry prior to the proposed starting address; if it's part 1193 * of an existing entry, this range is bogus. 1194 */ 1195 if (vm_map_lookup_entry(map, start, &temp_entry)) 1196 return (KERN_NO_SPACE); 1197 1198 prev_entry = temp_entry; 1199 1200 /* 1201 * Assert that the next entry doesn't overlap the end point. 1202 */ 1203 if (prev_entry->next->start < end) 1204 return (KERN_NO_SPACE); 1205 1206 if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL || 1207 max != VM_PROT_NONE)) 1208 return (KERN_INVALID_ARGUMENT); 1209 1210 protoeflags = 0; 1211 if (cow & MAP_COPY_ON_WRITE) 1212 protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; 1213 if (cow & MAP_NOFAULT) 1214 protoeflags |= MAP_ENTRY_NOFAULT; 1215 if (cow & MAP_DISABLE_SYNCER) 1216 protoeflags |= MAP_ENTRY_NOSYNC; 1217 if (cow & MAP_DISABLE_COREDUMP) 1218 protoeflags |= MAP_ENTRY_NOCOREDUMP; 1219 if (cow & MAP_STACK_GROWS_DOWN) 1220 protoeflags |= MAP_ENTRY_GROWS_DOWN; 1221 if (cow & MAP_STACK_GROWS_UP) 1222 protoeflags |= MAP_ENTRY_GROWS_UP; 1223 if (cow & MAP_VN_WRITECOUNT) 1224 protoeflags |= MAP_ENTRY_VN_WRITECNT; 1225 if ((cow & MAP_CREATE_GUARD) != 0) 1226 protoeflags |= MAP_ENTRY_GUARD; 1227 if ((cow & MAP_CREATE_STACK_GAP_DN) != 0) 1228 protoeflags |= MAP_ENTRY_STACK_GAP_DN; 1229 if ((cow & MAP_CREATE_STACK_GAP_UP) != 0) 1230 protoeflags |= MAP_ENTRY_STACK_GAP_UP; 1231 if (cow & MAP_INHERIT_SHARE) 1232 inheritance = VM_INHERIT_SHARE; 1233 else 1234 inheritance = VM_INHERIT_DEFAULT; 1235 1236 cred = NULL; 1237 if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) 1238 goto charged; 1239 if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) && 1240 ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) { 1241 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start)) 1242 return (KERN_RESOURCE_SHORTAGE); 1243 KASSERT(object == NULL || 1244 (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 || 1245 object->cred == NULL, 1246 ("overcommit: vm_map_insert o %p", object)); 1247 cred = curthread->td_ucred; 1248 } 1249 1250charged: 1251 /* Expand the kernel pmap, if necessary. */ 1252 if (map == kernel_map && end > kernel_vm_end) 1253 pmap_growkernel(end); 1254 if (object != NULL) { 1255 /* 1256 * OBJ_ONEMAPPING must be cleared unless this mapping 1257 * is trivially proven to be the only mapping for any 1258 * of the object's pages. (Object granularity 1259 * reference counting is insufficient to recognize 1260 * aliases with precision.) 1261 */ 1262 VM_OBJECT_WLOCK(object); 1263 if (object->ref_count > 1 || object->shadow_count != 0) 1264 vm_object_clear_flag(object, OBJ_ONEMAPPING); 1265 VM_OBJECT_WUNLOCK(object); 1266 } else if (prev_entry != &map->header && 1267 prev_entry->eflags == protoeflags && 1268 (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 && 1269 prev_entry->end == start && prev_entry->wired_count == 0 && 1270 (prev_entry->cred == cred || 1271 (prev_entry->object.vm_object != NULL && 1272 prev_entry->object.vm_object->cred == cred)) && 1273 vm_object_coalesce(prev_entry->object.vm_object, 1274 prev_entry->offset, 1275 (vm_size_t)(prev_entry->end - prev_entry->start), 1276 (vm_size_t)(end - prev_entry->end), cred != NULL && 1277 (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) { 1278 /* 1279 * We were able to extend the object. Determine if we 1280 * can extend the previous map entry to include the 1281 * new range as well. 1282 */ 1283 if (prev_entry->inheritance == inheritance && 1284 prev_entry->protection == prot && 1285 prev_entry->max_protection == max) { 1286 if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0) 1287 map->size += end - prev_entry->end; 1288 prev_entry->end = end; 1289 vm_map_entry_resize_free(map, prev_entry); 1290 vm_map_simplify_entry(map, prev_entry); 1291 return (KERN_SUCCESS); 1292 } 1293 1294 /* 1295 * If we can extend the object but cannot extend the 1296 * map entry, we have to create a new map entry. We 1297 * must bump the ref count on the extended object to 1298 * account for it. object may be NULL. 1299 */ 1300 object = prev_entry->object.vm_object; 1301 offset = prev_entry->offset + 1302 (prev_entry->end - prev_entry->start); 1303 vm_object_reference(object); 1304 if (cred != NULL && object != NULL && object->cred != NULL && 1305 !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { 1306 /* Object already accounts for this uid. */ 1307 cred = NULL; 1308 } 1309 } 1310 if (cred != NULL) 1311 crhold(cred); 1312 1313 /* 1314 * Create a new entry 1315 */ 1316 new_entry = vm_map_entry_create(map); 1317 new_entry->start = start; 1318 new_entry->end = end; 1319 new_entry->cred = NULL; 1320 1321 new_entry->eflags = protoeflags; 1322 new_entry->object.vm_object = object; 1323 new_entry->offset = offset; 1324 1325 new_entry->inheritance = inheritance; 1326 new_entry->protection = prot; 1327 new_entry->max_protection = max; 1328 new_entry->wired_count = 0; 1329 new_entry->wiring_thread = NULL; 1330 new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT; 1331 new_entry->next_read = start; 1332 1333 KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry), 1334 ("overcommit: vm_map_insert leaks vm_map %p", new_entry)); 1335 new_entry->cred = cred; 1336 1337 /* 1338 * Insert the new entry into the list 1339 */ 1340 vm_map_entry_link(map, prev_entry, new_entry); 1341 if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0) 1342 map->size += new_entry->end - new_entry->start; 1343 1344 /* 1345 * Try to coalesce the new entry with both the previous and next 1346 * entries in the list. Previously, we only attempted to coalesce 1347 * with the previous entry when object is NULL. Here, we handle the 1348 * other cases, which are less common. 1349 */ 1350 vm_map_simplify_entry(map, new_entry); 1351 1352 if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) { 1353 vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset), 1354 end - start, cow & MAP_PREFAULT_PARTIAL); 1355 } 1356 1357 return (KERN_SUCCESS); 1358} 1359 1360/* 1361 * vm_map_findspace: 1362 * 1363 * Find the first fit (lowest VM address) for "length" free bytes 1364 * beginning at address >= start in the given map. 1365 * 1366 * In a vm_map_entry, "adj_free" is the amount of free space 1367 * adjacent (higher address) to this entry, and "max_free" is the 1368 * maximum amount of contiguous free space in its subtree. This 1369 * allows finding a free region in one path down the tree, so 1370 * O(log n) amortized with splay trees. 1371 * 1372 * The map must be locked, and leaves it so. 1373 * 1374 * Returns: 0 on success, and starting address in *addr, 1375 * 1 if insufficient space. 1376 */ 1377int 1378vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length, 1379 vm_offset_t *addr) /* OUT */ 1380{ 1381 vm_map_entry_t entry; 1382 vm_offset_t st; 1383 1384 /* 1385 * Request must fit within min/max VM address and must avoid 1386 * address wrap. 1387 */ 1388 start = MAX(start, vm_map_min(map)); 1389 if (start + length > vm_map_max(map) || start + length < start) 1390 return (1); 1391 1392 /* Empty tree means wide open address space. */ 1393 if (map->root == NULL) { 1394 *addr = start; 1395 return (0); 1396 } 1397 1398 /* 1399 * After splay, if start comes before root node, then there 1400 * must be a gap from start to the root. 1401 */ 1402 map->root = vm_map_entry_splay(start, map->root); 1403 if (start + length <= map->root->start) { 1404 *addr = start; 1405 return (0); 1406 } 1407 1408 /* 1409 * Root is the last node that might begin its gap before 1410 * start, and this is the last comparison where address 1411 * wrap might be a problem. 1412 */ 1413 st = (start > map->root->end) ? start : map->root->end; 1414 if (length <= map->root->end + map->root->adj_free - st) { 1415 *addr = st; 1416 return (0); 1417 } 1418 1419 /* With max_free, can immediately tell if no solution. */ 1420 entry = map->root->right; 1421 if (entry == NULL || length > entry->max_free) 1422 return (1); 1423 1424 /* 1425 * Search the right subtree in the order: left subtree, root, 1426 * right subtree (first fit). The previous splay implies that 1427 * all regions in the right subtree have addresses > start. 1428 */ 1429 while (entry != NULL) { 1430 if (entry->left != NULL && entry->left->max_free >= length) 1431 entry = entry->left; 1432 else if (entry->adj_free >= length) { 1433 *addr = entry->end; 1434 return (0); 1435 } else 1436 entry = entry->right; 1437 } 1438 1439 /* Can't get here, so panic if we do. */ 1440 panic("vm_map_findspace: max_free corrupt"); 1441} 1442 1443int 1444vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1445 vm_offset_t start, vm_size_t length, vm_prot_t prot, 1446 vm_prot_t max, int cow) 1447{ 1448 vm_offset_t end; 1449 int result; 1450 1451 end = start + length; 1452 KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || 1453 object == NULL, 1454 ("vm_map_fixed: non-NULL backing object for stack")); 1455 vm_map_lock(map); 1456 VM_MAP_RANGE_CHECK(map, start, end); 1457 if ((cow & MAP_CHECK_EXCL) == 0) 1458 vm_map_delete(map, start, end); 1459 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { 1460 result = vm_map_stack_locked(map, start, length, sgrowsiz, 1461 prot, max, cow); 1462 } else { 1463 result = vm_map_insert(map, object, offset, start, end, 1464 prot, max, cow); 1465 } 1466 vm_map_unlock(map); 1467 return (result); 1468} 1469 1470/* 1471 * vm_map_find finds an unallocated region in the target address 1472 * map with the given length. The search is defined to be 1473 * first-fit from the specified address; the region found is 1474 * returned in the same parameter. 1475 * 1476 * If object is non-NULL, ref count must be bumped by caller 1477 * prior to making call to account for the new entry. 1478 */ 1479int 1480vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1481 vm_offset_t *addr, /* IN/OUT */ 1482 vm_size_t length, vm_offset_t max_addr, int find_space, 1483 vm_prot_t prot, vm_prot_t max, int cow) 1484{ 1485 vm_offset_t alignment, initial_addr, start; 1486 int result; 1487 1488 KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || 1489 object == NULL, 1490 ("vm_map_find: non-NULL backing object for stack")); 1491 MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE && 1492 (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)); 1493 if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL || 1494 (object->flags & OBJ_COLORED) == 0)) 1495 find_space = VMFS_ANY_SPACE; 1496 if (find_space >> 8 != 0) { 1497 KASSERT((find_space & 0xff) == 0, ("bad VMFS flags")); 1498 alignment = (vm_offset_t)1 << (find_space >> 8); 1499 } else 1500 alignment = 0; 1501 initial_addr = *addr; 1502again: 1503 start = initial_addr; 1504 vm_map_lock(map); 1505 do { 1506 if (find_space != VMFS_NO_SPACE) { 1507 if (vm_map_findspace(map, start, length, addr) || 1508 (max_addr != 0 && *addr + length > max_addr)) { 1509 vm_map_unlock(map); 1510 if (find_space == VMFS_OPTIMAL_SPACE) { 1511 find_space = VMFS_ANY_SPACE; 1512 goto again; 1513 } 1514 return (KERN_NO_SPACE); 1515 } 1516 switch (find_space) { 1517 case VMFS_SUPER_SPACE: 1518 case VMFS_OPTIMAL_SPACE: 1519 pmap_align_superpage(object, offset, addr, 1520 length); 1521 break; 1522 case VMFS_ANY_SPACE: 1523 break; 1524 default: 1525 if ((*addr & (alignment - 1)) != 0) { 1526 *addr &= ~(alignment - 1); 1527 *addr += alignment; 1528 } 1529 break; 1530 } 1531 1532 start = *addr; 1533 } else if ((cow & MAP_REMAP) != 0) { 1534 if (start < vm_map_min(map) || 1535 start + length > vm_map_max(map) || 1536 start + length <= length) { 1537 result = KERN_INVALID_ADDRESS; 1538 break; 1539 } 1540 vm_map_delete(map, start, start + length); 1541 } 1542 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { 1543 result = vm_map_stack_locked(map, start, length, 1544 sgrowsiz, prot, max, cow); 1545 } else { 1546 result = vm_map_insert(map, object, offset, start, 1547 start + length, prot, max, cow); 1548 } 1549 } while (result == KERN_NO_SPACE && find_space != VMFS_NO_SPACE && 1550 find_space != VMFS_ANY_SPACE); 1551 vm_map_unlock(map); 1552 return (result); 1553} 1554 1555/* 1556 * vm_map_find_min() is a variant of vm_map_find() that takes an 1557 * additional parameter (min_addr) and treats the given address 1558 * (*addr) differently. Specifically, it treats *addr as a hint 1559 * and not as the minimum address where the mapping is created. 1560 * 1561 * This function works in two phases. First, it tries to 1562 * allocate above the hint. If that fails and the hint is 1563 * greater than min_addr, it performs a second pass, replacing 1564 * the hint with min_addr as the minimum address for the 1565 * allocation. 1566 */ 1567int 1568vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1569 vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr, 1570 vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, 1571 int cow) 1572{ 1573 vm_offset_t hint; 1574 int rv; 1575 1576 hint = *addr; 1577 for (;;) { 1578 rv = vm_map_find(map, object, offset, addr, length, max_addr, 1579 find_space, prot, max, cow); 1580 if (rv == KERN_SUCCESS || min_addr >= hint) 1581 return (rv); 1582 *addr = hint = min_addr; 1583 } 1584} 1585 1586/* 1587 * vm_map_simplify_entry: 1588 * 1589 * Simplify the given map entry by merging with either neighbor. This 1590 * routine also has the ability to merge with both neighbors. 1591 * 1592 * The map must be locked. 1593 * 1594 * This routine guarantees that the passed entry remains valid (though 1595 * possibly extended). When merging, this routine may delete one or 1596 * both neighbors. 1597 */ 1598void 1599vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) 1600{ 1601 vm_map_entry_t next, prev; 1602 vm_size_t prevsize, esize; 1603 1604 if ((entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | 1605 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) != 0) 1606 return; 1607 1608 prev = entry->prev; 1609 if (prev != &map->header) { 1610 prevsize = prev->end - prev->start; 1611 if ( (prev->end == entry->start) && 1612 (prev->object.vm_object == entry->object.vm_object) && 1613 (!prev->object.vm_object || 1614 (prev->offset + prevsize == entry->offset)) && 1615 (prev->eflags == entry->eflags) && 1616 (prev->protection == entry->protection) && 1617 (prev->max_protection == entry->max_protection) && 1618 (prev->inheritance == entry->inheritance) && 1619 (prev->wired_count == entry->wired_count) && 1620 (prev->cred == entry->cred)) { 1621 vm_map_entry_unlink(map, prev); 1622 entry->start = prev->start; 1623 entry->offset = prev->offset; 1624 if (entry->prev != &map->header) 1625 vm_map_entry_resize_free(map, entry->prev); 1626 1627 /* 1628 * If the backing object is a vnode object, 1629 * vm_object_deallocate() calls vrele(). 1630 * However, vrele() does not lock the vnode 1631 * because the vnode has additional 1632 * references. Thus, the map lock can be kept 1633 * without causing a lock-order reversal with 1634 * the vnode lock. 1635 * 1636 * Since we count the number of virtual page 1637 * mappings in object->un_pager.vnp.writemappings, 1638 * the writemappings value should not be adjusted 1639 * when the entry is disposed of. 1640 */ 1641 if (prev->object.vm_object) 1642 vm_object_deallocate(prev->object.vm_object); 1643 if (prev->cred != NULL) 1644 crfree(prev->cred); 1645 vm_map_entry_dispose(map, prev); 1646 } 1647 } 1648 1649 next = entry->next; 1650 if (next != &map->header) { 1651 esize = entry->end - entry->start; 1652 if ((entry->end == next->start) && 1653 (next->object.vm_object == entry->object.vm_object) && 1654 (!entry->object.vm_object || 1655 (entry->offset + esize == next->offset)) && 1656 (next->eflags == entry->eflags) && 1657 (next->protection == entry->protection) && 1658 (next->max_protection == entry->max_protection) && 1659 (next->inheritance == entry->inheritance) && 1660 (next->wired_count == entry->wired_count) && 1661 (next->cred == entry->cred)) { 1662 vm_map_entry_unlink(map, next); 1663 entry->end = next->end; 1664 vm_map_entry_resize_free(map, entry); 1665 1666 /* 1667 * See comment above. 1668 */ 1669 if (next->object.vm_object) 1670 vm_object_deallocate(next->object.vm_object); 1671 if (next->cred != NULL) 1672 crfree(next->cred); 1673 vm_map_entry_dispose(map, next); 1674 } 1675 } 1676} 1677/* 1678 * vm_map_clip_start: [ internal use only ] 1679 * 1680 * Asserts that the given entry begins at or after 1681 * the specified address; if necessary, 1682 * it splits the entry into two. 1683 */ 1684#define vm_map_clip_start(map, entry, startaddr) \ 1685{ \ 1686 if (startaddr > entry->start) \ 1687 _vm_map_clip_start(map, entry, startaddr); \ 1688} 1689 1690/* 1691 * This routine is called only when it is known that 1692 * the entry must be split. 1693 */ 1694static void 1695_vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) 1696{ 1697 vm_map_entry_t new_entry; 1698 1699 VM_MAP_ASSERT_LOCKED(map); 1700 KASSERT(entry->end > start && entry->start < start, 1701 ("_vm_map_clip_start: invalid clip of entry %p", entry)); 1702 1703 /* 1704 * Split off the front portion -- note that we must insert the new 1705 * entry BEFORE this one, so that this entry has the specified 1706 * starting address. 1707 */ 1708 vm_map_simplify_entry(map, entry); 1709 1710 /* 1711 * If there is no object backing this entry, we might as well create 1712 * one now. If we defer it, an object can get created after the map 1713 * is clipped, and individual objects will be created for the split-up 1714 * map. This is a bit of a hack, but is also about the best place to 1715 * put this improvement. 1716 */ 1717 if (entry->object.vm_object == NULL && !map->system_map && 1718 (entry->eflags & MAP_ENTRY_GUARD) == 0) { 1719 vm_object_t object; 1720 object = vm_object_allocate(OBJT_DEFAULT, 1721 atop(entry->end - entry->start)); 1722 entry->object.vm_object = object; 1723 entry->offset = 0; 1724 if (entry->cred != NULL) { 1725 object->cred = entry->cred; 1726 object->charge = entry->end - entry->start; 1727 entry->cred = NULL; 1728 } 1729 } else if (entry->object.vm_object != NULL && 1730 ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && 1731 entry->cred != NULL) { 1732 VM_OBJECT_WLOCK(entry->object.vm_object); 1733 KASSERT(entry->object.vm_object->cred == NULL, 1734 ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry)); 1735 entry->object.vm_object->cred = entry->cred; 1736 entry->object.vm_object->charge = entry->end - entry->start; 1737 VM_OBJECT_WUNLOCK(entry->object.vm_object); 1738 entry->cred = NULL; 1739 } 1740 1741 new_entry = vm_map_entry_create(map); 1742 *new_entry = *entry; 1743 1744 new_entry->end = start; 1745 entry->offset += (start - entry->start); 1746 entry->start = start; 1747 if (new_entry->cred != NULL) 1748 crhold(entry->cred); 1749 1750 vm_map_entry_link(map, entry->prev, new_entry); 1751 1752 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 1753 vm_object_reference(new_entry->object.vm_object); 1754 /* 1755 * The object->un_pager.vnp.writemappings for the 1756 * object of MAP_ENTRY_VN_WRITECNT type entry shall be 1757 * kept as is here. The virtual pages are 1758 * re-distributed among the clipped entries, so the sum is 1759 * left the same. 1760 */ 1761 } 1762} 1763 1764/* 1765 * vm_map_clip_end: [ internal use only ] 1766 * 1767 * Asserts that the given entry ends at or before 1768 * the specified address; if necessary, 1769 * it splits the entry into two. 1770 */ 1771#define vm_map_clip_end(map, entry, endaddr) \ 1772{ \ 1773 if ((endaddr) < (entry->end)) \ 1774 _vm_map_clip_end((map), (entry), (endaddr)); \ 1775} 1776 1777/* 1778 * This routine is called only when it is known that 1779 * the entry must be split. 1780 */ 1781static void 1782_vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) 1783{ 1784 vm_map_entry_t new_entry; 1785 1786 VM_MAP_ASSERT_LOCKED(map); 1787 KASSERT(entry->start < end && entry->end > end, 1788 ("_vm_map_clip_end: invalid clip of entry %p", entry)); 1789 1790 /* 1791 * If there is no object backing this entry, we might as well create 1792 * one now. If we defer it, an object can get created after the map 1793 * is clipped, and individual objects will be created for the split-up 1794 * map. This is a bit of a hack, but is also about the best place to 1795 * put this improvement. 1796 */ 1797 if (entry->object.vm_object == NULL && !map->system_map && 1798 (entry->eflags & MAP_ENTRY_GUARD) == 0) { 1799 vm_object_t object; 1800 object = vm_object_allocate(OBJT_DEFAULT, 1801 atop(entry->end - entry->start)); 1802 entry->object.vm_object = object; 1803 entry->offset = 0; 1804 if (entry->cred != NULL) { 1805 object->cred = entry->cred; 1806 object->charge = entry->end - entry->start; 1807 entry->cred = NULL; 1808 } 1809 } else if (entry->object.vm_object != NULL && 1810 ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && 1811 entry->cred != NULL) { 1812 VM_OBJECT_WLOCK(entry->object.vm_object); 1813 KASSERT(entry->object.vm_object->cred == NULL, 1814 ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry)); 1815 entry->object.vm_object->cred = entry->cred; 1816 entry->object.vm_object->charge = entry->end - entry->start; 1817 VM_OBJECT_WUNLOCK(entry->object.vm_object); 1818 entry->cred = NULL; 1819 } 1820 1821 /* 1822 * Create a new entry and insert it AFTER the specified entry 1823 */ 1824 new_entry = vm_map_entry_create(map); 1825 *new_entry = *entry; 1826 1827 new_entry->start = entry->end = end; 1828 new_entry->offset += (end - entry->start); 1829 if (new_entry->cred != NULL) 1830 crhold(entry->cred); 1831 1832 vm_map_entry_link(map, entry, new_entry); 1833 1834 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 1835 vm_object_reference(new_entry->object.vm_object); 1836 } 1837} 1838 1839/* 1840 * vm_map_submap: [ kernel use only ] 1841 * 1842 * Mark the given range as handled by a subordinate map. 1843 * 1844 * This range must have been created with vm_map_find, 1845 * and no other operations may have been performed on this 1846 * range prior to calling vm_map_submap. 1847 * 1848 * Only a limited number of operations can be performed 1849 * within this rage after calling vm_map_submap: 1850 * vm_fault 1851 * [Don't try vm_map_copy!] 1852 * 1853 * To remove a submapping, one must first remove the 1854 * range from the superior map, and then destroy the 1855 * submap (if desired). [Better yet, don't try it.] 1856 */ 1857int 1858vm_map_submap( 1859 vm_map_t map, 1860 vm_offset_t start, 1861 vm_offset_t end, 1862 vm_map_t submap) 1863{ 1864 vm_map_entry_t entry; 1865 int result = KERN_INVALID_ARGUMENT; 1866 1867 vm_map_lock(map); 1868 1869 VM_MAP_RANGE_CHECK(map, start, end); 1870 1871 if (vm_map_lookup_entry(map, start, &entry)) { 1872 vm_map_clip_start(map, entry, start); 1873 } else 1874 entry = entry->next; 1875 1876 vm_map_clip_end(map, entry, end); 1877 1878 if ((entry->start == start) && (entry->end == end) && 1879 ((entry->eflags & MAP_ENTRY_COW) == 0) && 1880 (entry->object.vm_object == NULL)) { 1881 entry->object.sub_map = submap; 1882 entry->eflags |= MAP_ENTRY_IS_SUB_MAP; 1883 result = KERN_SUCCESS; 1884 } 1885 vm_map_unlock(map); 1886 1887 return (result); 1888} 1889 1890/* 1891 * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified 1892 */ 1893#define MAX_INIT_PT 96 1894 1895/* 1896 * vm_map_pmap_enter: 1897 * 1898 * Preload the specified map's pmap with mappings to the specified 1899 * object's memory-resident pages. No further physical pages are 1900 * allocated, and no further virtual pages are retrieved from secondary 1901 * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a 1902 * limited number of page mappings are created at the low-end of the 1903 * specified address range. (For this purpose, a superpage mapping 1904 * counts as one page mapping.) Otherwise, all resident pages within 1905 * the specified address range are mapped. 1906 */ 1907static void 1908vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, 1909 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) 1910{ 1911 vm_offset_t start; 1912 vm_page_t p, p_start; 1913 vm_pindex_t mask, psize, threshold, tmpidx; 1914 1915 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL) 1916 return; 1917 VM_OBJECT_RLOCK(object); 1918 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { 1919 VM_OBJECT_RUNLOCK(object); 1920 VM_OBJECT_WLOCK(object); 1921 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { 1922 pmap_object_init_pt(map->pmap, addr, object, pindex, 1923 size); 1924 VM_OBJECT_WUNLOCK(object); 1925 return; 1926 } 1927 VM_OBJECT_LOCK_DOWNGRADE(object); 1928 } 1929 1930 psize = atop(size); 1931 if (psize + pindex > object->size) { 1932 if (object->size < pindex) { 1933 VM_OBJECT_RUNLOCK(object); 1934 return; 1935 } 1936 psize = object->size - pindex; 1937 } 1938 1939 start = 0; 1940 p_start = NULL; 1941 threshold = MAX_INIT_PT; 1942 1943 p = vm_page_find_least(object, pindex); 1944 /* 1945 * Assert: the variable p is either (1) the page with the 1946 * least pindex greater than or equal to the parameter pindex 1947 * or (2) NULL. 1948 */ 1949 for (; 1950 p != NULL && (tmpidx = p->pindex - pindex) < psize; 1951 p = TAILQ_NEXT(p, listq)) { 1952 /* 1953 * don't allow an madvise to blow away our really 1954 * free pages allocating pv entries. 1955 */ 1956 if (((flags & MAP_PREFAULT_MADVISE) != 0 && 1957 vm_cnt.v_free_count < vm_cnt.v_free_reserved) || 1958 ((flags & MAP_PREFAULT_PARTIAL) != 0 && 1959 tmpidx >= threshold)) { 1960 psize = tmpidx; 1961 break; 1962 } 1963 if (p->valid == VM_PAGE_BITS_ALL) { 1964 if (p_start == NULL) { 1965 start = addr + ptoa(tmpidx); 1966 p_start = p; 1967 } 1968 /* Jump ahead if a superpage mapping is possible. */ 1969 if (p->psind > 0 && ((addr + ptoa(tmpidx)) & 1970 (pagesizes[p->psind] - 1)) == 0) { 1971 mask = atop(pagesizes[p->psind]) - 1; 1972 if (tmpidx + mask < psize && 1973 vm_page_ps_test(p, PS_ALL_VALID, NULL)) { 1974 p += mask; 1975 threshold += mask; 1976 } 1977 } 1978 } else if (p_start != NULL) { 1979 pmap_enter_object(map->pmap, start, addr + 1980 ptoa(tmpidx), p_start, prot); 1981 p_start = NULL; 1982 } 1983 } 1984 if (p_start != NULL) 1985 pmap_enter_object(map->pmap, start, addr + ptoa(psize), 1986 p_start, prot); 1987 VM_OBJECT_RUNLOCK(object); 1988} 1989 1990/* 1991 * vm_map_protect: 1992 * 1993 * Sets the protection of the specified address 1994 * region in the target map. If "set_max" is 1995 * specified, the maximum protection is to be set; 1996 * otherwise, only the current protection is affected. 1997 */ 1998int 1999vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, 2000 vm_prot_t new_prot, boolean_t set_max) 2001{ 2002 vm_map_entry_t current, entry, in_tran; 2003 vm_object_t obj; 2004 struct ucred *cred; 2005 vm_prot_t old_prot; 2006 2007 if (start == end) 2008 return (KERN_SUCCESS); 2009 2010again: 2011 in_tran = NULL; 2012 vm_map_lock(map); 2013 2014 /* 2015 * Ensure that we are not concurrently wiring pages. vm_map_wire() may 2016 * need to fault pages into the map and will drop the map lock while 2017 * doing so, and the VM object may end up in an inconsistent state if we 2018 * update the protection on the map entry in between faults. 2019 */ 2020 vm_map_wait_busy(map); 2021 2022 VM_MAP_RANGE_CHECK(map, start, end); 2023 2024 if (vm_map_lookup_entry(map, start, &entry)) { 2025 vm_map_clip_start(map, entry, start); 2026 } else { 2027 entry = entry->next; 2028 } 2029 2030 /* 2031 * Make a first pass to check for protection violations. 2032 */ 2033 for (current = entry; current->start < end; current = current->next) { 2034 if ((current->eflags & MAP_ENTRY_GUARD) != 0) 2035 continue; 2036 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { 2037 vm_map_unlock(map); 2038 return (KERN_INVALID_ARGUMENT); 2039 } 2040 if ((new_prot & current->max_protection) != new_prot) { 2041 vm_map_unlock(map); 2042 return (KERN_PROTECTION_FAILURE); 2043 } 2044 if ((current->eflags & MAP_ENTRY_IN_TRANSITION) != 0) 2045 in_tran = current; 2046 } 2047 2048 /* 2049 * Postpone the operation until all in-transition map entries have 2050 * stabilized. An in-transition entry might already have its pages 2051 * wired and wired_count incremented, but not yet have its 2052 * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call 2053 * vm_fault_copy_entry() in the final loop below. 2054 */ 2055 if (in_tran != NULL) { 2056 in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 2057 vm_map_unlock_and_wait(map, 0); 2058 goto again; 2059 } 2060 2061 /* 2062 * Do an accounting pass for private read-only mappings that 2063 * now will do cow due to allowed write (e.g. debugger sets 2064 * breakpoint on text segment) 2065 */ 2066 for (current = entry; current->start < end; current = current->next) { 2067 2068 vm_map_clip_end(map, current, end); 2069 2070 if (set_max || 2071 ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 || 2072 ENTRY_CHARGED(current) || 2073 (current->eflags & MAP_ENTRY_GUARD) != 0) { 2074 continue; 2075 } 2076 2077 cred = curthread->td_ucred; 2078 obj = current->object.vm_object; 2079 2080 if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) { 2081 if (!swap_reserve(current->end - current->start)) { 2082 vm_map_unlock(map); 2083 return (KERN_RESOURCE_SHORTAGE); 2084 } 2085 crhold(cred); 2086 current->cred = cred; 2087 continue; 2088 } 2089 2090 VM_OBJECT_WLOCK(obj); 2091 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) { 2092 VM_OBJECT_WUNLOCK(obj); 2093 continue; 2094 } 2095 2096 /* 2097 * Charge for the whole object allocation now, since 2098 * we cannot distinguish between non-charged and 2099 * charged clipped mapping of the same object later. 2100 */ 2101 KASSERT(obj->charge == 0, 2102 ("vm_map_protect: object %p overcharged (entry %p)", 2103 obj, current)); 2104 if (!swap_reserve(ptoa(obj->size))) { 2105 VM_OBJECT_WUNLOCK(obj); 2106 vm_map_unlock(map); 2107 return (KERN_RESOURCE_SHORTAGE); 2108 } 2109 2110 crhold(cred); 2111 obj->cred = cred; 2112 obj->charge = ptoa(obj->size); 2113 VM_OBJECT_WUNLOCK(obj); 2114 } 2115 2116 /* 2117 * Go back and fix up protections. [Note that clipping is not 2118 * necessary the second time.] 2119 */ 2120 for (current = entry; current->start < end; current = current->next) { 2121 if ((current->eflags & MAP_ENTRY_GUARD) != 0) 2122 continue; 2123 2124 old_prot = current->protection; 2125 2126 if (set_max) 2127 current->protection = 2128 (current->max_protection = new_prot) & 2129 old_prot; 2130 else 2131 current->protection = new_prot; 2132 2133 /* 2134 * For user wired map entries, the normal lazy evaluation of 2135 * write access upgrades through soft page faults is 2136 * undesirable. Instead, immediately copy any pages that are 2137 * copy-on-write and enable write access in the physical map. 2138 */ 2139 if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 && 2140 (current->protection & VM_PROT_WRITE) != 0 && 2141 (old_prot & VM_PROT_WRITE) == 0) 2142 vm_fault_copy_entry(map, map, current, current, NULL); 2143 2144 /* 2145 * When restricting access, update the physical map. Worry 2146 * about copy-on-write here. 2147 */ 2148 if ((old_prot & ~current->protection) != 0) { 2149#define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ 2150 VM_PROT_ALL) 2151 pmap_protect(map->pmap, current->start, 2152 current->end, 2153 current->protection & MASK(current)); 2154#undef MASK 2155 } 2156 vm_map_simplify_entry(map, current); 2157 } 2158 vm_map_unlock(map); 2159 return (KERN_SUCCESS); 2160} 2161 2162/* 2163 * vm_map_madvise: 2164 * 2165 * This routine traverses a processes map handling the madvise 2166 * system call. Advisories are classified as either those effecting 2167 * the vm_map_entry structure, or those effecting the underlying 2168 * objects. 2169 */ 2170int 2171vm_map_madvise( 2172 vm_map_t map, 2173 vm_offset_t start, 2174 vm_offset_t end, 2175 int behav) 2176{ 2177 vm_map_entry_t current, entry; 2178 int modify_map = 0; 2179 2180 /* 2181 * Some madvise calls directly modify the vm_map_entry, in which case 2182 * we need to use an exclusive lock on the map and we need to perform 2183 * various clipping operations. Otherwise we only need a read-lock 2184 * on the map. 2185 */ 2186 switch(behav) { 2187 case MADV_NORMAL: 2188 case MADV_SEQUENTIAL: 2189 case MADV_RANDOM: 2190 case MADV_NOSYNC: 2191 case MADV_AUTOSYNC: 2192 case MADV_NOCORE: 2193 case MADV_CORE: 2194 if (start == end) 2195 return (KERN_SUCCESS); 2196 modify_map = 1; 2197 vm_map_lock(map); 2198 break; 2199 case MADV_WILLNEED: 2200 case MADV_DONTNEED: 2201 case MADV_FREE: 2202 if (start == end) 2203 return (KERN_SUCCESS); 2204 vm_map_lock_read(map); 2205 break; 2206 default: 2207 return (KERN_INVALID_ARGUMENT); 2208 } 2209 2210 /* 2211 * Locate starting entry and clip if necessary. 2212 */ 2213 VM_MAP_RANGE_CHECK(map, start, end); 2214 2215 if (vm_map_lookup_entry(map, start, &entry)) { 2216 if (modify_map) 2217 vm_map_clip_start(map, entry, start); 2218 } else { 2219 entry = entry->next; 2220 } 2221 2222 if (modify_map) { 2223 /* 2224 * madvise behaviors that are implemented in the vm_map_entry. 2225 * 2226 * We clip the vm_map_entry so that behavioral changes are 2227 * limited to the specified address range. 2228 */ 2229 for (current = entry; current->start < end; 2230 current = current->next) { 2231 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) 2232 continue; 2233 2234 vm_map_clip_end(map, current, end); 2235 2236 switch (behav) { 2237 case MADV_NORMAL: 2238 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL); 2239 break; 2240 case MADV_SEQUENTIAL: 2241 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL); 2242 break; 2243 case MADV_RANDOM: 2244 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM); 2245 break; 2246 case MADV_NOSYNC: 2247 current->eflags |= MAP_ENTRY_NOSYNC; 2248 break; 2249 case MADV_AUTOSYNC: 2250 current->eflags &= ~MAP_ENTRY_NOSYNC; 2251 break; 2252 case MADV_NOCORE: 2253 current->eflags |= MAP_ENTRY_NOCOREDUMP; 2254 break; 2255 case MADV_CORE: 2256 current->eflags &= ~MAP_ENTRY_NOCOREDUMP; 2257 break; 2258 default: 2259 break; 2260 } 2261 vm_map_simplify_entry(map, current); 2262 } 2263 vm_map_unlock(map); 2264 } else { 2265 vm_pindex_t pstart, pend; 2266 2267 /* 2268 * madvise behaviors that are implemented in the underlying 2269 * vm_object. 2270 * 2271 * Since we don't clip the vm_map_entry, we have to clip 2272 * the vm_object pindex and count. 2273 */ 2274 for (current = entry; current->start < end; 2275 current = current->next) { 2276 vm_offset_t useEnd, useStart; 2277 2278 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) 2279 continue; 2280 2281 /* 2282 * MADV_FREE would otherwise rewind time to 2283 * the creation of the shadow object. Because 2284 * we hold the VM map read-locked, neither the 2285 * entry's object nor the presence of a 2286 * backing object can change. 2287 */ 2288 if (behav == MADV_FREE && 2289 current->object.vm_object != NULL && 2290 current->object.vm_object->backing_object != NULL) 2291 continue; 2292 2293 pstart = OFF_TO_IDX(current->offset); 2294 pend = pstart + atop(current->end - current->start); 2295 useStart = current->start; 2296 useEnd = current->end; 2297 2298 if (current->start < start) { 2299 pstart += atop(start - current->start); 2300 useStart = start; 2301 } 2302 if (current->end > end) { 2303 pend -= atop(current->end - end); 2304 useEnd = end; 2305 } 2306 2307 if (pstart >= pend) 2308 continue; 2309 2310 /* 2311 * Perform the pmap_advise() before clearing 2312 * PGA_REFERENCED in vm_page_advise(). Otherwise, a 2313 * concurrent pmap operation, such as pmap_remove(), 2314 * could clear a reference in the pmap and set 2315 * PGA_REFERENCED on the page before the pmap_advise() 2316 * had completed. Consequently, the page would appear 2317 * referenced based upon an old reference that 2318 * occurred before this pmap_advise() ran. 2319 */ 2320 if (behav == MADV_DONTNEED || behav == MADV_FREE) 2321 pmap_advise(map->pmap, useStart, useEnd, 2322 behav); 2323 2324 vm_object_madvise(current->object.vm_object, pstart, 2325 pend, behav); 2326 2327 /* 2328 * Pre-populate paging structures in the 2329 * WILLNEED case. For wired entries, the 2330 * paging structures are already populated. 2331 */ 2332 if (behav == MADV_WILLNEED && 2333 current->wired_count == 0) { 2334 vm_map_pmap_enter(map, 2335 useStart, 2336 current->protection, 2337 current->object.vm_object, 2338 pstart, 2339 ptoa(pend - pstart), 2340 MAP_PREFAULT_MADVISE 2341 ); 2342 } 2343 } 2344 vm_map_unlock_read(map); 2345 } 2346 return (0); 2347} 2348 2349 2350/* 2351 * vm_map_inherit: 2352 * 2353 * Sets the inheritance of the specified address 2354 * range in the target map. Inheritance 2355 * affects how the map will be shared with 2356 * child maps at the time of vmspace_fork. 2357 */ 2358int 2359vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, 2360 vm_inherit_t new_inheritance) 2361{ 2362 vm_map_entry_t entry; 2363 vm_map_entry_t temp_entry; 2364 2365 switch (new_inheritance) { 2366 case VM_INHERIT_NONE: 2367 case VM_INHERIT_COPY: 2368 case VM_INHERIT_SHARE: 2369 case VM_INHERIT_ZERO: 2370 break; 2371 default: 2372 return (KERN_INVALID_ARGUMENT); 2373 } 2374 if (start == end) 2375 return (KERN_SUCCESS); 2376 vm_map_lock(map); 2377 VM_MAP_RANGE_CHECK(map, start, end); 2378 if (vm_map_lookup_entry(map, start, &temp_entry)) { 2379 entry = temp_entry; 2380 vm_map_clip_start(map, entry, start); 2381 } else 2382 entry = temp_entry->next; 2383 while (entry->start < end) { 2384 vm_map_clip_end(map, entry, end); 2385 if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || 2386 new_inheritance != VM_INHERIT_ZERO) 2387 entry->inheritance = new_inheritance; 2388 vm_map_simplify_entry(map, entry); 2389 entry = entry->next; 2390 } 2391 vm_map_unlock(map); 2392 return (KERN_SUCCESS); 2393} 2394 2395/* 2396 * vm_map_unwire: 2397 * 2398 * Implements both kernel and user unwiring. 2399 */ 2400int 2401vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, 2402 int flags) 2403{ 2404 vm_map_entry_t entry, first_entry, tmp_entry; 2405 vm_offset_t saved_start; 2406 unsigned int last_timestamp; 2407 int rv; 2408 boolean_t need_wakeup, result, user_unwire; 2409 2410 if (start == end) 2411 return (KERN_SUCCESS); 2412 user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; 2413 vm_map_lock(map); 2414 VM_MAP_RANGE_CHECK(map, start, end); 2415 if (!vm_map_lookup_entry(map, start, &first_entry)) { 2416 if (flags & VM_MAP_WIRE_HOLESOK) 2417 first_entry = first_entry->next; 2418 else { 2419 vm_map_unlock(map); 2420 return (KERN_INVALID_ADDRESS); 2421 } 2422 } 2423 last_timestamp = map->timestamp; 2424 entry = first_entry; 2425 while (entry->start < end) { 2426 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 2427 /* 2428 * We have not yet clipped the entry. 2429 */ 2430 saved_start = (start >= entry->start) ? start : 2431 entry->start; 2432 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 2433 if (vm_map_unlock_and_wait(map, 0)) { 2434 /* 2435 * Allow interruption of user unwiring? 2436 */ 2437 } 2438 vm_map_lock(map); 2439 if (last_timestamp+1 != map->timestamp) { 2440 /* 2441 * Look again for the entry because the map was 2442 * modified while it was unlocked. 2443 * Specifically, the entry may have been 2444 * clipped, merged, or deleted. 2445 */ 2446 if (!vm_map_lookup_entry(map, saved_start, 2447 &tmp_entry)) { 2448 if (flags & VM_MAP_WIRE_HOLESOK) 2449 tmp_entry = tmp_entry->next; 2450 else { 2451 if (saved_start == start) { 2452 /* 2453 * First_entry has been deleted. 2454 */ 2455 vm_map_unlock(map); 2456 return (KERN_INVALID_ADDRESS); 2457 } 2458 end = saved_start; 2459 rv = KERN_INVALID_ADDRESS; 2460 goto done; 2461 } 2462 } 2463 if (entry == first_entry) 2464 first_entry = tmp_entry; 2465 else 2466 first_entry = NULL; 2467 entry = tmp_entry; 2468 } 2469 last_timestamp = map->timestamp; 2470 continue; 2471 } 2472 vm_map_clip_start(map, entry, start); 2473 vm_map_clip_end(map, entry, end); 2474 /* 2475 * Mark the entry in case the map lock is released. (See 2476 * above.) 2477 */ 2478 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && 2479 entry->wiring_thread == NULL, 2480 ("owned map entry %p", entry)); 2481 entry->eflags |= MAP_ENTRY_IN_TRANSITION; 2482 entry->wiring_thread = curthread; 2483 /* 2484 * Check the map for holes in the specified region. 2485 * If VM_MAP_WIRE_HOLESOK was specified, skip this check. 2486 */ 2487 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && 2488 (entry->end < end && entry->next->start > entry->end)) { 2489 end = entry->end; 2490 rv = KERN_INVALID_ADDRESS; 2491 goto done; 2492 } 2493 /* 2494 * If system unwiring, require that the entry is system wired. 2495 */ 2496 if (!user_unwire && 2497 vm_map_entry_system_wired_count(entry) == 0) { 2498 end = entry->end; 2499 rv = KERN_INVALID_ARGUMENT; 2500 goto done; 2501 } 2502 entry = entry->next; 2503 } 2504 rv = KERN_SUCCESS; 2505done: 2506 need_wakeup = FALSE; 2507 if (first_entry == NULL) { 2508 result = vm_map_lookup_entry(map, start, &first_entry); 2509 if (!result && (flags & VM_MAP_WIRE_HOLESOK)) 2510 first_entry = first_entry->next; 2511 else 2512 KASSERT(result, ("vm_map_unwire: lookup failed")); 2513 } 2514 for (entry = first_entry; entry->start < end; entry = entry->next) { 2515 /* 2516 * If VM_MAP_WIRE_HOLESOK was specified, an empty 2517 * space in the unwired region could have been mapped 2518 * while the map lock was dropped for draining 2519 * MAP_ENTRY_IN_TRANSITION. Moreover, another thread 2520 * could be simultaneously wiring this new mapping 2521 * entry. Detect these cases and skip any entries 2522 * marked as in transition by us. 2523 */ 2524 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || 2525 entry->wiring_thread != curthread) { 2526 KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0, 2527 ("vm_map_unwire: !HOLESOK and new/changed entry")); 2528 continue; 2529 } 2530 2531 if (rv == KERN_SUCCESS && (!user_unwire || 2532 (entry->eflags & MAP_ENTRY_USER_WIRED))) { 2533 if (user_unwire) 2534 entry->eflags &= ~MAP_ENTRY_USER_WIRED; 2535 if (entry->wired_count == 1) 2536 vm_map_entry_unwire(map, entry); 2537 else 2538 entry->wired_count--; 2539 } 2540 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, 2541 ("vm_map_unwire: in-transition flag missing %p", entry)); 2542 KASSERT(entry->wiring_thread == curthread, 2543 ("vm_map_unwire: alien wire %p", entry)); 2544 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; 2545 entry->wiring_thread = NULL; 2546 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { 2547 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; 2548 need_wakeup = TRUE; 2549 } 2550 vm_map_simplify_entry(map, entry); 2551 } 2552 vm_map_unlock(map); 2553 if (need_wakeup) 2554 vm_map_wakeup(map); 2555 return (rv); 2556} 2557 2558/* 2559 * vm_map_wire_entry_failure: 2560 * 2561 * Handle a wiring failure on the given entry. 2562 * 2563 * The map should be locked. 2564 */ 2565static void 2566vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, 2567 vm_offset_t failed_addr) 2568{ 2569 2570 VM_MAP_ASSERT_LOCKED(map); 2571 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 && 2572 entry->wired_count == 1, 2573 ("vm_map_wire_entry_failure: entry %p isn't being wired", entry)); 2574 KASSERT(failed_addr < entry->end, 2575 ("vm_map_wire_entry_failure: entry %p was fully wired", entry)); 2576 2577 /* 2578 * If any pages at the start of this entry were successfully wired, 2579 * then unwire them. 2580 */ 2581 if (failed_addr > entry->start) { 2582 pmap_unwire(map->pmap, entry->start, failed_addr); 2583 vm_object_unwire(entry->object.vm_object, entry->offset, 2584 failed_addr - entry->start, PQ_ACTIVE); 2585 } 2586 2587 /* 2588 * Assign an out-of-range value to represent the failure to wire this 2589 * entry. 2590 */ 2591 entry->wired_count = -1; 2592} 2593 2594/* 2595 * vm_map_wire: 2596 * 2597 * Implements both kernel and user wiring. 2598 */ 2599int 2600vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, 2601 int flags) 2602{ 2603 vm_map_entry_t entry, first_entry, tmp_entry; 2604 vm_offset_t faddr, saved_end, saved_start; 2605 unsigned int last_timestamp; 2606 int rv; 2607 boolean_t need_wakeup, result, user_wire; 2608 vm_prot_t prot; 2609 2610 if (start == end) 2611 return (KERN_SUCCESS); 2612 prot = 0; 2613 if (flags & VM_MAP_WIRE_WRITE) 2614 prot |= VM_PROT_WRITE; 2615 user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; 2616 vm_map_lock(map); 2617 VM_MAP_RANGE_CHECK(map, start, end); 2618 if (!vm_map_lookup_entry(map, start, &first_entry)) { 2619 if (flags & VM_MAP_WIRE_HOLESOK) 2620 first_entry = first_entry->next; 2621 else { 2622 vm_map_unlock(map); 2623 return (KERN_INVALID_ADDRESS); 2624 } 2625 } 2626 last_timestamp = map->timestamp; 2627 entry = first_entry; 2628 while (entry->start < end) { 2629 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 2630 /* 2631 * We have not yet clipped the entry. 2632 */ 2633 saved_start = (start >= entry->start) ? start : 2634 entry->start; 2635 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 2636 if (vm_map_unlock_and_wait(map, 0)) { 2637 /* 2638 * Allow interruption of user wiring? 2639 */ 2640 } 2641 vm_map_lock(map); 2642 if (last_timestamp + 1 != map->timestamp) { 2643 /* 2644 * Look again for the entry because the map was 2645 * modified while it was unlocked. 2646 * Specifically, the entry may have been 2647 * clipped, merged, or deleted. 2648 */ 2649 if (!vm_map_lookup_entry(map, saved_start, 2650 &tmp_entry)) { 2651 if (flags & VM_MAP_WIRE_HOLESOK) 2652 tmp_entry = tmp_entry->next; 2653 else { 2654 if (saved_start == start) { 2655 /* 2656 * first_entry has been deleted. 2657 */ 2658 vm_map_unlock(map); 2659 return (KERN_INVALID_ADDRESS); 2660 } 2661 end = saved_start; 2662 rv = KERN_INVALID_ADDRESS; 2663 goto done; 2664 } 2665 } 2666 if (entry == first_entry) 2667 first_entry = tmp_entry; 2668 else 2669 first_entry = NULL; 2670 entry = tmp_entry; 2671 } 2672 last_timestamp = map->timestamp; 2673 continue; 2674 } 2675 vm_map_clip_start(map, entry, start); 2676 vm_map_clip_end(map, entry, end); 2677 /* 2678 * Mark the entry in case the map lock is released. (See 2679 * above.) 2680 */ 2681 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && 2682 entry->wiring_thread == NULL, 2683 ("owned map entry %p", entry)); 2684 entry->eflags |= MAP_ENTRY_IN_TRANSITION; 2685 entry->wiring_thread = curthread; 2686 if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 2687 || (entry->protection & prot) != prot) { 2688 entry->eflags |= MAP_ENTRY_WIRE_SKIPPED; 2689 if ((flags & VM_MAP_WIRE_HOLESOK) == 0) { 2690 end = entry->end; 2691 rv = KERN_INVALID_ADDRESS; 2692 goto done; 2693 } 2694 goto next_entry; 2695 } 2696 if (entry->wired_count == 0) { 2697 entry->wired_count++; 2698 saved_start = entry->start; 2699 saved_end = entry->end; 2700 2701 /* 2702 * Release the map lock, relying on the in-transition 2703 * mark. Mark the map busy for fork. 2704 */ 2705 vm_map_busy(map); 2706 vm_map_unlock(map); 2707 2708 faddr = saved_start; 2709 do { 2710 /* 2711 * Simulate a fault to get the page and enter 2712 * it into the physical map. 2713 */ 2714 if ((rv = vm_fault(map, faddr, VM_PROT_NONE, 2715 VM_FAULT_WIRE)) != KERN_SUCCESS) 2716 break; 2717 } while ((faddr += PAGE_SIZE) < saved_end); 2718 vm_map_lock(map); 2719 vm_map_unbusy(map); 2720 if (last_timestamp + 1 != map->timestamp) { 2721 /* 2722 * Look again for the entry because the map was 2723 * modified while it was unlocked. The entry 2724 * may have been clipped, but NOT merged or 2725 * deleted. 2726 */ 2727 result = vm_map_lookup_entry(map, saved_start, 2728 &tmp_entry); 2729 KASSERT(result, ("vm_map_wire: lookup failed")); 2730 if (entry == first_entry) 2731 first_entry = tmp_entry; 2732 else 2733 first_entry = NULL; 2734 entry = tmp_entry; 2735 while (entry->end < saved_end) { 2736 /* 2737 * In case of failure, handle entries 2738 * that were not fully wired here; 2739 * fully wired entries are handled 2740 * later. 2741 */ 2742 if (rv != KERN_SUCCESS && 2743 faddr < entry->end) 2744 vm_map_wire_entry_failure(map, 2745 entry, faddr); 2746 entry = entry->next; 2747 } 2748 } 2749 last_timestamp = map->timestamp; 2750 if (rv != KERN_SUCCESS) { 2751 vm_map_wire_entry_failure(map, entry, faddr); 2752 end = entry->end; 2753 goto done; 2754 } 2755 } else if (!user_wire || 2756 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { 2757 entry->wired_count++; 2758 } 2759 /* 2760 * Check the map for holes in the specified region. 2761 * If VM_MAP_WIRE_HOLESOK was specified, skip this check. 2762 */ 2763 next_entry: 2764 if ((flags & VM_MAP_WIRE_HOLESOK) == 0 && 2765 entry->end < end && entry->next->start > entry->end) { 2766 end = entry->end; 2767 rv = KERN_INVALID_ADDRESS; 2768 goto done; 2769 } 2770 entry = entry->next; 2771 } 2772 rv = KERN_SUCCESS; 2773done: 2774 need_wakeup = FALSE; 2775 if (first_entry == NULL) { 2776 result = vm_map_lookup_entry(map, start, &first_entry); 2777 if (!result && (flags & VM_MAP_WIRE_HOLESOK)) 2778 first_entry = first_entry->next; 2779 else 2780 KASSERT(result, ("vm_map_wire: lookup failed")); 2781 } 2782 for (entry = first_entry; entry->start < end; entry = entry->next) { 2783 /* 2784 * If VM_MAP_WIRE_HOLESOK was specified, an empty 2785 * space in the unwired region could have been mapped 2786 * while the map lock was dropped for faulting in the 2787 * pages or draining MAP_ENTRY_IN_TRANSITION. 2788 * Moreover, another thread could be simultaneously 2789 * wiring this new mapping entry. Detect these cases 2790 * and skip any entries marked as in transition not by us. 2791 */ 2792 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || 2793 entry->wiring_thread != curthread) { 2794 KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0, 2795 ("vm_map_wire: !HOLESOK and new/changed entry")); 2796 continue; 2797 } 2798 2799 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) 2800 goto next_entry_done; 2801 2802 if (rv == KERN_SUCCESS) { 2803 if (user_wire) 2804 entry->eflags |= MAP_ENTRY_USER_WIRED; 2805 } else if (entry->wired_count == -1) { 2806 /* 2807 * Wiring failed on this entry. Thus, unwiring is 2808 * unnecessary. 2809 */ 2810 entry->wired_count = 0; 2811 } else if (!user_wire || 2812 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { 2813 /* 2814 * Undo the wiring. Wiring succeeded on this entry 2815 * but failed on a later entry. 2816 */ 2817 if (entry->wired_count == 1) 2818 vm_map_entry_unwire(map, entry); 2819 else 2820 entry->wired_count--; 2821 } 2822 next_entry_done: 2823 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, 2824 ("vm_map_wire: in-transition flag missing %p", entry)); 2825 KASSERT(entry->wiring_thread == curthread, 2826 ("vm_map_wire: alien wire %p", entry)); 2827 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION | 2828 MAP_ENTRY_WIRE_SKIPPED); 2829 entry->wiring_thread = NULL; 2830 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { 2831 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; 2832 need_wakeup = TRUE; 2833 } 2834 vm_map_simplify_entry(map, entry); 2835 } 2836 vm_map_unlock(map); 2837 if (need_wakeup) 2838 vm_map_wakeup(map); 2839 return (rv); 2840} 2841 2842/* 2843 * vm_map_sync 2844 * 2845 * Push any dirty cached pages in the address range to their pager. 2846 * If syncio is TRUE, dirty pages are written synchronously. 2847 * If invalidate is TRUE, any cached pages are freed as well. 2848 * 2849 * If the size of the region from start to end is zero, we are 2850 * supposed to flush all modified pages within the region containing 2851 * start. Unfortunately, a region can be split or coalesced with 2852 * neighboring regions, making it difficult to determine what the 2853 * original region was. Therefore, we approximate this requirement by 2854 * flushing the current region containing start. 2855 * 2856 * Returns an error if any part of the specified range is not mapped. 2857 */ 2858int 2859vm_map_sync( 2860 vm_map_t map, 2861 vm_offset_t start, 2862 vm_offset_t end, 2863 boolean_t syncio, 2864 boolean_t invalidate) 2865{ 2866 vm_map_entry_t current; 2867 vm_map_entry_t entry; 2868 vm_size_t size; 2869 vm_object_t object; 2870 vm_ooffset_t offset; 2871 unsigned int last_timestamp; 2872 boolean_t failed; 2873 2874 vm_map_lock_read(map); 2875 VM_MAP_RANGE_CHECK(map, start, end); 2876 if (!vm_map_lookup_entry(map, start, &entry)) { 2877 vm_map_unlock_read(map); 2878 return (KERN_INVALID_ADDRESS); 2879 } else if (start == end) { 2880 start = entry->start; 2881 end = entry->end; 2882 } 2883 /* 2884 * Make a first pass to check for user-wired memory and holes. 2885 */ 2886 for (current = entry; current->start < end; current = current->next) { 2887 if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) { 2888 vm_map_unlock_read(map); 2889 return (KERN_INVALID_ARGUMENT); 2890 } 2891 if (end > current->end && 2892 current->end != current->next->start) { 2893 vm_map_unlock_read(map); 2894 return (KERN_INVALID_ADDRESS); 2895 } 2896 } 2897 2898 if (invalidate) 2899 pmap_remove(map->pmap, start, end); 2900 failed = FALSE; 2901 2902 /* 2903 * Make a second pass, cleaning/uncaching pages from the indicated 2904 * objects as we go. 2905 */ 2906 for (current = entry; current->start < end;) { 2907 offset = current->offset + (start - current->start); 2908 size = (end <= current->end ? end : current->end) - start; 2909 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { 2910 vm_map_t smap; 2911 vm_map_entry_t tentry; 2912 vm_size_t tsize; 2913 2914 smap = current->object.sub_map; 2915 vm_map_lock_read(smap); 2916 (void) vm_map_lookup_entry(smap, offset, &tentry); 2917 tsize = tentry->end - offset; 2918 if (tsize < size) 2919 size = tsize; 2920 object = tentry->object.vm_object; 2921 offset = tentry->offset + (offset - tentry->start); 2922 vm_map_unlock_read(smap); 2923 } else { 2924 object = current->object.vm_object; 2925 } 2926 vm_object_reference(object); 2927 last_timestamp = map->timestamp; 2928 vm_map_unlock_read(map); 2929 if (!vm_object_sync(object, offset, size, syncio, invalidate)) 2930 failed = TRUE; 2931 start += size; 2932 vm_object_deallocate(object); 2933 vm_map_lock_read(map); 2934 if (last_timestamp == map->timestamp || 2935 !vm_map_lookup_entry(map, start, ¤t)) 2936 current = current->next; 2937 } 2938 2939 vm_map_unlock_read(map); 2940 return (failed ? KERN_FAILURE : KERN_SUCCESS); 2941} 2942 2943/* 2944 * vm_map_entry_unwire: [ internal use only ] 2945 * 2946 * Make the region specified by this entry pageable. 2947 * 2948 * The map in question should be locked. 2949 * [This is the reason for this routine's existence.] 2950 */ 2951static void 2952vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) 2953{ 2954 2955 VM_MAP_ASSERT_LOCKED(map); 2956 KASSERT(entry->wired_count > 0, 2957 ("vm_map_entry_unwire: entry %p isn't wired", entry)); 2958 pmap_unwire(map->pmap, entry->start, entry->end); 2959 vm_object_unwire(entry->object.vm_object, entry->offset, entry->end - 2960 entry->start, PQ_ACTIVE); 2961 entry->wired_count = 0; 2962} 2963 2964static void 2965vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map) 2966{ 2967 2968 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) 2969 vm_object_deallocate(entry->object.vm_object); 2970 uma_zfree(system_map ? kmapentzone : mapentzone, entry); 2971} 2972 2973/* 2974 * vm_map_entry_delete: [ internal use only ] 2975 * 2976 * Deallocate the given entry from the target map. 2977 */ 2978static void 2979vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) 2980{ 2981 vm_object_t object; 2982 vm_pindex_t offidxstart, offidxend, count, size1; 2983 vm_size_t size; 2984 2985 vm_map_entry_unlink(map, entry); 2986 object = entry->object.vm_object; 2987 2988 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) { 2989 MPASS(entry->cred == NULL); 2990 MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0); 2991 MPASS(object == NULL); 2992 vm_map_entry_deallocate(entry, map->system_map); 2993 return; 2994 } 2995 2996 size = entry->end - entry->start; 2997 map->size -= size; 2998 2999 if (entry->cred != NULL) { 3000 swap_release_by_cred(size, entry->cred); 3001 crfree(entry->cred); 3002 } 3003 3004 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && 3005 (object != NULL)) { 3006 KASSERT(entry->cred == NULL || object->cred == NULL || 3007 (entry->eflags & MAP_ENTRY_NEEDS_COPY), 3008 ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry)); 3009 count = atop(size); 3010 offidxstart = OFF_TO_IDX(entry->offset); 3011 offidxend = offidxstart + count; 3012 VM_OBJECT_WLOCK(object); 3013 if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT | 3014 OBJ_ONEMAPPING)) == OBJ_ONEMAPPING || 3015 object == kernel_object || object == kmem_object)) { 3016 vm_object_collapse(object); 3017 3018 /* 3019 * The option OBJPR_NOTMAPPED can be passed here 3020 * because vm_map_delete() already performed 3021 * pmap_remove() on the only mapping to this range 3022 * of pages. 3023 */ 3024 vm_object_page_remove(object, offidxstart, offidxend, 3025 OBJPR_NOTMAPPED); 3026 if (object->type == OBJT_SWAP) 3027 swap_pager_freespace(object, offidxstart, 3028 count); 3029 if (offidxend >= object->size && 3030 offidxstart < object->size) { 3031 size1 = object->size; 3032 object->size = offidxstart; 3033 if (object->cred != NULL) { 3034 size1 -= object->size; 3035 KASSERT(object->charge >= ptoa(size1), 3036 ("object %p charge < 0", object)); 3037 swap_release_by_cred(ptoa(size1), 3038 object->cred); 3039 object->charge -= ptoa(size1); 3040 } 3041 } 3042 } 3043 VM_OBJECT_WUNLOCK(object); 3044 } else 3045 entry->object.vm_object = NULL; 3046 if (map->system_map) 3047 vm_map_entry_deallocate(entry, TRUE); 3048 else { 3049 entry->next = curthread->td_map_def_user; 3050 curthread->td_map_def_user = entry; 3051 } 3052} 3053 3054/* 3055 * vm_map_delete: [ internal use only ] 3056 * 3057 * Deallocates the given address range from the target 3058 * map. 3059 */ 3060int 3061vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) 3062{ 3063 vm_map_entry_t entry; 3064 vm_map_entry_t first_entry; 3065 3066 VM_MAP_ASSERT_LOCKED(map); 3067 if (start == end) 3068 return (KERN_SUCCESS); 3069 3070 /* 3071 * Find the start of the region, and clip it 3072 */ 3073 if (!vm_map_lookup_entry(map, start, &first_entry)) 3074 entry = first_entry->next; 3075 else { 3076 entry = first_entry; 3077 vm_map_clip_start(map, entry, start); 3078 } 3079 3080 /* 3081 * Step through all entries in this region 3082 */ 3083 while (entry->start < end) { 3084 vm_map_entry_t next; 3085 3086 /* 3087 * Wait for wiring or unwiring of an entry to complete. 3088 * Also wait for any system wirings to disappear on 3089 * user maps. 3090 */ 3091 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || 3092 (vm_map_pmap(map) != kernel_pmap && 3093 vm_map_entry_system_wired_count(entry) != 0)) { 3094 unsigned int last_timestamp; 3095 vm_offset_t saved_start; 3096 vm_map_entry_t tmp_entry; 3097 3098 saved_start = entry->start; 3099 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 3100 last_timestamp = map->timestamp; 3101 (void) vm_map_unlock_and_wait(map, 0); 3102 vm_map_lock(map); 3103 if (last_timestamp + 1 != map->timestamp) { 3104 /* 3105 * Look again for the entry because the map was 3106 * modified while it was unlocked. 3107 * Specifically, the entry may have been 3108 * clipped, merged, or deleted. 3109 */ 3110 if (!vm_map_lookup_entry(map, saved_start, 3111 &tmp_entry)) 3112 entry = tmp_entry->next; 3113 else { 3114 entry = tmp_entry; 3115 vm_map_clip_start(map, entry, 3116 saved_start); 3117 } 3118 } 3119 continue; 3120 } 3121 vm_map_clip_end(map, entry, end); 3122 3123 next = entry->next; 3124 3125 /* 3126 * Unwire before removing addresses from the pmap; otherwise, 3127 * unwiring will put the entries back in the pmap. 3128 */ 3129 if (entry->wired_count != 0) 3130 vm_map_entry_unwire(map, entry); 3131 3132 /* 3133 * Remove mappings for the pages, but only if the 3134 * mappings could exist. For instance, it does not 3135 * make sense to call pmap_remove() for guard entries. 3136 */ 3137 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || 3138 entry->object.vm_object != NULL) 3139 pmap_remove(map->pmap, entry->start, entry->end); 3140 3141 /* 3142 * Delete the entry only after removing all pmap 3143 * entries pointing to its pages. (Otherwise, its 3144 * page frames may be reallocated, and any modify bits 3145 * will be set in the wrong object!) 3146 */ 3147 vm_map_entry_delete(map, entry); 3148 entry = next; 3149 } 3150 return (KERN_SUCCESS); 3151} 3152 3153/* 3154 * vm_map_remove: 3155 * 3156 * Remove the given address range from the target map. 3157 * This is the exported form of vm_map_delete. 3158 */ 3159int 3160vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) 3161{ 3162 int result; 3163 3164 vm_map_lock(map); 3165 VM_MAP_RANGE_CHECK(map, start, end); 3166 result = vm_map_delete(map, start, end); 3167 vm_map_unlock(map); 3168 return (result); 3169} 3170 3171/* 3172 * vm_map_check_protection: 3173 * 3174 * Assert that the target map allows the specified privilege on the 3175 * entire address region given. The entire region must be allocated. 3176 * 3177 * WARNING! This code does not and should not check whether the 3178 * contents of the region is accessible. For example a smaller file 3179 * might be mapped into a larger address space. 3180 * 3181 * NOTE! This code is also called by munmap(). 3182 * 3183 * The map must be locked. A read lock is sufficient. 3184 */ 3185boolean_t 3186vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, 3187 vm_prot_t protection) 3188{ 3189 vm_map_entry_t entry; 3190 vm_map_entry_t tmp_entry; 3191 3192 if (!vm_map_lookup_entry(map, start, &tmp_entry)) 3193 return (FALSE); 3194 entry = tmp_entry; 3195 3196 while (start < end) { 3197 /* 3198 * No holes allowed! 3199 */ 3200 if (start < entry->start) 3201 return (FALSE); 3202 /* 3203 * Check protection associated with entry. 3204 */ 3205 if ((entry->protection & protection) != protection) 3206 return (FALSE); 3207 /* go to next entry */ 3208 start = entry->end; 3209 entry = entry->next; 3210 } 3211 return (TRUE); 3212} 3213 3214/* 3215 * vm_map_copy_entry: 3216 * 3217 * Copies the contents of the source entry to the destination 3218 * entry. The entries *must* be aligned properly. 3219 */ 3220static void 3221vm_map_copy_entry( 3222 vm_map_t src_map, 3223 vm_map_t dst_map, 3224 vm_map_entry_t src_entry, 3225 vm_map_entry_t dst_entry, 3226 vm_ooffset_t *fork_charge) 3227{ 3228 vm_object_t src_object; 3229 vm_map_entry_t fake_entry; 3230 vm_offset_t size; 3231 struct ucred *cred; 3232 int charged; 3233 3234 VM_MAP_ASSERT_LOCKED(dst_map); 3235 3236 if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) 3237 return; 3238 3239 if (src_entry->wired_count == 0 || 3240 (src_entry->protection & VM_PROT_WRITE) == 0) { 3241 /* 3242 * If the source entry is marked needs_copy, it is already 3243 * write-protected. 3244 */ 3245 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 && 3246 (src_entry->protection & VM_PROT_WRITE) != 0) { 3247 pmap_protect(src_map->pmap, 3248 src_entry->start, 3249 src_entry->end, 3250 src_entry->protection & ~VM_PROT_WRITE); 3251 } 3252 3253 /* 3254 * Make a copy of the object. 3255 */ 3256 size = src_entry->end - src_entry->start; 3257 if ((src_object = src_entry->object.vm_object) != NULL) { 3258 VM_OBJECT_WLOCK(src_object); 3259 charged = ENTRY_CHARGED(src_entry); 3260 if (src_object->handle == NULL && 3261 (src_object->type == OBJT_DEFAULT || 3262 src_object->type == OBJT_SWAP)) { 3263 vm_object_collapse(src_object); 3264 if ((src_object->flags & (OBJ_NOSPLIT | 3265 OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) { 3266 vm_object_split(src_entry); 3267 src_object = 3268 src_entry->object.vm_object; 3269 } 3270 } 3271 vm_object_reference_locked(src_object); 3272 vm_object_clear_flag(src_object, OBJ_ONEMAPPING); 3273 if (src_entry->cred != NULL && 3274 !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { 3275 KASSERT(src_object->cred == NULL, 3276 ("OVERCOMMIT: vm_map_copy_entry: cred %p", 3277 src_object)); 3278 src_object->cred = src_entry->cred; 3279 src_object->charge = size; 3280 } 3281 VM_OBJECT_WUNLOCK(src_object); 3282 dst_entry->object.vm_object = src_object; 3283 if (charged) { 3284 cred = curthread->td_ucred; 3285 crhold(cred); 3286 dst_entry->cred = cred; 3287 *fork_charge += size; 3288 if (!(src_entry->eflags & 3289 MAP_ENTRY_NEEDS_COPY)) { 3290 crhold(cred); 3291 src_entry->cred = cred; 3292 *fork_charge += size; 3293 } 3294 } 3295 src_entry->eflags |= MAP_ENTRY_COW | 3296 MAP_ENTRY_NEEDS_COPY; 3297 dst_entry->eflags |= MAP_ENTRY_COW | 3298 MAP_ENTRY_NEEDS_COPY; 3299 dst_entry->offset = src_entry->offset; 3300 if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) { 3301 /* 3302 * MAP_ENTRY_VN_WRITECNT cannot 3303 * indicate write reference from 3304 * src_entry, since the entry is 3305 * marked as needs copy. Allocate a 3306 * fake entry that is used to 3307 * decrement object->un_pager.vnp.writecount 3308 * at the appropriate time. Attach 3309 * fake_entry to the deferred list. 3310 */ 3311 fake_entry = vm_map_entry_create(dst_map); 3312 fake_entry->eflags = MAP_ENTRY_VN_WRITECNT; 3313 src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT; 3314 vm_object_reference(src_object); 3315 fake_entry->object.vm_object = src_object; 3316 fake_entry->start = src_entry->start; 3317 fake_entry->end = src_entry->end; 3318 fake_entry->next = curthread->td_map_def_user; 3319 curthread->td_map_def_user = fake_entry; 3320 } 3321 3322 pmap_copy(dst_map->pmap, src_map->pmap, 3323 dst_entry->start, dst_entry->end - dst_entry->start, 3324 src_entry->start); 3325 } else { 3326 dst_entry->object.vm_object = NULL; 3327 dst_entry->offset = 0; 3328 if (src_entry->cred != NULL) { 3329 dst_entry->cred = curthread->td_ucred; 3330 crhold(dst_entry->cred); 3331 *fork_charge += size; 3332 } 3333 } 3334 } else { 3335 /* 3336 * We don't want to make writeable wired pages copy-on-write. 3337 * Immediately copy these pages into the new map by simulating 3338 * page faults. The new pages are pageable. 3339 */ 3340 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry, 3341 fork_charge); 3342 } 3343} 3344 3345/* 3346 * vmspace_map_entry_forked: 3347 * Update the newly-forked vmspace each time a map entry is inherited 3348 * or copied. The values for vm_dsize and vm_tsize are approximate 3349 * (and mostly-obsolete ideas in the face of mmap(2) et al.) 3350 */ 3351static void 3352vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, 3353 vm_map_entry_t entry) 3354{ 3355 vm_size_t entrysize; 3356 vm_offset_t newend; 3357 3358 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) 3359 return; 3360 entrysize = entry->end - entry->start; 3361 vm2->vm_map.size += entrysize; 3362 if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) { 3363 vm2->vm_ssize += btoc(entrysize); 3364 } else if (entry->start >= (vm_offset_t)vm1->vm_daddr && 3365 entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) { 3366 newend = MIN(entry->end, 3367 (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)); 3368 vm2->vm_dsize += btoc(newend - entry->start); 3369 } else if (entry->start >= (vm_offset_t)vm1->vm_taddr && 3370 entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) { 3371 newend = MIN(entry->end, 3372 (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)); 3373 vm2->vm_tsize += btoc(newend - entry->start); 3374 } 3375} 3376 3377/* 3378 * vmspace_fork: 3379 * Create a new process vmspace structure and vm_map 3380 * based on those of an existing process. The new map 3381 * is based on the old map, according to the inheritance 3382 * values on the regions in that map. 3383 * 3384 * XXX It might be worth coalescing the entries added to the new vmspace. 3385 * 3386 * The source map must not be locked. 3387 */ 3388struct vmspace * 3389vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) 3390{ 3391 struct vmspace *vm2; 3392 vm_map_t new_map, old_map; 3393 vm_map_entry_t new_entry, old_entry; 3394 vm_object_t object; 3395 int locked; 3396 vm_inherit_t inh; 3397 3398 old_map = &vm1->vm_map; 3399 /* Copy immutable fields of vm1 to vm2. */ 3400 vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map), 3401 pmap_pinit); 3402 if (vm2 == NULL) 3403 return (NULL); 3404 vm2->vm_taddr = vm1->vm_taddr; 3405 vm2->vm_daddr = vm1->vm_daddr; 3406 vm2->vm_maxsaddr = vm1->vm_maxsaddr; 3407 vm_map_lock(old_map); 3408 if (old_map->busy) 3409 vm_map_wait_busy(old_map); 3410 new_map = &vm2->vm_map; 3411 locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ 3412 KASSERT(locked, ("vmspace_fork: lock failed")); 3413 3414 old_entry = old_map->header.next; 3415 3416 while (old_entry != &old_map->header) { 3417 if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) 3418 panic("vm_map_fork: encountered a submap"); 3419 3420 inh = old_entry->inheritance; 3421 if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 && 3422 inh != VM_INHERIT_NONE) 3423 inh = VM_INHERIT_COPY; 3424 3425 switch (inh) { 3426 case VM_INHERIT_NONE: 3427 break; 3428 3429 case VM_INHERIT_SHARE: 3430 /* 3431 * Clone the entry, creating the shared object if necessary. 3432 */ 3433 object = old_entry->object.vm_object; 3434 if (object == NULL) { 3435 object = vm_object_allocate(OBJT_DEFAULT, 3436 atop(old_entry->end - old_entry->start)); 3437 old_entry->object.vm_object = object; 3438 old_entry->offset = 0; 3439 if (old_entry->cred != NULL) { 3440 object->cred = old_entry->cred; 3441 object->charge = old_entry->end - 3442 old_entry->start; 3443 old_entry->cred = NULL; 3444 } 3445 } 3446 3447 /* 3448 * Add the reference before calling vm_object_shadow 3449 * to insure that a shadow object is created. 3450 */ 3451 vm_object_reference(object); 3452 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { 3453 vm_object_shadow(&old_entry->object.vm_object, 3454 &old_entry->offset, 3455 old_entry->end - old_entry->start); 3456 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; 3457 /* Transfer the second reference too. */ 3458 vm_object_reference( 3459 old_entry->object.vm_object); 3460 3461 /* 3462 * As in vm_map_simplify_entry(), the 3463 * vnode lock will not be acquired in 3464 * this call to vm_object_deallocate(). 3465 */ 3466 vm_object_deallocate(object); 3467 object = old_entry->object.vm_object; 3468 } 3469 VM_OBJECT_WLOCK(object); 3470 vm_object_clear_flag(object, OBJ_ONEMAPPING); 3471 if (old_entry->cred != NULL) { 3472 KASSERT(object->cred == NULL, ("vmspace_fork both cred")); 3473 object->cred = old_entry->cred; 3474 object->charge = old_entry->end - old_entry->start; 3475 old_entry->cred = NULL; 3476 } 3477 3478 /* 3479 * Assert the correct state of the vnode 3480 * v_writecount while the object is locked, to 3481 * not relock it later for the assertion 3482 * correctness. 3483 */ 3484 if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT && 3485 object->type == OBJT_VNODE) { 3486 KASSERT(((struct vnode *)object->handle)-> 3487 v_writecount > 0, 3488 ("vmspace_fork: v_writecount %p", object)); 3489 KASSERT(object->un_pager.vnp.writemappings > 0, 3490 ("vmspace_fork: vnp.writecount %p", 3491 object)); 3492 } 3493 VM_OBJECT_WUNLOCK(object); 3494 3495 /* 3496 * Clone the entry, referencing the shared object. 3497 */ 3498 new_entry = vm_map_entry_create(new_map); 3499 *new_entry = *old_entry; 3500 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | 3501 MAP_ENTRY_IN_TRANSITION); 3502 new_entry->wiring_thread = NULL; 3503 new_entry->wired_count = 0; 3504 if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) { 3505 vnode_pager_update_writecount(object, 3506 new_entry->start, new_entry->end); 3507 } 3508 3509 /* 3510 * Insert the entry into the new map -- we know we're 3511 * inserting at the end of the new map. 3512 */ 3513 vm_map_entry_link(new_map, new_map->header.prev, 3514 new_entry); 3515 vmspace_map_entry_forked(vm1, vm2, new_entry); 3516 3517 /* 3518 * Update the physical map 3519 */ 3520 pmap_copy(new_map->pmap, old_map->pmap, 3521 new_entry->start, 3522 (old_entry->end - old_entry->start), 3523 old_entry->start); 3524 break; 3525 3526 case VM_INHERIT_COPY: 3527 /* 3528 * Clone the entry and link into the map. 3529 */ 3530 new_entry = vm_map_entry_create(new_map); 3531 *new_entry = *old_entry; 3532 /* 3533 * Copied entry is COW over the old object. 3534 */ 3535 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | 3536 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT); 3537 new_entry->wiring_thread = NULL; 3538 new_entry->wired_count = 0; 3539 new_entry->object.vm_object = NULL; 3540 new_entry->cred = NULL; 3541 vm_map_entry_link(new_map, new_map->header.prev, 3542 new_entry); 3543 vmspace_map_entry_forked(vm1, vm2, new_entry); 3544 vm_map_copy_entry(old_map, new_map, old_entry, 3545 new_entry, fork_charge); 3546 break; 3547 3548 case VM_INHERIT_ZERO: 3549 /* 3550 * Create a new anonymous mapping entry modelled from 3551 * the old one. 3552 */ 3553 new_entry = vm_map_entry_create(new_map); 3554 memset(new_entry, 0, sizeof(*new_entry)); 3555 3556 new_entry->start = old_entry->start; 3557 new_entry->end = old_entry->end; 3558 new_entry->eflags = old_entry->eflags & 3559 ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | 3560 MAP_ENTRY_VN_WRITECNT); 3561 new_entry->protection = old_entry->protection; 3562 new_entry->max_protection = old_entry->max_protection; 3563 new_entry->inheritance = VM_INHERIT_ZERO; 3564 3565 vm_map_entry_link(new_map, new_map->header.prev, 3566 new_entry); 3567 vmspace_map_entry_forked(vm1, vm2, new_entry); 3568 3569 new_entry->cred = curthread->td_ucred; 3570 crhold(new_entry->cred); 3571 *fork_charge += (new_entry->end - new_entry->start); 3572 3573 break; 3574 } 3575 old_entry = old_entry->next; 3576 } 3577 /* 3578 * Use inlined vm_map_unlock() to postpone handling the deferred 3579 * map entries, which cannot be done until both old_map and 3580 * new_map locks are released. 3581 */ 3582 sx_xunlock(&old_map->lock); 3583 sx_xunlock(&new_map->lock); 3584 vm_map_process_deferred(); 3585 3586 return (vm2); 3587} 3588 3589/* 3590 * Create a process's stack for exec_new_vmspace(). This function is never 3591 * asked to wire the newly created stack. 3592 */ 3593int 3594vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, 3595 vm_prot_t prot, vm_prot_t max, int cow) 3596{ 3597 vm_size_t growsize, init_ssize; 3598 rlim_t vmemlim; 3599 int rv; 3600 3601 MPASS((map->flags & MAP_WIREFUTURE) == 0); 3602 growsize = sgrowsiz; 3603 init_ssize = (max_ssize < growsize) ? max_ssize : growsize; 3604 vm_map_lock(map); 3605 vmemlim = lim_cur(curthread, RLIMIT_VMEM); 3606 /* If we would blow our VMEM resource limit, no go */ 3607 if (map->size + init_ssize > vmemlim) { 3608 rv = KERN_NO_SPACE; 3609 goto out; 3610 } 3611 rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot, 3612 max, cow); 3613out: 3614 vm_map_unlock(map); 3615 return (rv); 3616} 3617 3618static int stack_guard_page = 1; 3619SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN, 3620 &stack_guard_page, 0, 3621 "Specifies the number of guard pages for a stack that grows"); 3622 3623static int 3624vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, 3625 vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow) 3626{ 3627 vm_map_entry_t new_entry, prev_entry; 3628 vm_offset_t bot, gap_bot, gap_top, top; 3629 vm_size_t init_ssize, sgp; 3630 int orient, rv; 3631 3632 /* 3633 * The stack orientation is piggybacked with the cow argument. 3634 * Extract it into orient and mask the cow argument so that we 3635 * don't pass it around further. 3636 */ 3637 orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP); 3638 KASSERT(orient != 0, ("No stack grow direction")); 3639 KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP), 3640 ("bi-dir stack")); 3641 3642 if (addrbos < vm_map_min(map) || 3643 addrbos + max_ssize > vm_map_max(map) || 3644 addrbos + max_ssize <= addrbos) 3645 return (KERN_INVALID_ADDRESS); 3646 sgp = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 : 3647 (vm_size_t)stack_guard_page * PAGE_SIZE; 3648 if (sgp >= max_ssize) 3649 return (KERN_INVALID_ARGUMENT); 3650 3651 init_ssize = growsize; 3652 if (max_ssize < init_ssize + sgp) 3653 init_ssize = max_ssize - sgp; 3654 3655 /* If addr is already mapped, no go */ 3656 if (vm_map_lookup_entry(map, addrbos, &prev_entry)) 3657 return (KERN_NO_SPACE); 3658 3659 /* 3660 * If we can't accommodate max_ssize in the current mapping, no go. 3661 */ 3662 if (prev_entry->next->start < addrbos + max_ssize) 3663 return (KERN_NO_SPACE); 3664 3665 /* 3666 * We initially map a stack of only init_ssize. We will grow as 3667 * needed later. Depending on the orientation of the stack (i.e. 3668 * the grow direction) we either map at the top of the range, the 3669 * bottom of the range or in the middle. 3670 * 3671 * Note: we would normally expect prot and max to be VM_PROT_ALL, 3672 * and cow to be 0. Possibly we should eliminate these as input 3673 * parameters, and just pass these values here in the insert call. 3674 */ 3675 if (orient == MAP_STACK_GROWS_DOWN) { 3676 bot = addrbos + max_ssize - init_ssize; 3677 top = bot + init_ssize; 3678 gap_bot = addrbos; 3679 gap_top = bot; 3680 } else /* if (orient == MAP_STACK_GROWS_UP) */ { 3681 bot = addrbos; 3682 top = bot + init_ssize; 3683 gap_bot = top; 3684 gap_top = addrbos + max_ssize; 3685 } 3686 rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); 3687 if (rv != KERN_SUCCESS) 3688 return (rv); 3689 new_entry = prev_entry->next; 3690 KASSERT(new_entry->end == top || new_entry->start == bot, 3691 ("Bad entry start/end for new stack entry")); 3692 KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 || 3693 (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, 3694 ("new entry lacks MAP_ENTRY_GROWS_DOWN")); 3695 KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || 3696 (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, 3697 ("new entry lacks MAP_ENTRY_GROWS_UP")); 3698 if (gap_bot == gap_top) 3699 return (KERN_SUCCESS); 3700 rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE, 3701 VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ? 3702 MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP)); 3703 if (rv == KERN_SUCCESS) { 3704 /* 3705 * Gap can never successfully handle a fault, so 3706 * read-ahead logic is never used for it. Re-use 3707 * next_read of the gap entry to store 3708 * stack_guard_page for vm_map_growstack(). 3709 */ 3710 if (orient == MAP_STACK_GROWS_DOWN) 3711 new_entry->prev->next_read = sgp; 3712 else 3713 new_entry->next->next_read = sgp; 3714 } else { 3715 (void)vm_map_delete(map, bot, top); 3716 } 3717 return (rv); 3718} 3719 3720/* 3721 * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we 3722 * successfully grow the stack. 3723 */ 3724static int 3725vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry) 3726{ 3727 vm_map_entry_t stack_entry; 3728 struct proc *p; 3729 struct vmspace *vm; 3730 struct ucred *cred; 3731 vm_offset_t gap_end, gap_start, grow_start; 3732 size_t grow_amount, guard, max_grow; 3733 rlim_t lmemlim, stacklim, vmemlim; 3734 int rv, rv1; 3735 bool gap_deleted, grow_down, is_procstack; 3736#ifdef notyet 3737 uint64_t limit; 3738#endif 3739#ifdef RACCT 3740 int error; 3741#endif 3742 3743 p = curproc; 3744 vm = p->p_vmspace; 3745 3746 /* 3747 * Disallow stack growth when the access is performed by a 3748 * debugger or AIO daemon. The reason is that the wrong 3749 * resource limits are applied. 3750 */ 3751 if (p != initproc && (map != &p->p_vmspace->vm_map || 3752 p->p_textvp == NULL)) 3753 return (KERN_FAILURE); 3754 3755 MPASS(!map->system_map); 3756 3757 lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK); 3758 stacklim = lim_cur(curthread, RLIMIT_STACK); 3759 vmemlim = lim_cur(curthread, RLIMIT_VMEM); 3760retry: 3761 /* If addr is not in a hole for a stack grow area, no need to grow. */ 3762 if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry)) 3763 return (KERN_FAILURE); 3764 if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0) 3765 return (KERN_SUCCESS); 3766 if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) { 3767 stack_entry = gap_entry->next; 3768 if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 || 3769 stack_entry->start != gap_entry->end) 3770 return (KERN_FAILURE); 3771 grow_amount = round_page(stack_entry->start - addr); 3772 grow_down = true; 3773 } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) { 3774 stack_entry = gap_entry->prev; 3775 if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 || 3776 stack_entry->end != gap_entry->start) 3777 return (KERN_FAILURE); 3778 grow_amount = round_page(addr + 1 - stack_entry->end); 3779 grow_down = false; 3780 } else { 3781 return (KERN_FAILURE); 3782 } 3783 guard = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 : 3784 gap_entry->next_read; 3785 max_grow = gap_entry->end - gap_entry->start; 3786 if (guard > max_grow) 3787 return (KERN_NO_SPACE); 3788 max_grow -= guard; 3789 if (grow_amount > max_grow) 3790 return (KERN_NO_SPACE); 3791 3792 /* 3793 * If this is the main process stack, see if we're over the stack 3794 * limit. 3795 */ 3796 is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr && 3797 addr < (vm_offset_t)p->p_sysent->sv_usrstack; 3798 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) 3799 return (KERN_NO_SPACE); 3800 3801#ifdef RACCT 3802 if (racct_enable) { 3803 PROC_LOCK(p); 3804 if (is_procstack && racct_set(p, RACCT_STACK, 3805 ctob(vm->vm_ssize) + grow_amount)) { 3806 PROC_UNLOCK(p); 3807 return (KERN_NO_SPACE); 3808 } 3809 PROC_UNLOCK(p); 3810 } 3811#endif 3812 3813 grow_amount = roundup(grow_amount, sgrowsiz); 3814 if (grow_amount > max_grow) 3815 grow_amount = max_grow; 3816 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { 3817 grow_amount = trunc_page((vm_size_t)stacklim) - 3818 ctob(vm->vm_ssize); 3819 } 3820 3821#ifdef notyet 3822 PROC_LOCK(p); 3823 limit = racct_get_available(p, RACCT_STACK); 3824 PROC_UNLOCK(p); 3825 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit)) 3826 grow_amount = limit - ctob(vm->vm_ssize); 3827#endif 3828 3829 if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) { 3830 if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) { 3831 rv = KERN_NO_SPACE; 3832 goto out; 3833 } 3834#ifdef RACCT 3835 if (racct_enable) { 3836 PROC_LOCK(p); 3837 if (racct_set(p, RACCT_MEMLOCK, 3838 ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { 3839 PROC_UNLOCK(p); 3840 rv = KERN_NO_SPACE; 3841 goto out; 3842 } 3843 PROC_UNLOCK(p); 3844 } 3845#endif 3846 } 3847 3848 /* If we would blow our VMEM resource limit, no go */ 3849 if (map->size + grow_amount > vmemlim) { 3850 rv = KERN_NO_SPACE; 3851 goto out; 3852 } 3853#ifdef RACCT 3854 if (racct_enable) { 3855 PROC_LOCK(p); 3856 if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { 3857 PROC_UNLOCK(p); 3858 rv = KERN_NO_SPACE; 3859 goto out; 3860 } 3861 PROC_UNLOCK(p); 3862 } 3863#endif 3864 3865 if (vm_map_lock_upgrade(map)) { 3866 gap_entry = NULL; 3867 vm_map_lock_read(map); 3868 goto retry; 3869 } 3870 3871 if (grow_down) { 3872 grow_start = gap_entry->end - grow_amount; 3873 if (gap_entry->start + grow_amount == gap_entry->end) { 3874 gap_start = gap_entry->start; 3875 gap_end = gap_entry->end; 3876 vm_map_entry_delete(map, gap_entry); 3877 gap_deleted = true; 3878 } else { 3879 MPASS(gap_entry->start < gap_entry->end - grow_amount); 3880 gap_entry->end -= grow_amount; 3881 vm_map_entry_resize_free(map, gap_entry); 3882 gap_deleted = false; 3883 } 3884 rv = vm_map_insert(map, NULL, 0, grow_start, 3885 grow_start + grow_amount, 3886 stack_entry->protection, stack_entry->max_protection, 3887 MAP_STACK_GROWS_DOWN); 3888 if (rv != KERN_SUCCESS) { 3889 if (gap_deleted) { 3890 rv1 = vm_map_insert(map, NULL, 0, gap_start, 3891 gap_end, VM_PROT_NONE, VM_PROT_NONE, 3892 MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN); 3893 MPASS(rv1 == KERN_SUCCESS); 3894 } else { 3895 gap_entry->end += grow_amount; 3896 vm_map_entry_resize_free(map, gap_entry); 3897 } 3898 } 3899 } else { 3900 grow_start = stack_entry->end; 3901 cred = stack_entry->cred; 3902 if (cred == NULL && stack_entry->object.vm_object != NULL) 3903 cred = stack_entry->object.vm_object->cred; 3904 if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred)) 3905 rv = KERN_NO_SPACE; 3906 /* Grow the underlying object if applicable. */ 3907 else if (stack_entry->object.vm_object == NULL || 3908 vm_object_coalesce(stack_entry->object.vm_object, 3909 stack_entry->offset, 3910 (vm_size_t)(stack_entry->end - stack_entry->start), 3911 (vm_size_t)grow_amount, cred != NULL)) { 3912 if (gap_entry->start + grow_amount == gap_entry->end) 3913 vm_map_entry_delete(map, gap_entry); 3914 else 3915 gap_entry->start += grow_amount; 3916 stack_entry->end += grow_amount; 3917 map->size += grow_amount; 3918 vm_map_entry_resize_free(map, stack_entry); 3919 rv = KERN_SUCCESS; 3920 } else 3921 rv = KERN_FAILURE; 3922 } 3923 if (rv == KERN_SUCCESS && is_procstack) 3924 vm->vm_ssize += btoc(grow_amount); 3925 3926 /* 3927 * Heed the MAP_WIREFUTURE flag if it was set for this process. 3928 */ 3929 if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) { 3930 vm_map_unlock(map); 3931 vm_map_wire(map, grow_start, grow_start + grow_amount, 3932 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 3933 vm_map_lock_read(map); 3934 } else 3935 vm_map_lock_downgrade(map); 3936 3937out: 3938#ifdef RACCT 3939 if (racct_enable && rv != KERN_SUCCESS) { 3940 PROC_LOCK(p); 3941 error = racct_set(p, RACCT_VMEM, map->size); 3942 KASSERT(error == 0, ("decreasing RACCT_VMEM failed")); 3943 if (!old_mlock) { 3944 error = racct_set(p, RACCT_MEMLOCK, 3945 ptoa(pmap_wired_count(map->pmap))); 3946 KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed")); 3947 } 3948 error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize)); 3949 KASSERT(error == 0, ("decreasing RACCT_STACK failed")); 3950 PROC_UNLOCK(p); 3951 } 3952#endif 3953 3954 return (rv); 3955} 3956 3957/* 3958 * Unshare the specified VM space for exec. If other processes are 3959 * mapped to it, then create a new one. The new vmspace is null. 3960 */ 3961int 3962vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) 3963{ 3964 struct vmspace *oldvmspace = p->p_vmspace; 3965 struct vmspace *newvmspace; 3966 3967 KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0, 3968 ("vmspace_exec recursed")); 3969 newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit); 3970 if (newvmspace == NULL) 3971 return (ENOMEM); 3972 newvmspace->vm_swrss = oldvmspace->vm_swrss; 3973 /* 3974 * This code is written like this for prototype purposes. The 3975 * goal is to avoid running down the vmspace here, but let the 3976 * other process's that are still using the vmspace to finally 3977 * run it down. Even though there is little or no chance of blocking 3978 * here, it is a good idea to keep this form for future mods. 3979 */ 3980 PROC_VMSPACE_LOCK(p); 3981 p->p_vmspace = newvmspace; 3982 PROC_VMSPACE_UNLOCK(p); 3983 if (p == curthread->td_proc) 3984 pmap_activate(curthread); 3985 curthread->td_pflags |= TDP_EXECVMSPC; 3986 return (0); 3987} 3988 3989/* 3990 * Unshare the specified VM space for forcing COW. This 3991 * is called by rfork, for the (RFMEM|RFPROC) == 0 case. 3992 */ 3993int 3994vmspace_unshare(struct proc *p) 3995{ 3996 struct vmspace *oldvmspace = p->p_vmspace; 3997 struct vmspace *newvmspace; 3998 vm_ooffset_t fork_charge; 3999 4000 if (oldvmspace->vm_refcnt == 1) 4001 return (0); 4002 fork_charge = 0; 4003 newvmspace = vmspace_fork(oldvmspace, &fork_charge); 4004 if (newvmspace == NULL) 4005 return (ENOMEM); 4006 if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) { 4007 vmspace_free(newvmspace); 4008 return (ENOMEM); 4009 } 4010 PROC_VMSPACE_LOCK(p); 4011 p->p_vmspace = newvmspace; 4012 PROC_VMSPACE_UNLOCK(p); 4013 if (p == curthread->td_proc) 4014 pmap_activate(curthread); 4015 vmspace_free(oldvmspace); 4016 return (0); 4017} 4018 4019/* 4020 * vm_map_lookup: 4021 * 4022 * Finds the VM object, offset, and 4023 * protection for a given virtual address in the 4024 * specified map, assuming a page fault of the 4025 * type specified. 4026 * 4027 * Leaves the map in question locked for read; return 4028 * values are guaranteed until a vm_map_lookup_done 4029 * call is performed. Note that the map argument 4030 * is in/out; the returned map must be used in 4031 * the call to vm_map_lookup_done. 4032 * 4033 * A handle (out_entry) is returned for use in 4034 * vm_map_lookup_done, to make that fast. 4035 * 4036 * If a lookup is requested with "write protection" 4037 * specified, the map may be changed to perform virtual 4038 * copying operations, although the data referenced will 4039 * remain the same. 4040 */ 4041int 4042vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ 4043 vm_offset_t vaddr, 4044 vm_prot_t fault_typea, 4045 vm_map_entry_t *out_entry, /* OUT */ 4046 vm_object_t *object, /* OUT */ 4047 vm_pindex_t *pindex, /* OUT */ 4048 vm_prot_t *out_prot, /* OUT */ 4049 boolean_t *wired) /* OUT */ 4050{ 4051 vm_map_entry_t entry; 4052 vm_map_t map = *var_map; 4053 vm_prot_t prot; 4054 vm_prot_t fault_type = fault_typea; 4055 vm_object_t eobject; 4056 vm_size_t size; 4057 struct ucred *cred; 4058 4059RetryLookup: 4060 4061 vm_map_lock_read(map); 4062 4063RetryLookupLocked: 4064 /* 4065 * Lookup the faulting address. 4066 */ 4067 if (!vm_map_lookup_entry(map, vaddr, out_entry)) { 4068 vm_map_unlock_read(map); 4069 return (KERN_INVALID_ADDRESS); 4070 } 4071 4072 entry = *out_entry; 4073 4074 /* 4075 * Handle submaps. 4076 */ 4077 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { 4078 vm_map_t old_map = map; 4079 4080 *var_map = map = entry->object.sub_map; 4081 vm_map_unlock_read(old_map); 4082 goto RetryLookup; 4083 } 4084 4085 /* 4086 * Check whether this task is allowed to have this page. 4087 */ 4088 prot = entry->protection; 4089 if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) { 4090 fault_typea &= ~VM_PROT_FAULT_LOOKUP; 4091 if (prot == VM_PROT_NONE && map != kernel_map && 4092 (entry->eflags & MAP_ENTRY_GUARD) != 0 && 4093 (entry->eflags & (MAP_ENTRY_STACK_GAP_DN | 4094 MAP_ENTRY_STACK_GAP_UP)) != 0 && 4095 vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS) 4096 goto RetryLookupLocked; 4097 } 4098 fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; 4099 if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) { 4100 vm_map_unlock_read(map); 4101 return (KERN_PROTECTION_FAILURE); 4102 } 4103 KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags & 4104 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) != 4105 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY), 4106 ("entry %p flags %x", entry, entry->eflags)); 4107 if ((fault_typea & VM_PROT_COPY) != 0 && 4108 (entry->max_protection & VM_PROT_WRITE) == 0 && 4109 (entry->eflags & MAP_ENTRY_COW) == 0) { 4110 vm_map_unlock_read(map); 4111 return (KERN_PROTECTION_FAILURE); 4112 } 4113 4114 /* 4115 * If this page is not pageable, we have to get it for all possible 4116 * accesses. 4117 */ 4118 *wired = (entry->wired_count != 0); 4119 if (*wired) 4120 fault_type = entry->protection; 4121 size = entry->end - entry->start; 4122 /* 4123 * If the entry was copy-on-write, we either ... 4124 */ 4125 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { 4126 /* 4127 * If we want to write the page, we may as well handle that 4128 * now since we've got the map locked. 4129 * 4130 * If we don't need to write the page, we just demote the 4131 * permissions allowed. 4132 */ 4133 if ((fault_type & VM_PROT_WRITE) != 0 || 4134 (fault_typea & VM_PROT_COPY) != 0) { 4135 /* 4136 * Make a new object, and place it in the object 4137 * chain. Note that no new references have appeared 4138 * -- one just moved from the map to the new 4139 * object. 4140 */ 4141 if (vm_map_lock_upgrade(map)) 4142 goto RetryLookup; 4143 4144 if (entry->cred == NULL) { 4145 /* 4146 * The debugger owner is charged for 4147 * the memory. 4148 */ 4149 cred = curthread->td_ucred; 4150 crhold(cred); 4151 if (!swap_reserve_by_cred(size, cred)) { 4152 crfree(cred); 4153 vm_map_unlock(map); 4154 return (KERN_RESOURCE_SHORTAGE); 4155 } 4156 entry->cred = cred; 4157 } 4158 vm_object_shadow(&entry->object.vm_object, 4159 &entry->offset, size); 4160 entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; 4161 eobject = entry->object.vm_object; 4162 if (eobject->cred != NULL) { 4163 /* 4164 * The object was not shadowed. 4165 */ 4166 swap_release_by_cred(size, entry->cred); 4167 crfree(entry->cred); 4168 entry->cred = NULL; 4169 } else if (entry->cred != NULL) { 4170 VM_OBJECT_WLOCK(eobject); 4171 eobject->cred = entry->cred; 4172 eobject->charge = size; 4173 VM_OBJECT_WUNLOCK(eobject); 4174 entry->cred = NULL; 4175 } 4176 4177 vm_map_lock_downgrade(map); 4178 } else { 4179 /* 4180 * We're attempting to read a copy-on-write page -- 4181 * don't allow writes. 4182 */ 4183 prot &= ~VM_PROT_WRITE; 4184 } 4185 } 4186 4187 /* 4188 * Create an object if necessary. 4189 */ 4190 if (entry->object.vm_object == NULL && 4191 !map->system_map) { 4192 if (vm_map_lock_upgrade(map)) 4193 goto RetryLookup; 4194 entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, 4195 atop(size)); 4196 entry->offset = 0; 4197 if (entry->cred != NULL) { 4198 VM_OBJECT_WLOCK(entry->object.vm_object); 4199 entry->object.vm_object->cred = entry->cred; 4200 entry->object.vm_object->charge = size; 4201 VM_OBJECT_WUNLOCK(entry->object.vm_object); 4202 entry->cred = NULL; 4203 } 4204 vm_map_lock_downgrade(map); 4205 } 4206 4207 /* 4208 * Return the object/offset from this entry. If the entry was 4209 * copy-on-write or empty, it has been fixed up. 4210 */ 4211 *pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset); 4212 *object = entry->object.vm_object; 4213 4214 *out_prot = prot; 4215 return (KERN_SUCCESS); 4216} 4217 4218/* 4219 * vm_map_lookup_locked: 4220 * 4221 * Lookup the faulting address. A version of vm_map_lookup that returns 4222 * KERN_FAILURE instead of blocking on map lock or memory allocation. 4223 */ 4224int 4225vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */ 4226 vm_offset_t vaddr, 4227 vm_prot_t fault_typea, 4228 vm_map_entry_t *out_entry, /* OUT */ 4229 vm_object_t *object, /* OUT */ 4230 vm_pindex_t *pindex, /* OUT */ 4231 vm_prot_t *out_prot, /* OUT */ 4232 boolean_t *wired) /* OUT */ 4233{ 4234 vm_map_entry_t entry; 4235 vm_map_t map = *var_map; 4236 vm_prot_t prot; 4237 vm_prot_t fault_type = fault_typea; 4238 4239 /* 4240 * Lookup the faulting address. 4241 */ 4242 if (!vm_map_lookup_entry(map, vaddr, out_entry)) 4243 return (KERN_INVALID_ADDRESS); 4244 4245 entry = *out_entry; 4246 4247 /* 4248 * Fail if the entry refers to a submap. 4249 */ 4250 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) 4251 return (KERN_FAILURE); 4252 4253 /* 4254 * Check whether this task is allowed to have this page. 4255 */ 4256 prot = entry->protection; 4257 fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; 4258 if ((fault_type & prot) != fault_type) 4259 return (KERN_PROTECTION_FAILURE); 4260 4261 /* 4262 * If this page is not pageable, we have to get it for all possible 4263 * accesses. 4264 */ 4265 *wired = (entry->wired_count != 0); 4266 if (*wired) 4267 fault_type = entry->protection; 4268 4269 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { 4270 /* 4271 * Fail if the entry was copy-on-write for a write fault. 4272 */ 4273 if (fault_type & VM_PROT_WRITE) 4274 return (KERN_FAILURE); 4275 /* 4276 * We're attempting to read a copy-on-write page -- 4277 * don't allow writes. 4278 */ 4279 prot &= ~VM_PROT_WRITE; 4280 } 4281 4282 /* 4283 * Fail if an object should be created. 4284 */ 4285 if (entry->object.vm_object == NULL && !map->system_map) 4286 return (KERN_FAILURE); 4287 4288 /* 4289 * Return the object/offset from this entry. If the entry was 4290 * copy-on-write or empty, it has been fixed up. 4291 */ 4292 *pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset); 4293 *object = entry->object.vm_object; 4294 4295 *out_prot = prot; 4296 return (KERN_SUCCESS); 4297} 4298 4299/* 4300 * vm_map_lookup_done: 4301 * 4302 * Releases locks acquired by a vm_map_lookup 4303 * (according to the handle returned by that lookup). 4304 */ 4305void 4306vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) 4307{ 4308 /* 4309 * Unlock the main-level map 4310 */ 4311 vm_map_unlock_read(map); 4312} 4313 4314vm_offset_t 4315vm_map_max_KBI(const struct vm_map *map) 4316{ 4317 4318 return (vm_map_max(map)); 4319} 4320 4321vm_offset_t 4322vm_map_min_KBI(const struct vm_map *map) 4323{ 4324 4325 return (vm_map_min(map)); 4326} 4327 4328pmap_t 4329vm_map_pmap_KBI(vm_map_t map) 4330{ 4331 4332 return (map->pmap); 4333} 4334 4335#include "opt_ddb.h" 4336#ifdef DDB 4337#include <sys/kernel.h> 4338 4339#include <ddb/ddb.h> 4340 4341static void 4342vm_map_print(vm_map_t map) 4343{ 4344 vm_map_entry_t entry; 4345 4346 db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", 4347 (void *)map, 4348 (void *)map->pmap, map->nentries, map->timestamp); 4349 4350 db_indent += 2; 4351 for (entry = map->header.next; entry != &map->header; 4352 entry = entry->next) { 4353 db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n", 4354 (void *)entry, (void *)entry->start, (void *)entry->end, 4355 entry->eflags); 4356 { 4357 static char *inheritance_name[4] = 4358 {"share", "copy", "none", "donate_copy"}; 4359 4360 db_iprintf(" prot=%x/%x/%s", 4361 entry->protection, 4362 entry->max_protection, 4363 inheritance_name[(int)(unsigned char)entry->inheritance]); 4364 if (entry->wired_count != 0) 4365 db_printf(", wired"); 4366 } 4367 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { 4368 db_printf(", share=%p, offset=0x%jx\n", 4369 (void *)entry->object.sub_map, 4370 (uintmax_t)entry->offset); 4371 if ((entry->prev == &map->header) || 4372 (entry->prev->object.sub_map != 4373 entry->object.sub_map)) { 4374 db_indent += 2; 4375 vm_map_print((vm_map_t)entry->object.sub_map); 4376 db_indent -= 2; 4377 } 4378 } else { 4379 if (entry->cred != NULL) 4380 db_printf(", ruid %d", entry->cred->cr_ruid); 4381 db_printf(", object=%p, offset=0x%jx", 4382 (void *)entry->object.vm_object, 4383 (uintmax_t)entry->offset); 4384 if (entry->object.vm_object && entry->object.vm_object->cred) 4385 db_printf(", obj ruid %d charge %jx", 4386 entry->object.vm_object->cred->cr_ruid, 4387 (uintmax_t)entry->object.vm_object->charge); 4388 if (entry->eflags & MAP_ENTRY_COW) 4389 db_printf(", copy (%s)", 4390 (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); 4391 db_printf("\n"); 4392 4393 if ((entry->prev == &map->header) || 4394 (entry->prev->object.vm_object != 4395 entry->object.vm_object)) { 4396 db_indent += 2; 4397 vm_object_print((db_expr_t)(intptr_t) 4398 entry->object.vm_object, 4399 0, 0, (char *)0); 4400 db_indent -= 2; 4401 } 4402 } 4403 } 4404 db_indent -= 2; 4405} 4406 4407DB_SHOW_COMMAND(map, map) 4408{ 4409 4410 if (!have_addr) { 4411 db_printf("usage: show map <addr>\n"); 4412 return; 4413 } 4414 vm_map_print((vm_map_t)addr); 4415} 4416 4417DB_SHOW_COMMAND(procvm, procvm) 4418{ 4419 struct proc *p; 4420 4421 if (have_addr) { 4422 p = db_lookup_proc(addr); 4423 } else { 4424 p = curproc; 4425 } 4426 4427 db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", 4428 (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, 4429 (void *)vmspace_pmap(p->p_vmspace)); 4430 4431 vm_map_print((vm_map_t)&p->p_vmspace->vm_map); 4432} 4433 4434#endif /* DDB */ 4435