1/* 2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * @OSF_COPYRIGHT@ 30 */ 31/* 32 * Mach Operating System 33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University 34 * All Rights Reserved. 35 * 36 * Permission to use, copy, modify and distribute this software and its 37 * documentation is hereby granted, provided that both the copyright 38 * notice and this permission notice appear in all copies of the 39 * software, derivative works or modified versions, and any portions 40 * thereof, and that both notices appear in supporting documentation. 41 * 42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR 44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 45 * 46 * Carnegie Mellon requests users of this software to return to 47 * 48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 49 * School of Computer Science 50 * Carnegie Mellon University 51 * Pittsburgh PA 15213-3890 52 * 53 * any improvements or extensions that they make and grant Carnegie Mellon 54 * the rights to redistribute these changes. 55 */ 56/* 57 */ 58/* 59 * File: vm_fault.c 60 * Author: Avadis Tevanian, Jr., Michael Wayne Young 61 * 62 * Page fault handling module. 63 */ 64 65#include <mach_cluster_stats.h> 66#include <mach_pagemap.h> 67#include <mach_kdb.h> 68#include <libkern/OSAtomic.h> 69 70#include <mach/mach_types.h> 71#include <mach/kern_return.h> 72#include <mach/message.h> /* for error codes */ 73#include <mach/vm_param.h> 74#include <mach/vm_behavior.h> 75#include <mach/memory_object.h> 76 /* For memory_object_data_{request,unlock} */ 77#include <mach/sdt.h> 78 79#include <kern/kern_types.h> 80#include <kern/host_statistics.h> 81#include <kern/counters.h> 82#include <kern/task.h> 83#include <kern/thread.h> 84#include <kern/sched_prim.h> 85#include <kern/host.h> 86#include <kern/xpr.h> 87#include <kern/mach_param.h> 88#include <kern/macro_help.h> 89#include <kern/zalloc.h> 90#include <kern/misc_protos.h> 91 92#include <ppc/proc_reg.h> 93 94#include <vm/vm_fault.h> 95#include <vm/vm_map.h> 96#include <vm/vm_object.h> 97#include <vm/vm_page.h> 98#include <vm/vm_kern.h> 99#include <vm/pmap.h> 100#include <vm/vm_pageout.h> 101#include <vm/vm_protos.h> 102#include <vm/vm_external.h> 103#include <vm/memory_object.h> 104#include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */ 105 106#include <sys/kdebug.h> 107 108#define VM_FAULT_CLASSIFY 0 109 110/* Zero-filled pages are marked "m->zero_fill" and put on the 111 * special zero-fill inactive queue only if they belong to 112 * an object at least this big. 113 */ 114#define VM_ZF_OBJECT_SIZE_THRESHOLD (0x200000) 115 116#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */ 117 118int vm_object_pagein_throttle = 16; 119 120extern int cs_debug; 121 122#if MACH_KDB 123extern struct db_watchpoint *db_watchpoint_list; 124#endif /* MACH_KDB */ 125 126 127/* Forward declarations of internal routines. */ 128extern kern_return_t vm_fault_wire_fast( 129 vm_map_t map, 130 vm_map_offset_t va, 131 vm_map_entry_t entry, 132 pmap_t pmap, 133 vm_map_offset_t pmap_addr); 134 135extern void vm_fault_continue(void); 136 137extern void vm_fault_copy_cleanup( 138 vm_page_t page, 139 vm_page_t top_page); 140 141extern void vm_fault_copy_dst_cleanup( 142 vm_page_t page); 143 144#if VM_FAULT_CLASSIFY 145extern void vm_fault_classify(vm_object_t object, 146 vm_object_offset_t offset, 147 vm_prot_t fault_type); 148 149extern void vm_fault_classify_init(void); 150#endif 151 152 153unsigned long vm_cs_validates = 0; 154unsigned long vm_cs_revalidates = 0; 155unsigned long vm_cs_query_modified = 0; 156unsigned long vm_cs_validated_dirtied = 0; 157 158#if CONFIG_ENFORCE_SIGNED_CODE 159#if SECURE_KERNEL 160const int cs_enforcement_disable=0; 161#else 162int cs_enforcement_disable=1; 163#endif 164#endif 165 166/* 167 * Routine: vm_fault_init 168 * Purpose: 169 * Initialize our private data structures. 170 */ 171void 172vm_fault_init(void) 173{ 174#if !SECURE_KERNEL 175#if CONFIG_ENFORCE_SIGNED_CODE 176 PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable, sizeof (cs_enforcement_disable)); 177#endif 178 PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug)); 179#endif 180} 181 182/* 183 * Routine: vm_fault_cleanup 184 * Purpose: 185 * Clean up the result of vm_fault_page. 186 * Results: 187 * The paging reference for "object" is released. 188 * "object" is unlocked. 189 * If "top_page" is not null, "top_page" is 190 * freed and the paging reference for the object 191 * containing it is released. 192 * 193 * In/out conditions: 194 * "object" must be locked. 195 */ 196void 197vm_fault_cleanup( 198 register vm_object_t object, 199 register vm_page_t top_page) 200{ 201 vm_object_paging_end(object); 202 vm_object_unlock(object); 203 204 if (top_page != VM_PAGE_NULL) { 205 object = top_page->object; 206 207 vm_object_lock(object); 208 VM_PAGE_FREE(top_page); 209 vm_object_paging_end(object); 210 vm_object_unlock(object); 211 } 212} 213 214#if MACH_CLUSTER_STATS 215#define MAXCLUSTERPAGES 16 216struct { 217 unsigned long pages_in_cluster; 218 unsigned long pages_at_higher_offsets; 219 unsigned long pages_at_lower_offsets; 220} cluster_stats_in[MAXCLUSTERPAGES]; 221#define CLUSTER_STAT(clause) clause 222#define CLUSTER_STAT_HIGHER(x) \ 223 ((cluster_stats_in[(x)].pages_at_higher_offsets)++) 224#define CLUSTER_STAT_LOWER(x) \ 225 ((cluster_stats_in[(x)].pages_at_lower_offsets)++) 226#define CLUSTER_STAT_CLUSTER(x) \ 227 ((cluster_stats_in[(x)].pages_in_cluster)++) 228#else /* MACH_CLUSTER_STATS */ 229#define CLUSTER_STAT(clause) 230#endif /* MACH_CLUSTER_STATS */ 231 232#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0) 233 234 235boolean_t vm_page_deactivate_behind = TRUE; 236/* 237 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior 238 */ 239int vm_default_ahead = 0; 240int vm_default_behind = MAX_UPL_TRANSFER; 241 242#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024) 243 244/* 245 * vm_page_is_sequential 246 * 247 * Determine if sequential access is in progress 248 * in accordance with the behavior specified. 249 * Update state to indicate current access pattern. 250 * 251 * object must have at least the shared lock held 252 */ 253static 254void 255vm_fault_is_sequential( 256 vm_object_t object, 257 vm_object_offset_t offset, 258 vm_behavior_t behavior) 259{ 260 vm_object_offset_t last_alloc; 261 int sequential; 262 int orig_sequential; 263 264 last_alloc = object->last_alloc; 265 sequential = object->sequential; 266 orig_sequential = sequential; 267 268 switch (behavior) { 269 case VM_BEHAVIOR_RANDOM: 270 /* 271 * reset indicator of sequential behavior 272 */ 273 sequential = 0; 274 break; 275 276 case VM_BEHAVIOR_SEQUENTIAL: 277 if (offset && last_alloc == offset - PAGE_SIZE_64) { 278 /* 279 * advance indicator of sequential behavior 280 */ 281 if (sequential < MAX_SEQUENTIAL_RUN) 282 sequential += PAGE_SIZE; 283 } else { 284 /* 285 * reset indicator of sequential behavior 286 */ 287 sequential = 0; 288 } 289 break; 290 291 case VM_BEHAVIOR_RSEQNTL: 292 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) { 293 /* 294 * advance indicator of sequential behavior 295 */ 296 if (sequential > -MAX_SEQUENTIAL_RUN) 297 sequential -= PAGE_SIZE; 298 } else { 299 /* 300 * reset indicator of sequential behavior 301 */ 302 sequential = 0; 303 } 304 break; 305 306 case VM_BEHAVIOR_DEFAULT: 307 default: 308 if (offset && last_alloc == (offset - PAGE_SIZE_64)) { 309 /* 310 * advance indicator of sequential behavior 311 */ 312 if (sequential < 0) 313 sequential = 0; 314 if (sequential < MAX_SEQUENTIAL_RUN) 315 sequential += PAGE_SIZE; 316 317 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) { 318 /* 319 * advance indicator of sequential behavior 320 */ 321 if (sequential > 0) 322 sequential = 0; 323 if (sequential > -MAX_SEQUENTIAL_RUN) 324 sequential -= PAGE_SIZE; 325 } else { 326 /* 327 * reset indicator of sequential behavior 328 */ 329 sequential = 0; 330 } 331 break; 332 } 333 if (sequential != orig_sequential) { 334 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) { 335 /* 336 * if someone else has already updated object->sequential 337 * don't bother trying to update it or object->last_alloc 338 */ 339 return; 340 } 341 } 342 /* 343 * I'd like to do this with a OSCompareAndSwap64, but that 344 * doesn't exist for PPC... however, it shouldn't matter 345 * that much... last_alloc is maintained so that we can determine 346 * if a sequential access pattern is taking place... if only 347 * one thread is banging on this object, no problem with the unprotected 348 * update... if 2 or more threads are banging away, we run the risk of 349 * someone seeing a mangled update... however, in the face of multiple 350 * accesses, no sequential access pattern can develop anyway, so we 351 * haven't lost any real info. 352 */ 353 object->last_alloc = offset; 354} 355 356 357/* 358 * vm_page_deactivate_behind 359 * 360 * Determine if sequential access is in progress 361 * in accordance with the behavior specified. If 362 * so, compute a potential page to deactivate and 363 * deactivate it. 364 * 365 * object must be locked. 366 * 367 * return TRUE if we actually deactivate a page 368 */ 369static 370boolean_t 371vm_fault_deactivate_behind( 372 vm_object_t object, 373 vm_object_offset_t offset, 374 vm_behavior_t behavior) 375{ 376 vm_page_t m = NULL; 377 int sequential_run; 378 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; 379 380#if TRACEFAULTPAGE 381 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */ 382#endif 383 384 if (object == kernel_object || vm_page_deactivate_behind == FALSE) { 385 /* 386 * Do not deactivate pages from the kernel object: they 387 * are not intended to become pageable. 388 * or we've disabled the deactivate behind mechanism 389 */ 390 return FALSE; 391 } 392 if ((sequential_run = object->sequential)) { 393 if (sequential_run < 0) { 394 sequential_behavior = VM_BEHAVIOR_RSEQNTL; 395 sequential_run = 0 - sequential_run; 396 } else { 397 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; 398 } 399 } 400 switch (behavior) { 401 case VM_BEHAVIOR_RANDOM: 402 break; 403 case VM_BEHAVIOR_SEQUENTIAL: 404 if (sequential_run >= (int)PAGE_SIZE) 405 m = vm_page_lookup(object, offset - PAGE_SIZE_64); 406 break; 407 case VM_BEHAVIOR_RSEQNTL: 408 if (sequential_run >= (int)PAGE_SIZE) 409 m = vm_page_lookup(object, offset + PAGE_SIZE_64); 410 break; 411 case VM_BEHAVIOR_DEFAULT: 412 default: 413 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64; 414 415 /* 416 * determine if the run of sequential accesss has been 417 * long enough on an object with default access behavior 418 * to consider it for deactivation 419 */ 420 if ((uint64_t)sequential_run >= behind) { 421 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) { 422 if (offset >= behind) 423 m = vm_page_lookup(object, offset - behind); 424 } else { 425 if (offset < -behind) 426 m = vm_page_lookup(object, offset + behind); 427 } 428 } 429 break; 430 } 431 } 432 if (m) { 433 if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) { 434 pmap_clear_reference(m->phys_page); 435 m->deactivated = TRUE; 436#if TRACEFAULTPAGE 437 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ 438#endif 439 return TRUE; 440 } 441 } 442 return FALSE; 443} 444 445 446/* 447 * check for various conditions that would 448 * prevent us from creating a ZF page... 449 * cleanup is based on being called from vm_fault_page 450 * 451 * object must be locked 452 * object == m->object 453 */ 454static vm_fault_return_t 455vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state) 456{ 457 if (object->shadow_severed) { 458 /* 459 * the shadow chain was severed 460 * just have to return an error at this point 461 */ 462 if (m != VM_PAGE_NULL) 463 VM_PAGE_FREE(m); 464 vm_fault_cleanup(object, first_m); 465 466 thread_interrupt_level(interruptible_state); 467 468 return (VM_FAULT_MEMORY_ERROR); 469 } 470 if (vm_backing_store_low) { 471 /* 472 * are we protecting the system from 473 * backing store exhaustion. If so 474 * sleep unless we are privileged. 475 */ 476 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { 477 478 if (m != VM_PAGE_NULL) 479 VM_PAGE_FREE(m); 480 vm_fault_cleanup(object, first_m); 481 482 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); 483 484 thread_block(THREAD_CONTINUE_NULL); 485 thread_interrupt_level(interruptible_state); 486 487 return (VM_FAULT_RETRY); 488 } 489 } 490 if (VM_PAGE_ZFILL_THROTTLED()) { 491 /* 492 * we're throttling zero-fills... 493 * treat this as if we couldn't grab a page 494 */ 495 if (m != VM_PAGE_NULL) 496 VM_PAGE_FREE(m); 497 vm_fault_cleanup(object, first_m); 498 499 thread_interrupt_level(interruptible_state); 500 501 return (VM_FAULT_MEMORY_SHORTAGE); 502 } 503 return (VM_FAULT_SUCCESS); 504} 505 506 507/* 508 * do the work to zero fill a page and 509 * inject it into the correct paging queue 510 * 511 * m->object must be locked 512 * page queue lock must NOT be held 513 */ 514static int 515vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) 516{ 517 int my_fault = DBG_ZERO_FILL_FAULT; 518 519 /* 520 * This is is a zero-fill page fault... 521 * 522 * Checking the page lock is a waste of 523 * time; this page was absent, so 524 * it can't be page locked by a pager. 525 * 526 * we also consider it undefined 527 * with respect to instruction 528 * execution. i.e. it is the responsibility 529 * of higher layers to call for an instruction 530 * sync after changing the contents and before 531 * sending a program into this area. We 532 * choose this approach for performance 533 */ 534 m->pmapped = TRUE; 535 536 m->cs_validated = FALSE; 537 m->cs_tainted = FALSE; 538 539 if (no_zero_fill == TRUE) 540 my_fault = DBG_NZF_PAGE_FAULT; 541 else { 542 vm_page_zero_fill(m); 543 544 VM_STAT_INCR(zero_fill_count); 545 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL); 546 } 547 assert(!m->laundry); 548 assert(m->object != kernel_object); 549 //assert(m->pageq.next == NULL && m->pageq.prev == NULL); 550 551 if (!IP_VALID(memory_manager_default) && 552 (m->object->purgable == VM_PURGABLE_DENY || 553 m->object->purgable == VM_PURGABLE_NONVOLATILE || 554 m->object->purgable == VM_PURGABLE_VOLATILE )) { 555 vm_page_lock_queues(); 556 557 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); 558 m->throttled = TRUE; 559 vm_page_throttled_count++; 560 561 vm_page_unlock_queues(); 562 } else { 563 if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) { 564 m->zero_fill = TRUE; 565 OSAddAtomic(1, (SInt32 *)&vm_zf_count); 566 } 567 } 568 return (my_fault); 569} 570 571 572/* 573 * Routine: vm_fault_page 574 * Purpose: 575 * Find the resident page for the virtual memory 576 * specified by the given virtual memory object 577 * and offset. 578 * Additional arguments: 579 * The required permissions for the page is given 580 * in "fault_type". Desired permissions are included 581 * in "protection". 582 * fault_info is passed along to determine pagein cluster 583 * limits... it contains the expected reference pattern, 584 * cluster size if available, etc... 585 * 586 * If the desired page is known to be resident (for 587 * example, because it was previously wired down), asserting 588 * the "unwiring" parameter will speed the search. 589 * 590 * If the operation can be interrupted (by thread_abort 591 * or thread_terminate), then the "interruptible" 592 * parameter should be asserted. 593 * 594 * Results: 595 * The page containing the proper data is returned 596 * in "result_page". 597 * 598 * In/out conditions: 599 * The source object must be locked and referenced, 600 * and must donate one paging reference. The reference 601 * is not affected. The paging reference and lock are 602 * consumed. 603 * 604 * If the call succeeds, the object in which "result_page" 605 * resides is left locked and holding a paging reference. 606 * If this is not the original object, a busy page in the 607 * original object is returned in "top_page", to prevent other 608 * callers from pursuing this same data, along with a paging 609 * reference for the original object. The "top_page" should 610 * be destroyed when this guarantee is no longer required. 611 * The "result_page" is also left busy. It is not removed 612 * from the pageout queues. 613 */ 614 615vm_fault_return_t 616vm_fault_page( 617 /* Arguments: */ 618 vm_object_t first_object, /* Object to begin search */ 619 vm_object_offset_t first_offset, /* Offset into object */ 620 vm_prot_t fault_type, /* What access is requested */ 621 boolean_t must_be_resident,/* Must page be resident? */ 622 /* Modifies in place: */ 623 vm_prot_t *protection, /* Protection for mapping */ 624 /* Returns: */ 625 vm_page_t *result_page, /* Page found, if successful */ 626 vm_page_t *top_page, /* Page in top object, if 627 * not result_page. */ 628 int *type_of_fault, /* if non-null, fill in with type of fault 629 * COW, zero-fill, etc... returned in trace point */ 630 /* More arguments: */ 631 kern_return_t *error_code, /* code if page is in error */ 632 boolean_t no_zero_fill, /* don't zero fill absent pages */ 633#if MACH_PAGEMAP 634 boolean_t data_supply, /* treat as data_supply if 635 * it is a write fault and a full 636 * page is provided */ 637#else 638 __unused boolean_t data_supply, 639#endif 640 vm_object_fault_info_t fault_info) 641{ 642 vm_page_t m; 643 vm_object_t object; 644 vm_object_offset_t offset; 645 vm_page_t first_m; 646 vm_object_t next_object; 647 vm_object_t copy_object; 648 boolean_t look_for_page; 649 vm_prot_t access_required = fault_type; 650 vm_prot_t wants_copy_flag; 651 CLUSTER_STAT(int pages_at_higher_offsets;) 652 CLUSTER_STAT(int pages_at_lower_offsets;) 653 kern_return_t wait_result; 654 boolean_t interruptible_state; 655 vm_fault_return_t error; 656 int my_fault; 657 uint32_t try_failed_count; 658 int interruptible; /* how may fault be interrupted? */ 659 memory_object_t pager; 660 661/* 662 * MACH page map - an optional optimization where a bit map is maintained 663 * by the VM subsystem for internal objects to indicate which pages of 664 * the object currently reside on backing store. This existence map 665 * duplicates information maintained by the vnode pager. It is 666 * created at the time of the first pageout against the object, i.e. 667 * at the same time pager for the object is created. The optimization 668 * is designed to eliminate pager interaction overhead, if it is 669 * 'known' that the page does not exist on backing store. 670 * 671 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is 672 * either marked as paged out in the existence map for the object or no 673 * existence map exists for the object. MUST_ASK_PAGER() is one of the 674 * criteria in the decision to invoke the pager. It is also used as one 675 * of the criteria to terminate the scan for adjacent pages in a clustered 676 * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for 677 * permanent objects. Note also that if the pager for an internal object 678 * has not been created, the pager is not invoked regardless of the value 679 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object 680 * for which a pager has been created. 681 * 682 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset 683 * is marked as paged out in the existence map for the object. PAGED_OUT() 684 * PAGED_OUT() is used to determine if a page has already been pushed 685 * into a copy object in order to avoid a redundant page out operation. 686 */ 687#if MACH_PAGEMAP 688#define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \ 689 != VM_EXTERNAL_STATE_ABSENT) 690#define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \ 691 == VM_EXTERNAL_STATE_EXISTS) 692#else 693#define MUST_ASK_PAGER(o, f) (TRUE) 694#define PAGED_OUT(o, f) (FALSE) 695#endif 696 697/* 698 * Recovery actions 699 */ 700#define PREPARE_RELEASE_PAGE(m) \ 701 MACRO_BEGIN \ 702 vm_page_lock_queues(); \ 703 MACRO_END 704 705#define DO_RELEASE_PAGE(m) \ 706 MACRO_BEGIN \ 707 PAGE_WAKEUP_DONE(m); \ 708 if (!m->active && !m->inactive && !m->throttled)\ 709 vm_page_activate(m); \ 710 vm_page_unlock_queues(); \ 711 MACRO_END 712 713#define RELEASE_PAGE(m) \ 714 MACRO_BEGIN \ 715 PREPARE_RELEASE_PAGE(m); \ 716 DO_RELEASE_PAGE(m); \ 717 MACRO_END 718 719#if TRACEFAULTPAGE 720 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */ 721#endif 722 723 724#if MACH_KDB 725 /* 726 * If there are watchpoints set, then 727 * we don't want to give away write permission 728 * on a read fault. Make the task write fault, 729 * so that the watchpoint code notices the access. 730 */ 731 if (db_watchpoint_list) { 732 /* 733 * If we aren't asking for write permission, 734 * then don't give it away. We're using write 735 * faults to set the dirty bit. 736 */ 737 if (!(fault_type & VM_PROT_WRITE)) 738 *protection &= ~VM_PROT_WRITE; 739 } 740#endif /* MACH_KDB */ 741 742 interruptible = fault_info->interruptible; 743 interruptible_state = thread_interrupt_level(interruptible); 744 745 /* 746 * INVARIANTS (through entire routine): 747 * 748 * 1) At all times, we must either have the object 749 * lock or a busy page in some object to prevent 750 * some other thread from trying to bring in 751 * the same page. 752 * 753 * Note that we cannot hold any locks during the 754 * pager access or when waiting for memory, so 755 * we use a busy page then. 756 * 757 * 2) To prevent another thread from racing us down the 758 * shadow chain and entering a new page in the top 759 * object before we do, we must keep a busy page in 760 * the top object while following the shadow chain. 761 * 762 * 3) We must increment paging_in_progress on any object 763 * for which we have a busy page before dropping 764 * the object lock 765 * 766 * 4) We leave busy pages on the pageout queues. 767 * If the pageout daemon comes across a busy page, 768 * it will remove the page from the pageout queues. 769 */ 770 771 object = first_object; 772 offset = first_offset; 773 first_m = VM_PAGE_NULL; 774 access_required = fault_type; 775 776 777 XPR(XPR_VM_FAULT, 778 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n", 779 (integer_t)object, offset, fault_type, *protection, 0); 780 781 /* 782 * default type of fault 783 */ 784 my_fault = DBG_CACHE_HIT_FAULT; 785 786 while (TRUE) { 787#if TRACEFAULTPAGE 788 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */ 789#endif 790 if (!object->alive) { 791 /* 792 * object is no longer valid 793 * clean up and return error 794 */ 795 vm_fault_cleanup(object, first_m); 796 thread_interrupt_level(interruptible_state); 797 798 return (VM_FAULT_MEMORY_ERROR); 799 } 800 801 /* 802 * See whether the page at 'offset' is resident 803 */ 804 m = vm_page_lookup(object, offset); 805#if TRACEFAULTPAGE 806 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */ 807#endif 808 if (m != VM_PAGE_NULL) { 809 810 if (m->busy) { 811 /* 812 * The page is being brought in, 813 * wait for it and then retry. 814 * 815 * A possible optimization: if the page 816 * is known to be resident, we can ignore 817 * pages that are absent (regardless of 818 * whether they're busy). 819 */ 820#if TRACEFAULTPAGE 821 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ 822#endif 823 wait_result = PAGE_SLEEP(object, m, interruptible); 824 XPR(XPR_VM_FAULT, 825 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n", 826 (integer_t)object, offset, 827 (integer_t)m, 0, 0); 828 counter(c_vm_fault_page_block_busy_kernel++); 829 830 if (wait_result != THREAD_AWAKENED) { 831 vm_fault_cleanup(object, first_m); 832 thread_interrupt_level(interruptible_state); 833 834 if (wait_result == THREAD_RESTART) 835 return (VM_FAULT_RETRY); 836 else 837 return (VM_FAULT_INTERRUPTED); 838 } 839 continue; 840 } 841 842 if (m->phys_page == vm_page_guard_addr) { 843 /* 844 * Guard page: off limits ! 845 */ 846 if (fault_type == VM_PROT_NONE) { 847 /* 848 * The fault is not requesting any 849 * access to the guard page, so it must 850 * be just to wire or unwire it. 851 * Let's pretend it succeeded... 852 */ 853 m->busy = TRUE; 854 *result_page = m; 855 assert(first_m == VM_PAGE_NULL); 856 *top_page = first_m; 857 if (type_of_fault) 858 *type_of_fault = DBG_GUARD_FAULT; 859 return VM_FAULT_SUCCESS; 860 } else { 861 /* 862 * The fault requests access to the 863 * guard page: let's deny that ! 864 */ 865 vm_fault_cleanup(object, first_m); 866 thread_interrupt_level(interruptible_state); 867 return VM_FAULT_MEMORY_ERROR; 868 } 869 } 870 871 if (m->error) { 872 /* 873 * The page is in error, give up now. 874 */ 875#if TRACEFAULTPAGE 876 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */ 877#endif 878 if (error_code) 879 *error_code = KERN_MEMORY_ERROR; 880 VM_PAGE_FREE(m); 881 882 vm_fault_cleanup(object, first_m); 883 thread_interrupt_level(interruptible_state); 884 885 return (VM_FAULT_MEMORY_ERROR); 886 } 887 if (m->restart) { 888 /* 889 * The pager wants us to restart 890 * at the top of the chain, 891 * typically because it has moved the 892 * page to another pager, then do so. 893 */ 894#if TRACEFAULTPAGE 895 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ 896#endif 897 VM_PAGE_FREE(m); 898 899 vm_fault_cleanup(object, first_m); 900 thread_interrupt_level(interruptible_state); 901 902 return (VM_FAULT_RETRY); 903 } 904 if (m->absent) { 905 /* 906 * The page isn't busy, but is absent, 907 * therefore it's deemed "unavailable". 908 * 909 * Remove the non-existent page (unless it's 910 * in the top object) and move on down to the 911 * next object (if there is one). 912 */ 913#if TRACEFAULTPAGE 914 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */ 915#endif 916 next_object = object->shadow; 917 918 if (next_object == VM_OBJECT_NULL) { 919 /* 920 * Absent page at bottom of shadow 921 * chain; zero fill the page we left 922 * busy in the first object, and free 923 * the absent page. 924 */ 925 assert(!must_be_resident); 926 927 /* 928 * check for any conditions that prevent 929 * us from creating a new zero-fill page 930 * vm_fault_check will do all of the 931 * fault cleanup in the case of an error condition 932 * including resetting the thread_interrupt_level 933 */ 934 error = vm_fault_check(object, m, first_m, interruptible_state); 935 936 if (error != VM_FAULT_SUCCESS) 937 return (error); 938 939 XPR(XPR_VM_FAULT, 940 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n", 941 (integer_t)object, offset, 942 (integer_t)m, 943 (integer_t)first_object, 0); 944 945 if (object != first_object) { 946 /* 947 * free the absent page we just found 948 */ 949 VM_PAGE_FREE(m); 950 951 /* 952 * drop reference and lock on current object 953 */ 954 vm_object_paging_end(object); 955 vm_object_unlock(object); 956 957 /* 958 * grab the original page we 959 * 'soldered' in place and 960 * retake lock on 'first_object' 961 */ 962 m = first_m; 963 first_m = VM_PAGE_NULL; 964 965 object = first_object; 966 offset = first_offset; 967 968 vm_object_lock(object); 969 } else { 970 /* 971 * we're going to use the absent page we just found 972 * so convert it to a 'busy' page 973 */ 974 m->absent = FALSE; 975 m->busy = TRUE; 976 } 977 /* 978 * zero-fill the page and put it on 979 * the correct paging queue 980 */ 981 my_fault = vm_fault_zero_page(m, no_zero_fill); 982 983 break; 984 } else { 985 if (must_be_resident) 986 vm_object_paging_end(object); 987 else if (object != first_object) { 988 vm_object_paging_end(object); 989 VM_PAGE_FREE(m); 990 } else { 991 first_m = m; 992 m->absent = FALSE; 993 m->busy = TRUE; 994 995 vm_page_lockspin_queues(); 996 VM_PAGE_QUEUES_REMOVE(m); 997 vm_page_unlock_queues(); 998 } 999 XPR(XPR_VM_FAULT, 1000 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n", 1001 (integer_t)object, offset, 1002 (integer_t)next_object, 1003 offset+object->shadow_offset,0); 1004 1005 offset += object->shadow_offset; 1006 fault_info->lo_offset += object->shadow_offset; 1007 fault_info->hi_offset += object->shadow_offset; 1008 access_required = VM_PROT_READ; 1009 1010 vm_object_lock(next_object); 1011 vm_object_unlock(object); 1012 object = next_object; 1013 vm_object_paging_begin(object); 1014 1015 /* 1016 * reset to default type of fault 1017 */ 1018 my_fault = DBG_CACHE_HIT_FAULT; 1019 1020 continue; 1021 } 1022 } 1023 if ((m->cleaning) 1024 && ((object != first_object) || (object->copy != VM_OBJECT_NULL)) 1025 && (fault_type & VM_PROT_WRITE)) { 1026 /* 1027 * This is a copy-on-write fault that will 1028 * cause us to revoke access to this page, but 1029 * this page is in the process of being cleaned 1030 * in a clustered pageout. We must wait until 1031 * the cleaning operation completes before 1032 * revoking access to the original page, 1033 * otherwise we might attempt to remove a 1034 * wired mapping. 1035 */ 1036#if TRACEFAULTPAGE 1037 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */ 1038#endif 1039 XPR(XPR_VM_FAULT, 1040 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n", 1041 (integer_t)object, offset, 1042 (integer_t)m, 0, 0); 1043 /* 1044 * take an extra ref so that object won't die 1045 */ 1046 vm_object_reference_locked(object); 1047 1048 vm_fault_cleanup(object, first_m); 1049 1050 counter(c_vm_fault_page_block_backoff_kernel++); 1051 vm_object_lock(object); 1052 assert(object->ref_count > 0); 1053 1054 m = vm_page_lookup(object, offset); 1055 1056 if (m != VM_PAGE_NULL && m->cleaning) { 1057 PAGE_ASSERT_WAIT(m, interruptible); 1058 1059 vm_object_unlock(object); 1060 wait_result = thread_block(THREAD_CONTINUE_NULL); 1061 vm_object_deallocate(object); 1062 1063 goto backoff; 1064 } else { 1065 vm_object_unlock(object); 1066 1067 vm_object_deallocate(object); 1068 thread_interrupt_level(interruptible_state); 1069 1070 return (VM_FAULT_RETRY); 1071 } 1072 } 1073 if (type_of_fault == NULL && m->speculative) { 1074 /* 1075 * If we were passed a non-NULL pointer for 1076 * "type_of_fault", than we came from 1077 * vm_fault... we'll let it deal with 1078 * this condition, since it 1079 * needs to see m->speculative to correctly 1080 * account the pageins, otherwise... 1081 * take it off the speculative queue, we'll 1082 * let the caller of vm_fault_page deal 1083 * with getting it onto the correct queue 1084 */ 1085 vm_page_lockspin_queues(); 1086 VM_PAGE_QUEUES_REMOVE(m); 1087 vm_page_unlock_queues(); 1088 } 1089 1090 if (m->encrypted) { 1091 /* 1092 * ENCRYPTED SWAP: 1093 * the user needs access to a page that we 1094 * encrypted before paging it out. 1095 * Decrypt the page now. 1096 * Keep it busy to prevent anyone from 1097 * accessing it during the decryption. 1098 */ 1099 m->busy = TRUE; 1100 vm_page_decrypt(m, 0); 1101 assert(object == m->object); 1102 assert(m->busy); 1103 PAGE_WAKEUP_DONE(m); 1104 1105 /* 1106 * Retry from the top, in case 1107 * something changed while we were 1108 * decrypting. 1109 */ 1110 continue; 1111 } 1112 ASSERT_PAGE_DECRYPTED(m); 1113 1114 if (m->object->code_signed) { 1115 /* 1116 * CODE SIGNING: 1117 * We just paged in a page from a signed 1118 * memory object but we don't need to 1119 * validate it now. We'll validate it if 1120 * when it gets mapped into a user address 1121 * space for the first time or when the page 1122 * gets copied to another object as a result 1123 * of a copy-on-write. 1124 */ 1125 } 1126 1127 /* 1128 * We mark the page busy and leave it on 1129 * the pageout queues. If the pageout 1130 * deamon comes across it, then it will 1131 * remove the page from the queue, but not the object 1132 */ 1133#if TRACEFAULTPAGE 1134 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ 1135#endif 1136 XPR(XPR_VM_FAULT, 1137 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n", 1138 (integer_t)object, offset, (integer_t)m, 0, 0); 1139 assert(!m->busy); 1140 assert(!m->absent); 1141 1142 m->busy = TRUE; 1143 break; 1144 } 1145 1146 1147 /* 1148 * we get here when there is no page present in the object at 1149 * the offset we're interested in... we'll allocate a page 1150 * at this point if the pager associated with 1151 * this object can provide the data or we're the top object... 1152 * object is locked; m == NULL 1153 */ 1154 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply); 1155 1156#if TRACEFAULTPAGE 1157 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */ 1158#endif 1159 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) { 1160 /* 1161 * Allocate a new page for this object/offset pair 1162 */ 1163 m = vm_page_grab(); 1164#if TRACEFAULTPAGE 1165 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */ 1166#endif 1167 if (m == VM_PAGE_NULL) { 1168 1169 vm_fault_cleanup(object, first_m); 1170 thread_interrupt_level(interruptible_state); 1171 1172 return (VM_FAULT_MEMORY_SHORTAGE); 1173 } 1174 vm_page_insert(m, object, offset); 1175 } 1176 if (look_for_page && !must_be_resident) { 1177 kern_return_t rc; 1178 1179 /* 1180 * If the memory manager is not ready, we 1181 * cannot make requests. 1182 */ 1183 if (!object->pager_ready) { 1184#if TRACEFAULTPAGE 1185 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */ 1186#endif 1187 if (m != VM_PAGE_NULL) 1188 VM_PAGE_FREE(m); 1189 1190 XPR(XPR_VM_FAULT, 1191 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n", 1192 (integer_t)object, offset, 0, 0, 0); 1193 1194 /* 1195 * take an extra ref so object won't die 1196 */ 1197 vm_object_reference_locked(object); 1198 vm_fault_cleanup(object, first_m); 1199 counter(c_vm_fault_page_block_backoff_kernel++); 1200 1201 vm_object_lock(object); 1202 assert(object->ref_count > 0); 1203 1204 if (!object->pager_ready) { 1205 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible); 1206 1207 vm_object_unlock(object); 1208 if (wait_result == THREAD_WAITING) 1209 wait_result = thread_block(THREAD_CONTINUE_NULL); 1210 vm_object_deallocate(object); 1211 1212 goto backoff; 1213 } else { 1214 vm_object_unlock(object); 1215 vm_object_deallocate(object); 1216 thread_interrupt_level(interruptible_state); 1217 1218 return (VM_FAULT_RETRY); 1219 } 1220 } 1221 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) { 1222 /* 1223 * If there are too many outstanding page 1224 * requests pending on this external object, we 1225 * wait for them to be resolved now. 1226 */ 1227#if TRACEFAULTPAGE 1228 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ 1229#endif 1230 if (m != VM_PAGE_NULL) 1231 VM_PAGE_FREE(m); 1232 /* 1233 * take an extra ref so object won't die 1234 */ 1235 vm_object_reference_locked(object); 1236 1237 vm_fault_cleanup(object, first_m); 1238 1239 counter(c_vm_fault_page_block_backoff_kernel++); 1240 1241 vm_object_lock(object); 1242 assert(object->ref_count > 0); 1243 1244 if (object->paging_in_progress > vm_object_pagein_throttle) { 1245 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible); 1246 1247 vm_object_unlock(object); 1248 wait_result = thread_block(THREAD_CONTINUE_NULL); 1249 vm_object_deallocate(object); 1250 1251 goto backoff; 1252 } else { 1253 vm_object_unlock(object); 1254 vm_object_deallocate(object); 1255 thread_interrupt_level(interruptible_state); 1256 1257 return (VM_FAULT_RETRY); 1258 } 1259 } 1260 if (m != VM_PAGE_NULL) { 1261 /* 1262 * Indicate that the page is waiting for data 1263 * from the memory manager. 1264 */ 1265 m->list_req_pending = TRUE; 1266 m->absent = TRUE; 1267 } 1268 1269#if TRACEFAULTPAGE 1270 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */ 1271#endif 1272 1273 /* 1274 * It's possible someone called vm_object_destroy while we weren't 1275 * holding the object lock. If that has happened, then bail out 1276 * here. 1277 */ 1278 1279 pager = object->pager; 1280 1281 if (pager == MEMORY_OBJECT_NULL) { 1282 vm_fault_cleanup(object, first_m); 1283 thread_interrupt_level(interruptible_state); 1284 return VM_FAULT_MEMORY_ERROR; 1285 } 1286 1287 /* 1288 * We have an absent page in place for the faulting offset, 1289 * so we can release the object lock. 1290 */ 1291 1292 vm_object_unlock(object); 1293 1294 /* 1295 * If this object uses a copy_call strategy, 1296 * and we are interested in a copy of this object 1297 * (having gotten here only by following a 1298 * shadow chain), then tell the memory manager 1299 * via a flag added to the desired_access 1300 * parameter, so that it can detect a race 1301 * between our walking down the shadow chain 1302 * and its pushing pages up into a copy of 1303 * the object that it manages. 1304 */ 1305 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) 1306 wants_copy_flag = VM_PROT_WANTS_COPY; 1307 else 1308 wants_copy_flag = VM_PROT_NONE; 1309 1310 XPR(XPR_VM_FAULT, 1311 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n", 1312 (integer_t)object, offset, (integer_t)m, 1313 access_required | wants_copy_flag, 0); 1314 1315 /* 1316 * Call the memory manager to retrieve the data. 1317 */ 1318 rc = memory_object_data_request( 1319 pager, 1320 offset + object->paging_offset, 1321 PAGE_SIZE, 1322 access_required | wants_copy_flag, 1323 (memory_object_fault_info_t)fault_info); 1324 1325#if TRACEFAULTPAGE 1326 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */ 1327#endif 1328 vm_object_lock(object); 1329 1330 if (rc != KERN_SUCCESS) { 1331 1332 vm_fault_cleanup(object, first_m); 1333 thread_interrupt_level(interruptible_state); 1334 1335 return ((rc == MACH_SEND_INTERRUPTED) ? 1336 VM_FAULT_INTERRUPTED : 1337 VM_FAULT_MEMORY_ERROR); 1338 } 1339 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) { 1340 1341 vm_fault_cleanup(object, first_m); 1342 thread_interrupt_level(interruptible_state); 1343 1344 return (VM_FAULT_INTERRUPTED); 1345 } 1346 if (m == VM_PAGE_NULL && object->phys_contiguous) { 1347 /* 1348 * No page here means that the object we 1349 * initially looked up was "physically 1350 * contiguous" (i.e. device memory). However, 1351 * with Virtual VRAM, the object might not 1352 * be backed by that device memory anymore, 1353 * so we're done here only if the object is 1354 * still "phys_contiguous". 1355 * Otherwise, if the object is no longer 1356 * "phys_contiguous", we need to retry the 1357 * page fault against the object's new backing 1358 * store (different memory object). 1359 */ 1360 break; 1361 } 1362 /* 1363 * potentially a pagein fault 1364 * if we make it through the state checks 1365 * above, than we'll count it as such 1366 */ 1367 my_fault = DBG_PAGEIN_FAULT; 1368 1369 /* 1370 * Retry with same object/offset, since new data may 1371 * be in a different page (i.e., m is meaningless at 1372 * this point). 1373 */ 1374 continue; 1375 } 1376 1377 /* 1378 * We get here if the object has no pager, or an existence map 1379 * exists and indicates the page isn't present on the pager 1380 * or we're unwiring a page. If a pager exists, but there 1381 * is no existence map, then the m->absent case above handles 1382 * the ZF case when the pager can't provide the page 1383 */ 1384#if TRACEFAULTPAGE 1385 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ 1386#endif 1387 if (object == first_object) 1388 first_m = m; 1389 else 1390 assert(m == VM_PAGE_NULL); 1391 1392 XPR(XPR_VM_FAULT, 1393 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n", 1394 (integer_t)object, offset, (integer_t)m, 1395 (integer_t)object->shadow, 0); 1396 1397 next_object = object->shadow; 1398 1399 if (next_object == VM_OBJECT_NULL) { 1400 /* 1401 * we've hit the bottom of the shadown chain, 1402 * fill the page in the top object with zeros. 1403 */ 1404 assert(!must_be_resident); 1405 1406 if (object != first_object) { 1407 vm_object_paging_end(object); 1408 vm_object_unlock(object); 1409 1410 object = first_object; 1411 offset = first_offset; 1412 vm_object_lock(object); 1413 } 1414 m = first_m; 1415 assert(m->object == object); 1416 first_m = VM_PAGE_NULL; 1417 1418 /* 1419 * check for any conditions that prevent 1420 * us from creating a new zero-fill page 1421 * vm_fault_check will do all of the 1422 * fault cleanup in the case of an error condition 1423 * including resetting the thread_interrupt_level 1424 */ 1425 error = vm_fault_check(object, m, first_m, interruptible_state); 1426 1427 if (error != VM_FAULT_SUCCESS) 1428 return (error); 1429 1430 if (m == VM_PAGE_NULL) { 1431 m = vm_page_grab(); 1432 1433 if (m == VM_PAGE_NULL) { 1434 vm_fault_cleanup(object, VM_PAGE_NULL); 1435 thread_interrupt_level(interruptible_state); 1436 1437 return (VM_FAULT_MEMORY_SHORTAGE); 1438 } 1439 vm_page_insert(m, object, offset); 1440 } 1441 my_fault = vm_fault_zero_page(m, no_zero_fill); 1442 1443 break; 1444 1445 } else { 1446 /* 1447 * Move on to the next object. Lock the next 1448 * object before unlocking the current one. 1449 */ 1450 if ((object != first_object) || must_be_resident) 1451 vm_object_paging_end(object); 1452 1453 offset += object->shadow_offset; 1454 fault_info->lo_offset += object->shadow_offset; 1455 fault_info->hi_offset += object->shadow_offset; 1456 access_required = VM_PROT_READ; 1457 1458 vm_object_lock(next_object); 1459 vm_object_unlock(object); 1460 1461 object = next_object; 1462 vm_object_paging_begin(object); 1463 } 1464 } 1465 1466 /* 1467 * PAGE HAS BEEN FOUND. 1468 * 1469 * This page (m) is: 1470 * busy, so that we can play with it; 1471 * not absent, so that nobody else will fill it; 1472 * possibly eligible for pageout; 1473 * 1474 * The top-level page (first_m) is: 1475 * VM_PAGE_NULL if the page was found in the 1476 * top-level object; 1477 * busy, not absent, and ineligible for pageout. 1478 * 1479 * The current object (object) is locked. A paging 1480 * reference is held for the current and top-level 1481 * objects. 1482 */ 1483 1484#if TRACEFAULTPAGE 1485 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ 1486#endif 1487#if EXTRA_ASSERTIONS 1488 if (m != VM_PAGE_NULL) { 1489 assert(m->busy && !m->absent); 1490 assert((first_m == VM_PAGE_NULL) || 1491 (first_m->busy && !first_m->absent && 1492 !first_m->active && !first_m->inactive)); 1493 } 1494#endif /* EXTRA_ASSERTIONS */ 1495 1496 /* 1497 * ENCRYPTED SWAP: 1498 * If we found a page, we must have decrypted it before we 1499 * get here... 1500 */ 1501 if (m != VM_PAGE_NULL) { 1502 ASSERT_PAGE_DECRYPTED(m); 1503 } 1504 1505 XPR(XPR_VM_FAULT, 1506 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n", 1507 (integer_t)object, offset, (integer_t)m, 1508 (integer_t)first_object, (integer_t)first_m); 1509 1510 /* 1511 * If the page is being written, but isn't 1512 * already owned by the top-level object, 1513 * we have to copy it into a new page owned 1514 * by the top-level object. 1515 */ 1516 if ((object != first_object) && (m != VM_PAGE_NULL)) { 1517 1518#if TRACEFAULTPAGE 1519 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */ 1520#endif 1521 if (fault_type & VM_PROT_WRITE) { 1522 vm_page_t copy_m; 1523 1524 /* 1525 * We only really need to copy if we 1526 * want to write it. 1527 */ 1528 assert(!must_be_resident); 1529 1530 /* 1531 * are we protecting the system from 1532 * backing store exhaustion. If so 1533 * sleep unless we are privileged. 1534 */ 1535 if (vm_backing_store_low) { 1536 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { 1537 1538 RELEASE_PAGE(m); 1539 vm_fault_cleanup(object, first_m); 1540 1541 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); 1542 1543 thread_block(THREAD_CONTINUE_NULL); 1544 thread_interrupt_level(interruptible_state); 1545 1546 return (VM_FAULT_RETRY); 1547 } 1548 } 1549 /* 1550 * If we try to collapse first_object at this 1551 * point, we may deadlock when we try to get 1552 * the lock on an intermediate object (since we 1553 * have the bottom object locked). We can't 1554 * unlock the bottom object, because the page 1555 * we found may move (by collapse) if we do. 1556 * 1557 * Instead, we first copy the page. Then, when 1558 * we have no more use for the bottom object, 1559 * we unlock it and try to collapse. 1560 * 1561 * Note that we copy the page even if we didn't 1562 * need to... that's the breaks. 1563 */ 1564 1565 /* 1566 * Allocate a page for the copy 1567 */ 1568 copy_m = vm_page_grab(); 1569 1570 if (copy_m == VM_PAGE_NULL) { 1571 RELEASE_PAGE(m); 1572 1573 vm_fault_cleanup(object, first_m); 1574 thread_interrupt_level(interruptible_state); 1575 1576 return (VM_FAULT_MEMORY_SHORTAGE); 1577 } 1578 XPR(XPR_VM_FAULT, 1579 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n", 1580 (integer_t)object, offset, 1581 (integer_t)m, (integer_t)copy_m, 0); 1582 1583 vm_page_copy(m, copy_m); 1584 1585 /* 1586 * If another map is truly sharing this 1587 * page with us, we have to flush all 1588 * uses of the original page, since we 1589 * can't distinguish those which want the 1590 * original from those which need the 1591 * new copy. 1592 * 1593 * XXXO If we know that only one map has 1594 * access to this page, then we could 1595 * avoid the pmap_disconnect() call. 1596 */ 1597 if (m->pmapped) 1598 pmap_disconnect(m->phys_page); 1599 1600 assert(!m->cleaning); 1601 1602 /* 1603 * We no longer need the old page or object. 1604 */ 1605 PAGE_WAKEUP_DONE(m); 1606 vm_object_paging_end(object); 1607 vm_object_unlock(object); 1608 1609 my_fault = DBG_COW_FAULT; 1610 VM_STAT_INCR(cow_faults); 1611 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL); 1612 current_task()->cow_faults++; 1613 1614 object = first_object; 1615 offset = first_offset; 1616 1617 vm_object_lock(object); 1618 /* 1619 * get rid of the place holder 1620 * page that we soldered in earlier 1621 */ 1622 VM_PAGE_FREE(first_m); 1623 first_m = VM_PAGE_NULL; 1624 1625 /* 1626 * and replace it with the 1627 * page we just copied into 1628 */ 1629 assert(copy_m->busy); 1630 vm_page_insert(copy_m, object, offset); 1631 copy_m->dirty = TRUE; 1632 1633 m = copy_m; 1634 /* 1635 * Now that we've gotten the copy out of the 1636 * way, let's try to collapse the top object. 1637 * But we have to play ugly games with 1638 * paging_in_progress to do that... 1639 */ 1640 vm_object_paging_end(object); 1641 vm_object_collapse(object, offset, TRUE); 1642 vm_object_paging_begin(object); 1643 1644 } else 1645 *protection &= (~VM_PROT_WRITE); 1646 } 1647 /* 1648 * Now check whether the page needs to be pushed into the 1649 * copy object. The use of asymmetric copy on write for 1650 * shared temporary objects means that we may do two copies to 1651 * satisfy the fault; one above to get the page from a 1652 * shadowed object, and one here to push it into the copy. 1653 */ 1654 try_failed_count = 0; 1655 1656 while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) { 1657 vm_object_offset_t copy_offset; 1658 vm_page_t copy_m; 1659 1660#if TRACEFAULTPAGE 1661 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */ 1662#endif 1663 /* 1664 * If the page is being written, but hasn't been 1665 * copied to the copy-object, we have to copy it there. 1666 */ 1667 if ((fault_type & VM_PROT_WRITE) == 0) { 1668 *protection &= ~VM_PROT_WRITE; 1669 break; 1670 } 1671 1672 /* 1673 * If the page was guaranteed to be resident, 1674 * we must have already performed the copy. 1675 */ 1676 if (must_be_resident) 1677 break; 1678 1679 /* 1680 * Try to get the lock on the copy_object. 1681 */ 1682 if (!vm_object_lock_try(copy_object)) { 1683 1684 vm_object_unlock(object); 1685 try_failed_count++; 1686 1687 mutex_pause(try_failed_count); /* wait a bit */ 1688 vm_object_lock(object); 1689 1690 continue; 1691 } 1692 try_failed_count = 0; 1693 1694 /* 1695 * Make another reference to the copy-object, 1696 * to keep it from disappearing during the 1697 * copy. 1698 */ 1699 vm_object_reference_locked(copy_object); 1700 1701 /* 1702 * Does the page exist in the copy? 1703 */ 1704 copy_offset = first_offset - copy_object->shadow_offset; 1705 1706 if (copy_object->size <= copy_offset) 1707 /* 1708 * Copy object doesn't cover this page -- do nothing. 1709 */ 1710 ; 1711 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) { 1712 /* 1713 * Page currently exists in the copy object 1714 */ 1715 if (copy_m->busy) { 1716 /* 1717 * If the page is being brought 1718 * in, wait for it and then retry. 1719 */ 1720 RELEASE_PAGE(m); 1721 1722 /* 1723 * take an extra ref so object won't die 1724 */ 1725 vm_object_reference_locked(copy_object); 1726 vm_object_unlock(copy_object); 1727 vm_fault_cleanup(object, first_m); 1728 counter(c_vm_fault_page_block_backoff_kernel++); 1729 1730 vm_object_lock(copy_object); 1731 assert(copy_object->ref_count > 0); 1732 VM_OBJ_RES_DECR(copy_object); 1733 vm_object_lock_assert_exclusive(copy_object); 1734 copy_object->ref_count--; 1735 assert(copy_object->ref_count > 0); 1736 copy_m = vm_page_lookup(copy_object, copy_offset); 1737 /* 1738 * ENCRYPTED SWAP: 1739 * it's OK if the "copy_m" page is encrypted, 1740 * because we're not moving it nor handling its 1741 * contents. 1742 */ 1743 if (copy_m != VM_PAGE_NULL && copy_m->busy) { 1744 PAGE_ASSERT_WAIT(copy_m, interruptible); 1745 1746 vm_object_unlock(copy_object); 1747 wait_result = thread_block(THREAD_CONTINUE_NULL); 1748 vm_object_deallocate(copy_object); 1749 1750 goto backoff; 1751 } else { 1752 vm_object_unlock(copy_object); 1753 vm_object_deallocate(copy_object); 1754 thread_interrupt_level(interruptible_state); 1755 1756 return (VM_FAULT_RETRY); 1757 } 1758 } 1759 } 1760 else if (!PAGED_OUT(copy_object, copy_offset)) { 1761 /* 1762 * If PAGED_OUT is TRUE, then the page used to exist 1763 * in the copy-object, and has already been paged out. 1764 * We don't need to repeat this. If PAGED_OUT is 1765 * FALSE, then either we don't know (!pager_created, 1766 * for example) or it hasn't been paged out. 1767 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT) 1768 * We must copy the page to the copy object. 1769 */ 1770 1771 if (vm_backing_store_low) { 1772 /* 1773 * we are protecting the system from 1774 * backing store exhaustion. If so 1775 * sleep unless we are privileged. 1776 */ 1777 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { 1778 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); 1779 1780 RELEASE_PAGE(m); 1781 VM_OBJ_RES_DECR(copy_object); 1782 vm_object_lock_assert_exclusive(copy_object); 1783 copy_object->ref_count--; 1784 assert(copy_object->ref_count > 0); 1785 1786 vm_object_unlock(copy_object); 1787 vm_fault_cleanup(object, first_m); 1788 thread_block(THREAD_CONTINUE_NULL); 1789 thread_interrupt_level(interruptible_state); 1790 1791 return (VM_FAULT_RETRY); 1792 } 1793 } 1794 /* 1795 * Allocate a page for the copy 1796 */ 1797 copy_m = vm_page_alloc(copy_object, copy_offset); 1798 1799 if (copy_m == VM_PAGE_NULL) { 1800 RELEASE_PAGE(m); 1801 1802 VM_OBJ_RES_DECR(copy_object); 1803 vm_object_lock_assert_exclusive(copy_object); 1804 copy_object->ref_count--; 1805 assert(copy_object->ref_count > 0); 1806 1807 vm_object_unlock(copy_object); 1808 vm_fault_cleanup(object, first_m); 1809 thread_interrupt_level(interruptible_state); 1810 1811 return (VM_FAULT_MEMORY_SHORTAGE); 1812 } 1813 /* 1814 * Must copy page into copy-object. 1815 */ 1816 vm_page_copy(m, copy_m); 1817 1818 /* 1819 * If the old page was in use by any users 1820 * of the copy-object, it must be removed 1821 * from all pmaps. (We can't know which 1822 * pmaps use it.) 1823 */ 1824 if (m->pmapped) 1825 pmap_disconnect(m->phys_page); 1826 1827 /* 1828 * If there's a pager, then immediately 1829 * page out this page, using the "initialize" 1830 * option. Else, we use the copy. 1831 */ 1832 if ((!copy_object->pager_created) 1833#if MACH_PAGEMAP 1834 || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT 1835#endif 1836 ) { 1837 1838 vm_page_lockspin_queues(); 1839 assert(!m->cleaning); 1840 vm_page_activate(copy_m); 1841 vm_page_unlock_queues(); 1842 1843 copy_m->dirty = TRUE; 1844 PAGE_WAKEUP_DONE(copy_m); 1845 } 1846 else { 1847 assert(copy_m->busy == TRUE); 1848 assert(!m->cleaning); 1849 1850 /* 1851 * dirty is protected by the object lock 1852 */ 1853 copy_m->dirty = TRUE; 1854 1855 /* 1856 * The page is already ready for pageout: 1857 * not on pageout queues and busy. 1858 * Unlock everything except the 1859 * copy_object itself. 1860 */ 1861 vm_object_unlock(object); 1862 1863 /* 1864 * Write the page to the copy-object, 1865 * flushing it from the kernel. 1866 */ 1867 vm_pageout_initialize_page(copy_m); 1868 1869 /* 1870 * Since the pageout may have 1871 * temporarily dropped the 1872 * copy_object's lock, we 1873 * check whether we'll have 1874 * to deallocate the hard way. 1875 */ 1876 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) { 1877 vm_object_unlock(copy_object); 1878 vm_object_deallocate(copy_object); 1879 vm_object_lock(object); 1880 1881 continue; 1882 } 1883 /* 1884 * Pick back up the old object's 1885 * lock. [It is safe to do so, 1886 * since it must be deeper in the 1887 * object tree.] 1888 */ 1889 vm_object_lock(object); 1890 } 1891 /* 1892 * Because we're pushing a page upward 1893 * in the object tree, we must restart 1894 * any faults that are waiting here. 1895 * [Note that this is an expansion of 1896 * PAGE_WAKEUP that uses the THREAD_RESTART 1897 * wait result]. Can't turn off the page's 1898 * busy bit because we're not done with it. 1899 */ 1900 if (m->wanted) { 1901 m->wanted = FALSE; 1902 thread_wakeup_with_result((event_t) m, THREAD_RESTART); 1903 } 1904 } 1905 /* 1906 * The reference count on copy_object must be 1907 * at least 2: one for our extra reference, 1908 * and at least one from the outside world 1909 * (we checked that when we last locked 1910 * copy_object). 1911 */ 1912 vm_object_lock_assert_exclusive(copy_object); 1913 copy_object->ref_count--; 1914 assert(copy_object->ref_count > 0); 1915 1916 VM_OBJ_RES_DECR(copy_object); 1917 vm_object_unlock(copy_object); 1918 1919 break; 1920 } 1921 *result_page = m; 1922 *top_page = first_m; 1923 1924 XPR(XPR_VM_FAULT, 1925 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n", 1926 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0); 1927 1928 if (m != VM_PAGE_NULL) { 1929 if (my_fault == DBG_PAGEIN_FAULT) { 1930 1931 VM_STAT_INCR(pageins); 1932 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL); 1933 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL); 1934 current_task()->pageins++; 1935 1936 if (m->object->internal) { 1937 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL); 1938 } else { 1939 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL); 1940 } 1941 1942 /* 1943 * evaluate access pattern and update state 1944 * vm_fault_deactivate_behind depends on the 1945 * state being up to date 1946 */ 1947 vm_fault_is_sequential(object, offset, fault_info->behavior); 1948 1949 vm_fault_deactivate_behind(object, offset, fault_info->behavior); 1950 } 1951 if (type_of_fault) 1952 *type_of_fault = my_fault; 1953 } else 1954 vm_object_unlock(object); 1955 1956 thread_interrupt_level(interruptible_state); 1957 1958#if TRACEFAULTPAGE 1959 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */ 1960#endif 1961 return (VM_FAULT_SUCCESS); 1962 1963backoff: 1964 thread_interrupt_level(interruptible_state); 1965 1966 if (wait_result == THREAD_INTERRUPTED) 1967 return (VM_FAULT_INTERRUPTED); 1968 return (VM_FAULT_RETRY); 1969 1970#undef RELEASE_PAGE 1971} 1972 1973 1974 1975/* 1976 * CODE SIGNING: 1977 * When soft faulting a page, we have to validate the page if: 1978 * 1. the page is being mapped in user space 1979 * 2. the page hasn't already been found to be "tainted" 1980 * 3. the page belongs to a code-signed object 1981 * 4. the page has not been validated yet or has been mapped for write. 1982 */ 1983#define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \ 1984 ((pmap) != kernel_pmap /*1*/ && \ 1985 !(page)->cs_tainted /*2*/ && \ 1986 (page)->object->code_signed /*3*/ && \ 1987 (!(page)->cs_validated || (page)->wpmapped /*4*/)) 1988 1989 1990/* 1991 * page queue lock must NOT be held 1992 * m->object must be locked 1993 * 1994 * NOTE: m->object could be locked "shared" only if we are called 1995 * from vm_fault() as part of a soft fault. If so, we must be 1996 * careful not to modify the VM object in any way that is not 1997 * legal under a shared lock... 1998 */ 1999unsigned long cs_enter_tainted_rejected = 0; 2000unsigned long cs_enter_tainted_accepted = 0; 2001kern_return_t 2002vm_fault_enter(vm_page_t m, 2003 pmap_t pmap, 2004 vm_map_offset_t vaddr, 2005 vm_prot_t prot, 2006 boolean_t wired, 2007 boolean_t change_wiring, 2008 boolean_t no_cache, 2009 int *type_of_fault) 2010{ 2011 unsigned int cache_attr; 2012 kern_return_t kr; 2013 boolean_t previously_pmapped = m->pmapped; 2014 2015 vm_object_lock_assert_held(m->object); 2016#if DEBUG 2017 mutex_assert(&vm_page_queue_lock, MA_NOTOWNED); 2018#endif /* DEBUG */ 2019 2020 if (m->phys_page == vm_page_guard_addr) { 2021 assert(m->fictitious); 2022 return KERN_SUCCESS; 2023 } 2024 2025 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK; 2026 2027 if (m->pmapped == FALSE) { 2028 /* 2029 * This is the first time this page is being 2030 * mapped in an address space (pmapped == FALSE). 2031 * 2032 * Part of that page may still be in the data cache 2033 * and not flushed to memory. In case we end up 2034 * accessing that page via the instruction cache, 2035 * we need to ensure that the 2 caches are in sync. 2036 */ 2037 pmap_sync_page_data_phys(m->phys_page); 2038 2039 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) { 2040 /* 2041 * found it in the cache, but this 2042 * is the first fault-in of the page (m->pmapped == FALSE) 2043 * so it must have come in as part of 2044 * a cluster... account 1 pagein against it 2045 */ 2046 VM_STAT_INCR(pageins); 2047 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL); 2048 2049 if (m->object->internal) { 2050 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL); 2051 } else { 2052 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL); 2053 } 2054 2055 current_task()->pageins++; 2056 2057 *type_of_fault = DBG_PAGEIN_FAULT; 2058 } 2059 VM_PAGE_CONSUME_CLUSTERED(m); 2060 2061 } else if (cache_attr != VM_WIMG_DEFAULT) 2062 pmap_sync_page_attributes_phys(m->phys_page); 2063 2064 if (*type_of_fault != DBG_COW_FAULT) { 2065 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL); 2066 2067 if (pmap == kernel_pmap) { 2068 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL); 2069 } 2070 } 2071 2072 if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) { 2073 vm_object_lock_assert_exclusive(m->object); 2074 2075 if (m->cs_validated) { 2076 vm_cs_revalidates++; 2077 } 2078 2079 /* VM map is locked, so 1 ref will remain on VM object */ 2080 vm_page_validate_cs(m); 2081 } 2082 2083 if (m->cs_tainted /* always invalidate a tainted page */ 2084#if CONFIG_ENFORCE_SIGNED_CODE 2085 /* 2086 * Code Signing enforcement invalidates an executable page that 2087 * has no code directory, and thus could not be validated. 2088 */ 2089 || ((prot & VM_PROT_EXECUTE) && !m->cs_validated ) 2090#endif 2091 ) { 2092 /* 2093 * CODE SIGNING: 2094 * This page has been tainted and can not be trusted. 2095 * Let's notify the current process and let it take any 2096 * necessary precautions before we enter the tainted page 2097 * into its address space. 2098 */ 2099 kr = KERN_SUCCESS; 2100#if CONFIG_ENFORCE_SIGNED_CODE 2101 if (!cs_enforcement_disable) { 2102#endif 2103 if (cs_invalid_page((addr64_t) vaddr)) { 2104 /* reject the tainted page: abort the page fault */ 2105 kr = KERN_MEMORY_ERROR; 2106 cs_enter_tainted_rejected++; 2107 } else { 2108 /* proceed with the tainted page */ 2109 kr = KERN_SUCCESS; 2110 cs_enter_tainted_accepted++; 2111 } 2112#if CONFIG_ENFORCE_SIGNED_CODE 2113 } 2114#endif 2115 if (cs_debug || kr != KERN_SUCCESS) { 2116 printf("CODESIGNING: vm_fault_enter(0x%llx): " 2117 "page %p obj %p off 0x%llx *** INVALID PAGE ***\n", 2118 (long long)vaddr, m, m->object, m->offset); 2119 } 2120 } else { 2121 /* proceed with the valid page */ 2122 kr = KERN_SUCCESS; 2123 } 2124 2125 if (kr == KERN_SUCCESS) { 2126 /* 2127 * NOTE: we may only hold the vm_object lock SHARED 2128 * at this point, but the update of pmapped is ok 2129 * since this is the ONLY bit updated behind the SHARED 2130 * lock... however, we need to figure out how to do an atomic 2131 * update on a bit field to make this less fragile... right 2132 * now I don't know how to coerce 'C' to give me the offset info 2133 * that's needed for an AtomicCompareAndSwap 2134 */ 2135 m->pmapped = TRUE; 2136 if (prot & VM_PROT_WRITE) { 2137 vm_object_lock_assert_exclusive(m->object); 2138 m->wpmapped = TRUE; 2139 } 2140 2141 PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired); 2142 } 2143 2144 /* 2145 * Hold queues lock to manipulate 2146 * the page queues. Change wiring 2147 * case is obvious. 2148 */ 2149 if (change_wiring) { 2150 vm_page_lockspin_queues(); 2151 2152 if (wired) { 2153 if (kr == KERN_SUCCESS) { 2154 vm_page_wire(m); 2155 } 2156 } else { 2157 vm_page_unwire(m); 2158 } 2159 vm_page_unlock_queues(); 2160 2161 } else { 2162 if (kr != KERN_SUCCESS) { 2163 vm_page_lock_queues(); 2164 vm_page_deactivate(m); 2165 vm_page_unlock_queues(); 2166 } else { 2167 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) { 2168 vm_page_lockspin_queues(); 2169 /* 2170 * test again now that we hold the page queue lock 2171 */ 2172 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) { 2173 2174 /* 2175 * If this is a no_cache mapping and the page has never been 2176 * mapped before or was previously a no_cache page, then we 2177 * want to leave pages in the speculative state so that they 2178 * can be readily recycled if free memory runs low. Otherwise 2179 * the page is activated as normal. 2180 */ 2181 2182 if (no_cache && (!previously_pmapped || m->no_cache)) { 2183 m->no_cache = TRUE; 2184 2185 if (m->active || m->inactive) 2186 VM_PAGE_QUEUES_REMOVE(m); 2187 2188 if (!m->speculative) 2189 vm_page_speculate(m, TRUE); 2190 2191 } else if (!m->active && !m->inactive) 2192 vm_page_activate(m); 2193 2194 } 2195 2196 vm_page_unlock_queues(); 2197 } 2198 } 2199 } 2200 return kr; 2201} 2202 2203 2204/* 2205 * Routine: vm_fault 2206 * Purpose: 2207 * Handle page faults, including pseudo-faults 2208 * used to change the wiring status of pages. 2209 * Returns: 2210 * Explicit continuations have been removed. 2211 * Implementation: 2212 * vm_fault and vm_fault_page save mucho state 2213 * in the moral equivalent of a closure. The state 2214 * structure is allocated when first entering vm_fault 2215 * and deallocated when leaving vm_fault. 2216 */ 2217 2218extern int _map_enter_debug; 2219 2220unsigned long vm_fault_collapse_total = 0; 2221unsigned long vm_fault_collapse_skipped = 0; 2222 2223kern_return_t 2224vm_fault( 2225 vm_map_t map, 2226 vm_map_offset_t vaddr, 2227 vm_prot_t fault_type, 2228 boolean_t change_wiring, 2229 int interruptible, 2230 pmap_t caller_pmap, 2231 vm_map_offset_t caller_pmap_addr) 2232{ 2233 vm_map_version_t version; /* Map version for verificiation */ 2234 boolean_t wired; /* Should mapping be wired down? */ 2235 vm_object_t object; /* Top-level object */ 2236 vm_object_offset_t offset; /* Top-level offset */ 2237 vm_prot_t prot; /* Protection for mapping */ 2238 vm_object_t old_copy_object; /* Saved copy object */ 2239 vm_page_t result_page; /* Result of vm_fault_page */ 2240 vm_page_t top_page; /* Placeholder page */ 2241 kern_return_t kr; 2242 2243 vm_page_t m; /* Fast access to result_page */ 2244 kern_return_t error_code; 2245 vm_object_t cur_object; 2246 vm_object_offset_t cur_offset; 2247 vm_page_t cur_m; 2248 vm_object_t new_object; 2249 int type_of_fault; 2250 pmap_t pmap; 2251 boolean_t interruptible_state; 2252 vm_map_t real_map = map; 2253 vm_map_t original_map = map; 2254 vm_prot_t original_fault_type; 2255 struct vm_object_fault_info fault_info; 2256 boolean_t need_collapse = FALSE; 2257 int object_lock_type = 0; 2258 int cur_object_lock_type; 2259 vm_object_t top_object = VM_OBJECT_NULL; 2260 2261 2262 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START, 2263 (int)((uint64_t)vaddr >> 32), 2264 (int)vaddr, 2265 0, 2266 0, 2267 0); 2268 2269 if (get_preemption_level() != 0) { 2270 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, 2271 (int)((uint64_t)vaddr >> 32), 2272 (int)vaddr, 2273 KERN_FAILURE, 2274 0, 2275 0); 2276 2277 return (KERN_FAILURE); 2278 } 2279 interruptible_state = thread_interrupt_level(interruptible); 2280 2281 VM_STAT_INCR(faults); 2282 current_task()->faults++; 2283 original_fault_type = fault_type; 2284 2285 if (fault_type & VM_PROT_WRITE) 2286 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2287 else 2288 object_lock_type = OBJECT_LOCK_SHARED; 2289 2290 cur_object_lock_type = OBJECT_LOCK_SHARED; 2291 2292RetryFault: 2293 /* 2294 * assume we will hit a page in the cache 2295 * otherwise, explicitly override with 2296 * the real fault type once we determine it 2297 */ 2298 type_of_fault = DBG_CACHE_HIT_FAULT; 2299 2300 /* 2301 * Find the backing store object and offset into 2302 * it to begin the search. 2303 */ 2304 fault_type = original_fault_type; 2305 map = original_map; 2306 vm_map_lock_read(map); 2307 2308 kr = vm_map_lookup_locked(&map, vaddr, fault_type, 2309 object_lock_type, &version, 2310 &object, &offset, &prot, &wired, 2311 &fault_info, 2312 &real_map); 2313 2314 if (kr != KERN_SUCCESS) { 2315 vm_map_unlock_read(map); 2316 goto done; 2317 } 2318 pmap = real_map->pmap; 2319 fault_info.interruptible = interruptible; 2320 2321 /* 2322 * If the page is wired, we must fault for the current protection 2323 * value, to avoid further faults. 2324 */ 2325 if (wired) { 2326 fault_type = prot | VM_PROT_WRITE; 2327 /* 2328 * since we're treating this fault as a 'write' 2329 * we must hold the top object lock exclusively 2330 */ 2331 if (object_lock_type == OBJECT_LOCK_SHARED) { 2332 2333 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2334 2335 if (vm_object_lock_upgrade(object) == FALSE) { 2336 /* 2337 * couldn't upgrade, so explictly 2338 * take the lock exclusively 2339 */ 2340 vm_object_lock(object); 2341 } 2342 } 2343 } 2344 2345#if VM_FAULT_CLASSIFY 2346 /* 2347 * Temporary data gathering code 2348 */ 2349 vm_fault_classify(object, offset, fault_type); 2350#endif 2351 /* 2352 * Fast fault code. The basic idea is to do as much as 2353 * possible while holding the map lock and object locks. 2354 * Busy pages are not used until the object lock has to 2355 * be dropped to do something (copy, zero fill, pmap enter). 2356 * Similarly, paging references aren't acquired until that 2357 * point, and object references aren't used. 2358 * 2359 * If we can figure out what to do 2360 * (zero fill, copy on write, pmap enter) while holding 2361 * the locks, then it gets done. Otherwise, we give up, 2362 * and use the original fault path (which doesn't hold 2363 * the map lock, and relies on busy pages). 2364 * The give up cases include: 2365 * - Have to talk to pager. 2366 * - Page is busy, absent or in error. 2367 * - Pager has locked out desired access. 2368 * - Fault needs to be restarted. 2369 * - Have to push page into copy object. 2370 * 2371 * The code is an infinite loop that moves one level down 2372 * the shadow chain each time. cur_object and cur_offset 2373 * refer to the current object being examined. object and offset 2374 * are the original object from the map. The loop is at the 2375 * top level if and only if object and cur_object are the same. 2376 * 2377 * Invariants: Map lock is held throughout. Lock is held on 2378 * original object and cur_object (if different) when 2379 * continuing or exiting loop. 2380 * 2381 */ 2382 2383 2384 /* 2385 * If this page is to be inserted in a copy delay object 2386 * for writing, and if the object has a copy, then the 2387 * copy delay strategy is implemented in the slow fault page. 2388 */ 2389 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY && 2390 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) 2391 goto handle_copy_delay; 2392 2393 cur_object = object; 2394 cur_offset = offset; 2395 2396 while (TRUE) { 2397 m = vm_page_lookup(cur_object, cur_offset); 2398 2399 if (m != VM_PAGE_NULL) { 2400 if (m->busy) { 2401 wait_result_t result; 2402 2403 /* 2404 * in order to do the PAGE_ASSERT_WAIT, we must 2405 * have object that 'm' belongs to locked exclusively 2406 */ 2407 if (object != cur_object) { 2408 vm_object_unlock(object); 2409 2410 if (cur_object_lock_type == OBJECT_LOCK_SHARED) { 2411 2412 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2413 2414 if (vm_object_lock_upgrade(cur_object) == FALSE) { 2415 /* 2416 * couldn't upgrade so go do a full retry 2417 * immediately since we've already dropped 2418 * the top object lock associated with this page 2419 * and the current one got dropped due to the 2420 * failed upgrade... the state is no longer valid 2421 */ 2422 vm_map_unlock_read(map); 2423 if (real_map != map) 2424 vm_map_unlock(real_map); 2425 2426 goto RetryFault; 2427 } 2428 } 2429 } else if (object_lock_type == OBJECT_LOCK_SHARED) { 2430 2431 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2432 2433 if (vm_object_lock_upgrade(object) == FALSE) { 2434 /* 2435 * couldn't upgrade, so explictly take the lock 2436 * exclusively and go relookup the page since we 2437 * will have dropped the object lock and 2438 * a different thread could have inserted 2439 * a page at this offset 2440 * no need for a full retry since we're 2441 * at the top level of the object chain 2442 */ 2443 vm_object_lock(object); 2444 2445 continue; 2446 } 2447 } 2448 vm_map_unlock_read(map); 2449 if (real_map != map) 2450 vm_map_unlock(real_map); 2451 2452 result = PAGE_ASSERT_WAIT(m, interruptible); 2453 2454 vm_object_unlock(cur_object); 2455 2456 if (result == THREAD_WAITING) { 2457 result = thread_block(THREAD_CONTINUE_NULL); 2458 2459 counter(c_vm_fault_page_block_busy_kernel++); 2460 } 2461 if (result == THREAD_AWAKENED || result == THREAD_RESTART) 2462 goto RetryFault; 2463 2464 kr = KERN_ABORTED; 2465 goto done; 2466 } 2467 if (m->phys_page == vm_page_guard_addr) { 2468 /* 2469 * Guard page: let the slow path deal with it 2470 */ 2471 break; 2472 } 2473 if (m->unusual && (m->error || m->restart || m->private || m->absent)) { 2474 /* 2475 * Unusual case... let the slow path deal with it 2476 */ 2477 break; 2478 } 2479 if (m->encrypted) { 2480 /* 2481 * ENCRYPTED SWAP: 2482 * We've soft-faulted (because it's not in the page 2483 * table) on an encrypted page. 2484 * Keep the page "busy" so that no one messes with 2485 * it during the decryption. 2486 * Release the extra locks we're holding, keep only 2487 * the page's VM object lock. 2488 * 2489 * in order to set 'busy' on 'm', we must 2490 * have object that 'm' belongs to locked exclusively 2491 */ 2492 if (object != cur_object) { 2493 vm_object_unlock(object); 2494 2495 if (cur_object_lock_type == OBJECT_LOCK_SHARED) { 2496 2497 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2498 2499 if (vm_object_lock_upgrade(cur_object) == FALSE) { 2500 /* 2501 * couldn't upgrade so go do a full retry 2502 * immediately since we've already dropped 2503 * the top object lock associated with this page 2504 * and the current one got dropped due to the 2505 * failed upgrade... the state is no longer valid 2506 */ 2507 vm_map_unlock_read(map); 2508 if (real_map != map) 2509 vm_map_unlock(real_map); 2510 2511 goto RetryFault; 2512 } 2513 } 2514 } else if (object_lock_type == OBJECT_LOCK_SHARED) { 2515 2516 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2517 2518 if (vm_object_lock_upgrade(object) == FALSE) { 2519 /* 2520 * couldn't upgrade, so explictly take the lock 2521 * exclusively and go relookup the page since we 2522 * will have dropped the object lock and 2523 * a different thread could have inserted 2524 * a page at this offset 2525 * no need for a full retry since we're 2526 * at the top level of the object chain 2527 */ 2528 vm_object_lock(object); 2529 2530 continue; 2531 } 2532 } 2533 m->busy = TRUE; 2534 2535 vm_map_unlock_read(map); 2536 if (real_map != map) 2537 vm_map_unlock(real_map); 2538 2539 vm_page_decrypt(m, 0); 2540 2541 assert(m->busy); 2542 PAGE_WAKEUP_DONE(m); 2543 2544 vm_object_unlock(cur_object); 2545 /* 2546 * Retry from the top, in case anything 2547 * changed while we were decrypting... 2548 */ 2549 goto RetryFault; 2550 } 2551 ASSERT_PAGE_DECRYPTED(m); 2552 2553 if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) { 2554 /* 2555 * We might need to validate this page 2556 * against its code signature, so we 2557 * want to hold the VM object exclusively. 2558 */ 2559 if (object != cur_object) { 2560 if (cur_object_lock_type == OBJECT_LOCK_SHARED) { 2561 vm_object_unlock(object); 2562 vm_object_unlock(cur_object); 2563 2564 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2565 2566 vm_map_unlock_read(map); 2567 if (real_map != map) 2568 vm_map_unlock(real_map); 2569 2570 goto RetryFault; 2571 } 2572 2573 } else if (object_lock_type == OBJECT_LOCK_SHARED) { 2574 2575 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2576 2577 if (vm_object_lock_upgrade(object) == FALSE) { 2578 /* 2579 * couldn't upgrade, so explictly take the lock 2580 * exclusively and go relookup the page since we 2581 * will have dropped the object lock and 2582 * a different thread could have inserted 2583 * a page at this offset 2584 * no need for a full retry since we're 2585 * at the top level of the object chain 2586 */ 2587 vm_object_lock(object); 2588 2589 continue; 2590 } 2591 } 2592 } 2593 /* 2594 * Two cases of map in faults: 2595 * - At top level w/o copy object. 2596 * - Read fault anywhere. 2597 * --> must disallow write. 2598 */ 2599 2600 if (object == cur_object && object->copy == VM_OBJECT_NULL) { 2601 if ((fault_type & VM_PROT_WRITE) == 0) { 2602 /* 2603 * This is not a "write" fault, so we 2604 * might not have taken the object lock 2605 * exclusively and we might not be able 2606 * to update the "wpmapped" bit in 2607 * vm_fault_enter(). 2608 * Let's just grant read access to 2609 * the page for now and we'll 2610 * soft-fault again if we need write 2611 * access later... 2612 */ 2613 prot &= ~VM_PROT_WRITE; 2614 } 2615 goto FastPmapEnter; 2616 } 2617 2618 if ((fault_type & VM_PROT_WRITE) == 0) { 2619 2620 prot &= ~VM_PROT_WRITE; 2621 2622 if (object != cur_object) { 2623 /* 2624 * We still need to hold the top object 2625 * lock here to prevent a race between 2626 * a read fault (taking only "shared" 2627 * locks) and a write fault (taking 2628 * an "exclusive" lock on the top 2629 * object. 2630 * Otherwise, as soon as we release the 2631 * top lock, the write fault could 2632 * proceed and actually complete before 2633 * the read fault, and the copied page's 2634 * translation could then be overwritten 2635 * by the read fault's translation for 2636 * the original page. 2637 * 2638 * Let's just record what the top object 2639 * is and we'll release it later. 2640 */ 2641 top_object = object; 2642 2643 /* 2644 * switch to the object that has the new page 2645 */ 2646 object = cur_object; 2647 object_lock_type = cur_object_lock_type; 2648 } 2649FastPmapEnter: 2650 /* 2651 * prepare for the pmap_enter... 2652 * object and map are both locked 2653 * m contains valid data 2654 * object == m->object 2655 * cur_object == NULL or it's been unlocked 2656 * no paging references on either object or cur_object 2657 */ 2658#if MACH_KDB 2659 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0) 2660 prot &= ~VM_PROT_WRITE; 2661#endif 2662 if (caller_pmap) { 2663 kr = vm_fault_enter(m, 2664 caller_pmap, 2665 caller_pmap_addr, 2666 prot, 2667 wired, 2668 change_wiring, 2669 fault_info.no_cache, 2670 &type_of_fault); 2671 } else { 2672 kr = vm_fault_enter(m, 2673 pmap, 2674 vaddr, 2675 prot, 2676 wired, 2677 change_wiring, 2678 fault_info.no_cache, 2679 &type_of_fault); 2680 } 2681 2682 if (top_object != VM_OBJECT_NULL) { 2683 /* 2684 * It's safe to drop the top object 2685 * now that we've done our 2686 * vm_fault_enter(). Any other fault 2687 * in progress for that virtual 2688 * address will either find our page 2689 * and translation or put in a new page 2690 * and translation. 2691 */ 2692 vm_object_unlock(top_object); 2693 top_object = VM_OBJECT_NULL; 2694 } 2695 2696 if (need_collapse == TRUE) 2697 vm_object_collapse(object, offset, TRUE); 2698 2699 if (type_of_fault == DBG_PAGEIN_FAULT) { 2700 /* 2701 * evaluate access pattern and update state 2702 * vm_fault_deactivate_behind depends on the 2703 * state being up to date 2704 */ 2705 vm_fault_is_sequential(object, cur_offset, fault_info.behavior); 2706 2707 vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior); 2708 } 2709 /* 2710 * That's it, clean up and return. 2711 */ 2712 if (m->busy) 2713 PAGE_WAKEUP_DONE(m); 2714 2715 vm_object_unlock(object); 2716 2717 vm_map_unlock_read(map); 2718 if (real_map != map) 2719 vm_map_unlock(real_map); 2720 2721 goto done; 2722 } 2723 /* 2724 * COPY ON WRITE FAULT 2725 * 2726 * If objects match, then 2727 * object->copy must not be NULL (else control 2728 * would be in previous code block), and we 2729 * have a potential push into the copy object 2730 * with which we can't cope with here. 2731 */ 2732 if (cur_object == object) { 2733 /* 2734 * must take the slow path to 2735 * deal with the copy push 2736 */ 2737 break; 2738 } 2739 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE); 2740 2741 /* 2742 * This is now a shadow based copy on write 2743 * fault -- it requires a copy up the shadow 2744 * chain. 2745 * 2746 * Allocate a page in the original top level 2747 * object. Give up if allocate fails. Also 2748 * need to remember current page, as it's the 2749 * source of the copy. 2750 * 2751 * at this point we hold locks on both 2752 * object and cur_object... no need to take 2753 * paging refs or mark pages BUSY since 2754 * we don't drop either object lock until 2755 * the page has been copied and inserted 2756 */ 2757 cur_m = m; 2758 m = vm_page_grab(); 2759 2760 if (m == VM_PAGE_NULL) { 2761 /* 2762 * no free page currently available... 2763 * must take the slow path 2764 */ 2765 break; 2766 } 2767 /* 2768 * Now do the copy. Mark the source page busy... 2769 * 2770 * NOTE: This code holds the map lock across 2771 * the page copy. 2772 */ 2773 vm_page_copy(cur_m, m); 2774 vm_page_insert(m, object, offset); 2775 m->dirty = TRUE; 2776 2777 /* 2778 * Now cope with the source page and object 2779 */ 2780 if (object->ref_count > 1 && cur_m->pmapped) 2781 pmap_disconnect(cur_m->phys_page); 2782 2783 need_collapse = TRUE; 2784 2785 if (!cur_object->internal && 2786 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) { 2787 /* 2788 * The object from which we've just 2789 * copied a page is most probably backed 2790 * by a vnode. We don't want to waste too 2791 * much time trying to collapse the VM objects 2792 * and create a bottleneck when several tasks 2793 * map the same file. 2794 */ 2795 if (cur_object->copy == object) { 2796 /* 2797 * Shared mapping or no COW yet. 2798 * We can never collapse a copy 2799 * object into its backing object. 2800 */ 2801 need_collapse = FALSE; 2802 } else if (cur_object->copy == object->shadow && 2803 object->shadow->resident_page_count == 0) { 2804 /* 2805 * Shared mapping after a COW occurred. 2806 */ 2807 need_collapse = FALSE; 2808 } 2809 } 2810 vm_object_unlock(cur_object); 2811 2812 if (need_collapse == FALSE) 2813 vm_fault_collapse_skipped++; 2814 vm_fault_collapse_total++; 2815 2816 type_of_fault = DBG_COW_FAULT; 2817 VM_STAT_INCR(cow_faults); 2818 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL); 2819 current_task()->cow_faults++; 2820 2821 goto FastPmapEnter; 2822 2823 } else { 2824 /* 2825 * No page at cur_object, cur_offset... m == NULL 2826 */ 2827 if (cur_object->pager_created) { 2828 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) { 2829 /* 2830 * May have to talk to a pager... 2831 * take the slow path. 2832 */ 2833 break; 2834 } 2835 /* 2836 * existence map present and indicates 2837 * that the pager doesn't have this page 2838 */ 2839 } 2840 if (cur_object->shadow == VM_OBJECT_NULL) { 2841 /* 2842 * Zero fill fault. Page gets 2843 * inserted into the original object. 2844 */ 2845 if (cur_object->shadow_severed) { 2846 2847 if (object != cur_object) 2848 vm_object_unlock(cur_object); 2849 vm_object_unlock(object); 2850 2851 vm_map_unlock_read(map); 2852 if (real_map != map) 2853 vm_map_unlock(real_map); 2854 2855 kr = KERN_MEMORY_ERROR; 2856 goto done; 2857 } 2858 if (VM_PAGE_ZFILL_THROTTLED()) { 2859 /* 2860 * drop all of our locks... 2861 * wait until the free queue is 2862 * pumped back up and then 2863 * redrive the fault 2864 */ 2865 if (object != cur_object) 2866 vm_object_unlock(cur_object); 2867 vm_object_unlock(object); 2868 vm_map_unlock_read(map); 2869 if (real_map != map) 2870 vm_map_unlock(real_map); 2871 2872 if (vm_page_wait((change_wiring) ? 2873 THREAD_UNINT : 2874 THREAD_ABORTSAFE)) 2875 goto RetryFault; 2876 2877 kr = KERN_ABORTED; 2878 goto done; 2879 } 2880 if (vm_backing_store_low) { 2881 /* 2882 * we are protecting the system from 2883 * backing store exhaustion... 2884 * must take the slow path if we're 2885 * not privileged 2886 */ 2887 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) 2888 break; 2889 } 2890 if (cur_object != object) { 2891 vm_object_unlock(cur_object); 2892 2893 cur_object = object; 2894 } 2895 if (object_lock_type == OBJECT_LOCK_SHARED) { 2896 2897 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2898 2899 if (vm_object_lock_upgrade(object) == FALSE) { 2900 /* 2901 * couldn't upgrade so do a full retry on the fault 2902 * since we dropped the object lock which 2903 * could allow another thread to insert 2904 * a page at this offset 2905 */ 2906 vm_map_unlock_read(map); 2907 if (real_map != map) 2908 vm_map_unlock(real_map); 2909 2910 goto RetryFault; 2911 } 2912 } 2913 m = vm_page_alloc(object, offset); 2914 2915 if (m == VM_PAGE_NULL) { 2916 /* 2917 * no free page currently available... 2918 * must take the slow path 2919 */ 2920 break; 2921 } 2922 2923 /* 2924 * Now zero fill page... 2925 * the page is probably going to 2926 * be written soon, so don't bother 2927 * to clear the modified bit 2928 * 2929 * NOTE: This code holds the map 2930 * lock across the zero fill. 2931 */ 2932 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill); 2933 2934 goto FastPmapEnter; 2935 } 2936 /* 2937 * On to the next level in the shadow chain 2938 */ 2939 cur_offset += cur_object->shadow_offset; 2940 new_object = cur_object->shadow; 2941 2942 /* 2943 * take the new_object's lock with the indicated state 2944 */ 2945 if (cur_object_lock_type == OBJECT_LOCK_SHARED) 2946 vm_object_lock_shared(new_object); 2947 else 2948 vm_object_lock(new_object); 2949 2950 if (cur_object != object) 2951 vm_object_unlock(cur_object); 2952 2953 cur_object = new_object; 2954 2955 continue; 2956 } 2957 } 2958 /* 2959 * Cleanup from fast fault failure. Drop any object 2960 * lock other than original and drop map lock. 2961 */ 2962 if (object != cur_object) 2963 vm_object_unlock(cur_object); 2964 2965 /* 2966 * must own the object lock exclusively at this point 2967 */ 2968 if (object_lock_type == OBJECT_LOCK_SHARED) { 2969 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2970 2971 if (vm_object_lock_upgrade(object) == FALSE) { 2972 /* 2973 * couldn't upgrade, so explictly 2974 * take the lock exclusively 2975 * no need to retry the fault at this 2976 * point since "vm_fault_page" will 2977 * completely re-evaluate the state 2978 */ 2979 vm_object_lock(object); 2980 } 2981 } 2982 2983handle_copy_delay: 2984 vm_map_unlock_read(map); 2985 if (real_map != map) 2986 vm_map_unlock(real_map); 2987 2988 /* 2989 * Make a reference to this object to 2990 * prevent its disposal while we are messing with 2991 * it. Once we have the reference, the map is free 2992 * to be diddled. Since objects reference their 2993 * shadows (and copies), they will stay around as well. 2994 */ 2995 vm_object_reference_locked(object); 2996 vm_object_paging_begin(object); 2997 2998 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0); 2999 3000 error_code = 0; 3001 3002 kr = vm_fault_page(object, offset, fault_type, 3003 (change_wiring && !wired), 3004 &prot, &result_page, &top_page, 3005 &type_of_fault, 3006 &error_code, map->no_zero_fill, 3007 FALSE, &fault_info); 3008 3009 /* 3010 * if kr != VM_FAULT_SUCCESS, then the paging reference 3011 * has been dropped and the object unlocked... the ref_count 3012 * is still held 3013 * 3014 * if kr == VM_FAULT_SUCCESS, then the paging reference 3015 * is still held along with the ref_count on the original object 3016 * 3017 * if m != NULL, then the object it belongs to 3018 * is returned locked with a paging reference 3019 * 3020 * if top_page != NULL, then it's BUSY and the 3021 * object it belongs to has a paging reference 3022 * but is returned unlocked 3023 */ 3024 if (kr != VM_FAULT_SUCCESS) { 3025 /* 3026 * we didn't succeed, lose the object reference immediately. 3027 */ 3028 vm_object_deallocate(object); 3029 3030 /* 3031 * See why we failed, and take corrective action. 3032 */ 3033 switch (kr) { 3034 case VM_FAULT_MEMORY_SHORTAGE: 3035 if (vm_page_wait((change_wiring) ? 3036 THREAD_UNINT : 3037 THREAD_ABORTSAFE)) 3038 goto RetryFault; 3039 /* 3040 * fall thru 3041 */ 3042 case VM_FAULT_INTERRUPTED: 3043 kr = KERN_ABORTED; 3044 goto done; 3045 case VM_FAULT_RETRY: 3046 goto RetryFault; 3047 case VM_FAULT_MEMORY_ERROR: 3048 if (error_code) 3049 kr = error_code; 3050 else 3051 kr = KERN_MEMORY_ERROR; 3052 goto done; 3053 } 3054 } 3055 m = result_page; 3056 3057 if (m != VM_PAGE_NULL) { 3058 assert((change_wiring && !wired) ? 3059 (top_page == VM_PAGE_NULL) : 3060 ((top_page == VM_PAGE_NULL) == (m->object == object))); 3061 } 3062 3063 /* 3064 * What to do with the resulting page from vm_fault_page 3065 * if it doesn't get entered into the physical map: 3066 */ 3067#define RELEASE_PAGE(m) \ 3068 MACRO_BEGIN \ 3069 PAGE_WAKEUP_DONE(m); \ 3070 vm_page_lockspin_queues(); \ 3071 if (!m->active && !m->inactive && !m->throttled)\ 3072 vm_page_activate(m); \ 3073 vm_page_unlock_queues(); \ 3074 MACRO_END 3075 3076 /* 3077 * We must verify that the maps have not changed 3078 * since our last lookup. 3079 */ 3080 if (m != VM_PAGE_NULL) { 3081 old_copy_object = m->object->copy; 3082 vm_object_unlock(m->object); 3083 } else 3084 old_copy_object = VM_OBJECT_NULL; 3085 3086 /* 3087 * no object locks are held at this point 3088 */ 3089 if ((map != original_map) || !vm_map_verify(map, &version)) { 3090 vm_object_t retry_object; 3091 vm_object_offset_t retry_offset; 3092 vm_prot_t retry_prot; 3093 3094 /* 3095 * To avoid trying to write_lock the map while another 3096 * thread has it read_locked (in vm_map_pageable), we 3097 * do not try for write permission. If the page is 3098 * still writable, we will get write permission. If it 3099 * is not, or has been marked needs_copy, we enter the 3100 * mapping without write permission, and will merely 3101 * take another fault. 3102 */ 3103 map = original_map; 3104 vm_map_lock_read(map); 3105 3106 kr = vm_map_lookup_locked(&map, vaddr, 3107 fault_type & ~VM_PROT_WRITE, 3108 OBJECT_LOCK_EXCLUSIVE, &version, 3109 &retry_object, &retry_offset, &retry_prot, 3110 &wired, 3111 &fault_info, 3112 &real_map); 3113 pmap = real_map->pmap; 3114 3115 if (kr != KERN_SUCCESS) { 3116 vm_map_unlock_read(map); 3117 3118 if (m != VM_PAGE_NULL) { 3119 /* 3120 * retake the lock so that 3121 * we can drop the paging reference 3122 * in vm_fault_cleanup and do the 3123 * PAGE_WAKEUP_DONE in RELEASE_PAGE 3124 */ 3125 vm_object_lock(m->object); 3126 3127 RELEASE_PAGE(m); 3128 3129 vm_fault_cleanup(m->object, top_page); 3130 } else { 3131 /* 3132 * retake the lock so that 3133 * we can drop the paging reference 3134 * in vm_fault_cleanup 3135 */ 3136 vm_object_lock(object); 3137 3138 vm_fault_cleanup(object, top_page); 3139 } 3140 vm_object_deallocate(object); 3141 3142 goto done; 3143 } 3144 vm_object_unlock(retry_object); 3145 3146 if ((retry_object != object) || (retry_offset != offset)) { 3147 3148 vm_map_unlock_read(map); 3149 if (real_map != map) 3150 vm_map_unlock(real_map); 3151 3152 if (m != VM_PAGE_NULL) { 3153 /* 3154 * retake the lock so that 3155 * we can drop the paging reference 3156 * in vm_fault_cleanup and do the 3157 * PAGE_WAKEUP_DONE in RELEASE_PAGE 3158 */ 3159 vm_object_lock(m->object); 3160 3161 RELEASE_PAGE(m); 3162 3163 vm_fault_cleanup(m->object, top_page); 3164 } else { 3165 /* 3166 * retake the lock so that 3167 * we can drop the paging reference 3168 * in vm_fault_cleanup 3169 */ 3170 vm_object_lock(object); 3171 3172 vm_fault_cleanup(object, top_page); 3173 } 3174 vm_object_deallocate(object); 3175 3176 goto RetryFault; 3177 } 3178 /* 3179 * Check whether the protection has changed or the object 3180 * has been copied while we left the map unlocked. 3181 */ 3182 prot &= retry_prot; 3183 } 3184 if (m != VM_PAGE_NULL) { 3185 vm_object_lock(m->object); 3186 3187 if (m->object->copy != old_copy_object) { 3188 /* 3189 * The copy object changed while the top-level object 3190 * was unlocked, so take away write permission. 3191 */ 3192 prot &= ~VM_PROT_WRITE; 3193 } 3194 } else 3195 vm_object_lock(object); 3196 3197 /* 3198 * If we want to wire down this page, but no longer have 3199 * adequate permissions, we must start all over. 3200 */ 3201 if (wired && (fault_type != (prot | VM_PROT_WRITE))) { 3202 3203 vm_map_verify_done(map, &version); 3204 if (real_map != map) 3205 vm_map_unlock(real_map); 3206 3207 if (m != VM_PAGE_NULL) { 3208 RELEASE_PAGE(m); 3209 3210 vm_fault_cleanup(m->object, top_page); 3211 } else 3212 vm_fault_cleanup(object, top_page); 3213 3214 vm_object_deallocate(object); 3215 3216 goto RetryFault; 3217 } 3218 if (m != VM_PAGE_NULL) { 3219 /* 3220 * Put this page into the physical map. 3221 * We had to do the unlock above because pmap_enter 3222 * may cause other faults. The page may be on 3223 * the pageout queues. If the pageout daemon comes 3224 * across the page, it will remove it from the queues. 3225 */ 3226 if (caller_pmap) { 3227 kr = vm_fault_enter(m, 3228 caller_pmap, 3229 caller_pmap_addr, 3230 prot, 3231 wired, 3232 change_wiring, 3233 fault_info.no_cache, 3234 &type_of_fault); 3235 } else { 3236 kr = vm_fault_enter(m, 3237 pmap, 3238 vaddr, 3239 prot, 3240 wired, 3241 change_wiring, 3242 fault_info.no_cache, 3243 &type_of_fault); 3244 } 3245 if (kr != KERN_SUCCESS) { 3246 /* abort this page fault */ 3247 vm_map_verify_done(map, &version); 3248 if (real_map != map) 3249 vm_map_unlock(real_map); 3250 PAGE_WAKEUP_DONE(m); 3251 vm_fault_cleanup(m->object, top_page); 3252 vm_object_deallocate(object); 3253 goto done; 3254 } 3255 } else { 3256 3257 vm_map_entry_t entry; 3258 vm_map_offset_t laddr; 3259 vm_map_offset_t ldelta, hdelta; 3260 3261 /* 3262 * do a pmap block mapping from the physical address 3263 * in the object 3264 */ 3265 3266#ifdef ppc 3267 /* While we do not worry about execution protection in */ 3268 /* general, certian pages may have instruction execution */ 3269 /* disallowed. We will check here, and if not allowed */ 3270 /* to execute, we return with a protection failure. */ 3271 3272 if ((fault_type & VM_PROT_EXECUTE) && 3273 (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) { 3274 3275 vm_map_verify_done(map, &version); 3276 3277 if (real_map != map) 3278 vm_map_unlock(real_map); 3279 3280 vm_fault_cleanup(object, top_page); 3281 vm_object_deallocate(object); 3282 3283 kr = KERN_PROTECTION_FAILURE; 3284 goto done; 3285 } 3286#endif /* ppc */ 3287 3288 if (real_map != map) 3289 vm_map_unlock(real_map); 3290 3291 if (original_map != map) { 3292 vm_map_unlock_read(map); 3293 vm_map_lock_read(original_map); 3294 map = original_map; 3295 } 3296 real_map = map; 3297 3298 laddr = vaddr; 3299 hdelta = 0xFFFFF000; 3300 ldelta = 0xFFFFF000; 3301 3302 while (vm_map_lookup_entry(map, laddr, &entry)) { 3303 if (ldelta > (laddr - entry->vme_start)) 3304 ldelta = laddr - entry->vme_start; 3305 if (hdelta > (entry->vme_end - laddr)) 3306 hdelta = entry->vme_end - laddr; 3307 if (entry->is_sub_map) { 3308 3309 laddr = (laddr - entry->vme_start) 3310 + entry->offset; 3311 vm_map_lock_read(entry->object.sub_map); 3312 3313 if (map != real_map) 3314 vm_map_unlock_read(map); 3315 if (entry->use_pmap) { 3316 vm_map_unlock_read(real_map); 3317 real_map = entry->object.sub_map; 3318 } 3319 map = entry->object.sub_map; 3320 3321 } else { 3322 break; 3323 } 3324 } 3325 3326 if (vm_map_lookup_entry(map, laddr, &entry) && 3327 (entry->object.vm_object != NULL) && 3328 (entry->object.vm_object == object)) { 3329 3330 if (caller_pmap) { 3331 /* 3332 * Set up a block mapped area 3333 */ 3334 pmap_map_block(caller_pmap, 3335 (addr64_t)(caller_pmap_addr - ldelta), 3336 (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) + 3337 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12, 3338 ((ldelta + hdelta) >> 12), prot, 3339 (VM_WIMG_MASK & (int)object->wimg_bits), 0); 3340 } else { 3341 /* 3342 * Set up a block mapped area 3343 */ 3344 pmap_map_block(real_map->pmap, 3345 (addr64_t)(vaddr - ldelta), 3346 (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) + 3347 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12, 3348 ((ldelta + hdelta) >> 12), prot, 3349 (VM_WIMG_MASK & (int)object->wimg_bits), 0); 3350 } 3351 } 3352 } 3353 3354 /* 3355 * Unlock everything, and return 3356 */ 3357 vm_map_verify_done(map, &version); 3358 if (real_map != map) 3359 vm_map_unlock(real_map); 3360 3361 if (m != VM_PAGE_NULL) { 3362 PAGE_WAKEUP_DONE(m); 3363 3364 vm_fault_cleanup(m->object, top_page); 3365 } else 3366 vm_fault_cleanup(object, top_page); 3367 3368 vm_object_deallocate(object); 3369 3370#undef RELEASE_PAGE 3371 3372 kr = KERN_SUCCESS; 3373done: 3374 thread_interrupt_level(interruptible_state); 3375 3376 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, 3377 (int)((uint64_t)vaddr >> 32), 3378 (int)vaddr, 3379 kr, 3380 type_of_fault, 3381 0); 3382 3383 return (kr); 3384} 3385 3386/* 3387 * vm_fault_wire: 3388 * 3389 * Wire down a range of virtual addresses in a map. 3390 */ 3391kern_return_t 3392vm_fault_wire( 3393 vm_map_t map, 3394 vm_map_entry_t entry, 3395 pmap_t pmap, 3396 vm_map_offset_t pmap_addr) 3397{ 3398 3399 register vm_map_offset_t va; 3400 register vm_map_offset_t end_addr = entry->vme_end; 3401 register kern_return_t rc; 3402 3403 assert(entry->in_transition); 3404 3405 if ((entry->object.vm_object != NULL) && 3406 !entry->is_sub_map && 3407 entry->object.vm_object->phys_contiguous) { 3408 return KERN_SUCCESS; 3409 } 3410 3411 /* 3412 * Inform the physical mapping system that the 3413 * range of addresses may not fault, so that 3414 * page tables and such can be locked down as well. 3415 */ 3416 3417 pmap_pageable(pmap, pmap_addr, 3418 pmap_addr + (end_addr - entry->vme_start), FALSE); 3419 3420 /* 3421 * We simulate a fault to get the page and enter it 3422 * in the physical map. 3423 */ 3424 3425 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { 3426 if ((rc = vm_fault_wire_fast( 3427 map, va, entry, pmap, 3428 pmap_addr + (va - entry->vme_start) 3429 )) != KERN_SUCCESS) { 3430 rc = vm_fault(map, va, VM_PROT_NONE, TRUE, 3431 (pmap == kernel_pmap) ? 3432 THREAD_UNINT : THREAD_ABORTSAFE, 3433 pmap, pmap_addr + (va - entry->vme_start)); 3434 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL); 3435 } 3436 3437 if (rc != KERN_SUCCESS) { 3438 struct vm_map_entry tmp_entry = *entry; 3439 3440 /* unwire wired pages */ 3441 tmp_entry.vme_end = va; 3442 vm_fault_unwire(map, 3443 &tmp_entry, FALSE, pmap, pmap_addr); 3444 3445 return rc; 3446 } 3447 } 3448 return KERN_SUCCESS; 3449} 3450 3451/* 3452 * vm_fault_unwire: 3453 * 3454 * Unwire a range of virtual addresses in a map. 3455 */ 3456void 3457vm_fault_unwire( 3458 vm_map_t map, 3459 vm_map_entry_t entry, 3460 boolean_t deallocate, 3461 pmap_t pmap, 3462 vm_map_offset_t pmap_addr) 3463{ 3464 register vm_map_offset_t va; 3465 register vm_map_offset_t end_addr = entry->vme_end; 3466 vm_object_t object; 3467 struct vm_object_fault_info fault_info; 3468 3469 object = (entry->is_sub_map) 3470 ? VM_OBJECT_NULL : entry->object.vm_object; 3471 3472 /* 3473 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually 3474 * do anything since such memory is wired by default. So we don't have 3475 * anything to undo here. 3476 */ 3477 3478 if (object != VM_OBJECT_NULL && object->phys_contiguous) 3479 return; 3480 3481 fault_info.interruptible = THREAD_UNINT; 3482 fault_info.behavior = entry->behavior; 3483 fault_info.user_tag = entry->alias; 3484 fault_info.lo_offset = entry->offset; 3485 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset; 3486 fault_info.no_cache = entry->no_cache; 3487 3488 /* 3489 * Since the pages are wired down, we must be able to 3490 * get their mappings from the physical map system. 3491 */ 3492 3493 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { 3494 3495 if (object == VM_OBJECT_NULL) { 3496 if (pmap) { 3497 pmap_change_wiring(pmap, 3498 pmap_addr + (va - entry->vme_start), FALSE); 3499 } 3500 (void) vm_fault(map, va, VM_PROT_NONE, 3501 TRUE, THREAD_UNINT, pmap, pmap_addr); 3502 } else { 3503 vm_prot_t prot; 3504 vm_page_t result_page; 3505 vm_page_t top_page; 3506 vm_object_t result_object; 3507 vm_fault_return_t result; 3508 3509 fault_info.cluster_size = end_addr - va; 3510 3511 do { 3512 prot = VM_PROT_NONE; 3513 3514 vm_object_lock(object); 3515 vm_object_paging_begin(object); 3516 XPR(XPR_VM_FAULT, 3517 "vm_fault_unwire -> vm_fault_page\n", 3518 0,0,0,0,0); 3519 result = vm_fault_page( 3520 object, 3521 entry->offset + (va - entry->vme_start), 3522 VM_PROT_NONE, TRUE, 3523 &prot, &result_page, &top_page, 3524 (int *)0, 3525 NULL, map->no_zero_fill, 3526 FALSE, &fault_info); 3527 } while (result == VM_FAULT_RETRY); 3528 3529 /* 3530 * If this was a mapping to a file on a device that has been forcibly 3531 * unmounted, then we won't get a page back from vm_fault_page(). Just 3532 * move on to the next one in case the remaining pages are mapped from 3533 * different objects. During a forced unmount, the object is terminated 3534 * so the alive flag will be false if this happens. A forced unmount will 3535 * will occur when an external disk is unplugged before the user does an 3536 * eject, so we don't want to panic in that situation. 3537 */ 3538 3539 if (result == VM_FAULT_MEMORY_ERROR && !object->alive) 3540 continue; 3541 3542 if (result != VM_FAULT_SUCCESS) 3543 panic("vm_fault_unwire: failure"); 3544 3545 result_object = result_page->object; 3546 3547 if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) { 3548 pmap_change_wiring(pmap, 3549 pmap_addr + (va - entry->vme_start), FALSE); 3550 } 3551 if (deallocate) { 3552 assert(result_page->phys_page != 3553 vm_page_fictitious_addr); 3554 pmap_disconnect(result_page->phys_page); 3555 VM_PAGE_FREE(result_page); 3556 } else { 3557 vm_page_lockspin_queues(); 3558 vm_page_unwire(result_page); 3559 vm_page_unlock_queues(); 3560 PAGE_WAKEUP_DONE(result_page); 3561 } 3562 vm_fault_cleanup(result_object, top_page); 3563 } 3564 } 3565 3566 /* 3567 * Inform the physical mapping system that the range 3568 * of addresses may fault, so that page tables and 3569 * such may be unwired themselves. 3570 */ 3571 3572 pmap_pageable(pmap, pmap_addr, 3573 pmap_addr + (end_addr - entry->vme_start), TRUE); 3574 3575} 3576 3577/* 3578 * vm_fault_wire_fast: 3579 * 3580 * Handle common case of a wire down page fault at the given address. 3581 * If successful, the page is inserted into the associated physical map. 3582 * The map entry is passed in to avoid the overhead of a map lookup. 3583 * 3584 * NOTE: the given address should be truncated to the 3585 * proper page address. 3586 * 3587 * KERN_SUCCESS is returned if the page fault is handled; otherwise, 3588 * a standard error specifying why the fault is fatal is returned. 3589 * 3590 * The map in question must be referenced, and remains so. 3591 * Caller has a read lock on the map. 3592 * 3593 * This is a stripped version of vm_fault() for wiring pages. Anything 3594 * other than the common case will return KERN_FAILURE, and the caller 3595 * is expected to call vm_fault(). 3596 */ 3597kern_return_t 3598vm_fault_wire_fast( 3599 __unused vm_map_t map, 3600 vm_map_offset_t va, 3601 vm_map_entry_t entry, 3602 pmap_t pmap, 3603 vm_map_offset_t pmap_addr) 3604{ 3605 vm_object_t object; 3606 vm_object_offset_t offset; 3607 register vm_page_t m; 3608 vm_prot_t prot; 3609 thread_t thread = current_thread(); 3610 int type_of_fault; 3611 kern_return_t kr; 3612 3613 VM_STAT_INCR(faults); 3614 3615 if (thread != THREAD_NULL && thread->task != TASK_NULL) 3616 thread->task->faults++; 3617 3618/* 3619 * Recovery actions 3620 */ 3621 3622#undef RELEASE_PAGE 3623#define RELEASE_PAGE(m) { \ 3624 PAGE_WAKEUP_DONE(m); \ 3625 vm_page_lockspin_queues(); \ 3626 vm_page_unwire(m); \ 3627 vm_page_unlock_queues(); \ 3628} 3629 3630 3631#undef UNLOCK_THINGS 3632#define UNLOCK_THINGS { \ 3633 vm_object_paging_end(object); \ 3634 vm_object_unlock(object); \ 3635} 3636 3637#undef UNLOCK_AND_DEALLOCATE 3638#define UNLOCK_AND_DEALLOCATE { \ 3639 UNLOCK_THINGS; \ 3640 vm_object_deallocate(object); \ 3641} 3642/* 3643 * Give up and have caller do things the hard way. 3644 */ 3645 3646#define GIVE_UP { \ 3647 UNLOCK_AND_DEALLOCATE; \ 3648 return(KERN_FAILURE); \ 3649} 3650 3651 3652 /* 3653 * If this entry is not directly to a vm_object, bail out. 3654 */ 3655 if (entry->is_sub_map) 3656 return(KERN_FAILURE); 3657 3658 /* 3659 * Find the backing store object and offset into it. 3660 */ 3661 3662 object = entry->object.vm_object; 3663 offset = (va - entry->vme_start) + entry->offset; 3664 prot = entry->protection; 3665 3666 /* 3667 * Make a reference to this object to prevent its 3668 * disposal while we are messing with it. 3669 */ 3670 3671 vm_object_lock(object); 3672 vm_object_reference_locked(object); 3673 vm_object_paging_begin(object); 3674 3675 /* 3676 * INVARIANTS (through entire routine): 3677 * 3678 * 1) At all times, we must either have the object 3679 * lock or a busy page in some object to prevent 3680 * some other thread from trying to bring in 3681 * the same page. 3682 * 3683 * 2) Once we have a busy page, we must remove it from 3684 * the pageout queues, so that the pageout daemon 3685 * will not grab it away. 3686 * 3687 */ 3688 3689 /* 3690 * Look for page in top-level object. If it's not there or 3691 * there's something going on, give up. 3692 * ENCRYPTED SWAP: use the slow fault path, since we'll need to 3693 * decrypt the page before wiring it down. 3694 */ 3695 m = vm_page_lookup(object, offset); 3696 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) || 3697 (m->unusual && ( m->error || m->restart || m->absent))) { 3698 3699 GIVE_UP; 3700 } 3701 ASSERT_PAGE_DECRYPTED(m); 3702 3703 if (m->fictitious && 3704 m->phys_page == vm_page_guard_addr) { 3705 /* 3706 * Guard pages are fictitious pages and are never 3707 * entered into a pmap, so let's say it's been wired... 3708 */ 3709 kr = KERN_SUCCESS; 3710 goto done; 3711 } 3712 3713 /* 3714 * Wire the page down now. All bail outs beyond this 3715 * point must unwire the page. 3716 */ 3717 3718 vm_page_lockspin_queues(); 3719 vm_page_wire(m); 3720 vm_page_unlock_queues(); 3721 3722 /* 3723 * Mark page busy for other threads. 3724 */ 3725 assert(!m->busy); 3726 m->busy = TRUE; 3727 assert(!m->absent); 3728 3729 /* 3730 * Give up if the page is being written and there's a copy object 3731 */ 3732 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) { 3733 RELEASE_PAGE(m); 3734 GIVE_UP; 3735 } 3736 3737 /* 3738 * Put this page into the physical map. 3739 */ 3740 type_of_fault = DBG_CACHE_HIT_FAULT; 3741 kr = vm_fault_enter(m, 3742 pmap, 3743 pmap_addr, 3744 prot, 3745 TRUE, 3746 FALSE, 3747 FALSE, 3748 &type_of_fault); 3749 3750done: 3751 /* 3752 * Unlock everything, and return 3753 */ 3754 3755 PAGE_WAKEUP_DONE(m); 3756 UNLOCK_AND_DEALLOCATE; 3757 3758 return kr; 3759 3760} 3761 3762/* 3763 * Routine: vm_fault_copy_cleanup 3764 * Purpose: 3765 * Release a page used by vm_fault_copy. 3766 */ 3767 3768void 3769vm_fault_copy_cleanup( 3770 vm_page_t page, 3771 vm_page_t top_page) 3772{ 3773 vm_object_t object = page->object; 3774 3775 vm_object_lock(object); 3776 PAGE_WAKEUP_DONE(page); 3777 vm_page_lockspin_queues(); 3778 if (!page->active && !page->inactive && !page->throttled) 3779 vm_page_activate(page); 3780 vm_page_unlock_queues(); 3781 vm_fault_cleanup(object, top_page); 3782} 3783 3784void 3785vm_fault_copy_dst_cleanup( 3786 vm_page_t page) 3787{ 3788 vm_object_t object; 3789 3790 if (page != VM_PAGE_NULL) { 3791 object = page->object; 3792 vm_object_lock(object); 3793 vm_page_lockspin_queues(); 3794 vm_page_unwire(page); 3795 vm_page_unlock_queues(); 3796 vm_object_paging_end(object); 3797 vm_object_unlock(object); 3798 } 3799} 3800 3801/* 3802 * Routine: vm_fault_copy 3803 * 3804 * Purpose: 3805 * Copy pages from one virtual memory object to another -- 3806 * neither the source nor destination pages need be resident. 3807 * 3808 * Before actually copying a page, the version associated with 3809 * the destination address map wil be verified. 3810 * 3811 * In/out conditions: 3812 * The caller must hold a reference, but not a lock, to 3813 * each of the source and destination objects and to the 3814 * destination map. 3815 * 3816 * Results: 3817 * Returns KERN_SUCCESS if no errors were encountered in 3818 * reading or writing the data. Returns KERN_INTERRUPTED if 3819 * the operation was interrupted (only possible if the 3820 * "interruptible" argument is asserted). Other return values 3821 * indicate a permanent error in copying the data. 3822 * 3823 * The actual amount of data copied will be returned in the 3824 * "copy_size" argument. In the event that the destination map 3825 * verification failed, this amount may be less than the amount 3826 * requested. 3827 */ 3828kern_return_t 3829vm_fault_copy( 3830 vm_object_t src_object, 3831 vm_object_offset_t src_offset, 3832 vm_map_size_t *copy_size, /* INOUT */ 3833 vm_object_t dst_object, 3834 vm_object_offset_t dst_offset, 3835 vm_map_t dst_map, 3836 vm_map_version_t *dst_version, 3837 int interruptible) 3838{ 3839 vm_page_t result_page; 3840 3841 vm_page_t src_page; 3842 vm_page_t src_top_page; 3843 vm_prot_t src_prot; 3844 3845 vm_page_t dst_page; 3846 vm_page_t dst_top_page; 3847 vm_prot_t dst_prot; 3848 3849 vm_map_size_t amount_left; 3850 vm_object_t old_copy_object; 3851 kern_return_t error = 0; 3852 3853 vm_map_size_t part_size; 3854 struct vm_object_fault_info fault_info_src; 3855 struct vm_object_fault_info fault_info_dst; 3856 3857 /* 3858 * In order not to confuse the clustered pageins, align 3859 * the different offsets on a page boundary. 3860 */ 3861 3862#define RETURN(x) \ 3863 MACRO_BEGIN \ 3864 *copy_size -= amount_left; \ 3865 MACRO_RETURN(x); \ 3866 MACRO_END 3867 3868 amount_left = *copy_size; 3869 3870 fault_info_src.interruptible = interruptible; 3871 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL; 3872 fault_info_src.user_tag = 0; 3873 fault_info_src.lo_offset = vm_object_trunc_page(src_offset); 3874 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left; 3875 fault_info_src.no_cache = FALSE; 3876 3877 fault_info_dst.interruptible = interruptible; 3878 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL; 3879 fault_info_dst.user_tag = 0; 3880 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset); 3881 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left; 3882 fault_info_dst.no_cache = FALSE; 3883 3884 do { /* while (amount_left > 0) */ 3885 /* 3886 * There may be a deadlock if both source and destination 3887 * pages are the same. To avoid this deadlock, the copy must 3888 * start by getting the destination page in order to apply 3889 * COW semantics if any. 3890 */ 3891 3892 RetryDestinationFault: ; 3893 3894 dst_prot = VM_PROT_WRITE|VM_PROT_READ; 3895 3896 vm_object_lock(dst_object); 3897 vm_object_paging_begin(dst_object); 3898 3899 fault_info_dst.cluster_size = amount_left; 3900 3901 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0); 3902 switch (vm_fault_page(dst_object, 3903 vm_object_trunc_page(dst_offset), 3904 VM_PROT_WRITE|VM_PROT_READ, 3905 FALSE, 3906 &dst_prot, &dst_page, &dst_top_page, 3907 (int *)0, 3908 &error, 3909 dst_map->no_zero_fill, 3910 FALSE, &fault_info_dst)) { 3911 case VM_FAULT_SUCCESS: 3912 break; 3913 case VM_FAULT_RETRY: 3914 goto RetryDestinationFault; 3915 case VM_FAULT_MEMORY_SHORTAGE: 3916 if (vm_page_wait(interruptible)) 3917 goto RetryDestinationFault; 3918 /* fall thru */ 3919 case VM_FAULT_INTERRUPTED: 3920 RETURN(MACH_SEND_INTERRUPTED); 3921 case VM_FAULT_MEMORY_ERROR: 3922 if (error) 3923 return (error); 3924 else 3925 return(KERN_MEMORY_ERROR); 3926 } 3927 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE); 3928 3929 old_copy_object = dst_page->object->copy; 3930 3931 /* 3932 * There exists the possiblity that the source and 3933 * destination page are the same. But we can't 3934 * easily determine that now. If they are the 3935 * same, the call to vm_fault_page() for the 3936 * destination page will deadlock. To prevent this we 3937 * wire the page so we can drop busy without having 3938 * the page daemon steal the page. We clean up the 3939 * top page but keep the paging reference on the object 3940 * holding the dest page so it doesn't go away. 3941 */ 3942 3943 vm_page_lockspin_queues(); 3944 vm_page_wire(dst_page); 3945 vm_page_unlock_queues(); 3946 PAGE_WAKEUP_DONE(dst_page); 3947 vm_object_unlock(dst_page->object); 3948 3949 if (dst_top_page != VM_PAGE_NULL) { 3950 vm_object_lock(dst_object); 3951 VM_PAGE_FREE(dst_top_page); 3952 vm_object_paging_end(dst_object); 3953 vm_object_unlock(dst_object); 3954 } 3955 3956 RetrySourceFault: ; 3957 3958 if (src_object == VM_OBJECT_NULL) { 3959 /* 3960 * No source object. We will just 3961 * zero-fill the page in dst_object. 3962 */ 3963 src_page = VM_PAGE_NULL; 3964 result_page = VM_PAGE_NULL; 3965 } else { 3966 vm_object_lock(src_object); 3967 src_page = vm_page_lookup(src_object, 3968 vm_object_trunc_page(src_offset)); 3969 if (src_page == dst_page) { 3970 src_prot = dst_prot; 3971 result_page = VM_PAGE_NULL; 3972 } else { 3973 src_prot = VM_PROT_READ; 3974 vm_object_paging_begin(src_object); 3975 3976 fault_info_src.cluster_size = amount_left; 3977 3978 XPR(XPR_VM_FAULT, 3979 "vm_fault_copy(2) -> vm_fault_page\n", 3980 0,0,0,0,0); 3981 switch (vm_fault_page( 3982 src_object, 3983 vm_object_trunc_page(src_offset), 3984 VM_PROT_READ, FALSE, 3985 &src_prot, 3986 &result_page, &src_top_page, 3987 (int *)0, &error, FALSE, 3988 FALSE, &fault_info_src)) { 3989 3990 case VM_FAULT_SUCCESS: 3991 break; 3992 case VM_FAULT_RETRY: 3993 goto RetrySourceFault; 3994 case VM_FAULT_MEMORY_SHORTAGE: 3995 if (vm_page_wait(interruptible)) 3996 goto RetrySourceFault; 3997 /* fall thru */ 3998 case VM_FAULT_INTERRUPTED: 3999 vm_fault_copy_dst_cleanup(dst_page); 4000 RETURN(MACH_SEND_INTERRUPTED); 4001 case VM_FAULT_MEMORY_ERROR: 4002 vm_fault_copy_dst_cleanup(dst_page); 4003 if (error) 4004 return (error); 4005 else 4006 return(KERN_MEMORY_ERROR); 4007 } 4008 4009 4010 assert((src_top_page == VM_PAGE_NULL) == 4011 (result_page->object == src_object)); 4012 } 4013 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE); 4014 vm_object_unlock(result_page->object); 4015 } 4016 4017 if (!vm_map_verify(dst_map, dst_version)) { 4018 if (result_page != VM_PAGE_NULL && src_page != dst_page) 4019 vm_fault_copy_cleanup(result_page, src_top_page); 4020 vm_fault_copy_dst_cleanup(dst_page); 4021 break; 4022 } 4023 4024 vm_object_lock(dst_page->object); 4025 4026 if (dst_page->object->copy != old_copy_object) { 4027 vm_object_unlock(dst_page->object); 4028 vm_map_verify_done(dst_map, dst_version); 4029 if (result_page != VM_PAGE_NULL && src_page != dst_page) 4030 vm_fault_copy_cleanup(result_page, src_top_page); 4031 vm_fault_copy_dst_cleanup(dst_page); 4032 break; 4033 } 4034 vm_object_unlock(dst_page->object); 4035 4036 /* 4037 * Copy the page, and note that it is dirty 4038 * immediately. 4039 */ 4040 4041 if (!page_aligned(src_offset) || 4042 !page_aligned(dst_offset) || 4043 !page_aligned(amount_left)) { 4044 4045 vm_object_offset_t src_po, 4046 dst_po; 4047 4048 src_po = src_offset - vm_object_trunc_page(src_offset); 4049 dst_po = dst_offset - vm_object_trunc_page(dst_offset); 4050 4051 if (dst_po > src_po) { 4052 part_size = PAGE_SIZE - dst_po; 4053 } else { 4054 part_size = PAGE_SIZE - src_po; 4055 } 4056 if (part_size > (amount_left)){ 4057 part_size = amount_left; 4058 } 4059 4060 if (result_page == VM_PAGE_NULL) { 4061 vm_page_part_zero_fill(dst_page, 4062 dst_po, part_size); 4063 } else { 4064 vm_page_part_copy(result_page, src_po, 4065 dst_page, dst_po, part_size); 4066 if(!dst_page->dirty){ 4067 vm_object_lock(dst_object); 4068 dst_page->dirty = TRUE; 4069 vm_object_unlock(dst_page->object); 4070 } 4071 4072 } 4073 } else { 4074 part_size = PAGE_SIZE; 4075 4076 if (result_page == VM_PAGE_NULL) 4077 vm_page_zero_fill(dst_page); 4078 else{ 4079 vm_page_copy(result_page, dst_page); 4080 if(!dst_page->dirty){ 4081 vm_object_lock(dst_object); 4082 dst_page->dirty = TRUE; 4083 vm_object_unlock(dst_page->object); 4084 } 4085 } 4086 4087 } 4088 4089 /* 4090 * Unlock everything, and return 4091 */ 4092 4093 vm_map_verify_done(dst_map, dst_version); 4094 4095 if (result_page != VM_PAGE_NULL && src_page != dst_page) 4096 vm_fault_copy_cleanup(result_page, src_top_page); 4097 vm_fault_copy_dst_cleanup(dst_page); 4098 4099 amount_left -= part_size; 4100 src_offset += part_size; 4101 dst_offset += part_size; 4102 } while (amount_left > 0); 4103 4104 RETURN(KERN_SUCCESS); 4105#undef RETURN 4106 4107 /*NOTREACHED*/ 4108} 4109 4110#if VM_FAULT_CLASSIFY 4111/* 4112 * Temporary statistics gathering support. 4113 */ 4114 4115/* 4116 * Statistics arrays: 4117 */ 4118#define VM_FAULT_TYPES_MAX 5 4119#define VM_FAULT_LEVEL_MAX 8 4120 4121int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX]; 4122 4123#define VM_FAULT_TYPE_ZERO_FILL 0 4124#define VM_FAULT_TYPE_MAP_IN 1 4125#define VM_FAULT_TYPE_PAGER 2 4126#define VM_FAULT_TYPE_COPY 3 4127#define VM_FAULT_TYPE_OTHER 4 4128 4129 4130void 4131vm_fault_classify(vm_object_t object, 4132 vm_object_offset_t offset, 4133 vm_prot_t fault_type) 4134{ 4135 int type, level = 0; 4136 vm_page_t m; 4137 4138 while (TRUE) { 4139 m = vm_page_lookup(object, offset); 4140 if (m != VM_PAGE_NULL) { 4141 if (m->busy || m->error || m->restart || m->absent) { 4142 type = VM_FAULT_TYPE_OTHER; 4143 break; 4144 } 4145 if (((fault_type & VM_PROT_WRITE) == 0) || 4146 ((level == 0) && object->copy == VM_OBJECT_NULL)) { 4147 type = VM_FAULT_TYPE_MAP_IN; 4148 break; 4149 } 4150 type = VM_FAULT_TYPE_COPY; 4151 break; 4152 } 4153 else { 4154 if (object->pager_created) { 4155 type = VM_FAULT_TYPE_PAGER; 4156 break; 4157 } 4158 if (object->shadow == VM_OBJECT_NULL) { 4159 type = VM_FAULT_TYPE_ZERO_FILL; 4160 break; 4161 } 4162 4163 offset += object->shadow_offset; 4164 object = object->shadow; 4165 level++; 4166 continue; 4167 } 4168 } 4169 4170 if (level > VM_FAULT_LEVEL_MAX) 4171 level = VM_FAULT_LEVEL_MAX; 4172 4173 vm_fault_stats[type][level] += 1; 4174 4175 return; 4176} 4177 4178/* cleanup routine to call from debugger */ 4179 4180void 4181vm_fault_classify_init(void) 4182{ 4183 int type, level; 4184 4185 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) { 4186 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) { 4187 vm_fault_stats[type][level] = 0; 4188 } 4189 } 4190 4191 return; 4192} 4193#endif /* VM_FAULT_CLASSIFY */ 4194 4195 4196extern int cs_validation; 4197 4198void 4199vm_page_validate_cs_mapped( 4200 vm_page_t page, 4201 const void *kaddr) 4202{ 4203 vm_object_t object; 4204 vm_object_offset_t offset; 4205 kern_return_t kr; 4206 memory_object_t pager; 4207 void *blobs; 4208 boolean_t validated, tainted; 4209 4210 assert(page->busy); 4211 vm_object_lock_assert_exclusive(page->object); 4212 4213 if (!cs_validation) { 4214 return; 4215 } 4216 4217 if (page->wpmapped && !page->cs_tainted) { 4218 /* 4219 * This page was mapped for "write" access sometime in the 4220 * past and could still be modifiable in the future. 4221 * Consider it tainted. 4222 * [ If the page was already found to be "tainted", no 4223 * need to re-validate. ] 4224 */ 4225 page->cs_validated = TRUE; 4226 page->cs_tainted = TRUE; 4227 if (cs_debug) { 4228 printf("CODESIGNING: vm_page_validate_cs: " 4229 "page %p obj %p off 0x%llx " 4230 "was modified\n", 4231 page, page->object, page->offset); 4232 } 4233 vm_cs_validated_dirtied++; 4234 } 4235 4236 if (page->cs_validated) { 4237 return; 4238 } 4239 4240 vm_cs_validates++; 4241 4242 object = page->object; 4243 assert(object->code_signed); 4244 offset = page->offset; 4245 4246 if (!object->alive || object->terminating || object->pager == NULL) { 4247 /* 4248 * The object is terminating and we don't have its pager 4249 * so we can't validate the data... 4250 */ 4251 return; 4252 } 4253 /* 4254 * Since we get here to validate a page that was brought in by 4255 * the pager, we know that this pager is all setup and ready 4256 * by now. 4257 */ 4258 assert(!object->internal); 4259 assert(object->pager != NULL); 4260 assert(object->pager_ready); 4261 4262 pager = object->pager; 4263 4264 kr = vnode_pager_get_object_cs_blobs(pager, &blobs); 4265 if (kr != KERN_SUCCESS) { 4266 blobs = NULL; 4267 } 4268 4269 /* verify the SHA1 hash for this page */ 4270 validated = cs_validate_page(blobs, 4271 offset + object->paging_offset, 4272 (const void *)kaddr, 4273 &tainted); 4274 4275 page->cs_validated = validated; 4276 if (validated) { 4277 page->cs_tainted = tainted; 4278 } 4279} 4280 4281void 4282vm_page_validate_cs( 4283 vm_page_t page) 4284{ 4285 vm_object_t object; 4286 vm_object_offset_t offset; 4287 vm_map_offset_t koffset; 4288 vm_map_size_t ksize; 4289 vm_offset_t kaddr; 4290 kern_return_t kr; 4291 boolean_t busy_page; 4292 4293 vm_object_lock_assert_held(page->object); 4294 4295 if (!cs_validation) { 4296 return; 4297 } 4298 4299 if (page->wpmapped && !page->cs_tainted) { 4300 vm_object_lock_assert_exclusive(page->object); 4301 4302 /* 4303 * This page was mapped for "write" access sometime in the 4304 * past and could still be modifiable in the future. 4305 * Consider it tainted. 4306 * [ If the page was already found to be "tainted", no 4307 * need to re-validate. ] 4308 */ 4309 page->cs_validated = TRUE; 4310 page->cs_tainted = TRUE; 4311 if (cs_debug) { 4312 printf("CODESIGNING: vm_page_validate_cs: " 4313 "page %p obj %p off 0x%llx " 4314 "was modified\n", 4315 page, page->object, page->offset); 4316 } 4317 vm_cs_validated_dirtied++; 4318 } 4319 4320 if (page->cs_validated) { 4321 return; 4322 } 4323 4324 vm_object_lock_assert_exclusive(page->object); 4325 4326 object = page->object; 4327 assert(object->code_signed); 4328 offset = page->offset; 4329 4330 busy_page = page->busy; 4331 if (!busy_page) { 4332 /* keep page busy while we map (and unlock) the VM object */ 4333 page->busy = TRUE; 4334 } 4335 4336 /* 4337 * Take a paging reference on the VM object 4338 * to protect it from collapse or bypass, 4339 * and keep it from disappearing too. 4340 */ 4341 vm_object_paging_begin(object); 4342 4343 /* map the page in the kernel address space */ 4344 koffset = 0; 4345 ksize = PAGE_SIZE_64; 4346 kr = vm_paging_map_object(&koffset, 4347 page, 4348 object, 4349 offset, 4350 &ksize, 4351 VM_PROT_READ, 4352 FALSE); /* can't unlock object ! */ 4353 if (kr != KERN_SUCCESS) { 4354 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr); 4355 } 4356 kaddr = CAST_DOWN(vm_offset_t, koffset); 4357 4358 /* validate the mapped page */ 4359 vm_page_validate_cs_mapped(page, (const void *) kaddr); 4360 4361 assert(page->busy); 4362 assert(object == page->object); 4363 vm_object_lock_assert_exclusive(object); 4364 4365 if (!busy_page) { 4366 PAGE_WAKEUP_DONE(page); 4367 } 4368 if (koffset != 0) { 4369 /* unmap the map from the kernel address space */ 4370 vm_paging_unmap_object(object, koffset, koffset + ksize); 4371 koffset = 0; 4372 ksize = 0; 4373 kaddr = 0; 4374 } 4375 vm_object_paging_end(object); 4376} 4377