1/* 2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * @OSF_COPYRIGHT@ 30 */ 31/* 32 * Mach Operating System 33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University 34 * All Rights Reserved. 35 * 36 * Permission to use, copy, modify and distribute this software and its 37 * documentation is hereby granted, provided that both the copyright 38 * notice and this permission notice appear in all copies of the 39 * software, derivative works or modified versions, and any portions 40 * thereof, and that both notices appear in supporting documentation. 41 * 42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR 44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 45 * 46 * Carnegie Mellon requests users of this software to return to 47 * 48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 49 * School of Computer Science 50 * Carnegie Mellon University 51 * Pittsburgh PA 15213-3890 52 * 53 * any improvements or extensions that they make and grant Carnegie Mellon 54 * the rights to redistribute these changes. 55 */ 56/* 57 */ 58/* 59 * File: vm_fault.c 60 * Author: Avadis Tevanian, Jr., Michael Wayne Young 61 * 62 * Page fault handling module. 63 */ 64 65#include <mach_cluster_stats.h> 66#include <mach_pagemap.h> 67#include <libkern/OSAtomic.h> 68 69#include <mach/mach_types.h> 70#include <mach/kern_return.h> 71#include <mach/message.h> /* for error codes */ 72#include <mach/vm_param.h> 73#include <mach/vm_behavior.h> 74#include <mach/memory_object.h> 75 /* For memory_object_data_{request,unlock} */ 76#include <mach/sdt.h> 77 78#include <kern/kern_types.h> 79#include <kern/host_statistics.h> 80#include <kern/counters.h> 81#include <kern/task.h> 82#include <kern/thread.h> 83#include <kern/sched_prim.h> 84#include <kern/host.h> 85#include <kern/xpr.h> 86#include <kern/mach_param.h> 87#include <kern/macro_help.h> 88#include <kern/zalloc.h> 89#include <kern/misc_protos.h> 90 91#include <vm/vm_fault.h> 92#include <vm/vm_map.h> 93#include <vm/vm_object.h> 94#include <vm/vm_page.h> 95#include <vm/vm_kern.h> 96#include <vm/pmap.h> 97#include <vm/vm_pageout.h> 98#include <vm/vm_protos.h> 99#include <vm/vm_external.h> 100#include <vm/memory_object.h> 101#include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */ 102#include <vm/vm_shared_region.h> 103 104#define VM_FAULT_CLASSIFY 0 105 106#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */ 107 108int vm_object_pagein_throttle = 16; 109 110/* 111 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which 112 * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts 113 * of memory if they're buggy and can run the system completely out of swap space. If this happens, we 114 * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps 115 * keep the UI active so that the user has a chance to kill the offending task before the system 116 * completely hangs. 117 * 118 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied 119 * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold 120 * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a 121 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again. 122 */ 123 124extern boolean_t thread_is_io_throttled(void); 125extern void throttle_lowpri_io(int); 126 127uint64_t vm_hard_throttle_threshold; 128 129extern unsigned int dp_pages_free, dp_pages_reserve; 130 131#define NEED_TO_HARD_THROTTLE_THIS_TASK() (((dp_pages_free + dp_pages_reserve < 2000) && \ 132 (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \ 133 (current_task() != kernel_task) && VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) || \ 134 (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \ 135 (get_task_resident_size(current_task()) > vm_hard_throttle_threshold))) 136 137 138#define HARD_THROTTLE_DELAY 20000 /* 20000 us == 20 ms */ 139#define SOFT_THROTTLE_DELAY 2000 /* 2000 us == 2 ms */ 140 141 142extern int cs_debug; 143 144boolean_t current_thread_aborted(void); 145 146/* Forward declarations of internal routines. */ 147extern kern_return_t vm_fault_wire_fast( 148 vm_map_t map, 149 vm_map_offset_t va, 150 vm_map_entry_t entry, 151 pmap_t pmap, 152 vm_map_offset_t pmap_addr); 153 154extern void vm_fault_continue(void); 155 156extern void vm_fault_copy_cleanup( 157 vm_page_t page, 158 vm_page_t top_page); 159 160extern void vm_fault_copy_dst_cleanup( 161 vm_page_t page); 162 163#if VM_FAULT_CLASSIFY 164extern void vm_fault_classify(vm_object_t object, 165 vm_object_offset_t offset, 166 vm_prot_t fault_type); 167 168extern void vm_fault_classify_init(void); 169#endif 170 171unsigned long vm_pmap_enter_blocked = 0; 172unsigned long vm_pmap_enter_retried = 0; 173 174unsigned long vm_cs_validates = 0; 175unsigned long vm_cs_revalidates = 0; 176unsigned long vm_cs_query_modified = 0; 177unsigned long vm_cs_validated_dirtied = 0; 178unsigned long vm_cs_bitmap_validated = 0; 179#if CONFIG_ENFORCE_SIGNED_CODE 180int cs_enforcement_disable=0; 181#else 182static const int cs_enforcement_disable=1; 183#endif 184 185/* 186 * Routine: vm_fault_init 187 * Purpose: 188 * Initialize our private data structures. 189 */ 190void 191vm_fault_init(void) 192{ 193#if !SECURE_KERNEL 194#if CONFIG_ENFORCE_SIGNED_CODE 195 PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable, 196 sizeof (cs_enforcement_disable)); 197#endif 198 PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug)); 199#endif 200 201 /* 202 * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is 203 * computed as a percentage of available memory, and the percentage used is scaled inversely with 204 * the amount of memory. The pertange runs between 10% and 35%. We use 35% for small memory systems 205 * and reduce the value down to 10% for very large memory configurations. This helps give us a 206 * definition of a memory hog that makes more sense relative to the amount of ram in the machine. 207 * The formula here simply uses the number of gigabytes of ram to adjust the percentage. 208 */ 209 210 vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100; 211} 212 213/* 214 * Routine: vm_fault_cleanup 215 * Purpose: 216 * Clean up the result of vm_fault_page. 217 * Results: 218 * The paging reference for "object" is released. 219 * "object" is unlocked. 220 * If "top_page" is not null, "top_page" is 221 * freed and the paging reference for the object 222 * containing it is released. 223 * 224 * In/out conditions: 225 * "object" must be locked. 226 */ 227void 228vm_fault_cleanup( 229 register vm_object_t object, 230 register vm_page_t top_page) 231{ 232 vm_object_paging_end(object); 233 vm_object_unlock(object); 234 235 if (top_page != VM_PAGE_NULL) { 236 object = top_page->object; 237 238 vm_object_lock(object); 239 VM_PAGE_FREE(top_page); 240 vm_object_paging_end(object); 241 vm_object_unlock(object); 242 } 243} 244 245#if MACH_CLUSTER_STATS 246#define MAXCLUSTERPAGES 16 247struct { 248 unsigned long pages_in_cluster; 249 unsigned long pages_at_higher_offsets; 250 unsigned long pages_at_lower_offsets; 251} cluster_stats_in[MAXCLUSTERPAGES]; 252#define CLUSTER_STAT(clause) clause 253#define CLUSTER_STAT_HIGHER(x) \ 254 ((cluster_stats_in[(x)].pages_at_higher_offsets)++) 255#define CLUSTER_STAT_LOWER(x) \ 256 ((cluster_stats_in[(x)].pages_at_lower_offsets)++) 257#define CLUSTER_STAT_CLUSTER(x) \ 258 ((cluster_stats_in[(x)].pages_in_cluster)++) 259#else /* MACH_CLUSTER_STATS */ 260#define CLUSTER_STAT(clause) 261#endif /* MACH_CLUSTER_STATS */ 262 263#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0) 264 265 266boolean_t vm_page_deactivate_behind = TRUE; 267/* 268 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior 269 */ 270#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128 271#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */ 272 /* we use it to size an array on the stack */ 273 274int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW; 275 276#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024) 277 278/* 279 * vm_page_is_sequential 280 * 281 * Determine if sequential access is in progress 282 * in accordance with the behavior specified. 283 * Update state to indicate current access pattern. 284 * 285 * object must have at least the shared lock held 286 */ 287static 288void 289vm_fault_is_sequential( 290 vm_object_t object, 291 vm_object_offset_t offset, 292 vm_behavior_t behavior) 293{ 294 vm_object_offset_t last_alloc; 295 int sequential; 296 int orig_sequential; 297 298 last_alloc = object->last_alloc; 299 sequential = object->sequential; 300 orig_sequential = sequential; 301 302 switch (behavior) { 303 case VM_BEHAVIOR_RANDOM: 304 /* 305 * reset indicator of sequential behavior 306 */ 307 sequential = 0; 308 break; 309 310 case VM_BEHAVIOR_SEQUENTIAL: 311 if (offset && last_alloc == offset - PAGE_SIZE_64) { 312 /* 313 * advance indicator of sequential behavior 314 */ 315 if (sequential < MAX_SEQUENTIAL_RUN) 316 sequential += PAGE_SIZE; 317 } else { 318 /* 319 * reset indicator of sequential behavior 320 */ 321 sequential = 0; 322 } 323 break; 324 325 case VM_BEHAVIOR_RSEQNTL: 326 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) { 327 /* 328 * advance indicator of sequential behavior 329 */ 330 if (sequential > -MAX_SEQUENTIAL_RUN) 331 sequential -= PAGE_SIZE; 332 } else { 333 /* 334 * reset indicator of sequential behavior 335 */ 336 sequential = 0; 337 } 338 break; 339 340 case VM_BEHAVIOR_DEFAULT: 341 default: 342 if (offset && last_alloc == (offset - PAGE_SIZE_64)) { 343 /* 344 * advance indicator of sequential behavior 345 */ 346 if (sequential < 0) 347 sequential = 0; 348 if (sequential < MAX_SEQUENTIAL_RUN) 349 sequential += PAGE_SIZE; 350 351 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) { 352 /* 353 * advance indicator of sequential behavior 354 */ 355 if (sequential > 0) 356 sequential = 0; 357 if (sequential > -MAX_SEQUENTIAL_RUN) 358 sequential -= PAGE_SIZE; 359 } else { 360 /* 361 * reset indicator of sequential behavior 362 */ 363 sequential = 0; 364 } 365 break; 366 } 367 if (sequential != orig_sequential) { 368 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) { 369 /* 370 * if someone else has already updated object->sequential 371 * don't bother trying to update it or object->last_alloc 372 */ 373 return; 374 } 375 } 376 /* 377 * I'd like to do this with a OSCompareAndSwap64, but that 378 * doesn't exist for PPC... however, it shouldn't matter 379 * that much... last_alloc is maintained so that we can determine 380 * if a sequential access pattern is taking place... if only 381 * one thread is banging on this object, no problem with the unprotected 382 * update... if 2 or more threads are banging away, we run the risk of 383 * someone seeing a mangled update... however, in the face of multiple 384 * accesses, no sequential access pattern can develop anyway, so we 385 * haven't lost any real info. 386 */ 387 object->last_alloc = offset; 388} 389 390 391int vm_page_deactivate_behind_count = 0; 392 393/* 394 * vm_page_deactivate_behind 395 * 396 * Determine if sequential access is in progress 397 * in accordance with the behavior specified. If 398 * so, compute a potential page to deactivate and 399 * deactivate it. 400 * 401 * object must be locked. 402 * 403 * return TRUE if we actually deactivate a page 404 */ 405static 406boolean_t 407vm_fault_deactivate_behind( 408 vm_object_t object, 409 vm_object_offset_t offset, 410 vm_behavior_t behavior) 411{ 412 int n; 413 int pages_in_run = 0; 414 int max_pages_in_run = 0; 415 int sequential_run; 416 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; 417 vm_object_offset_t run_offset = 0; 418 vm_object_offset_t pg_offset = 0; 419 vm_page_t m; 420 vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER]; 421 422 pages_in_run = 0; 423#if TRACEFAULTPAGE 424 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */ 425#endif 426 427 if (object == kernel_object || vm_page_deactivate_behind == FALSE) { 428 /* 429 * Do not deactivate pages from the kernel object: they 430 * are not intended to become pageable. 431 * or we've disabled the deactivate behind mechanism 432 */ 433 return FALSE; 434 } 435 if ((sequential_run = object->sequential)) { 436 if (sequential_run < 0) { 437 sequential_behavior = VM_BEHAVIOR_RSEQNTL; 438 sequential_run = 0 - sequential_run; 439 } else { 440 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; 441 } 442 } 443 switch (behavior) { 444 case VM_BEHAVIOR_RANDOM: 445 break; 446 case VM_BEHAVIOR_SEQUENTIAL: 447 if (sequential_run >= (int)PAGE_SIZE) { 448 run_offset = 0 - PAGE_SIZE_64; 449 max_pages_in_run = 1; 450 } 451 break; 452 case VM_BEHAVIOR_RSEQNTL: 453 if (sequential_run >= (int)PAGE_SIZE) { 454 run_offset = PAGE_SIZE_64; 455 max_pages_in_run = 1; 456 } 457 break; 458 case VM_BEHAVIOR_DEFAULT: 459 default: 460 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64; 461 462 /* 463 * determine if the run of sequential accesss has been 464 * long enough on an object with default access behavior 465 * to consider it for deactivation 466 */ 467 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) { 468 /* 469 * the comparisons between offset and behind are done 470 * in this kind of odd fashion in order to prevent wrap around 471 * at the end points 472 */ 473 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) { 474 if (offset >= behind) { 475 run_offset = 0 - behind; 476 pg_offset = PAGE_SIZE_64; 477 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER; 478 } 479 } else { 480 if (offset < -behind) { 481 run_offset = behind; 482 pg_offset = 0 - PAGE_SIZE_64; 483 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER; 484 } 485 } 486 } 487 break; 488 } 489 } 490 for (n = 0; n < max_pages_in_run; n++) { 491 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset)); 492 493 if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) { 494 page_run[pages_in_run++] = m; 495 pmap_clear_reference(m->phys_page); 496 } 497 } 498 if (pages_in_run) { 499 vm_page_lockspin_queues(); 500 501 for (n = 0; n < pages_in_run; n++) { 502 503 m = page_run[n]; 504 505 vm_page_deactivate_internal(m, FALSE); 506 507 vm_page_deactivate_behind_count++; 508#if TRACEFAULTPAGE 509 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ 510#endif 511 } 512 vm_page_unlock_queues(); 513 514 return TRUE; 515 } 516 return FALSE; 517} 518 519 520static int 521vm_page_throttled(void) 522{ 523 clock_sec_t elapsed_sec; 524 clock_sec_t tv_sec; 525 clock_usec_t tv_usec; 526 527 thread_t thread = current_thread(); 528 529 if (thread->options & TH_OPT_VMPRIV) 530 return (0); 531 532 thread->t_page_creation_count++; 533 534 if (NEED_TO_HARD_THROTTLE_THIS_TASK()) 535 return (HARD_THROTTLE_DELAY); 536 537 if (vm_page_free_count < vm_page_throttle_limit && 538 thread->t_page_creation_count > vm_page_creation_throttle) { 539 540 clock_get_system_microtime(&tv_sec, &tv_usec); 541 542 elapsed_sec = tv_sec - thread->t_page_creation_time; 543 544 if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) { 545 546 if (elapsed_sec >= 60) { 547 /* 548 * we'll reset our stats to give a well behaved app 549 * that was unlucky enough to accumulate a bunch of pages 550 * over a long period of time a chance to get out of 551 * the throttled state... we reset the counter and timestamp 552 * so that if it stays under the rate limit for the next second 553 * it will be back in our good graces... if it exceeds it, it 554 * will remain in the throttled state 555 */ 556 thread->t_page_creation_time = tv_sec; 557 thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5; 558 } 559 ++vm_page_throttle_count; 560 561 return (SOFT_THROTTLE_DELAY); 562 } 563 thread->t_page_creation_time = tv_sec; 564 thread->t_page_creation_count = 0; 565 } 566 return (0); 567} 568 569 570/* 571 * check for various conditions that would 572 * prevent us from creating a ZF page... 573 * cleanup is based on being called from vm_fault_page 574 * 575 * object must be locked 576 * object == m->object 577 */ 578static vm_fault_return_t 579vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state) 580{ 581 int throttle_delay; 582 583 if (object->shadow_severed || 584 VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) { 585 /* 586 * Either: 587 * 1. the shadow chain was severed, 588 * 2. the purgeable object is volatile or empty and is marked 589 * to fault on access while volatile. 590 * Just have to return an error at this point 591 */ 592 if (m != VM_PAGE_NULL) 593 VM_PAGE_FREE(m); 594 vm_fault_cleanup(object, first_m); 595 596 thread_interrupt_level(interruptible_state); 597 598 return (VM_FAULT_MEMORY_ERROR); 599 } 600 if (vm_backing_store_low) { 601 /* 602 * are we protecting the system from 603 * backing store exhaustion. If so 604 * sleep unless we are privileged. 605 */ 606 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { 607 608 if (m != VM_PAGE_NULL) 609 VM_PAGE_FREE(m); 610 vm_fault_cleanup(object, first_m); 611 612 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); 613 614 thread_block(THREAD_CONTINUE_NULL); 615 thread_interrupt_level(interruptible_state); 616 617 return (VM_FAULT_RETRY); 618 } 619 } 620 if ((throttle_delay = vm_page_throttled())) { 621 /* 622 * we're throttling zero-fills... 623 * treat this as if we couldn't grab a page 624 */ 625 if (m != VM_PAGE_NULL) 626 VM_PAGE_FREE(m); 627 vm_fault_cleanup(object, first_m); 628 629 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); 630 631 delay(throttle_delay); 632 633 if (current_thread_aborted()) { 634 thread_interrupt_level(interruptible_state); 635 return VM_FAULT_INTERRUPTED; 636 } 637 thread_interrupt_level(interruptible_state); 638 639 return (VM_FAULT_MEMORY_SHORTAGE); 640 } 641 return (VM_FAULT_SUCCESS); 642} 643 644 645/* 646 * do the work to zero fill a page and 647 * inject it into the correct paging queue 648 * 649 * m->object must be locked 650 * page queue lock must NOT be held 651 */ 652static int 653vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) 654{ 655 int my_fault = DBG_ZERO_FILL_FAULT; 656 657 /* 658 * This is is a zero-fill page fault... 659 * 660 * Checking the page lock is a waste of 661 * time; this page was absent, so 662 * it can't be page locked by a pager. 663 * 664 * we also consider it undefined 665 * with respect to instruction 666 * execution. i.e. it is the responsibility 667 * of higher layers to call for an instruction 668 * sync after changing the contents and before 669 * sending a program into this area. We 670 * choose this approach for performance 671 */ 672 m->pmapped = TRUE; 673 674 m->cs_validated = FALSE; 675 m->cs_tainted = FALSE; 676 677 if (no_zero_fill == TRUE) { 678 my_fault = DBG_NZF_PAGE_FAULT; 679 } else { 680 vm_page_zero_fill(m); 681 682 VM_STAT_INCR(zero_fill_count); 683 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL); 684 } 685 assert(!m->laundry); 686 assert(m->object != kernel_object); 687 //assert(m->pageq.next == NULL && m->pageq.prev == NULL); 688 689 if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && 690 (m->object->purgable == VM_PURGABLE_DENY || 691 m->object->purgable == VM_PURGABLE_NONVOLATILE || 692 m->object->purgable == VM_PURGABLE_VOLATILE )) { 693 694 vm_page_lockspin_queues(); 695 696 assert(!VM_PAGE_WIRED(m)); 697 698 /* 699 * can't be on the pageout queue since we don't 700 * have a pager to try and clean to 701 */ 702 assert(!m->pageout_queue); 703 704 VM_PAGE_QUEUES_REMOVE(m); 705 706 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); 707 m->throttled = TRUE; 708 vm_page_throttled_count++; 709 710 vm_page_unlock_queues(); 711 } 712 return (my_fault); 713} 714 715 716/* 717 * Routine: vm_fault_page 718 * Purpose: 719 * Find the resident page for the virtual memory 720 * specified by the given virtual memory object 721 * and offset. 722 * Additional arguments: 723 * The required permissions for the page is given 724 * in "fault_type". Desired permissions are included 725 * in "protection". 726 * fault_info is passed along to determine pagein cluster 727 * limits... it contains the expected reference pattern, 728 * cluster size if available, etc... 729 * 730 * If the desired page is known to be resident (for 731 * example, because it was previously wired down), asserting 732 * the "unwiring" parameter will speed the search. 733 * 734 * If the operation can be interrupted (by thread_abort 735 * or thread_terminate), then the "interruptible" 736 * parameter should be asserted. 737 * 738 * Results: 739 * The page containing the proper data is returned 740 * in "result_page". 741 * 742 * In/out conditions: 743 * The source object must be locked and referenced, 744 * and must donate one paging reference. The reference 745 * is not affected. The paging reference and lock are 746 * consumed. 747 * 748 * If the call succeeds, the object in which "result_page" 749 * resides is left locked and holding a paging reference. 750 * If this is not the original object, a busy page in the 751 * original object is returned in "top_page", to prevent other 752 * callers from pursuing this same data, along with a paging 753 * reference for the original object. The "top_page" should 754 * be destroyed when this guarantee is no longer required. 755 * The "result_page" is also left busy. It is not removed 756 * from the pageout queues. 757 * Special Case: 758 * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the 759 * fault succeeded but there's no VM page (i.e. the VM object 760 * does not actually hold VM pages, but device memory or 761 * large pages). The object is still locked and we still hold a 762 * paging_in_progress reference. 763 */ 764unsigned int vm_fault_page_blocked_access = 0; 765unsigned int vm_fault_page_forced_retry = 0; 766 767vm_fault_return_t 768vm_fault_page( 769 /* Arguments: */ 770 vm_object_t first_object, /* Object to begin search */ 771 vm_object_offset_t first_offset, /* Offset into object */ 772 vm_prot_t fault_type, /* What access is requested */ 773 boolean_t must_be_resident,/* Must page be resident? */ 774 /* Modifies in place: */ 775 vm_prot_t *protection, /* Protection for mapping */ 776 /* Returns: */ 777 vm_page_t *result_page, /* Page found, if successful */ 778 vm_page_t *top_page, /* Page in top object, if 779 * not result_page. */ 780 int *type_of_fault, /* if non-null, fill in with type of fault 781 * COW, zero-fill, etc... returned in trace point */ 782 /* More arguments: */ 783 kern_return_t *error_code, /* code if page is in error */ 784 boolean_t no_zero_fill, /* don't zero fill absent pages */ 785#if MACH_PAGEMAP 786 boolean_t data_supply, /* treat as data_supply if 787 * it is a write fault and a full 788 * page is provided */ 789#else 790 __unused boolean_t data_supply, 791#endif 792 vm_object_fault_info_t fault_info) 793{ 794 vm_page_t m; 795 vm_object_t object; 796 vm_object_offset_t offset; 797 vm_page_t first_m; 798 vm_object_t next_object; 799 vm_object_t copy_object; 800 boolean_t look_for_page; 801 boolean_t force_fault_retry = FALSE; 802 vm_prot_t access_required = fault_type; 803 vm_prot_t wants_copy_flag; 804 CLUSTER_STAT(int pages_at_higher_offsets;) 805 CLUSTER_STAT(int pages_at_lower_offsets;) 806 kern_return_t wait_result; 807 boolean_t interruptible_state; 808 boolean_t data_already_requested = FALSE; 809 vm_behavior_t orig_behavior; 810 vm_size_t orig_cluster_size; 811 vm_fault_return_t error; 812 int my_fault; 813 uint32_t try_failed_count; 814 int interruptible; /* how may fault be interrupted? */ 815 memory_object_t pager; 816 vm_fault_return_t retval; 817 818/* 819 * MACH page map - an optional optimization where a bit map is maintained 820 * by the VM subsystem for internal objects to indicate which pages of 821 * the object currently reside on backing store. This existence map 822 * duplicates information maintained by the vnode pager. It is 823 * created at the time of the first pageout against the object, i.e. 824 * at the same time pager for the object is created. The optimization 825 * is designed to eliminate pager interaction overhead, if it is 826 * 'known' that the page does not exist on backing store. 827 * 828 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is 829 * either marked as paged out in the existence map for the object or no 830 * existence map exists for the object. MUST_ASK_PAGER() is one of the 831 * criteria in the decision to invoke the pager. It is also used as one 832 * of the criteria to terminate the scan for adjacent pages in a clustered 833 * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for 834 * permanent objects. Note also that if the pager for an internal object 835 * has not been created, the pager is not invoked regardless of the value 836 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object 837 * for which a pager has been created. 838 * 839 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset 840 * is marked as paged out in the existence map for the object. PAGED_OUT() 841 * PAGED_OUT() is used to determine if a page has already been pushed 842 * into a copy object in order to avoid a redundant page out operation. 843 */ 844#if MACH_PAGEMAP 845#define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \ 846 != VM_EXTERNAL_STATE_ABSENT) 847#define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \ 848 == VM_EXTERNAL_STATE_EXISTS) 849#else 850#define MUST_ASK_PAGER(o, f) (TRUE) 851#define PAGED_OUT(o, f) (FALSE) 852#endif 853 854/* 855 * Recovery actions 856 */ 857#define RELEASE_PAGE(m) \ 858 MACRO_BEGIN \ 859 PAGE_WAKEUP_DONE(m); \ 860 if (!m->active && !m->inactive && !m->throttled) { \ 861 vm_page_lockspin_queues(); \ 862 if (!m->active && !m->inactive && !m->throttled) \ 863 vm_page_activate(m); \ 864 vm_page_unlock_queues(); \ 865 } \ 866 MACRO_END 867 868#if TRACEFAULTPAGE 869 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */ 870#endif 871 872 interruptible = fault_info->interruptible; 873 interruptible_state = thread_interrupt_level(interruptible); 874 875 /* 876 * INVARIANTS (through entire routine): 877 * 878 * 1) At all times, we must either have the object 879 * lock or a busy page in some object to prevent 880 * some other thread from trying to bring in 881 * the same page. 882 * 883 * Note that we cannot hold any locks during the 884 * pager access or when waiting for memory, so 885 * we use a busy page then. 886 * 887 * 2) To prevent another thread from racing us down the 888 * shadow chain and entering a new page in the top 889 * object before we do, we must keep a busy page in 890 * the top object while following the shadow chain. 891 * 892 * 3) We must increment paging_in_progress on any object 893 * for which we have a busy page before dropping 894 * the object lock 895 * 896 * 4) We leave busy pages on the pageout queues. 897 * If the pageout daemon comes across a busy page, 898 * it will remove the page from the pageout queues. 899 */ 900 901 object = first_object; 902 offset = first_offset; 903 first_m = VM_PAGE_NULL; 904 access_required = fault_type; 905 906 907 XPR(XPR_VM_FAULT, 908 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n", 909 object, offset, fault_type, *protection, 0); 910 911 /* 912 * default type of fault 913 */ 914 my_fault = DBG_CACHE_HIT_FAULT; 915 916 while (TRUE) { 917#if TRACEFAULTPAGE 918 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */ 919#endif 920 if (!object->alive) { 921 /* 922 * object is no longer valid 923 * clean up and return error 924 */ 925 vm_fault_cleanup(object, first_m); 926 thread_interrupt_level(interruptible_state); 927 928 return (VM_FAULT_MEMORY_ERROR); 929 } 930 931 if (!object->pager_created && object->phys_contiguous) { 932 /* 933 * A physically-contiguous object without a pager: 934 * must be a "large page" object. We do not deal 935 * with VM pages for this object. 936 */ 937 m = VM_PAGE_NULL; 938 goto phys_contig_object; 939 } 940 941 if (object->blocked_access) { 942 /* 943 * Access to this VM object has been blocked. 944 * Replace our "paging_in_progress" reference with 945 * a "activity_in_progress" reference and wait for 946 * access to be unblocked. 947 */ 948 vm_object_activity_begin(object); 949 vm_object_paging_end(object); 950 while (object->blocked_access) { 951 vm_object_sleep(object, 952 VM_OBJECT_EVENT_UNBLOCKED, 953 THREAD_UNINT); 954 } 955 vm_fault_page_blocked_access++; 956 vm_object_paging_begin(object); 957 vm_object_activity_end(object); 958 } 959 960 /* 961 * See whether the page at 'offset' is resident 962 */ 963 m = vm_page_lookup(object, offset); 964#if TRACEFAULTPAGE 965 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */ 966#endif 967 if (m != VM_PAGE_NULL) { 968 969 if (m->busy) { 970 /* 971 * The page is being brought in, 972 * wait for it and then retry. 973 */ 974#if TRACEFAULTPAGE 975 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ 976#endif 977 wait_result = PAGE_SLEEP(object, m, interruptible); 978 979 XPR(XPR_VM_FAULT, 980 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n", 981 object, offset, 982 m, 0, 0); 983 counter(c_vm_fault_page_block_busy_kernel++); 984 985 if (wait_result != THREAD_AWAKENED) { 986 vm_fault_cleanup(object, first_m); 987 thread_interrupt_level(interruptible_state); 988 989 if (wait_result == THREAD_RESTART) 990 return (VM_FAULT_RETRY); 991 else 992 return (VM_FAULT_INTERRUPTED); 993 } 994 continue; 995 } 996 if (m->laundry) { 997 m->pageout = FALSE; 998 999 if (!m->cleaning) 1000 vm_pageout_steal_laundry(m, FALSE); 1001 } 1002 if (m->phys_page == vm_page_guard_addr) { 1003 /* 1004 * Guard page: off limits ! 1005 */ 1006 if (fault_type == VM_PROT_NONE) { 1007 /* 1008 * The fault is not requesting any 1009 * access to the guard page, so it must 1010 * be just to wire or unwire it. 1011 * Let's pretend it succeeded... 1012 */ 1013 m->busy = TRUE; 1014 *result_page = m; 1015 assert(first_m == VM_PAGE_NULL); 1016 *top_page = first_m; 1017 if (type_of_fault) 1018 *type_of_fault = DBG_GUARD_FAULT; 1019 thread_interrupt_level(interruptible_state); 1020 return VM_FAULT_SUCCESS; 1021 } else { 1022 /* 1023 * The fault requests access to the 1024 * guard page: let's deny that ! 1025 */ 1026 vm_fault_cleanup(object, first_m); 1027 thread_interrupt_level(interruptible_state); 1028 return VM_FAULT_MEMORY_ERROR; 1029 } 1030 } 1031 1032 if (m->error) { 1033 /* 1034 * The page is in error, give up now. 1035 */ 1036#if TRACEFAULTPAGE 1037 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */ 1038#endif 1039 if (error_code) 1040 *error_code = KERN_MEMORY_ERROR; 1041 VM_PAGE_FREE(m); 1042 1043 vm_fault_cleanup(object, first_m); 1044 thread_interrupt_level(interruptible_state); 1045 1046 return (VM_FAULT_MEMORY_ERROR); 1047 } 1048 if (m->restart) { 1049 /* 1050 * The pager wants us to restart 1051 * at the top of the chain, 1052 * typically because it has moved the 1053 * page to another pager, then do so. 1054 */ 1055#if TRACEFAULTPAGE 1056 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ 1057#endif 1058 VM_PAGE_FREE(m); 1059 1060 vm_fault_cleanup(object, first_m); 1061 thread_interrupt_level(interruptible_state); 1062 1063 return (VM_FAULT_RETRY); 1064 } 1065 if (m->absent) { 1066 /* 1067 * The page isn't busy, but is absent, 1068 * therefore it's deemed "unavailable". 1069 * 1070 * Remove the non-existent page (unless it's 1071 * in the top object) and move on down to the 1072 * next object (if there is one). 1073 */ 1074#if TRACEFAULTPAGE 1075 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */ 1076#endif 1077 next_object = object->shadow; 1078 1079 if (next_object == VM_OBJECT_NULL) { 1080 /* 1081 * Absent page at bottom of shadow 1082 * chain; zero fill the page we left 1083 * busy in the first object, and free 1084 * the absent page. 1085 */ 1086 assert(!must_be_resident); 1087 1088 /* 1089 * check for any conditions that prevent 1090 * us from creating a new zero-fill page 1091 * vm_fault_check will do all of the 1092 * fault cleanup in the case of an error condition 1093 * including resetting the thread_interrupt_level 1094 */ 1095 error = vm_fault_check(object, m, first_m, interruptible_state); 1096 1097 if (error != VM_FAULT_SUCCESS) 1098 return (error); 1099 1100 XPR(XPR_VM_FAULT, 1101 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n", 1102 object, offset, 1103 m, 1104 first_object, 0); 1105 1106 if (object != first_object) { 1107 /* 1108 * free the absent page we just found 1109 */ 1110 VM_PAGE_FREE(m); 1111 1112 /* 1113 * drop reference and lock on current object 1114 */ 1115 vm_object_paging_end(object); 1116 vm_object_unlock(object); 1117 1118 /* 1119 * grab the original page we 1120 * 'soldered' in place and 1121 * retake lock on 'first_object' 1122 */ 1123 m = first_m; 1124 first_m = VM_PAGE_NULL; 1125 1126 object = first_object; 1127 offset = first_offset; 1128 1129 vm_object_lock(object); 1130 } else { 1131 /* 1132 * we're going to use the absent page we just found 1133 * so convert it to a 'busy' page 1134 */ 1135 m->absent = FALSE; 1136 m->busy = TRUE; 1137 } 1138 /* 1139 * zero-fill the page and put it on 1140 * the correct paging queue 1141 */ 1142 my_fault = vm_fault_zero_page(m, no_zero_fill); 1143 1144 if (fault_info->mark_zf_absent && no_zero_fill == TRUE) 1145 m->absent = TRUE; 1146 1147 break; 1148 } else { 1149 if (must_be_resident) 1150 vm_object_paging_end(object); 1151 else if (object != first_object) { 1152 vm_object_paging_end(object); 1153 VM_PAGE_FREE(m); 1154 } else { 1155 first_m = m; 1156 m->absent = FALSE; 1157 m->busy = TRUE; 1158 1159 vm_page_lockspin_queues(); 1160 1161 assert(!m->pageout_queue); 1162 VM_PAGE_QUEUES_REMOVE(m); 1163 1164 vm_page_unlock_queues(); 1165 } 1166 XPR(XPR_VM_FAULT, 1167 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n", 1168 object, offset, 1169 next_object, 1170 offset+object->vo_shadow_offset,0); 1171 1172 offset += object->vo_shadow_offset; 1173 fault_info->lo_offset += object->vo_shadow_offset; 1174 fault_info->hi_offset += object->vo_shadow_offset; 1175 access_required = VM_PROT_READ; 1176 1177 vm_object_lock(next_object); 1178 vm_object_unlock(object); 1179 object = next_object; 1180 vm_object_paging_begin(object); 1181 1182 /* 1183 * reset to default type of fault 1184 */ 1185 my_fault = DBG_CACHE_HIT_FAULT; 1186 1187 continue; 1188 } 1189 } 1190 if ((m->cleaning) 1191 && ((object != first_object) || (object->copy != VM_OBJECT_NULL)) 1192 && (fault_type & VM_PROT_WRITE)) { 1193 /* 1194 * This is a copy-on-write fault that will 1195 * cause us to revoke access to this page, but 1196 * this page is in the process of being cleaned 1197 * in a clustered pageout. We must wait until 1198 * the cleaning operation completes before 1199 * revoking access to the original page, 1200 * otherwise we might attempt to remove a 1201 * wired mapping. 1202 */ 1203#if TRACEFAULTPAGE 1204 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */ 1205#endif 1206 XPR(XPR_VM_FAULT, 1207 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n", 1208 object, offset, 1209 m, 0, 0); 1210 /* 1211 * take an extra ref so that object won't die 1212 */ 1213 vm_object_reference_locked(object); 1214 1215 vm_fault_cleanup(object, first_m); 1216 1217 counter(c_vm_fault_page_block_backoff_kernel++); 1218 vm_object_lock(object); 1219 assert(object->ref_count > 0); 1220 1221 m = vm_page_lookup(object, offset); 1222 1223 if (m != VM_PAGE_NULL && m->cleaning) { 1224 PAGE_ASSERT_WAIT(m, interruptible); 1225 1226 vm_object_unlock(object); 1227 wait_result = thread_block(THREAD_CONTINUE_NULL); 1228 vm_object_deallocate(object); 1229 1230 goto backoff; 1231 } else { 1232 vm_object_unlock(object); 1233 1234 vm_object_deallocate(object); 1235 thread_interrupt_level(interruptible_state); 1236 1237 return (VM_FAULT_RETRY); 1238 } 1239 } 1240 if (type_of_fault == NULL && m->speculative && 1241 !(fault_info != NULL && fault_info->stealth)) { 1242 /* 1243 * If we were passed a non-NULL pointer for 1244 * "type_of_fault", than we came from 1245 * vm_fault... we'll let it deal with 1246 * this condition, since it 1247 * needs to see m->speculative to correctly 1248 * account the pageins, otherwise... 1249 * take it off the speculative queue, we'll 1250 * let the caller of vm_fault_page deal 1251 * with getting it onto the correct queue 1252 * 1253 * If the caller specified in fault_info that 1254 * it wants a "stealth" fault, we also leave 1255 * the page in the speculative queue. 1256 */ 1257 vm_page_lockspin_queues(); 1258 if (m->speculative) 1259 VM_PAGE_QUEUES_REMOVE(m); 1260 vm_page_unlock_queues(); 1261 } 1262 1263 if (m->encrypted) { 1264 /* 1265 * ENCRYPTED SWAP: 1266 * the user needs access to a page that we 1267 * encrypted before paging it out. 1268 * Decrypt the page now. 1269 * Keep it busy to prevent anyone from 1270 * accessing it during the decryption. 1271 */ 1272 m->busy = TRUE; 1273 vm_page_decrypt(m, 0); 1274 assert(object == m->object); 1275 assert(m->busy); 1276 PAGE_WAKEUP_DONE(m); 1277 1278 /* 1279 * Retry from the top, in case 1280 * something changed while we were 1281 * decrypting. 1282 */ 1283 continue; 1284 } 1285 ASSERT_PAGE_DECRYPTED(m); 1286 1287 if (m->object->code_signed) { 1288 /* 1289 * CODE SIGNING: 1290 * We just paged in a page from a signed 1291 * memory object but we don't need to 1292 * validate it now. We'll validate it if 1293 * when it gets mapped into a user address 1294 * space for the first time or when the page 1295 * gets copied to another object as a result 1296 * of a copy-on-write. 1297 */ 1298 } 1299 1300 /* 1301 * We mark the page busy and leave it on 1302 * the pageout queues. If the pageout 1303 * deamon comes across it, then it will 1304 * remove the page from the queue, but not the object 1305 */ 1306#if TRACEFAULTPAGE 1307 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ 1308#endif 1309 XPR(XPR_VM_FAULT, 1310 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n", 1311 object, offset, m, 0, 0); 1312 assert(!m->busy); 1313 assert(!m->absent); 1314 1315 m->busy = TRUE; 1316 break; 1317 } 1318 1319 1320 /* 1321 * we get here when there is no page present in the object at 1322 * the offset we're interested in... we'll allocate a page 1323 * at this point if the pager associated with 1324 * this object can provide the data or we're the top object... 1325 * object is locked; m == NULL 1326 */ 1327 if (must_be_resident) 1328 goto dont_look_for_page; 1329 1330 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply); 1331 1332#if TRACEFAULTPAGE 1333 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */ 1334#endif 1335 if (!look_for_page && object == first_object && !object->phys_contiguous) { 1336 /* 1337 * Allocate a new page for this object/offset pair as a placeholder 1338 */ 1339 m = vm_page_grab(); 1340#if TRACEFAULTPAGE 1341 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */ 1342#endif 1343 if (m == VM_PAGE_NULL) { 1344 1345 vm_fault_cleanup(object, first_m); 1346 thread_interrupt_level(interruptible_state); 1347 1348 return (VM_FAULT_MEMORY_SHORTAGE); 1349 } 1350 1351 if (fault_info && fault_info->batch_pmap_op == TRUE) { 1352 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE); 1353 } else { 1354 vm_page_insert(m, object, offset); 1355 } 1356 } 1357 if (look_for_page) { 1358 kern_return_t rc; 1359 1360 /* 1361 * If the memory manager is not ready, we 1362 * cannot make requests. 1363 */ 1364 if (!object->pager_ready) { 1365#if TRACEFAULTPAGE 1366 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */ 1367#endif 1368 if (m != VM_PAGE_NULL) 1369 VM_PAGE_FREE(m); 1370 1371 XPR(XPR_VM_FAULT, 1372 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n", 1373 object, offset, 0, 0, 0); 1374 1375 /* 1376 * take an extra ref so object won't die 1377 */ 1378 vm_object_reference_locked(object); 1379 vm_fault_cleanup(object, first_m); 1380 counter(c_vm_fault_page_block_backoff_kernel++); 1381 1382 vm_object_lock(object); 1383 assert(object->ref_count > 0); 1384 1385 if (!object->pager_ready) { 1386 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible); 1387 1388 vm_object_unlock(object); 1389 if (wait_result == THREAD_WAITING) 1390 wait_result = thread_block(THREAD_CONTINUE_NULL); 1391 vm_object_deallocate(object); 1392 1393 goto backoff; 1394 } else { 1395 vm_object_unlock(object); 1396 vm_object_deallocate(object); 1397 thread_interrupt_level(interruptible_state); 1398 1399 return (VM_FAULT_RETRY); 1400 } 1401 } 1402 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) { 1403 /* 1404 * If there are too many outstanding page 1405 * requests pending on this external object, we 1406 * wait for them to be resolved now. 1407 */ 1408#if TRACEFAULTPAGE 1409 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ 1410#endif 1411 if (m != VM_PAGE_NULL) 1412 VM_PAGE_FREE(m); 1413 /* 1414 * take an extra ref so object won't die 1415 */ 1416 vm_object_reference_locked(object); 1417 1418 vm_fault_cleanup(object, first_m); 1419 1420 counter(c_vm_fault_page_block_backoff_kernel++); 1421 1422 vm_object_lock(object); 1423 assert(object->ref_count > 0); 1424 1425 if (object->paging_in_progress >= vm_object_pagein_throttle) { 1426 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible); 1427 1428 vm_object_unlock(object); 1429 wait_result = thread_block(THREAD_CONTINUE_NULL); 1430 vm_object_deallocate(object); 1431 1432 goto backoff; 1433 } else { 1434 vm_object_unlock(object); 1435 vm_object_deallocate(object); 1436 thread_interrupt_level(interruptible_state); 1437 1438 return (VM_FAULT_RETRY); 1439 } 1440 } 1441 if (m != VM_PAGE_NULL) { 1442 VM_PAGE_FREE(m); 1443 m = VM_PAGE_NULL; 1444 } 1445 1446#if TRACEFAULTPAGE 1447 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */ 1448#endif 1449 1450 /* 1451 * It's possible someone called vm_object_destroy while we weren't 1452 * holding the object lock. If that has happened, then bail out 1453 * here. 1454 */ 1455 1456 pager = object->pager; 1457 1458 if (pager == MEMORY_OBJECT_NULL) { 1459 vm_fault_cleanup(object, first_m); 1460 thread_interrupt_level(interruptible_state); 1461 return VM_FAULT_MEMORY_ERROR; 1462 } 1463 1464 /* 1465 * We have an absent page in place for the faulting offset, 1466 * so we can release the object lock. 1467 */ 1468 1469 vm_object_unlock(object); 1470 1471 /* 1472 * If this object uses a copy_call strategy, 1473 * and we are interested in a copy of this object 1474 * (having gotten here only by following a 1475 * shadow chain), then tell the memory manager 1476 * via a flag added to the desired_access 1477 * parameter, so that it can detect a race 1478 * between our walking down the shadow chain 1479 * and its pushing pages up into a copy of 1480 * the object that it manages. 1481 */ 1482 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) 1483 wants_copy_flag = VM_PROT_WANTS_COPY; 1484 else 1485 wants_copy_flag = VM_PROT_NONE; 1486 1487 XPR(XPR_VM_FAULT, 1488 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n", 1489 object, offset, m, 1490 access_required | wants_copy_flag, 0); 1491 1492 if (object->copy == first_object) { 1493 /* 1494 * if we issue the memory_object_data_request in 1495 * this state, we are subject to a deadlock with 1496 * the underlying filesystem if it is trying to 1497 * shrink the file resulting in a push of pages 1498 * into the copy object... that push will stall 1499 * on the placeholder page, and if the pushing thread 1500 * is holding a lock that is required on the pagein 1501 * path (such as a truncate lock), we'll deadlock... 1502 * to avoid this potential deadlock, we throw away 1503 * our placeholder page before calling memory_object_data_request 1504 * and force this thread to retry the vm_fault_page after 1505 * we have issued the I/O. the second time through this path 1506 * we will find the page already in the cache (presumably still 1507 * busy waiting for the I/O to complete) and then complete 1508 * the fault w/o having to go through memory_object_data_request again 1509 */ 1510 assert(first_m != VM_PAGE_NULL); 1511 assert(first_m->object == first_object); 1512 1513 vm_object_lock(first_object); 1514 VM_PAGE_FREE(first_m); 1515 vm_object_paging_end(first_object); 1516 vm_object_unlock(first_object); 1517 1518 first_m = VM_PAGE_NULL; 1519 force_fault_retry = TRUE; 1520 1521 vm_fault_page_forced_retry++; 1522 } 1523 1524 if (data_already_requested == TRUE) { 1525 orig_behavior = fault_info->behavior; 1526 orig_cluster_size = fault_info->cluster_size; 1527 1528 fault_info->behavior = VM_BEHAVIOR_RANDOM; 1529 fault_info->cluster_size = PAGE_SIZE; 1530 } 1531 /* 1532 * Call the memory manager to retrieve the data. 1533 */ 1534 rc = memory_object_data_request( 1535 pager, 1536 offset + object->paging_offset, 1537 PAGE_SIZE, 1538 access_required | wants_copy_flag, 1539 (memory_object_fault_info_t)fault_info); 1540 1541 if (data_already_requested == TRUE) { 1542 fault_info->behavior = orig_behavior; 1543 fault_info->cluster_size = orig_cluster_size; 1544 } else 1545 data_already_requested = TRUE; 1546 1547#if TRACEFAULTPAGE 1548 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */ 1549#endif 1550 vm_object_lock(object); 1551 1552 if (rc != KERN_SUCCESS) { 1553 1554 vm_fault_cleanup(object, first_m); 1555 thread_interrupt_level(interruptible_state); 1556 1557 return ((rc == MACH_SEND_INTERRUPTED) ? 1558 VM_FAULT_INTERRUPTED : 1559 VM_FAULT_MEMORY_ERROR); 1560 } else { 1561 clock_sec_t tv_sec; 1562 clock_usec_t tv_usec; 1563 1564 clock_get_system_microtime(&tv_sec, &tv_usec); 1565 current_thread()->t_page_creation_time = tv_sec; 1566 current_thread()->t_page_creation_count = 0; 1567 } 1568 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) { 1569 1570 vm_fault_cleanup(object, first_m); 1571 thread_interrupt_level(interruptible_state); 1572 1573 return (VM_FAULT_INTERRUPTED); 1574 } 1575 if (force_fault_retry == TRUE) { 1576 1577 vm_fault_cleanup(object, first_m); 1578 thread_interrupt_level(interruptible_state); 1579 1580 return (VM_FAULT_RETRY); 1581 } 1582 if (m == VM_PAGE_NULL && object->phys_contiguous) { 1583 /* 1584 * No page here means that the object we 1585 * initially looked up was "physically 1586 * contiguous" (i.e. device memory). However, 1587 * with Virtual VRAM, the object might not 1588 * be backed by that device memory anymore, 1589 * so we're done here only if the object is 1590 * still "phys_contiguous". 1591 * Otherwise, if the object is no longer 1592 * "phys_contiguous", we need to retry the 1593 * page fault against the object's new backing 1594 * store (different memory object). 1595 */ 1596 phys_contig_object: 1597 goto done; 1598 } 1599 /* 1600 * potentially a pagein fault 1601 * if we make it through the state checks 1602 * above, than we'll count it as such 1603 */ 1604 my_fault = DBG_PAGEIN_FAULT; 1605 1606 /* 1607 * Retry with same object/offset, since new data may 1608 * be in a different page (i.e., m is meaningless at 1609 * this point). 1610 */ 1611 continue; 1612 } 1613dont_look_for_page: 1614 /* 1615 * We get here if the object has no pager, or an existence map 1616 * exists and indicates the page isn't present on the pager 1617 * or we're unwiring a page. If a pager exists, but there 1618 * is no existence map, then the m->absent case above handles 1619 * the ZF case when the pager can't provide the page 1620 */ 1621#if TRACEFAULTPAGE 1622 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ 1623#endif 1624 if (object == first_object) 1625 first_m = m; 1626 else 1627 assert(m == VM_PAGE_NULL); 1628 1629 XPR(XPR_VM_FAULT, 1630 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n", 1631 object, offset, m, 1632 object->shadow, 0); 1633 1634 next_object = object->shadow; 1635 1636 if (next_object == VM_OBJECT_NULL) { 1637 /* 1638 * we've hit the bottom of the shadown chain, 1639 * fill the page in the top object with zeros. 1640 */ 1641 assert(!must_be_resident); 1642 1643 if (object != first_object) { 1644 vm_object_paging_end(object); 1645 vm_object_unlock(object); 1646 1647 object = first_object; 1648 offset = first_offset; 1649 vm_object_lock(object); 1650 } 1651 m = first_m; 1652 assert(m->object == object); 1653 first_m = VM_PAGE_NULL; 1654 1655 /* 1656 * check for any conditions that prevent 1657 * us from creating a new zero-fill page 1658 * vm_fault_check will do all of the 1659 * fault cleanup in the case of an error condition 1660 * including resetting the thread_interrupt_level 1661 */ 1662 error = vm_fault_check(object, m, first_m, interruptible_state); 1663 1664 if (error != VM_FAULT_SUCCESS) 1665 return (error); 1666 1667 if (m == VM_PAGE_NULL) { 1668 m = vm_page_grab(); 1669 1670 if (m == VM_PAGE_NULL) { 1671 vm_fault_cleanup(object, VM_PAGE_NULL); 1672 thread_interrupt_level(interruptible_state); 1673 1674 return (VM_FAULT_MEMORY_SHORTAGE); 1675 } 1676 vm_page_insert(m, object, offset); 1677 } 1678 my_fault = vm_fault_zero_page(m, no_zero_fill); 1679 1680 if (fault_info->mark_zf_absent && no_zero_fill == TRUE) 1681 m->absent = TRUE; 1682 break; 1683 1684 } else { 1685 /* 1686 * Move on to the next object. Lock the next 1687 * object before unlocking the current one. 1688 */ 1689 if ((object != first_object) || must_be_resident) 1690 vm_object_paging_end(object); 1691 1692 offset += object->vo_shadow_offset; 1693 fault_info->lo_offset += object->vo_shadow_offset; 1694 fault_info->hi_offset += object->vo_shadow_offset; 1695 access_required = VM_PROT_READ; 1696 1697 vm_object_lock(next_object); 1698 vm_object_unlock(object); 1699 1700 object = next_object; 1701 vm_object_paging_begin(object); 1702 } 1703 } 1704 1705 /* 1706 * PAGE HAS BEEN FOUND. 1707 * 1708 * This page (m) is: 1709 * busy, so that we can play with it; 1710 * not absent, so that nobody else will fill it; 1711 * possibly eligible for pageout; 1712 * 1713 * The top-level page (first_m) is: 1714 * VM_PAGE_NULL if the page was found in the 1715 * top-level object; 1716 * busy, not absent, and ineligible for pageout. 1717 * 1718 * The current object (object) is locked. A paging 1719 * reference is held for the current and top-level 1720 * objects. 1721 */ 1722 1723#if TRACEFAULTPAGE 1724 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ 1725#endif 1726#if EXTRA_ASSERTIONS 1727 assert(m->busy && !m->absent); 1728 assert((first_m == VM_PAGE_NULL) || 1729 (first_m->busy && !first_m->absent && 1730 !first_m->active && !first_m->inactive)); 1731#endif /* EXTRA_ASSERTIONS */ 1732 1733 /* 1734 * ENCRYPTED SWAP: 1735 * If we found a page, we must have decrypted it before we 1736 * get here... 1737 */ 1738 ASSERT_PAGE_DECRYPTED(m); 1739 1740 XPR(XPR_VM_FAULT, 1741 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n", 1742 object, offset, m, 1743 first_object, first_m); 1744 1745 /* 1746 * If the page is being written, but isn't 1747 * already owned by the top-level object, 1748 * we have to copy it into a new page owned 1749 * by the top-level object. 1750 */ 1751 if (object != first_object) { 1752 1753#if TRACEFAULTPAGE 1754 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */ 1755#endif 1756 if (fault_type & VM_PROT_WRITE) { 1757 vm_page_t copy_m; 1758 1759 /* 1760 * We only really need to copy if we 1761 * want to write it. 1762 */ 1763 assert(!must_be_resident); 1764 1765 /* 1766 * are we protecting the system from 1767 * backing store exhaustion. If so 1768 * sleep unless we are privileged. 1769 */ 1770 if (vm_backing_store_low) { 1771 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { 1772 1773 RELEASE_PAGE(m); 1774 vm_fault_cleanup(object, first_m); 1775 1776 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); 1777 1778 thread_block(THREAD_CONTINUE_NULL); 1779 thread_interrupt_level(interruptible_state); 1780 1781 return (VM_FAULT_RETRY); 1782 } 1783 } 1784 /* 1785 * If we try to collapse first_object at this 1786 * point, we may deadlock when we try to get 1787 * the lock on an intermediate object (since we 1788 * have the bottom object locked). We can't 1789 * unlock the bottom object, because the page 1790 * we found may move (by collapse) if we do. 1791 * 1792 * Instead, we first copy the page. Then, when 1793 * we have no more use for the bottom object, 1794 * we unlock it and try to collapse. 1795 * 1796 * Note that we copy the page even if we didn't 1797 * need to... that's the breaks. 1798 */ 1799 1800 /* 1801 * Allocate a page for the copy 1802 */ 1803 copy_m = vm_page_grab(); 1804 1805 if (copy_m == VM_PAGE_NULL) { 1806 RELEASE_PAGE(m); 1807 1808 vm_fault_cleanup(object, first_m); 1809 thread_interrupt_level(interruptible_state); 1810 1811 return (VM_FAULT_MEMORY_SHORTAGE); 1812 } 1813 XPR(XPR_VM_FAULT, 1814 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n", 1815 object, offset, 1816 m, copy_m, 0); 1817 1818 vm_page_copy(m, copy_m); 1819 1820 /* 1821 * If another map is truly sharing this 1822 * page with us, we have to flush all 1823 * uses of the original page, since we 1824 * can't distinguish those which want the 1825 * original from those which need the 1826 * new copy. 1827 * 1828 * XXXO If we know that only one map has 1829 * access to this page, then we could 1830 * avoid the pmap_disconnect() call. 1831 */ 1832 if (m->pmapped) 1833 pmap_disconnect(m->phys_page); 1834 1835 assert(!m->cleaning); 1836 1837 /* 1838 * We no longer need the old page or object. 1839 */ 1840 PAGE_WAKEUP_DONE(m); 1841 vm_object_paging_end(object); 1842 vm_object_unlock(object); 1843 1844 my_fault = DBG_COW_FAULT; 1845 VM_STAT_INCR(cow_faults); 1846 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL); 1847 current_task()->cow_faults++; 1848 1849 object = first_object; 1850 offset = first_offset; 1851 1852 vm_object_lock(object); 1853 /* 1854 * get rid of the place holder 1855 * page that we soldered in earlier 1856 */ 1857 VM_PAGE_FREE(first_m); 1858 first_m = VM_PAGE_NULL; 1859 1860 /* 1861 * and replace it with the 1862 * page we just copied into 1863 */ 1864 assert(copy_m->busy); 1865 vm_page_insert(copy_m, object, offset); 1866 SET_PAGE_DIRTY(copy_m, TRUE); 1867 1868 m = copy_m; 1869 /* 1870 * Now that we've gotten the copy out of the 1871 * way, let's try to collapse the top object. 1872 * But we have to play ugly games with 1873 * paging_in_progress to do that... 1874 */ 1875 vm_object_paging_end(object); 1876 vm_object_collapse(object, offset, TRUE); 1877 vm_object_paging_begin(object); 1878 1879 } else 1880 *protection &= (~VM_PROT_WRITE); 1881 } 1882 /* 1883 * Now check whether the page needs to be pushed into the 1884 * copy object. The use of asymmetric copy on write for 1885 * shared temporary objects means that we may do two copies to 1886 * satisfy the fault; one above to get the page from a 1887 * shadowed object, and one here to push it into the copy. 1888 */ 1889 try_failed_count = 0; 1890 1891 while ((copy_object = first_object->copy) != VM_OBJECT_NULL) { 1892 vm_object_offset_t copy_offset; 1893 vm_page_t copy_m; 1894 1895#if TRACEFAULTPAGE 1896 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */ 1897#endif 1898 /* 1899 * If the page is being written, but hasn't been 1900 * copied to the copy-object, we have to copy it there. 1901 */ 1902 if ((fault_type & VM_PROT_WRITE) == 0) { 1903 *protection &= ~VM_PROT_WRITE; 1904 break; 1905 } 1906 1907 /* 1908 * If the page was guaranteed to be resident, 1909 * we must have already performed the copy. 1910 */ 1911 if (must_be_resident) 1912 break; 1913 1914 /* 1915 * Try to get the lock on the copy_object. 1916 */ 1917 if (!vm_object_lock_try(copy_object)) { 1918 1919 vm_object_unlock(object); 1920 try_failed_count++; 1921 1922 mutex_pause(try_failed_count); /* wait a bit */ 1923 vm_object_lock(object); 1924 1925 continue; 1926 } 1927 try_failed_count = 0; 1928 1929 /* 1930 * Make another reference to the copy-object, 1931 * to keep it from disappearing during the 1932 * copy. 1933 */ 1934 vm_object_reference_locked(copy_object); 1935 1936 /* 1937 * Does the page exist in the copy? 1938 */ 1939 copy_offset = first_offset - copy_object->vo_shadow_offset; 1940 1941 if (copy_object->vo_size <= copy_offset) 1942 /* 1943 * Copy object doesn't cover this page -- do nothing. 1944 */ 1945 ; 1946 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) { 1947 /* 1948 * Page currently exists in the copy object 1949 */ 1950 if (copy_m->busy) { 1951 /* 1952 * If the page is being brought 1953 * in, wait for it and then retry. 1954 */ 1955 RELEASE_PAGE(m); 1956 1957 /* 1958 * take an extra ref so object won't die 1959 */ 1960 vm_object_reference_locked(copy_object); 1961 vm_object_unlock(copy_object); 1962 vm_fault_cleanup(object, first_m); 1963 counter(c_vm_fault_page_block_backoff_kernel++); 1964 1965 vm_object_lock(copy_object); 1966 assert(copy_object->ref_count > 0); 1967 VM_OBJ_RES_DECR(copy_object); 1968 vm_object_lock_assert_exclusive(copy_object); 1969 copy_object->ref_count--; 1970 assert(copy_object->ref_count > 0); 1971 copy_m = vm_page_lookup(copy_object, copy_offset); 1972 /* 1973 * ENCRYPTED SWAP: 1974 * it's OK if the "copy_m" page is encrypted, 1975 * because we're not moving it nor handling its 1976 * contents. 1977 */ 1978 if (copy_m != VM_PAGE_NULL && copy_m->busy) { 1979 PAGE_ASSERT_WAIT(copy_m, interruptible); 1980 1981 vm_object_unlock(copy_object); 1982 wait_result = thread_block(THREAD_CONTINUE_NULL); 1983 vm_object_deallocate(copy_object); 1984 1985 goto backoff; 1986 } else { 1987 vm_object_unlock(copy_object); 1988 vm_object_deallocate(copy_object); 1989 thread_interrupt_level(interruptible_state); 1990 1991 return (VM_FAULT_RETRY); 1992 } 1993 } 1994 } 1995 else if (!PAGED_OUT(copy_object, copy_offset)) { 1996 /* 1997 * If PAGED_OUT is TRUE, then the page used to exist 1998 * in the copy-object, and has already been paged out. 1999 * We don't need to repeat this. If PAGED_OUT is 2000 * FALSE, then either we don't know (!pager_created, 2001 * for example) or it hasn't been paged out. 2002 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT) 2003 * We must copy the page to the copy object. 2004 */ 2005 2006 if (vm_backing_store_low) { 2007 /* 2008 * we are protecting the system from 2009 * backing store exhaustion. If so 2010 * sleep unless we are privileged. 2011 */ 2012 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { 2013 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); 2014 2015 RELEASE_PAGE(m); 2016 VM_OBJ_RES_DECR(copy_object); 2017 vm_object_lock_assert_exclusive(copy_object); 2018 copy_object->ref_count--; 2019 assert(copy_object->ref_count > 0); 2020 2021 vm_object_unlock(copy_object); 2022 vm_fault_cleanup(object, first_m); 2023 thread_block(THREAD_CONTINUE_NULL); 2024 thread_interrupt_level(interruptible_state); 2025 2026 return (VM_FAULT_RETRY); 2027 } 2028 } 2029 /* 2030 * Allocate a page for the copy 2031 */ 2032 copy_m = vm_page_alloc(copy_object, copy_offset); 2033 2034 if (copy_m == VM_PAGE_NULL) { 2035 RELEASE_PAGE(m); 2036 2037 VM_OBJ_RES_DECR(copy_object); 2038 vm_object_lock_assert_exclusive(copy_object); 2039 copy_object->ref_count--; 2040 assert(copy_object->ref_count > 0); 2041 2042 vm_object_unlock(copy_object); 2043 vm_fault_cleanup(object, first_m); 2044 thread_interrupt_level(interruptible_state); 2045 2046 return (VM_FAULT_MEMORY_SHORTAGE); 2047 } 2048 /* 2049 * Must copy page into copy-object. 2050 */ 2051 vm_page_copy(m, copy_m); 2052 2053 /* 2054 * If the old page was in use by any users 2055 * of the copy-object, it must be removed 2056 * from all pmaps. (We can't know which 2057 * pmaps use it.) 2058 */ 2059 if (m->pmapped) 2060 pmap_disconnect(m->phys_page); 2061 2062 /* 2063 * If there's a pager, then immediately 2064 * page out this page, using the "initialize" 2065 * option. Else, we use the copy. 2066 */ 2067 if ((!copy_object->pager_created) 2068#if MACH_PAGEMAP 2069 || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT 2070#endif 2071 ) { 2072 2073 vm_page_lockspin_queues(); 2074 assert(!m->cleaning); 2075 vm_page_activate(copy_m); 2076 vm_page_unlock_queues(); 2077 2078 SET_PAGE_DIRTY(copy_m, TRUE); 2079 PAGE_WAKEUP_DONE(copy_m); 2080 2081 } else if (copy_object->internal) { 2082 /* 2083 * For internal objects check with the pager to see 2084 * if the page already exists in the backing store. 2085 * If yes, then we can drop the copy page. If not, 2086 * then we'll activate it, mark it dirty and keep it 2087 * around. 2088 */ 2089 2090 kern_return_t kr = KERN_SUCCESS; 2091 2092 memory_object_t copy_pager = copy_object->pager; 2093 assert(copy_pager != MEMORY_OBJECT_NULL); 2094 vm_object_paging_begin(copy_object); 2095 2096 vm_object_unlock(copy_object); 2097 2098 kr = memory_object_data_request( 2099 copy_pager, 2100 copy_offset + copy_object->paging_offset, 2101 0, /* Only query the pager. */ 2102 VM_PROT_READ, 2103 NULL); 2104 2105 vm_object_lock(copy_object); 2106 2107 vm_object_paging_end(copy_object); 2108 2109 /* 2110 * Since we dropped the copy_object's lock, 2111 * check whether we'll have to deallocate 2112 * the hard way. 2113 */ 2114 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) { 2115 vm_object_unlock(copy_object); 2116 vm_object_deallocate(copy_object); 2117 vm_object_lock(object); 2118 2119 continue; 2120 } 2121 if (kr == KERN_SUCCESS) { 2122 /* 2123 * The pager has the page. We don't want to overwrite 2124 * that page by sending this one out to the backing store. 2125 * So we drop the copy page. 2126 */ 2127 VM_PAGE_FREE(copy_m); 2128 2129 } else { 2130 /* 2131 * The pager doesn't have the page. We'll keep this one 2132 * around in the copy object. It might get sent out to 2133 * the backing store under memory pressure. 2134 */ 2135 vm_page_lockspin_queues(); 2136 assert(!m->cleaning); 2137 vm_page_activate(copy_m); 2138 vm_page_unlock_queues(); 2139 2140 SET_PAGE_DIRTY(copy_m, TRUE); 2141 PAGE_WAKEUP_DONE(copy_m); 2142 } 2143 } else { 2144 2145 assert(copy_m->busy == TRUE); 2146 assert(!m->cleaning); 2147 2148 /* 2149 * dirty is protected by the object lock 2150 */ 2151 SET_PAGE_DIRTY(copy_m, TRUE); 2152 2153 /* 2154 * The page is already ready for pageout: 2155 * not on pageout queues and busy. 2156 * Unlock everything except the 2157 * copy_object itself. 2158 */ 2159 vm_object_unlock(object); 2160 2161 /* 2162 * Write the page to the copy-object, 2163 * flushing it from the kernel. 2164 */ 2165 vm_pageout_initialize_page(copy_m); 2166 2167 /* 2168 * Since the pageout may have 2169 * temporarily dropped the 2170 * copy_object's lock, we 2171 * check whether we'll have 2172 * to deallocate the hard way. 2173 */ 2174 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) { 2175 vm_object_unlock(copy_object); 2176 vm_object_deallocate(copy_object); 2177 vm_object_lock(object); 2178 2179 continue; 2180 } 2181 /* 2182 * Pick back up the old object's 2183 * lock. [It is safe to do so, 2184 * since it must be deeper in the 2185 * object tree.] 2186 */ 2187 vm_object_lock(object); 2188 } 2189 2190 /* 2191 * Because we're pushing a page upward 2192 * in the object tree, we must restart 2193 * any faults that are waiting here. 2194 * [Note that this is an expansion of 2195 * PAGE_WAKEUP that uses the THREAD_RESTART 2196 * wait result]. Can't turn off the page's 2197 * busy bit because we're not done with it. 2198 */ 2199 if (m->wanted) { 2200 m->wanted = FALSE; 2201 thread_wakeup_with_result((event_t) m, THREAD_RESTART); 2202 } 2203 } 2204 /* 2205 * The reference count on copy_object must be 2206 * at least 2: one for our extra reference, 2207 * and at least one from the outside world 2208 * (we checked that when we last locked 2209 * copy_object). 2210 */ 2211 vm_object_lock_assert_exclusive(copy_object); 2212 copy_object->ref_count--; 2213 assert(copy_object->ref_count > 0); 2214 2215 VM_OBJ_RES_DECR(copy_object); 2216 vm_object_unlock(copy_object); 2217 2218 break; 2219 } 2220 2221done: 2222 *result_page = m; 2223 *top_page = first_m; 2224 2225 XPR(XPR_VM_FAULT, 2226 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n", 2227 object, offset, m, first_m, 0); 2228 2229 if (m != VM_PAGE_NULL) { 2230 retval = VM_FAULT_SUCCESS; 2231 if (my_fault == DBG_PAGEIN_FAULT) { 2232 2233 VM_STAT_INCR(pageins); 2234 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL); 2235 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL); 2236 current_task()->pageins++; 2237 2238 if (m->object->internal) { 2239 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL); 2240 my_fault = DBG_PAGEIND_FAULT; 2241 } else { 2242 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL); 2243 my_fault = DBG_PAGEINV_FAULT; 2244 } 2245 2246 /* 2247 * evaluate access pattern and update state 2248 * vm_fault_deactivate_behind depends on the 2249 * state being up to date 2250 */ 2251 vm_fault_is_sequential(object, offset, fault_info->behavior); 2252 2253 vm_fault_deactivate_behind(object, offset, fault_info->behavior); 2254 } 2255 if (type_of_fault) 2256 *type_of_fault = my_fault; 2257 } else { 2258 retval = VM_FAULT_SUCCESS_NO_VM_PAGE; 2259 assert(first_m == VM_PAGE_NULL); 2260 assert(object == first_object); 2261 } 2262 2263 thread_interrupt_level(interruptible_state); 2264 2265#if TRACEFAULTPAGE 2266 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */ 2267#endif 2268 return retval; 2269 2270backoff: 2271 thread_interrupt_level(interruptible_state); 2272 2273 if (wait_result == THREAD_INTERRUPTED) 2274 return (VM_FAULT_INTERRUPTED); 2275 return (VM_FAULT_RETRY); 2276 2277#undef RELEASE_PAGE 2278} 2279 2280 2281 2282/* 2283 * CODE SIGNING: 2284 * When soft faulting a page, we have to validate the page if: 2285 * 1. the page is being mapped in user space 2286 * 2. the page hasn't already been found to be "tainted" 2287 * 3. the page belongs to a code-signed object 2288 * 4. the page has not been validated yet or has been mapped for write. 2289 */ 2290#define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \ 2291 ((pmap) != kernel_pmap /*1*/ && \ 2292 !(page)->cs_tainted /*2*/ && \ 2293 (page)->object->code_signed /*3*/ && \ 2294 (!(page)->cs_validated || (page)->wpmapped /*4*/)) 2295 2296 2297/* 2298 * page queue lock must NOT be held 2299 * m->object must be locked 2300 * 2301 * NOTE: m->object could be locked "shared" only if we are called 2302 * from vm_fault() as part of a soft fault. If so, we must be 2303 * careful not to modify the VM object in any way that is not 2304 * legal under a shared lock... 2305 */ 2306unsigned long cs_enter_tainted_rejected = 0; 2307unsigned long cs_enter_tainted_accepted = 0; 2308kern_return_t 2309vm_fault_enter(vm_page_t m, 2310 pmap_t pmap, 2311 vm_map_offset_t vaddr, 2312 vm_prot_t prot, 2313 vm_prot_t fault_type, 2314 boolean_t wired, 2315 boolean_t change_wiring, 2316 boolean_t no_cache, 2317 boolean_t cs_bypass, 2318 boolean_t *need_retry, 2319 int *type_of_fault) 2320{ 2321 kern_return_t kr, pe_result; 2322 boolean_t previously_pmapped = m->pmapped; 2323 boolean_t must_disconnect = 0; 2324 boolean_t map_is_switched, map_is_switch_protected; 2325 2326 vm_object_lock_assert_held(m->object); 2327#if DEBUG 2328 lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); 2329#endif /* DEBUG */ 2330 2331 if (m->phys_page == vm_page_guard_addr) { 2332 assert(m->fictitious); 2333 return KERN_SUCCESS; 2334 } 2335 2336 if (*type_of_fault == DBG_ZERO_FILL_FAULT) { 2337 2338 vm_object_lock_assert_exclusive(m->object); 2339 2340 } else if ((fault_type & VM_PROT_WRITE) == 0) { 2341 /* 2342 * This is not a "write" fault, so we 2343 * might not have taken the object lock 2344 * exclusively and we might not be able 2345 * to update the "wpmapped" bit in 2346 * vm_fault_enter(). 2347 * Let's just grant read access to 2348 * the page for now and we'll 2349 * soft-fault again if we need write 2350 * access later... 2351 */ 2352 prot &= ~VM_PROT_WRITE; 2353 } 2354 if (m->pmapped == FALSE) { 2355 2356 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) { 2357 /* 2358 * found it in the cache, but this 2359 * is the first fault-in of the page (m->pmapped == FALSE) 2360 * so it must have come in as part of 2361 * a cluster... account 1 pagein against it 2362 */ 2363 VM_STAT_INCR(pageins); 2364 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL); 2365 2366 if (m->object->internal) { 2367 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL); 2368 *type_of_fault = DBG_PAGEIND_FAULT; 2369 } else { 2370 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL); 2371 *type_of_fault = DBG_PAGEINV_FAULT; 2372 } 2373 2374 current_task()->pageins++; 2375 } 2376 VM_PAGE_CONSUME_CLUSTERED(m); 2377 2378 } 2379 2380 if (*type_of_fault != DBG_COW_FAULT) { 2381 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL); 2382 2383 if (pmap == kernel_pmap) { 2384 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL); 2385 } 2386 } 2387 2388 /* Validate code signature if necessary. */ 2389 if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) { 2390 vm_object_lock_assert_exclusive(m->object); 2391 2392 if (m->cs_validated) { 2393 vm_cs_revalidates++; 2394 } 2395 2396 /* VM map is locked, so 1 ref will remain on VM object - 2397 * so no harm if vm_page_validate_cs drops the object lock */ 2398 vm_page_validate_cs(m); 2399 } 2400 2401#define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/) 2402 2403 map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) && 2404 (pmap == vm_map_pmap(current_thread()->map))); 2405 map_is_switch_protected = current_thread()->map->switch_protect; 2406 2407 /* If the map is switched, and is switch-protected, we must protect 2408 * some pages from being write-faulted: immutable pages because by 2409 * definition they may not be written, and executable pages because that 2410 * would provide a way to inject unsigned code. 2411 * If the page is immutable, we can simply return. However, we can't 2412 * immediately determine whether a page is executable anywhere. But, 2413 * we can disconnect it everywhere and remove the executable protection 2414 * from the current map. We do that below right before we do the 2415 * PMAP_ENTER. 2416 */ 2417 if(!cs_enforcement_disable && map_is_switched && 2418 map_is_switch_protected && page_immutable(m, prot) && 2419 (prot & VM_PROT_WRITE)) 2420 { 2421 return KERN_CODESIGN_ERROR; 2422 } 2423 2424 /* A page could be tainted, or pose a risk of being tainted later. 2425 * Check whether the receiving process wants it, and make it feel 2426 * the consequences (that hapens in cs_invalid_page()). 2427 * For CS Enforcement, two other conditions will 2428 * cause that page to be tainted as well: 2429 * - pmapping an unsigned page executable - this means unsigned code; 2430 * - writeable mapping of a validated page - the content of that page 2431 * can be changed without the kernel noticing, therefore unsigned 2432 * code can be created 2433 */ 2434 if (m->cs_tainted || 2435 (( !cs_enforcement_disable && !cs_bypass ) && 2436 (/* The page is unsigned and wants to be executable */ 2437 (!m->cs_validated && (prot & VM_PROT_EXECUTE)) || 2438 /* The page should be immutable, but is in danger of being modified 2439 * This is the case where we want policy from the code directory - 2440 * is the page immutable or not? For now we have to assume that 2441 * code pages will be immutable, data pages not. 2442 * We'll assume a page is a code page if it has a code directory 2443 * and we fault for execution. 2444 * That is good enough since if we faulted the code page for 2445 * writing in another map before, it is wpmapped; if we fault 2446 * it for writing in this map later it will also be faulted for executing 2447 * at the same time; and if we fault for writing in another map 2448 * later, we will disconnect it from this pmap so we'll notice 2449 * the change. 2450 */ 2451 (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped)) 2452 )) 2453 ) 2454 { 2455 /* We will have a tainted page. Have to handle the special case 2456 * of a switched map now. If the map is not switched, standard 2457 * procedure applies - call cs_invalid_page(). 2458 * If the map is switched, the real owner is invalid already. 2459 * There is no point in invalidating the switching process since 2460 * it will not be executing from the map. So we don't call 2461 * cs_invalid_page() in that case. */ 2462 boolean_t reject_page; 2463 if(map_is_switched) { 2464 assert(pmap==vm_map_pmap(current_thread()->map)); 2465 assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE)); 2466 reject_page = FALSE; 2467 } else { 2468 reject_page = cs_invalid_page((addr64_t) vaddr); 2469 } 2470 2471 if (reject_page) { 2472 /* reject the tainted page: abort the page fault */ 2473 kr = KERN_CODESIGN_ERROR; 2474 cs_enter_tainted_rejected++; 2475 } else { 2476 /* proceed with the tainted page */ 2477 kr = KERN_SUCCESS; 2478 /* Page might have been tainted before or not; now it 2479 * definitively is. If the page wasn't tainted, we must 2480 * disconnect it from all pmaps later. */ 2481 must_disconnect = !m->cs_tainted; 2482 m->cs_tainted = TRUE; 2483 cs_enter_tainted_accepted++; 2484 } 2485 if (cs_debug || kr != KERN_SUCCESS) { 2486 printf("CODESIGNING: vm_fault_enter(0x%llx): " 2487 "page %p obj %p off 0x%llx *** INVALID PAGE ***\n", 2488 (long long)vaddr, m, m->object, m->offset); 2489 } 2490 2491 } else { 2492 /* proceed with the valid page */ 2493 kr = KERN_SUCCESS; 2494 } 2495 2496 /* If we have a KERN_SUCCESS from the previous checks, we either have 2497 * a good page, or a tainted page that has been accepted by the process. 2498 * In both cases the page will be entered into the pmap. 2499 * If the page is writeable, we need to disconnect it from other pmaps 2500 * now so those processes can take note. 2501 */ 2502 if (kr == KERN_SUCCESS) { 2503 /* 2504 * NOTE: we may only hold the vm_object lock SHARED 2505 * at this point, but the update of pmapped is ok 2506 * since this is the ONLY bit updated behind the SHARED 2507 * lock... however, we need to figure out how to do an atomic 2508 * update on a bit field to make this less fragile... right 2509 * now I don't know how to coerce 'C' to give me the offset info 2510 * that's needed for an AtomicCompareAndSwap 2511 */ 2512 m->pmapped = TRUE; 2513 if(vm_page_is_slideable(m)) { 2514 boolean_t was_busy = m->busy; 2515 m->busy = TRUE; 2516 kr = vm_page_slide(m, 0); 2517 assert(m->busy); 2518 if(!was_busy) { 2519 PAGE_WAKEUP_DONE(m); 2520 } 2521 if (kr != KERN_SUCCESS) { 2522 /* 2523 * This page has not been slid correctly, 2524 * do not do the pmap_enter() ! 2525 * Let vm_fault_enter() return the error 2526 * so the caller can fail the fault. 2527 */ 2528 goto after_the_pmap_enter; 2529 } 2530 } 2531 2532 if (fault_type & VM_PROT_WRITE) { 2533 2534 if (m->wpmapped == FALSE) { 2535 vm_object_lock_assert_exclusive(m->object); 2536 2537 m->wpmapped = TRUE; 2538 } 2539 if (must_disconnect) { 2540 /* 2541 * We can only get here 2542 * because of the CSE logic 2543 */ 2544 assert(cs_enforcement_disable == FALSE); 2545 pmap_disconnect(m->phys_page); 2546 /* 2547 * If we are faulting for a write, we can clear 2548 * the execute bit - that will ensure the page is 2549 * checked again before being executable, which 2550 * protects against a map switch. 2551 * This only happens the first time the page 2552 * gets tainted, so we won't get stuck here 2553 * to make an already writeable page executable. 2554 */ 2555 if (!cs_bypass){ 2556 prot &= ~VM_PROT_EXECUTE; 2557 } 2558 } 2559 } 2560 2561 /* Prevent a deadlock by not 2562 * holding the object lock if we need to wait for a page in 2563 * pmap_enter() - <rdar://problem/7138958> */ 2564 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0, 2565 wired, PMAP_OPTIONS_NOWAIT, pe_result); 2566 2567 if(pe_result == KERN_RESOURCE_SHORTAGE) { 2568 2569 if (need_retry) { 2570 /* 2571 * this will be non-null in the case where we hold the lock 2572 * on the top-object in this chain... we can't just drop 2573 * the lock on the object we're inserting the page into 2574 * and recall the PMAP_ENTER since we can still cause 2575 * a deadlock if one of the critical paths tries to 2576 * acquire the lock on the top-object and we're blocked 2577 * in PMAP_ENTER waiting for memory... our only recourse 2578 * is to deal with it at a higher level where we can 2579 * drop both locks. 2580 */ 2581 *need_retry = TRUE; 2582 vm_pmap_enter_retried++; 2583 goto after_the_pmap_enter; 2584 } 2585 /* The nonblocking version of pmap_enter did not succeed. 2586 * and we don't need to drop other locks and retry 2587 * at the level above us, so 2588 * use the blocking version instead. Requires marking 2589 * the page busy and unlocking the object */ 2590 boolean_t was_busy = m->busy; 2591 m->busy = TRUE; 2592 vm_object_unlock(m->object); 2593 2594 PMAP_ENTER(pmap, vaddr, m, prot, fault_type, 0, wired); 2595 2596 /* Take the object lock again. */ 2597 vm_object_lock(m->object); 2598 2599 /* If the page was busy, someone else will wake it up. 2600 * Otherwise, we have to do it now. */ 2601 assert(m->busy); 2602 if(!was_busy) { 2603 PAGE_WAKEUP_DONE(m); 2604 } 2605 vm_pmap_enter_blocked++; 2606 } 2607 } 2608 2609after_the_pmap_enter: 2610 /* 2611 * Hold queues lock to manipulate 2612 * the page queues. Change wiring 2613 * case is obvious. 2614 */ 2615 if (change_wiring) { 2616 vm_page_lockspin_queues(); 2617 2618 if (wired) { 2619 if (kr == KERN_SUCCESS) { 2620 vm_page_wire(m); 2621 } 2622 } else { 2623 vm_page_unwire(m, TRUE); 2624 } 2625 vm_page_unlock_queues(); 2626 2627 } else { 2628 if (kr != KERN_SUCCESS) { 2629 vm_page_lockspin_queues(); 2630 vm_page_deactivate(m); 2631 vm_page_unlock_queues(); 2632 } else { 2633 if (((!m->active && !m->inactive) || m->clean_queue || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) { 2634 2635 if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) { 2636 struct vpl *lq; 2637 uint32_t lid; 2638 2639 /* 2640 * we got a local queue to stuff this new page on... 2641 * its safe to manipulate local and local_id at this point 2642 * since we're behind an exclusive object lock and the 2643 * page is not on any global queue. 2644 * 2645 * we'll use the current cpu number to select the queue 2646 * note that we don't need to disable preemption... we're 2647 * going to behind the local queue's lock to do the real 2648 * work 2649 */ 2650 lid = cpu_number(); 2651 2652 lq = &vm_page_local_q[lid].vpl_un.vpl; 2653 2654 VPL_LOCK(&lq->vpl_lock); 2655 2656 queue_enter(&lq->vpl_queue, m, vm_page_t, pageq); 2657 m->local = TRUE; 2658 m->local_id = lid; 2659 lq->vpl_count++; 2660 2661 VPL_UNLOCK(&lq->vpl_lock); 2662 2663 if (lq->vpl_count > vm_page_local_q_soft_limit) { 2664 /* 2665 * we're beyond the soft limit for the local queue 2666 * vm_page_reactivate_local will 'try' to take 2667 * the global page queue lock... if it can't that's 2668 * ok... we'll let the queue continue to grow up 2669 * to the hard limit... at that point we'll wait 2670 * for the lock... once we've got the lock, we'll 2671 * transfer all of the pages from the local queue 2672 * to the global active queue 2673 */ 2674 vm_page_reactivate_local(lid, FALSE, FALSE); 2675 } 2676 return kr; 2677 } 2678 2679 vm_page_lockspin_queues(); 2680 /* 2681 * test again now that we hold the page queue lock 2682 */ 2683 if (!VM_PAGE_WIRED(m)) { 2684 if (m->clean_queue) { 2685 VM_PAGE_QUEUES_REMOVE(m); 2686 2687 vm_pageout_cleaned_reactivated++; 2688 vm_pageout_cleaned_fault_reactivated++; 2689 } 2690 2691 if ((!m->active && !m->inactive) || no_cache) { 2692 /* 2693 * If this is a no_cache mapping and the page has never been 2694 * mapped before or was previously a no_cache page, then we 2695 * want to leave pages in the speculative state so that they 2696 * can be readily recycled if free memory runs low. Otherwise 2697 * the page is activated as normal. 2698 */ 2699 2700 if (no_cache && (!previously_pmapped || m->no_cache)) { 2701 m->no_cache = TRUE; 2702 2703 if (!m->speculative) 2704 vm_page_speculate(m, FALSE); 2705 2706 } else if (!m->active && !m->inactive) { 2707 2708 vm_page_activate(m); 2709 } 2710 } 2711 } 2712 vm_page_unlock_queues(); 2713 } 2714 } 2715 } 2716 return kr; 2717} 2718 2719 2720/* 2721 * Routine: vm_fault 2722 * Purpose: 2723 * Handle page faults, including pseudo-faults 2724 * used to change the wiring status of pages. 2725 * Returns: 2726 * Explicit continuations have been removed. 2727 * Implementation: 2728 * vm_fault and vm_fault_page save mucho state 2729 * in the moral equivalent of a closure. The state 2730 * structure is allocated when first entering vm_fault 2731 * and deallocated when leaving vm_fault. 2732 */ 2733 2734extern int _map_enter_debug; 2735 2736unsigned long vm_fault_collapse_total = 0; 2737unsigned long vm_fault_collapse_skipped = 0; 2738 2739kern_return_t 2740vm_fault( 2741 vm_map_t map, 2742 vm_map_offset_t vaddr, 2743 vm_prot_t fault_type, 2744 boolean_t change_wiring, 2745 int interruptible, 2746 pmap_t caller_pmap, 2747 vm_map_offset_t caller_pmap_addr) 2748{ 2749 vm_map_version_t version; /* Map version for verificiation */ 2750 boolean_t wired; /* Should mapping be wired down? */ 2751 vm_object_t object; /* Top-level object */ 2752 vm_object_offset_t offset; /* Top-level offset */ 2753 vm_prot_t prot; /* Protection for mapping */ 2754 vm_object_t old_copy_object; /* Saved copy object */ 2755 vm_page_t result_page; /* Result of vm_fault_page */ 2756 vm_page_t top_page; /* Placeholder page */ 2757 kern_return_t kr; 2758 2759 vm_page_t m; /* Fast access to result_page */ 2760 kern_return_t error_code; 2761 vm_object_t cur_object; 2762 vm_object_offset_t cur_offset; 2763 vm_page_t cur_m; 2764 vm_object_t new_object; 2765 int type_of_fault; 2766 pmap_t pmap; 2767 boolean_t interruptible_state; 2768 vm_map_t real_map = map; 2769 vm_map_t original_map = map; 2770 vm_prot_t original_fault_type; 2771 struct vm_object_fault_info fault_info; 2772 boolean_t need_collapse = FALSE; 2773 boolean_t need_retry = FALSE; 2774 int object_lock_type = 0; 2775 int cur_object_lock_type; 2776 vm_object_t top_object = VM_OBJECT_NULL; 2777 int throttle_delay; 2778 2779 2780 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 2781 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START, 2782 (int)((uint64_t)vaddr >> 32), 2783 (int)vaddr, 2784 (map == kernel_map), 2785 0, 2786 0); 2787 2788 if (get_preemption_level() != 0) { 2789 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 2790 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, 2791 (int)((uint64_t)vaddr >> 32), 2792 (int)vaddr, 2793 KERN_FAILURE, 2794 0, 2795 0); 2796 2797 return (KERN_FAILURE); 2798 } 2799 2800 interruptible_state = thread_interrupt_level(interruptible); 2801 2802 VM_STAT_INCR(faults); 2803 current_task()->faults++; 2804 original_fault_type = fault_type; 2805 2806 if (fault_type & VM_PROT_WRITE) 2807 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2808 else 2809 object_lock_type = OBJECT_LOCK_SHARED; 2810 2811 cur_object_lock_type = OBJECT_LOCK_SHARED; 2812 2813RetryFault: 2814 /* 2815 * assume we will hit a page in the cache 2816 * otherwise, explicitly override with 2817 * the real fault type once we determine it 2818 */ 2819 type_of_fault = DBG_CACHE_HIT_FAULT; 2820 2821 /* 2822 * Find the backing store object and offset into 2823 * it to begin the search. 2824 */ 2825 fault_type = original_fault_type; 2826 map = original_map; 2827 vm_map_lock_read(map); 2828 2829 kr = vm_map_lookup_locked(&map, vaddr, fault_type, 2830 object_lock_type, &version, 2831 &object, &offset, &prot, &wired, 2832 &fault_info, 2833 &real_map); 2834 2835 if (kr != KERN_SUCCESS) { 2836 vm_map_unlock_read(map); 2837 goto done; 2838 } 2839 pmap = real_map->pmap; 2840 fault_info.interruptible = interruptible; 2841 fault_info.stealth = FALSE; 2842 fault_info.io_sync = FALSE; 2843 fault_info.mark_zf_absent = FALSE; 2844 fault_info.batch_pmap_op = FALSE; 2845 2846 /* 2847 * If the page is wired, we must fault for the current protection 2848 * value, to avoid further faults. 2849 */ 2850 if (wired) { 2851 fault_type = prot | VM_PROT_WRITE; 2852 /* 2853 * since we're treating this fault as a 'write' 2854 * we must hold the top object lock exclusively 2855 */ 2856 if (object_lock_type == OBJECT_LOCK_SHARED) { 2857 2858 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2859 2860 if (vm_object_lock_upgrade(object) == FALSE) { 2861 /* 2862 * couldn't upgrade, so explictly 2863 * take the lock exclusively 2864 */ 2865 vm_object_lock(object); 2866 } 2867 } 2868 } 2869 2870#if VM_FAULT_CLASSIFY 2871 /* 2872 * Temporary data gathering code 2873 */ 2874 vm_fault_classify(object, offset, fault_type); 2875#endif 2876 /* 2877 * Fast fault code. The basic idea is to do as much as 2878 * possible while holding the map lock and object locks. 2879 * Busy pages are not used until the object lock has to 2880 * be dropped to do something (copy, zero fill, pmap enter). 2881 * Similarly, paging references aren't acquired until that 2882 * point, and object references aren't used. 2883 * 2884 * If we can figure out what to do 2885 * (zero fill, copy on write, pmap enter) while holding 2886 * the locks, then it gets done. Otherwise, we give up, 2887 * and use the original fault path (which doesn't hold 2888 * the map lock, and relies on busy pages). 2889 * The give up cases include: 2890 * - Have to talk to pager. 2891 * - Page is busy, absent or in error. 2892 * - Pager has locked out desired access. 2893 * - Fault needs to be restarted. 2894 * - Have to push page into copy object. 2895 * 2896 * The code is an infinite loop that moves one level down 2897 * the shadow chain each time. cur_object and cur_offset 2898 * refer to the current object being examined. object and offset 2899 * are the original object from the map. The loop is at the 2900 * top level if and only if object and cur_object are the same. 2901 * 2902 * Invariants: Map lock is held throughout. Lock is held on 2903 * original object and cur_object (if different) when 2904 * continuing or exiting loop. 2905 * 2906 */ 2907 2908 2909 /* 2910 * If this page is to be inserted in a copy delay object 2911 * for writing, and if the object has a copy, then the 2912 * copy delay strategy is implemented in the slow fault page. 2913 */ 2914 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY && 2915 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) 2916 goto handle_copy_delay; 2917 2918 cur_object = object; 2919 cur_offset = offset; 2920 2921 while (TRUE) { 2922 if (!cur_object->pager_created && 2923 cur_object->phys_contiguous) /* superpage */ 2924 break; 2925 2926 if (cur_object->blocked_access) { 2927 /* 2928 * Access to this VM object has been blocked. 2929 * Let the slow path handle it. 2930 */ 2931 break; 2932 } 2933 2934 m = vm_page_lookup(cur_object, cur_offset); 2935 2936 if (m != VM_PAGE_NULL) { 2937 if (m->busy) { 2938 wait_result_t result; 2939 2940 /* 2941 * in order to do the PAGE_ASSERT_WAIT, we must 2942 * have object that 'm' belongs to locked exclusively 2943 */ 2944 if (object != cur_object) { 2945 vm_object_unlock(object); 2946 2947 if (cur_object_lock_type == OBJECT_LOCK_SHARED) { 2948 2949 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2950 2951 if (vm_object_lock_upgrade(cur_object) == FALSE) { 2952 /* 2953 * couldn't upgrade so go do a full retry 2954 * immediately since we've already dropped 2955 * the top object lock associated with this page 2956 * and the current one got dropped due to the 2957 * failed upgrade... the state is no longer valid 2958 */ 2959 vm_map_unlock_read(map); 2960 if (real_map != map) 2961 vm_map_unlock(real_map); 2962 2963 goto RetryFault; 2964 } 2965 } 2966 } else if (object_lock_type == OBJECT_LOCK_SHARED) { 2967 2968 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 2969 2970 if (vm_object_lock_upgrade(object) == FALSE) { 2971 /* 2972 * couldn't upgrade, so explictly take the lock 2973 * exclusively and go relookup the page since we 2974 * will have dropped the object lock and 2975 * a different thread could have inserted 2976 * a page at this offset 2977 * no need for a full retry since we're 2978 * at the top level of the object chain 2979 */ 2980 vm_object_lock(object); 2981 2982 continue; 2983 } 2984 } 2985 vm_map_unlock_read(map); 2986 if (real_map != map) 2987 vm_map_unlock(real_map); 2988 2989 result = PAGE_ASSERT_WAIT(m, interruptible); 2990 2991 vm_object_unlock(cur_object); 2992 2993 if (result == THREAD_WAITING) { 2994 result = thread_block(THREAD_CONTINUE_NULL); 2995 2996 counter(c_vm_fault_page_block_busy_kernel++); 2997 } 2998 if (result == THREAD_AWAKENED || result == THREAD_RESTART) 2999 goto RetryFault; 3000 3001 kr = KERN_ABORTED; 3002 goto done; 3003 } 3004 if (m->laundry) { 3005 if (object != cur_object) { 3006 if (cur_object_lock_type == OBJECT_LOCK_SHARED) { 3007 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; 3008 3009 vm_object_unlock(object); 3010 vm_object_unlock(cur_object); 3011 3012 vm_map_unlock_read(map); 3013 if (real_map != map) 3014 vm_map_unlock(real_map); 3015 3016 goto RetryFault; 3017 } 3018 3019 } else if (object_lock_type == OBJECT_LOCK_SHARED) { 3020 3021 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 3022 3023 if (vm_object_lock_upgrade(object) == FALSE) { 3024 /* 3025 * couldn't upgrade, so explictly take the lock 3026 * exclusively and go relookup the page since we 3027 * will have dropped the object lock and 3028 * a different thread could have inserted 3029 * a page at this offset 3030 * no need for a full retry since we're 3031 * at the top level of the object chain 3032 */ 3033 vm_object_lock(object); 3034 3035 continue; 3036 } 3037 } 3038 m->pageout = FALSE; 3039 3040 vm_pageout_steal_laundry(m, FALSE); 3041 } 3042 3043 if (m->phys_page == vm_page_guard_addr) { 3044 /* 3045 * Guard page: let the slow path deal with it 3046 */ 3047 break; 3048 } 3049 if (m->unusual && (m->error || m->restart || m->private || m->absent)) { 3050 /* 3051 * Unusual case... let the slow path deal with it 3052 */ 3053 break; 3054 } 3055 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) { 3056 if (object != cur_object) 3057 vm_object_unlock(object); 3058 vm_map_unlock_read(map); 3059 if (real_map != map) 3060 vm_map_unlock(real_map); 3061 vm_object_unlock(cur_object); 3062 kr = KERN_MEMORY_ERROR; 3063 goto done; 3064 } 3065 3066 if (m->encrypted) { 3067 /* 3068 * ENCRYPTED SWAP: 3069 * We've soft-faulted (because it's not in the page 3070 * table) on an encrypted page. 3071 * Keep the page "busy" so that no one messes with 3072 * it during the decryption. 3073 * Release the extra locks we're holding, keep only 3074 * the page's VM object lock. 3075 * 3076 * in order to set 'busy' on 'm', we must 3077 * have object that 'm' belongs to locked exclusively 3078 */ 3079 if (object != cur_object) { 3080 vm_object_unlock(object); 3081 3082 if (cur_object_lock_type == OBJECT_LOCK_SHARED) { 3083 3084 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; 3085 3086 if (vm_object_lock_upgrade(cur_object) == FALSE) { 3087 /* 3088 * couldn't upgrade so go do a full retry 3089 * immediately since we've already dropped 3090 * the top object lock associated with this page 3091 * and the current one got dropped due to the 3092 * failed upgrade... the state is no longer valid 3093 */ 3094 vm_map_unlock_read(map); 3095 if (real_map != map) 3096 vm_map_unlock(real_map); 3097 3098 goto RetryFault; 3099 } 3100 } 3101 } else if (object_lock_type == OBJECT_LOCK_SHARED) { 3102 3103 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 3104 3105 if (vm_object_lock_upgrade(object) == FALSE) { 3106 /* 3107 * couldn't upgrade, so explictly take the lock 3108 * exclusively and go relookup the page since we 3109 * will have dropped the object lock and 3110 * a different thread could have inserted 3111 * a page at this offset 3112 * no need for a full retry since we're 3113 * at the top level of the object chain 3114 */ 3115 vm_object_lock(object); 3116 3117 continue; 3118 } 3119 } 3120 m->busy = TRUE; 3121 3122 vm_map_unlock_read(map); 3123 if (real_map != map) 3124 vm_map_unlock(real_map); 3125 3126 vm_page_decrypt(m, 0); 3127 3128 assert(m->busy); 3129 PAGE_WAKEUP_DONE(m); 3130 3131 vm_object_unlock(cur_object); 3132 /* 3133 * Retry from the top, in case anything 3134 * changed while we were decrypting... 3135 */ 3136 goto RetryFault; 3137 } 3138 ASSERT_PAGE_DECRYPTED(m); 3139 3140 if(vm_page_is_slideable(m)) { 3141 /* 3142 * We might need to slide this page, and so, 3143 * we want to hold the VM object exclusively. 3144 */ 3145 if (object != cur_object) { 3146 if (cur_object_lock_type == OBJECT_LOCK_SHARED) { 3147 vm_object_unlock(object); 3148 vm_object_unlock(cur_object); 3149 3150 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; 3151 3152 vm_map_unlock_read(map); 3153 if (real_map != map) 3154 vm_map_unlock(real_map); 3155 3156 goto RetryFault; 3157 } 3158 } else if (object_lock_type == OBJECT_LOCK_SHARED) { 3159 3160 vm_object_unlock(object); 3161 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 3162 vm_map_unlock_read(map); 3163 goto RetryFault; 3164 } 3165 } 3166 3167 if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) { 3168upgrade_for_validation: 3169 /* 3170 * We might need to validate this page 3171 * against its code signature, so we 3172 * want to hold the VM object exclusively. 3173 */ 3174 if (object != cur_object) { 3175 if (cur_object_lock_type == OBJECT_LOCK_SHARED) { 3176 vm_object_unlock(object); 3177 vm_object_unlock(cur_object); 3178 3179 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; 3180 3181 vm_map_unlock_read(map); 3182 if (real_map != map) 3183 vm_map_unlock(real_map); 3184 3185 goto RetryFault; 3186 } 3187 3188 } else if (object_lock_type == OBJECT_LOCK_SHARED) { 3189 3190 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 3191 3192 if (vm_object_lock_upgrade(object) == FALSE) { 3193 /* 3194 * couldn't upgrade, so explictly take the lock 3195 * exclusively and go relookup the page since we 3196 * will have dropped the object lock and 3197 * a different thread could have inserted 3198 * a page at this offset 3199 * no need for a full retry since we're 3200 * at the top level of the object chain 3201 */ 3202 vm_object_lock(object); 3203 3204 continue; 3205 } 3206 } 3207 } 3208 /* 3209 * Two cases of map in faults: 3210 * - At top level w/o copy object. 3211 * - Read fault anywhere. 3212 * --> must disallow write. 3213 */ 3214 3215 if (object == cur_object && object->copy == VM_OBJECT_NULL) { 3216 3217 goto FastPmapEnter; 3218 } 3219 3220 if ((fault_type & VM_PROT_WRITE) == 0) { 3221 3222 if (object != cur_object) { 3223 /* 3224 * We still need to hold the top object 3225 * lock here to prevent a race between 3226 * a read fault (taking only "shared" 3227 * locks) and a write fault (taking 3228 * an "exclusive" lock on the top 3229 * object. 3230 * Otherwise, as soon as we release the 3231 * top lock, the write fault could 3232 * proceed and actually complete before 3233 * the read fault, and the copied page's 3234 * translation could then be overwritten 3235 * by the read fault's translation for 3236 * the original page. 3237 * 3238 * Let's just record what the top object 3239 * is and we'll release it later. 3240 */ 3241 top_object = object; 3242 3243 /* 3244 * switch to the object that has the new page 3245 */ 3246 object = cur_object; 3247 object_lock_type = cur_object_lock_type; 3248 } 3249FastPmapEnter: 3250 /* 3251 * prepare for the pmap_enter... 3252 * object and map are both locked 3253 * m contains valid data 3254 * object == m->object 3255 * cur_object == NULL or it's been unlocked 3256 * no paging references on either object or cur_object 3257 */ 3258 if (caller_pmap) { 3259 kr = vm_fault_enter(m, 3260 caller_pmap, 3261 caller_pmap_addr, 3262 prot, 3263 fault_type, 3264 wired, 3265 change_wiring, 3266 fault_info.no_cache, 3267 fault_info.cs_bypass, 3268 (top_object != VM_OBJECT_NULL ? &need_retry : NULL), 3269 &type_of_fault); 3270 } else { 3271 kr = vm_fault_enter(m, 3272 pmap, 3273 vaddr, 3274 prot, 3275 fault_type, 3276 wired, 3277 change_wiring, 3278 fault_info.no_cache, 3279 fault_info.cs_bypass, 3280 (top_object != VM_OBJECT_NULL ? &need_retry : NULL), 3281 &type_of_fault); 3282 } 3283 3284 if (top_object != VM_OBJECT_NULL) { 3285 /* 3286 * It's safe to drop the top object 3287 * now that we've done our 3288 * vm_fault_enter(). Any other fault 3289 * in progress for that virtual 3290 * address will either find our page 3291 * and translation or put in a new page 3292 * and translation. 3293 */ 3294 vm_object_unlock(top_object); 3295 top_object = VM_OBJECT_NULL; 3296 } 3297 3298 if (need_collapse == TRUE) 3299 vm_object_collapse(object, offset, TRUE); 3300 3301 if (need_retry == FALSE && 3302 (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) { 3303 /* 3304 * evaluate access pattern and update state 3305 * vm_fault_deactivate_behind depends on the 3306 * state being up to date 3307 */ 3308 vm_fault_is_sequential(object, cur_offset, fault_info.behavior); 3309 3310 vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior); 3311 } 3312 /* 3313 * That's it, clean up and return. 3314 */ 3315 if (m->busy) 3316 PAGE_WAKEUP_DONE(m); 3317 3318 vm_object_unlock(object); 3319 3320 vm_map_unlock_read(map); 3321 if (real_map != map) 3322 vm_map_unlock(real_map); 3323 3324 if (need_retry == TRUE) { 3325 /* 3326 * vm_fault_enter couldn't complete the PMAP_ENTER... 3327 * at this point we don't hold any locks so it's safe 3328 * to ask the pmap layer to expand the page table to 3329 * accommodate this mapping... once expanded, we'll 3330 * re-drive the fault which should result in vm_fault_enter 3331 * being able to successfully enter the mapping this time around 3332 */ 3333 (void)pmap_enter_options(pmap, vaddr, 0, 0, 0, 0, 0, PMAP_OPTIONS_NOENTER); 3334 3335 need_retry = FALSE; 3336 goto RetryFault; 3337 } 3338 goto done; 3339 } 3340 /* 3341 * COPY ON WRITE FAULT 3342 */ 3343 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE); 3344 3345 if ((throttle_delay = vm_page_throttled())) { 3346 /* 3347 * drop all of our locks... 3348 * wait until the free queue is 3349 * pumped back up and then 3350 * redrive the fault 3351 */ 3352 if (object != cur_object) 3353 vm_object_unlock(cur_object); 3354 vm_object_unlock(object); 3355 vm_map_unlock_read(map); 3356 if (real_map != map) 3357 vm_map_unlock(real_map); 3358 3359 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); 3360 3361 delay(throttle_delay); 3362 3363 if (!current_thread_aborted() && vm_page_wait((change_wiring) ? 3364 THREAD_UNINT : 3365 THREAD_ABORTSAFE)) 3366 goto RetryFault; 3367 kr = KERN_ABORTED; 3368 goto done; 3369 } 3370 /* 3371 * If objects match, then 3372 * object->copy must not be NULL (else control 3373 * would be in previous code block), and we 3374 * have a potential push into the copy object 3375 * with which we can't cope with here. 3376 */ 3377 if (cur_object == object) { 3378 /* 3379 * must take the slow path to 3380 * deal with the copy push 3381 */ 3382 break; 3383 } 3384 3385 /* 3386 * This is now a shadow based copy on write 3387 * fault -- it requires a copy up the shadow 3388 * chain. 3389 */ 3390 3391 if ((cur_object_lock_type == OBJECT_LOCK_SHARED) && 3392 VM_FAULT_NEED_CS_VALIDATION(NULL, m)) { 3393 goto upgrade_for_validation; 3394 } 3395 3396 /* 3397 * Allocate a page in the original top level 3398 * object. Give up if allocate fails. Also 3399 * need to remember current page, as it's the 3400 * source of the copy. 3401 * 3402 * at this point we hold locks on both 3403 * object and cur_object... no need to take 3404 * paging refs or mark pages BUSY since 3405 * we don't drop either object lock until 3406 * the page has been copied and inserted 3407 */ 3408 cur_m = m; 3409 m = vm_page_grab(); 3410 3411 if (m == VM_PAGE_NULL) { 3412 /* 3413 * no free page currently available... 3414 * must take the slow path 3415 */ 3416 break; 3417 } 3418 /* 3419 * Now do the copy. Mark the source page busy... 3420 * 3421 * NOTE: This code holds the map lock across 3422 * the page copy. 3423 */ 3424 vm_page_copy(cur_m, m); 3425 vm_page_insert(m, object, offset); 3426 SET_PAGE_DIRTY(m, FALSE); 3427 3428 /* 3429 * Now cope with the source page and object 3430 */ 3431 if (object->ref_count > 1 && cur_m->pmapped) 3432 pmap_disconnect(cur_m->phys_page); 3433 3434 need_collapse = TRUE; 3435 3436 if (!cur_object->internal && 3437 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) { 3438 /* 3439 * The object from which we've just 3440 * copied a page is most probably backed 3441 * by a vnode. We don't want to waste too 3442 * much time trying to collapse the VM objects 3443 * and create a bottleneck when several tasks 3444 * map the same file. 3445 */ 3446 if (cur_object->copy == object) { 3447 /* 3448 * Shared mapping or no COW yet. 3449 * We can never collapse a copy 3450 * object into its backing object. 3451 */ 3452 need_collapse = FALSE; 3453 } else if (cur_object->copy == object->shadow && 3454 object->shadow->resident_page_count == 0) { 3455 /* 3456 * Shared mapping after a COW occurred. 3457 */ 3458 need_collapse = FALSE; 3459 } 3460 } 3461 vm_object_unlock(cur_object); 3462 3463 if (need_collapse == FALSE) 3464 vm_fault_collapse_skipped++; 3465 vm_fault_collapse_total++; 3466 3467 type_of_fault = DBG_COW_FAULT; 3468 VM_STAT_INCR(cow_faults); 3469 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL); 3470 current_task()->cow_faults++; 3471 3472 goto FastPmapEnter; 3473 3474 } else { 3475 /* 3476 * No page at cur_object, cur_offset... m == NULL 3477 */ 3478 if (cur_object->pager_created) { 3479 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) { 3480 /* 3481 * May have to talk to a pager... 3482 * take the slow path. 3483 */ 3484 break; 3485 } 3486 /* 3487 * existence map present and indicates 3488 * that the pager doesn't have this page 3489 */ 3490 } 3491 if (cur_object->shadow == VM_OBJECT_NULL) { 3492 /* 3493 * Zero fill fault. Page gets 3494 * inserted into the original object. 3495 */ 3496 if (cur_object->shadow_severed || 3497 VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) 3498 { 3499 if (object != cur_object) 3500 vm_object_unlock(cur_object); 3501 vm_object_unlock(object); 3502 3503 vm_map_unlock_read(map); 3504 if (real_map != map) 3505 vm_map_unlock(real_map); 3506 3507 kr = KERN_MEMORY_ERROR; 3508 goto done; 3509 } 3510 if ((throttle_delay = vm_page_throttled())) { 3511 /* 3512 * drop all of our locks... 3513 * wait until the free queue is 3514 * pumped back up and then 3515 * redrive the fault 3516 */ 3517 if (object != cur_object) 3518 vm_object_unlock(cur_object); 3519 vm_object_unlock(object); 3520 vm_map_unlock_read(map); 3521 if (real_map != map) 3522 vm_map_unlock(real_map); 3523 3524 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0); 3525 3526 delay(throttle_delay); 3527 3528 if (!current_thread_aborted() && vm_page_wait((change_wiring) ? 3529 THREAD_UNINT : 3530 THREAD_ABORTSAFE)) 3531 goto RetryFault; 3532 kr = KERN_ABORTED; 3533 goto done; 3534 } 3535 if (vm_backing_store_low) { 3536 /* 3537 * we are protecting the system from 3538 * backing store exhaustion... 3539 * must take the slow path if we're 3540 * not privileged 3541 */ 3542 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) 3543 break; 3544 } 3545 if (cur_object != object) { 3546 vm_object_unlock(cur_object); 3547 3548 cur_object = object; 3549 } 3550 if (object_lock_type == OBJECT_LOCK_SHARED) { 3551 3552 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 3553 3554 if (vm_object_lock_upgrade(object) == FALSE) { 3555 /* 3556 * couldn't upgrade so do a full retry on the fault 3557 * since we dropped the object lock which 3558 * could allow another thread to insert 3559 * a page at this offset 3560 */ 3561 vm_map_unlock_read(map); 3562 if (real_map != map) 3563 vm_map_unlock(real_map); 3564 3565 goto RetryFault; 3566 } 3567 } 3568 m = vm_page_alloc(object, offset); 3569 3570 if (m == VM_PAGE_NULL) { 3571 /* 3572 * no free page currently available... 3573 * must take the slow path 3574 */ 3575 break; 3576 } 3577 3578 /* 3579 * Now zero fill page... 3580 * the page is probably going to 3581 * be written soon, so don't bother 3582 * to clear the modified bit 3583 * 3584 * NOTE: This code holds the map 3585 * lock across the zero fill. 3586 */ 3587 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill); 3588 3589 goto FastPmapEnter; 3590 } 3591 /* 3592 * On to the next level in the shadow chain 3593 */ 3594 cur_offset += cur_object->vo_shadow_offset; 3595 new_object = cur_object->shadow; 3596 3597 /* 3598 * take the new_object's lock with the indicated state 3599 */ 3600 if (cur_object_lock_type == OBJECT_LOCK_SHARED) 3601 vm_object_lock_shared(new_object); 3602 else 3603 vm_object_lock(new_object); 3604 3605 if (cur_object != object) 3606 vm_object_unlock(cur_object); 3607 3608 cur_object = new_object; 3609 3610 continue; 3611 } 3612 } 3613 /* 3614 * Cleanup from fast fault failure. Drop any object 3615 * lock other than original and drop map lock. 3616 */ 3617 if (object != cur_object) 3618 vm_object_unlock(cur_object); 3619 3620 /* 3621 * must own the object lock exclusively at this point 3622 */ 3623 if (object_lock_type == OBJECT_LOCK_SHARED) { 3624 object_lock_type = OBJECT_LOCK_EXCLUSIVE; 3625 3626 if (vm_object_lock_upgrade(object) == FALSE) { 3627 /* 3628 * couldn't upgrade, so explictly 3629 * take the lock exclusively 3630 * no need to retry the fault at this 3631 * point since "vm_fault_page" will 3632 * completely re-evaluate the state 3633 */ 3634 vm_object_lock(object); 3635 } 3636 } 3637 3638handle_copy_delay: 3639 vm_map_unlock_read(map); 3640 if (real_map != map) 3641 vm_map_unlock(real_map); 3642 3643 /* 3644 * Make a reference to this object to 3645 * prevent its disposal while we are messing with 3646 * it. Once we have the reference, the map is free 3647 * to be diddled. Since objects reference their 3648 * shadows (and copies), they will stay around as well. 3649 */ 3650 vm_object_reference_locked(object); 3651 vm_object_paging_begin(object); 3652 3653 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0); 3654 3655 error_code = 0; 3656 3657 kr = vm_fault_page(object, offset, fault_type, 3658 (change_wiring && !wired), 3659 &prot, &result_page, &top_page, 3660 &type_of_fault, 3661 &error_code, map->no_zero_fill, 3662 FALSE, &fault_info); 3663 3664 /* 3665 * if kr != VM_FAULT_SUCCESS, then the paging reference 3666 * has been dropped and the object unlocked... the ref_count 3667 * is still held 3668 * 3669 * if kr == VM_FAULT_SUCCESS, then the paging reference 3670 * is still held along with the ref_count on the original object 3671 * 3672 * the object is returned locked with a paging reference 3673 * 3674 * if top_page != NULL, then it's BUSY and the 3675 * object it belongs to has a paging reference 3676 * but is returned unlocked 3677 */ 3678 if (kr != VM_FAULT_SUCCESS && 3679 kr != VM_FAULT_SUCCESS_NO_VM_PAGE) { 3680 /* 3681 * we didn't succeed, lose the object reference immediately. 3682 */ 3683 vm_object_deallocate(object); 3684 3685 /* 3686 * See why we failed, and take corrective action. 3687 */ 3688 switch (kr) { 3689 case VM_FAULT_MEMORY_SHORTAGE: 3690 if (vm_page_wait((change_wiring) ? 3691 THREAD_UNINT : 3692 THREAD_ABORTSAFE)) 3693 goto RetryFault; 3694 /* 3695 * fall thru 3696 */ 3697 case VM_FAULT_INTERRUPTED: 3698 kr = KERN_ABORTED; 3699 goto done; 3700 case VM_FAULT_RETRY: 3701 goto RetryFault; 3702 case VM_FAULT_MEMORY_ERROR: 3703 if (error_code) 3704 kr = error_code; 3705 else 3706 kr = KERN_MEMORY_ERROR; 3707 goto done; 3708 default: 3709 panic("vm_fault: unexpected error 0x%x from " 3710 "vm_fault_page()\n", kr); 3711 } 3712 } 3713 m = result_page; 3714 3715 if (m != VM_PAGE_NULL) { 3716 assert((change_wiring && !wired) ? 3717 (top_page == VM_PAGE_NULL) : 3718 ((top_page == VM_PAGE_NULL) == (m->object == object))); 3719 } 3720 3721 /* 3722 * What to do with the resulting page from vm_fault_page 3723 * if it doesn't get entered into the physical map: 3724 */ 3725#define RELEASE_PAGE(m) \ 3726 MACRO_BEGIN \ 3727 PAGE_WAKEUP_DONE(m); \ 3728 if (!m->active && !m->inactive && !m->throttled) { \ 3729 vm_page_lockspin_queues(); \ 3730 if (!m->active && !m->inactive && !m->throttled) \ 3731 vm_page_activate(m); \ 3732 vm_page_unlock_queues(); \ 3733 } \ 3734 MACRO_END 3735 3736 /* 3737 * We must verify that the maps have not changed 3738 * since our last lookup. 3739 */ 3740 if (m != VM_PAGE_NULL) { 3741 old_copy_object = m->object->copy; 3742 vm_object_unlock(m->object); 3743 } else { 3744 old_copy_object = VM_OBJECT_NULL; 3745 vm_object_unlock(object); 3746 } 3747 3748 /* 3749 * no object locks are held at this point 3750 */ 3751 if ((map != original_map) || !vm_map_verify(map, &version)) { 3752 vm_object_t retry_object; 3753 vm_object_offset_t retry_offset; 3754 vm_prot_t retry_prot; 3755 3756 /* 3757 * To avoid trying to write_lock the map while another 3758 * thread has it read_locked (in vm_map_pageable), we 3759 * do not try for write permission. If the page is 3760 * still writable, we will get write permission. If it 3761 * is not, or has been marked needs_copy, we enter the 3762 * mapping without write permission, and will merely 3763 * take another fault. 3764 */ 3765 map = original_map; 3766 vm_map_lock_read(map); 3767 3768 kr = vm_map_lookup_locked(&map, vaddr, 3769 fault_type & ~VM_PROT_WRITE, 3770 OBJECT_LOCK_EXCLUSIVE, &version, 3771 &retry_object, &retry_offset, &retry_prot, 3772 &wired, 3773 &fault_info, 3774 &real_map); 3775 pmap = real_map->pmap; 3776 3777 if (kr != KERN_SUCCESS) { 3778 vm_map_unlock_read(map); 3779 3780 if (m != VM_PAGE_NULL) { 3781 /* 3782 * retake the lock so that 3783 * we can drop the paging reference 3784 * in vm_fault_cleanup and do the 3785 * PAGE_WAKEUP_DONE in RELEASE_PAGE 3786 */ 3787 vm_object_lock(m->object); 3788 3789 RELEASE_PAGE(m); 3790 3791 vm_fault_cleanup(m->object, top_page); 3792 } else { 3793 /* 3794 * retake the lock so that 3795 * we can drop the paging reference 3796 * in vm_fault_cleanup 3797 */ 3798 vm_object_lock(object); 3799 3800 vm_fault_cleanup(object, top_page); 3801 } 3802 vm_object_deallocate(object); 3803 3804 goto done; 3805 } 3806 vm_object_unlock(retry_object); 3807 3808 if ((retry_object != object) || (retry_offset != offset)) { 3809 3810 vm_map_unlock_read(map); 3811 if (real_map != map) 3812 vm_map_unlock(real_map); 3813 3814 if (m != VM_PAGE_NULL) { 3815 /* 3816 * retake the lock so that 3817 * we can drop the paging reference 3818 * in vm_fault_cleanup and do the 3819 * PAGE_WAKEUP_DONE in RELEASE_PAGE 3820 */ 3821 vm_object_lock(m->object); 3822 3823 RELEASE_PAGE(m); 3824 3825 vm_fault_cleanup(m->object, top_page); 3826 } else { 3827 /* 3828 * retake the lock so that 3829 * we can drop the paging reference 3830 * in vm_fault_cleanup 3831 */ 3832 vm_object_lock(object); 3833 3834 vm_fault_cleanup(object, top_page); 3835 } 3836 vm_object_deallocate(object); 3837 3838 goto RetryFault; 3839 } 3840 /* 3841 * Check whether the protection has changed or the object 3842 * has been copied while we left the map unlocked. 3843 */ 3844 prot &= retry_prot; 3845 } 3846 if (m != VM_PAGE_NULL) { 3847 vm_object_lock(m->object); 3848 3849 if (m->object->copy != old_copy_object) { 3850 /* 3851 * The copy object changed while the top-level object 3852 * was unlocked, so take away write permission. 3853 */ 3854 prot &= ~VM_PROT_WRITE; 3855 } 3856 } else 3857 vm_object_lock(object); 3858 3859 /* 3860 * If we want to wire down this page, but no longer have 3861 * adequate permissions, we must start all over. 3862 */ 3863 if (wired && (fault_type != (prot | VM_PROT_WRITE))) { 3864 3865 vm_map_verify_done(map, &version); 3866 if (real_map != map) 3867 vm_map_unlock(real_map); 3868 3869 if (m != VM_PAGE_NULL) { 3870 RELEASE_PAGE(m); 3871 3872 vm_fault_cleanup(m->object, top_page); 3873 } else 3874 vm_fault_cleanup(object, top_page); 3875 3876 vm_object_deallocate(object); 3877 3878 goto RetryFault; 3879 } 3880 if (m != VM_PAGE_NULL) { 3881 /* 3882 * Put this page into the physical map. 3883 * We had to do the unlock above because pmap_enter 3884 * may cause other faults. The page may be on 3885 * the pageout queues. If the pageout daemon comes 3886 * across the page, it will remove it from the queues. 3887 */ 3888 if (caller_pmap) { 3889 kr = vm_fault_enter(m, 3890 caller_pmap, 3891 caller_pmap_addr, 3892 prot, 3893 fault_type, 3894 wired, 3895 change_wiring, 3896 fault_info.no_cache, 3897 fault_info.cs_bypass, 3898 NULL, 3899 &type_of_fault); 3900 } else { 3901 kr = vm_fault_enter(m, 3902 pmap, 3903 vaddr, 3904 prot, 3905 fault_type, 3906 wired, 3907 change_wiring, 3908 fault_info.no_cache, 3909 fault_info.cs_bypass, 3910 NULL, 3911 &type_of_fault); 3912 } 3913 if (kr != KERN_SUCCESS) { 3914 /* abort this page fault */ 3915 vm_map_verify_done(map, &version); 3916 if (real_map != map) 3917 vm_map_unlock(real_map); 3918 PAGE_WAKEUP_DONE(m); 3919 vm_fault_cleanup(m->object, top_page); 3920 vm_object_deallocate(object); 3921 goto done; 3922 } 3923 } else { 3924 3925 vm_map_entry_t entry; 3926 vm_map_offset_t laddr; 3927 vm_map_offset_t ldelta, hdelta; 3928 3929 /* 3930 * do a pmap block mapping from the physical address 3931 * in the object 3932 */ 3933 3934#ifdef ppc 3935 /* While we do not worry about execution protection in */ 3936 /* general, certian pages may have instruction execution */ 3937 /* disallowed. We will check here, and if not allowed */ 3938 /* to execute, we return with a protection failure. */ 3939 3940 if ((fault_type & VM_PROT_EXECUTE) && 3941 (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) { 3942 3943 vm_map_verify_done(map, &version); 3944 3945 if (real_map != map) 3946 vm_map_unlock(real_map); 3947 3948 vm_fault_cleanup(object, top_page); 3949 vm_object_deallocate(object); 3950 3951 kr = KERN_PROTECTION_FAILURE; 3952 goto done; 3953 } 3954#endif /* ppc */ 3955 3956 if (real_map != map) 3957 vm_map_unlock(real_map); 3958 3959 if (original_map != map) { 3960 vm_map_unlock_read(map); 3961 vm_map_lock_read(original_map); 3962 map = original_map; 3963 } 3964 real_map = map; 3965 3966 laddr = vaddr; 3967 hdelta = 0xFFFFF000; 3968 ldelta = 0xFFFFF000; 3969 3970 while (vm_map_lookup_entry(map, laddr, &entry)) { 3971 if (ldelta > (laddr - entry->vme_start)) 3972 ldelta = laddr - entry->vme_start; 3973 if (hdelta > (entry->vme_end - laddr)) 3974 hdelta = entry->vme_end - laddr; 3975 if (entry->is_sub_map) { 3976 3977 laddr = (laddr - entry->vme_start) 3978 + entry->offset; 3979 vm_map_lock_read(entry->object.sub_map); 3980 3981 if (map != real_map) 3982 vm_map_unlock_read(map); 3983 if (entry->use_pmap) { 3984 vm_map_unlock_read(real_map); 3985 real_map = entry->object.sub_map; 3986 } 3987 map = entry->object.sub_map; 3988 3989 } else { 3990 break; 3991 } 3992 } 3993 3994 if (vm_map_lookup_entry(map, laddr, &entry) && 3995 (entry->object.vm_object != NULL) && 3996 (entry->object.vm_object == object)) { 3997 3998 int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0; 3999 if (caller_pmap) { 4000 /* 4001 * Set up a block mapped area 4002 */ 4003 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12)); 4004 pmap_map_block(caller_pmap, 4005 (addr64_t)(caller_pmap_addr - ldelta), 4006 (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) + 4007 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12), 4008 (uint32_t)((ldelta + hdelta) >> 12), prot, 4009 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0); 4010 } else { 4011 /* 4012 * Set up a block mapped area 4013 */ 4014 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12)); 4015 pmap_map_block(real_map->pmap, 4016 (addr64_t)(vaddr - ldelta), 4017 (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) + 4018 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12), 4019 (uint32_t)((ldelta + hdelta) >> 12), prot, 4020 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0); 4021 } 4022 } 4023 } 4024 4025 /* 4026 * Unlock everything, and return 4027 */ 4028 vm_map_verify_done(map, &version); 4029 if (real_map != map) 4030 vm_map_unlock(real_map); 4031 4032 if (m != VM_PAGE_NULL) { 4033 PAGE_WAKEUP_DONE(m); 4034 4035 vm_fault_cleanup(m->object, top_page); 4036 } else 4037 vm_fault_cleanup(object, top_page); 4038 4039 vm_object_deallocate(object); 4040 4041#undef RELEASE_PAGE 4042 4043 kr = KERN_SUCCESS; 4044done: 4045 thread_interrupt_level(interruptible_state); 4046 4047 throttle_lowpri_io(TRUE); 4048 4049 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 4050 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, 4051 (int)((uint64_t)vaddr >> 32), 4052 (int)vaddr, 4053 kr, 4054 type_of_fault, 4055 0); 4056 4057 return (kr); 4058} 4059 4060/* 4061 * vm_fault_wire: 4062 * 4063 * Wire down a range of virtual addresses in a map. 4064 */ 4065kern_return_t 4066vm_fault_wire( 4067 vm_map_t map, 4068 vm_map_entry_t entry, 4069 pmap_t pmap, 4070 vm_map_offset_t pmap_addr) 4071{ 4072 4073 register vm_map_offset_t va; 4074 register vm_map_offset_t end_addr = entry->vme_end; 4075 register kern_return_t rc; 4076 4077 assert(entry->in_transition); 4078 4079 if ((entry->object.vm_object != NULL) && 4080 !entry->is_sub_map && 4081 entry->object.vm_object->phys_contiguous) { 4082 return KERN_SUCCESS; 4083 } 4084 4085 /* 4086 * Inform the physical mapping system that the 4087 * range of addresses may not fault, so that 4088 * page tables and such can be locked down as well. 4089 */ 4090 4091 pmap_pageable(pmap, pmap_addr, 4092 pmap_addr + (end_addr - entry->vme_start), FALSE); 4093 4094 /* 4095 * We simulate a fault to get the page and enter it 4096 * in the physical map. 4097 */ 4098 4099 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { 4100 if ((rc = vm_fault_wire_fast( 4101 map, va, entry, pmap, 4102 pmap_addr + (va - entry->vme_start) 4103 )) != KERN_SUCCESS) { 4104 rc = vm_fault(map, va, VM_PROT_NONE, TRUE, 4105 (pmap == kernel_pmap) ? 4106 THREAD_UNINT : THREAD_ABORTSAFE, 4107 pmap, pmap_addr + (va - entry->vme_start)); 4108 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL); 4109 } 4110 4111 if (rc != KERN_SUCCESS) { 4112 struct vm_map_entry tmp_entry = *entry; 4113 4114 /* unwire wired pages */ 4115 tmp_entry.vme_end = va; 4116 vm_fault_unwire(map, 4117 &tmp_entry, FALSE, pmap, pmap_addr); 4118 4119 return rc; 4120 } 4121 } 4122 return KERN_SUCCESS; 4123} 4124 4125/* 4126 * vm_fault_unwire: 4127 * 4128 * Unwire a range of virtual addresses in a map. 4129 */ 4130void 4131vm_fault_unwire( 4132 vm_map_t map, 4133 vm_map_entry_t entry, 4134 boolean_t deallocate, 4135 pmap_t pmap, 4136 vm_map_offset_t pmap_addr) 4137{ 4138 register vm_map_offset_t va; 4139 register vm_map_offset_t end_addr = entry->vme_end; 4140 vm_object_t object; 4141 struct vm_object_fault_info fault_info; 4142 4143 object = (entry->is_sub_map) 4144 ? VM_OBJECT_NULL : entry->object.vm_object; 4145 4146 /* 4147 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually 4148 * do anything since such memory is wired by default. So we don't have 4149 * anything to undo here. 4150 */ 4151 4152 if (object != VM_OBJECT_NULL && object->phys_contiguous) 4153 return; 4154 4155 fault_info.interruptible = THREAD_UNINT; 4156 fault_info.behavior = entry->behavior; 4157 fault_info.user_tag = entry->alias; 4158 fault_info.lo_offset = entry->offset; 4159 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset; 4160 fault_info.no_cache = entry->no_cache; 4161 fault_info.stealth = TRUE; 4162 fault_info.io_sync = FALSE; 4163 fault_info.cs_bypass = FALSE; 4164 fault_info.mark_zf_absent = FALSE; 4165 fault_info.batch_pmap_op = FALSE; 4166 4167 /* 4168 * Since the pages are wired down, we must be able to 4169 * get their mappings from the physical map system. 4170 */ 4171 4172 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { 4173 4174 if (object == VM_OBJECT_NULL) { 4175 if (pmap) { 4176 pmap_change_wiring(pmap, 4177 pmap_addr + (va - entry->vme_start), FALSE); 4178 } 4179 (void) vm_fault(map, va, VM_PROT_NONE, 4180 TRUE, THREAD_UNINT, pmap, pmap_addr); 4181 } else { 4182 vm_prot_t prot; 4183 vm_page_t result_page; 4184 vm_page_t top_page; 4185 vm_object_t result_object; 4186 vm_fault_return_t result; 4187 4188 if (end_addr - va > (vm_size_t) -1) { 4189 /* 32-bit overflow */ 4190 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE); 4191 } else { 4192 fault_info.cluster_size = (vm_size_t) (end_addr - va); 4193 assert(fault_info.cluster_size == end_addr - va); 4194 } 4195 4196 do { 4197 prot = VM_PROT_NONE; 4198 4199 vm_object_lock(object); 4200 vm_object_paging_begin(object); 4201 XPR(XPR_VM_FAULT, 4202 "vm_fault_unwire -> vm_fault_page\n", 4203 0,0,0,0,0); 4204 result = vm_fault_page( 4205 object, 4206 entry->offset + (va - entry->vme_start), 4207 VM_PROT_NONE, TRUE, 4208 &prot, &result_page, &top_page, 4209 (int *)0, 4210 NULL, map->no_zero_fill, 4211 FALSE, &fault_info); 4212 } while (result == VM_FAULT_RETRY); 4213 4214 /* 4215 * If this was a mapping to a file on a device that has been forcibly 4216 * unmounted, then we won't get a page back from vm_fault_page(). Just 4217 * move on to the next one in case the remaining pages are mapped from 4218 * different objects. During a forced unmount, the object is terminated 4219 * so the alive flag will be false if this happens. A forced unmount will 4220 * will occur when an external disk is unplugged before the user does an 4221 * eject, so we don't want to panic in that situation. 4222 */ 4223 4224 if (result == VM_FAULT_MEMORY_ERROR && !object->alive) 4225 continue; 4226 4227 if (result != VM_FAULT_SUCCESS) 4228 panic("vm_fault_unwire: failure"); 4229 4230 result_object = result_page->object; 4231 4232 if (deallocate) { 4233 assert(result_page->phys_page != 4234 vm_page_fictitious_addr); 4235 pmap_disconnect(result_page->phys_page); 4236 VM_PAGE_FREE(result_page); 4237 } else { 4238 if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) 4239 pmap_change_wiring(pmap, 4240 pmap_addr + (va - entry->vme_start), FALSE); 4241 4242 4243 if (VM_PAGE_WIRED(result_page)) { 4244 vm_page_lockspin_queues(); 4245 vm_page_unwire(result_page, TRUE); 4246 vm_page_unlock_queues(); 4247 } 4248 if(entry->zero_wired_pages) { 4249 pmap_zero_page(result_page->phys_page); 4250 entry->zero_wired_pages = FALSE; 4251 } 4252 4253 PAGE_WAKEUP_DONE(result_page); 4254 } 4255 vm_fault_cleanup(result_object, top_page); 4256 } 4257 } 4258 4259 /* 4260 * Inform the physical mapping system that the range 4261 * of addresses may fault, so that page tables and 4262 * such may be unwired themselves. 4263 */ 4264 4265 pmap_pageable(pmap, pmap_addr, 4266 pmap_addr + (end_addr - entry->vme_start), TRUE); 4267 4268} 4269 4270/* 4271 * vm_fault_wire_fast: 4272 * 4273 * Handle common case of a wire down page fault at the given address. 4274 * If successful, the page is inserted into the associated physical map. 4275 * The map entry is passed in to avoid the overhead of a map lookup. 4276 * 4277 * NOTE: the given address should be truncated to the 4278 * proper page address. 4279 * 4280 * KERN_SUCCESS is returned if the page fault is handled; otherwise, 4281 * a standard error specifying why the fault is fatal is returned. 4282 * 4283 * The map in question must be referenced, and remains so. 4284 * Caller has a read lock on the map. 4285 * 4286 * This is a stripped version of vm_fault() for wiring pages. Anything 4287 * other than the common case will return KERN_FAILURE, and the caller 4288 * is expected to call vm_fault(). 4289 */ 4290kern_return_t 4291vm_fault_wire_fast( 4292 __unused vm_map_t map, 4293 vm_map_offset_t va, 4294 vm_map_entry_t entry, 4295 pmap_t pmap, 4296 vm_map_offset_t pmap_addr) 4297{ 4298 vm_object_t object; 4299 vm_object_offset_t offset; 4300 register vm_page_t m; 4301 vm_prot_t prot; 4302 thread_t thread = current_thread(); 4303 int type_of_fault; 4304 kern_return_t kr; 4305 4306 VM_STAT_INCR(faults); 4307 4308 if (thread != THREAD_NULL && thread->task != TASK_NULL) 4309 thread->task->faults++; 4310 4311/* 4312 * Recovery actions 4313 */ 4314 4315#undef RELEASE_PAGE 4316#define RELEASE_PAGE(m) { \ 4317 PAGE_WAKEUP_DONE(m); \ 4318 vm_page_lockspin_queues(); \ 4319 vm_page_unwire(m, TRUE); \ 4320 vm_page_unlock_queues(); \ 4321} 4322 4323 4324#undef UNLOCK_THINGS 4325#define UNLOCK_THINGS { \ 4326 vm_object_paging_end(object); \ 4327 vm_object_unlock(object); \ 4328} 4329 4330#undef UNLOCK_AND_DEALLOCATE 4331#define UNLOCK_AND_DEALLOCATE { \ 4332 UNLOCK_THINGS; \ 4333 vm_object_deallocate(object); \ 4334} 4335/* 4336 * Give up and have caller do things the hard way. 4337 */ 4338 4339#define GIVE_UP { \ 4340 UNLOCK_AND_DEALLOCATE; \ 4341 return(KERN_FAILURE); \ 4342} 4343 4344 4345 /* 4346 * If this entry is not directly to a vm_object, bail out. 4347 */ 4348 if (entry->is_sub_map) 4349 return(KERN_FAILURE); 4350 4351 /* 4352 * Find the backing store object and offset into it. 4353 */ 4354 4355 object = entry->object.vm_object; 4356 offset = (va - entry->vme_start) + entry->offset; 4357 prot = entry->protection; 4358 4359 /* 4360 * Make a reference to this object to prevent its 4361 * disposal while we are messing with it. 4362 */ 4363 4364 vm_object_lock(object); 4365 vm_object_reference_locked(object); 4366 vm_object_paging_begin(object); 4367 4368 /* 4369 * INVARIANTS (through entire routine): 4370 * 4371 * 1) At all times, we must either have the object 4372 * lock or a busy page in some object to prevent 4373 * some other thread from trying to bring in 4374 * the same page. 4375 * 4376 * 2) Once we have a busy page, we must remove it from 4377 * the pageout queues, so that the pageout daemon 4378 * will not grab it away. 4379 * 4380 */ 4381 4382 /* 4383 * Look for page in top-level object. If it's not there or 4384 * there's something going on, give up. 4385 * ENCRYPTED SWAP: use the slow fault path, since we'll need to 4386 * decrypt the page before wiring it down. 4387 */ 4388 m = vm_page_lookup(object, offset); 4389 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) || 4390 (m->unusual && ( m->error || m->restart || m->absent))) { 4391 4392 GIVE_UP; 4393 } 4394 ASSERT_PAGE_DECRYPTED(m); 4395 4396 if (m->fictitious && 4397 m->phys_page == vm_page_guard_addr) { 4398 /* 4399 * Guard pages are fictitious pages and are never 4400 * entered into a pmap, so let's say it's been wired... 4401 */ 4402 kr = KERN_SUCCESS; 4403 goto done; 4404 } 4405 4406 /* 4407 * Wire the page down now. All bail outs beyond this 4408 * point must unwire the page. 4409 */ 4410 4411 vm_page_lockspin_queues(); 4412 vm_page_wire(m); 4413 vm_page_unlock_queues(); 4414 4415 /* 4416 * Mark page busy for other threads. 4417 */ 4418 assert(!m->busy); 4419 m->busy = TRUE; 4420 assert(!m->absent); 4421 4422 /* 4423 * Give up if the page is being written and there's a copy object 4424 */ 4425 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) { 4426 RELEASE_PAGE(m); 4427 GIVE_UP; 4428 } 4429 4430 /* 4431 * Put this page into the physical map. 4432 */ 4433 type_of_fault = DBG_CACHE_HIT_FAULT; 4434 kr = vm_fault_enter(m, 4435 pmap, 4436 pmap_addr, 4437 prot, 4438 prot, 4439 TRUE, 4440 FALSE, 4441 FALSE, 4442 FALSE, 4443 NULL, 4444 &type_of_fault); 4445 4446done: 4447 /* 4448 * Unlock everything, and return 4449 */ 4450 4451 PAGE_WAKEUP_DONE(m); 4452 UNLOCK_AND_DEALLOCATE; 4453 4454 return kr; 4455 4456} 4457 4458/* 4459 * Routine: vm_fault_copy_cleanup 4460 * Purpose: 4461 * Release a page used by vm_fault_copy. 4462 */ 4463 4464void 4465vm_fault_copy_cleanup( 4466 vm_page_t page, 4467 vm_page_t top_page) 4468{ 4469 vm_object_t object = page->object; 4470 4471 vm_object_lock(object); 4472 PAGE_WAKEUP_DONE(page); 4473 if (!page->active && !page->inactive && !page->throttled) { 4474 vm_page_lockspin_queues(); 4475 if (!page->active && !page->inactive && !page->throttled) 4476 vm_page_activate(page); 4477 vm_page_unlock_queues(); 4478 } 4479 vm_fault_cleanup(object, top_page); 4480} 4481 4482void 4483vm_fault_copy_dst_cleanup( 4484 vm_page_t page) 4485{ 4486 vm_object_t object; 4487 4488 if (page != VM_PAGE_NULL) { 4489 object = page->object; 4490 vm_object_lock(object); 4491 vm_page_lockspin_queues(); 4492 vm_page_unwire(page, TRUE); 4493 vm_page_unlock_queues(); 4494 vm_object_paging_end(object); 4495 vm_object_unlock(object); 4496 } 4497} 4498 4499/* 4500 * Routine: vm_fault_copy 4501 * 4502 * Purpose: 4503 * Copy pages from one virtual memory object to another -- 4504 * neither the source nor destination pages need be resident. 4505 * 4506 * Before actually copying a page, the version associated with 4507 * the destination address map wil be verified. 4508 * 4509 * In/out conditions: 4510 * The caller must hold a reference, but not a lock, to 4511 * each of the source and destination objects and to the 4512 * destination map. 4513 * 4514 * Results: 4515 * Returns KERN_SUCCESS if no errors were encountered in 4516 * reading or writing the data. Returns KERN_INTERRUPTED if 4517 * the operation was interrupted (only possible if the 4518 * "interruptible" argument is asserted). Other return values 4519 * indicate a permanent error in copying the data. 4520 * 4521 * The actual amount of data copied will be returned in the 4522 * "copy_size" argument. In the event that the destination map 4523 * verification failed, this amount may be less than the amount 4524 * requested. 4525 */ 4526kern_return_t 4527vm_fault_copy( 4528 vm_object_t src_object, 4529 vm_object_offset_t src_offset, 4530 vm_map_size_t *copy_size, /* INOUT */ 4531 vm_object_t dst_object, 4532 vm_object_offset_t dst_offset, 4533 vm_map_t dst_map, 4534 vm_map_version_t *dst_version, 4535 int interruptible) 4536{ 4537 vm_page_t result_page; 4538 4539 vm_page_t src_page; 4540 vm_page_t src_top_page; 4541 vm_prot_t src_prot; 4542 4543 vm_page_t dst_page; 4544 vm_page_t dst_top_page; 4545 vm_prot_t dst_prot; 4546 4547 vm_map_size_t amount_left; 4548 vm_object_t old_copy_object; 4549 kern_return_t error = 0; 4550 vm_fault_return_t result; 4551 4552 vm_map_size_t part_size; 4553 struct vm_object_fault_info fault_info_src; 4554 struct vm_object_fault_info fault_info_dst; 4555 4556 /* 4557 * In order not to confuse the clustered pageins, align 4558 * the different offsets on a page boundary. 4559 */ 4560 4561#define RETURN(x) \ 4562 MACRO_BEGIN \ 4563 *copy_size -= amount_left; \ 4564 MACRO_RETURN(x); \ 4565 MACRO_END 4566 4567 amount_left = *copy_size; 4568 4569 fault_info_src.interruptible = interruptible; 4570 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL; 4571 fault_info_src.user_tag = 0; 4572 fault_info_src.lo_offset = vm_object_trunc_page(src_offset); 4573 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left; 4574 fault_info_src.no_cache = FALSE; 4575 fault_info_src.stealth = TRUE; 4576 fault_info_src.io_sync = FALSE; 4577 fault_info_src.cs_bypass = FALSE; 4578 fault_info_src.mark_zf_absent = FALSE; 4579 fault_info_src.batch_pmap_op = FALSE; 4580 4581 fault_info_dst.interruptible = interruptible; 4582 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL; 4583 fault_info_dst.user_tag = 0; 4584 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset); 4585 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left; 4586 fault_info_dst.no_cache = FALSE; 4587 fault_info_dst.stealth = TRUE; 4588 fault_info_dst.io_sync = FALSE; 4589 fault_info_dst.cs_bypass = FALSE; 4590 fault_info_dst.mark_zf_absent = FALSE; 4591 fault_info_dst.batch_pmap_op = FALSE; 4592 4593 do { /* while (amount_left > 0) */ 4594 /* 4595 * There may be a deadlock if both source and destination 4596 * pages are the same. To avoid this deadlock, the copy must 4597 * start by getting the destination page in order to apply 4598 * COW semantics if any. 4599 */ 4600 4601 RetryDestinationFault: ; 4602 4603 dst_prot = VM_PROT_WRITE|VM_PROT_READ; 4604 4605 vm_object_lock(dst_object); 4606 vm_object_paging_begin(dst_object); 4607 4608 if (amount_left > (vm_size_t) -1) { 4609 /* 32-bit overflow */ 4610 fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE); 4611 } else { 4612 fault_info_dst.cluster_size = (vm_size_t) amount_left; 4613 assert(fault_info_dst.cluster_size == amount_left); 4614 } 4615 4616 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0); 4617 result = vm_fault_page(dst_object, 4618 vm_object_trunc_page(dst_offset), 4619 VM_PROT_WRITE|VM_PROT_READ, 4620 FALSE, 4621 &dst_prot, &dst_page, &dst_top_page, 4622 (int *)0, 4623 &error, 4624 dst_map->no_zero_fill, 4625 FALSE, &fault_info_dst); 4626 switch (result) { 4627 case VM_FAULT_SUCCESS: 4628 break; 4629 case VM_FAULT_RETRY: 4630 goto RetryDestinationFault; 4631 case VM_FAULT_MEMORY_SHORTAGE: 4632 if (vm_page_wait(interruptible)) 4633 goto RetryDestinationFault; 4634 /* fall thru */ 4635 case VM_FAULT_INTERRUPTED: 4636 RETURN(MACH_SEND_INTERRUPTED); 4637 case VM_FAULT_SUCCESS_NO_VM_PAGE: 4638 /* success but no VM page: fail the copy */ 4639 vm_object_paging_end(dst_object); 4640 vm_object_unlock(dst_object); 4641 /*FALLTHROUGH*/ 4642 case VM_FAULT_MEMORY_ERROR: 4643 if (error) 4644 return (error); 4645 else 4646 return(KERN_MEMORY_ERROR); 4647 default: 4648 panic("vm_fault_copy: unexpected error 0x%x from " 4649 "vm_fault_page()\n", result); 4650 } 4651 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE); 4652 4653 old_copy_object = dst_page->object->copy; 4654 4655 /* 4656 * There exists the possiblity that the source and 4657 * destination page are the same. But we can't 4658 * easily determine that now. If they are the 4659 * same, the call to vm_fault_page() for the 4660 * destination page will deadlock. To prevent this we 4661 * wire the page so we can drop busy without having 4662 * the page daemon steal the page. We clean up the 4663 * top page but keep the paging reference on the object 4664 * holding the dest page so it doesn't go away. 4665 */ 4666 4667 vm_page_lockspin_queues(); 4668 vm_page_wire(dst_page); 4669 vm_page_unlock_queues(); 4670 PAGE_WAKEUP_DONE(dst_page); 4671 vm_object_unlock(dst_page->object); 4672 4673 if (dst_top_page != VM_PAGE_NULL) { 4674 vm_object_lock(dst_object); 4675 VM_PAGE_FREE(dst_top_page); 4676 vm_object_paging_end(dst_object); 4677 vm_object_unlock(dst_object); 4678 } 4679 4680 RetrySourceFault: ; 4681 4682 if (src_object == VM_OBJECT_NULL) { 4683 /* 4684 * No source object. We will just 4685 * zero-fill the page in dst_object. 4686 */ 4687 src_page = VM_PAGE_NULL; 4688 result_page = VM_PAGE_NULL; 4689 } else { 4690 vm_object_lock(src_object); 4691 src_page = vm_page_lookup(src_object, 4692 vm_object_trunc_page(src_offset)); 4693 if (src_page == dst_page) { 4694 src_prot = dst_prot; 4695 result_page = VM_PAGE_NULL; 4696 } else { 4697 src_prot = VM_PROT_READ; 4698 vm_object_paging_begin(src_object); 4699 4700 if (amount_left > (vm_size_t) -1) { 4701 /* 32-bit overflow */ 4702 fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE); 4703 } else { 4704 fault_info_src.cluster_size = (vm_size_t) amount_left; 4705 assert(fault_info_src.cluster_size == amount_left); 4706 } 4707 4708 XPR(XPR_VM_FAULT, 4709 "vm_fault_copy(2) -> vm_fault_page\n", 4710 0,0,0,0,0); 4711 result = vm_fault_page( 4712 src_object, 4713 vm_object_trunc_page(src_offset), 4714 VM_PROT_READ, FALSE, 4715 &src_prot, 4716 &result_page, &src_top_page, 4717 (int *)0, &error, FALSE, 4718 FALSE, &fault_info_src); 4719 4720 switch (result) { 4721 case VM_FAULT_SUCCESS: 4722 break; 4723 case VM_FAULT_RETRY: 4724 goto RetrySourceFault; 4725 case VM_FAULT_MEMORY_SHORTAGE: 4726 if (vm_page_wait(interruptible)) 4727 goto RetrySourceFault; 4728 /* fall thru */ 4729 case VM_FAULT_INTERRUPTED: 4730 vm_fault_copy_dst_cleanup(dst_page); 4731 RETURN(MACH_SEND_INTERRUPTED); 4732 case VM_FAULT_SUCCESS_NO_VM_PAGE: 4733 /* success but no VM page: fail */ 4734 vm_object_paging_end(src_object); 4735 vm_object_unlock(src_object); 4736 /*FALLTHROUGH*/ 4737 case VM_FAULT_MEMORY_ERROR: 4738 vm_fault_copy_dst_cleanup(dst_page); 4739 if (error) 4740 return (error); 4741 else 4742 return(KERN_MEMORY_ERROR); 4743 default: 4744 panic("vm_fault_copy(2): unexpected " 4745 "error 0x%x from " 4746 "vm_fault_page()\n", result); 4747 } 4748 4749 4750 assert((src_top_page == VM_PAGE_NULL) == 4751 (result_page->object == src_object)); 4752 } 4753 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE); 4754 vm_object_unlock(result_page->object); 4755 } 4756 4757 if (!vm_map_verify(dst_map, dst_version)) { 4758 if (result_page != VM_PAGE_NULL && src_page != dst_page) 4759 vm_fault_copy_cleanup(result_page, src_top_page); 4760 vm_fault_copy_dst_cleanup(dst_page); 4761 break; 4762 } 4763 4764 vm_object_lock(dst_page->object); 4765 4766 if (dst_page->object->copy != old_copy_object) { 4767 vm_object_unlock(dst_page->object); 4768 vm_map_verify_done(dst_map, dst_version); 4769 if (result_page != VM_PAGE_NULL && src_page != dst_page) 4770 vm_fault_copy_cleanup(result_page, src_top_page); 4771 vm_fault_copy_dst_cleanup(dst_page); 4772 break; 4773 } 4774 vm_object_unlock(dst_page->object); 4775 4776 /* 4777 * Copy the page, and note that it is dirty 4778 * immediately. 4779 */ 4780 4781 if (!page_aligned(src_offset) || 4782 !page_aligned(dst_offset) || 4783 !page_aligned(amount_left)) { 4784 4785 vm_object_offset_t src_po, 4786 dst_po; 4787 4788 src_po = src_offset - vm_object_trunc_page(src_offset); 4789 dst_po = dst_offset - vm_object_trunc_page(dst_offset); 4790 4791 if (dst_po > src_po) { 4792 part_size = PAGE_SIZE - dst_po; 4793 } else { 4794 part_size = PAGE_SIZE - src_po; 4795 } 4796 if (part_size > (amount_left)){ 4797 part_size = amount_left; 4798 } 4799 4800 if (result_page == VM_PAGE_NULL) { 4801 assert((vm_offset_t) dst_po == dst_po); 4802 assert((vm_size_t) part_size == part_size); 4803 vm_page_part_zero_fill(dst_page, 4804 (vm_offset_t) dst_po, 4805 (vm_size_t) part_size); 4806 } else { 4807 assert((vm_offset_t) src_po == src_po); 4808 assert((vm_offset_t) dst_po == dst_po); 4809 assert((vm_size_t) part_size == part_size); 4810 vm_page_part_copy(result_page, 4811 (vm_offset_t) src_po, 4812 dst_page, 4813 (vm_offset_t) dst_po, 4814 (vm_size_t)part_size); 4815 if(!dst_page->dirty){ 4816 vm_object_lock(dst_object); 4817 SET_PAGE_DIRTY(dst_page, TRUE); 4818 vm_object_unlock(dst_page->object); 4819 } 4820 4821 } 4822 } else { 4823 part_size = PAGE_SIZE; 4824 4825 if (result_page == VM_PAGE_NULL) 4826 vm_page_zero_fill(dst_page); 4827 else{ 4828 vm_object_lock(result_page->object); 4829 vm_page_copy(result_page, dst_page); 4830 vm_object_unlock(result_page->object); 4831 4832 if(!dst_page->dirty){ 4833 vm_object_lock(dst_object); 4834 SET_PAGE_DIRTY(dst_page, TRUE); 4835 vm_object_unlock(dst_page->object); 4836 } 4837 } 4838 4839 } 4840 4841 /* 4842 * Unlock everything, and return 4843 */ 4844 4845 vm_map_verify_done(dst_map, dst_version); 4846 4847 if (result_page != VM_PAGE_NULL && src_page != dst_page) 4848 vm_fault_copy_cleanup(result_page, src_top_page); 4849 vm_fault_copy_dst_cleanup(dst_page); 4850 4851 amount_left -= part_size; 4852 src_offset += part_size; 4853 dst_offset += part_size; 4854 } while (amount_left > 0); 4855 4856 RETURN(KERN_SUCCESS); 4857#undef RETURN 4858 4859 /*NOTREACHED*/ 4860} 4861 4862#if VM_FAULT_CLASSIFY 4863/* 4864 * Temporary statistics gathering support. 4865 */ 4866 4867/* 4868 * Statistics arrays: 4869 */ 4870#define VM_FAULT_TYPES_MAX 5 4871#define VM_FAULT_LEVEL_MAX 8 4872 4873int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX]; 4874 4875#define VM_FAULT_TYPE_ZERO_FILL 0 4876#define VM_FAULT_TYPE_MAP_IN 1 4877#define VM_FAULT_TYPE_PAGER 2 4878#define VM_FAULT_TYPE_COPY 3 4879#define VM_FAULT_TYPE_OTHER 4 4880 4881 4882void 4883vm_fault_classify(vm_object_t object, 4884 vm_object_offset_t offset, 4885 vm_prot_t fault_type) 4886{ 4887 int type, level = 0; 4888 vm_page_t m; 4889 4890 while (TRUE) { 4891 m = vm_page_lookup(object, offset); 4892 if (m != VM_PAGE_NULL) { 4893 if (m->busy || m->error || m->restart || m->absent) { 4894 type = VM_FAULT_TYPE_OTHER; 4895 break; 4896 } 4897 if (((fault_type & VM_PROT_WRITE) == 0) || 4898 ((level == 0) && object->copy == VM_OBJECT_NULL)) { 4899 type = VM_FAULT_TYPE_MAP_IN; 4900 break; 4901 } 4902 type = VM_FAULT_TYPE_COPY; 4903 break; 4904 } 4905 else { 4906 if (object->pager_created) { 4907 type = VM_FAULT_TYPE_PAGER; 4908 break; 4909 } 4910 if (object->shadow == VM_OBJECT_NULL) { 4911 type = VM_FAULT_TYPE_ZERO_FILL; 4912 break; 4913 } 4914 4915 offset += object->vo_shadow_offset; 4916 object = object->shadow; 4917 level++; 4918 continue; 4919 } 4920 } 4921 4922 if (level > VM_FAULT_LEVEL_MAX) 4923 level = VM_FAULT_LEVEL_MAX; 4924 4925 vm_fault_stats[type][level] += 1; 4926 4927 return; 4928} 4929 4930/* cleanup routine to call from debugger */ 4931 4932void 4933vm_fault_classify_init(void) 4934{ 4935 int type, level; 4936 4937 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) { 4938 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) { 4939 vm_fault_stats[type][level] = 0; 4940 } 4941 } 4942 4943 return; 4944} 4945#endif /* VM_FAULT_CLASSIFY */ 4946 4947 4948extern int cs_validation; 4949 4950void 4951vm_page_validate_cs_mapped( 4952 vm_page_t page, 4953 const void *kaddr) 4954{ 4955 vm_object_t object; 4956 vm_object_offset_t offset; 4957 kern_return_t kr; 4958 memory_object_t pager; 4959 void *blobs; 4960 boolean_t validated, tainted; 4961 4962 assert(page->busy); 4963 vm_object_lock_assert_exclusive(page->object); 4964 4965 if (!cs_validation) { 4966 return; 4967 } 4968 4969 if (page->wpmapped && !page->cs_tainted) { 4970 /* 4971 * This page was mapped for "write" access sometime in the 4972 * past and could still be modifiable in the future. 4973 * Consider it tainted. 4974 * [ If the page was already found to be "tainted", no 4975 * need to re-validate. ] 4976 */ 4977 page->cs_validated = TRUE; 4978 page->cs_tainted = TRUE; 4979 if (cs_debug) { 4980 printf("CODESIGNING: vm_page_validate_cs: " 4981 "page %p obj %p off 0x%llx " 4982 "was modified\n", 4983 page, page->object, page->offset); 4984 } 4985 vm_cs_validated_dirtied++; 4986 } 4987 4988 if (page->cs_validated) { 4989 return; 4990 } 4991 4992 vm_cs_validates++; 4993 4994 object = page->object; 4995 assert(object->code_signed); 4996 offset = page->offset; 4997 4998 if (!object->alive || object->terminating || object->pager == NULL) { 4999 /* 5000 * The object is terminating and we don't have its pager 5001 * so we can't validate the data... 5002 */ 5003 return; 5004 } 5005 /* 5006 * Since we get here to validate a page that was brought in by 5007 * the pager, we know that this pager is all setup and ready 5008 * by now. 5009 */ 5010 assert(!object->internal); 5011 assert(object->pager != NULL); 5012 assert(object->pager_ready); 5013 5014 pager = object->pager; 5015 assert(object->paging_in_progress); 5016 kr = vnode_pager_get_object_cs_blobs(pager, &blobs); 5017 if (kr != KERN_SUCCESS) { 5018 blobs = NULL; 5019 } 5020 5021 /* verify the SHA1 hash for this page */ 5022 validated = cs_validate_page(blobs, 5023 pager, 5024 offset + object->paging_offset, 5025 (const void *)kaddr, 5026 &tainted); 5027 5028 page->cs_validated = validated; 5029 if (validated) { 5030 page->cs_tainted = tainted; 5031 } 5032} 5033 5034void 5035vm_page_validate_cs( 5036 vm_page_t page) 5037{ 5038 vm_object_t object; 5039 vm_object_offset_t offset; 5040 vm_map_offset_t koffset; 5041 vm_map_size_t ksize; 5042 vm_offset_t kaddr; 5043 kern_return_t kr; 5044 boolean_t busy_page; 5045 5046 vm_object_lock_assert_held(page->object); 5047 5048 if (!cs_validation) { 5049 return; 5050 } 5051 5052 if (page->wpmapped && !page->cs_tainted) { 5053 vm_object_lock_assert_exclusive(page->object); 5054 5055 /* 5056 * This page was mapped for "write" access sometime in the 5057 * past and could still be modifiable in the future. 5058 * Consider it tainted. 5059 * [ If the page was already found to be "tainted", no 5060 * need to re-validate. ] 5061 */ 5062 page->cs_validated = TRUE; 5063 page->cs_tainted = TRUE; 5064 if (cs_debug) { 5065 printf("CODESIGNING: vm_page_validate_cs: " 5066 "page %p obj %p off 0x%llx " 5067 "was modified\n", 5068 page, page->object, page->offset); 5069 } 5070 vm_cs_validated_dirtied++; 5071 } 5072 5073 if (page->cs_validated) { 5074 return; 5075 } 5076 5077#if CHECK_CS_VALIDATION_BITMAP 5078 if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) { 5079 page->cs_validated = TRUE; 5080 page->cs_tainted = FALSE; 5081 vm_cs_bitmap_validated++; 5082 return; 5083 } 5084#endif 5085 vm_object_lock_assert_exclusive(page->object); 5086 5087 object = page->object; 5088 assert(object->code_signed); 5089 offset = page->offset; 5090 5091 busy_page = page->busy; 5092 if (!busy_page) { 5093 /* keep page busy while we map (and unlock) the VM object */ 5094 page->busy = TRUE; 5095 } 5096 5097 /* 5098 * Take a paging reference on the VM object 5099 * to protect it from collapse or bypass, 5100 * and keep it from disappearing too. 5101 */ 5102 vm_object_paging_begin(object); 5103 5104 /* map the page in the kernel address space */ 5105 koffset = 0; 5106 ksize = PAGE_SIZE_64; 5107 kr = vm_paging_map_object(&koffset, 5108 page, 5109 object, 5110 offset, 5111 &ksize, 5112 VM_PROT_READ, 5113 FALSE); /* can't unlock object ! */ 5114 if (kr != KERN_SUCCESS) { 5115 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr); 5116 } 5117 kaddr = CAST_DOWN(vm_offset_t, koffset); 5118 5119 /* validate the mapped page */ 5120 vm_page_validate_cs_mapped(page, (const void *) kaddr); 5121 5122#if CHECK_CS_VALIDATION_BITMAP 5123 if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) { 5124 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET ); 5125 } 5126#endif 5127 assert(page->busy); 5128 assert(object == page->object); 5129 vm_object_lock_assert_exclusive(object); 5130 5131 if (!busy_page) { 5132 PAGE_WAKEUP_DONE(page); 5133 } 5134 if (koffset != 0) { 5135 /* unmap the map from the kernel address space */ 5136 vm_paging_unmap_object(object, koffset, koffset + ksize); 5137 koffset = 0; 5138 ksize = 0; 5139 kaddr = 0; 5140 } 5141 vm_object_paging_end(object); 5142} 5143