1/*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 *	File:	vm_fault.c
60 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 *	Page fault handling module.
63 */
64
65#include <mach_cluster_stats.h>
66#include <mach_pagemap.h>
67#include <libkern/OSAtomic.h>
68
69#include <mach/mach_types.h>
70#include <mach/kern_return.h>
71#include <mach/message.h>	/* for error codes */
72#include <mach/vm_param.h>
73#include <mach/vm_behavior.h>
74#include <mach/memory_object.h>
75				/* For memory_object_data_{request,unlock} */
76#include <mach/sdt.h>
77
78#include <kern/kern_types.h>
79#include <kern/host_statistics.h>
80#include <kern/counters.h>
81#include <kern/task.h>
82#include <kern/thread.h>
83#include <kern/sched_prim.h>
84#include <kern/host.h>
85#include <kern/xpr.h>
86#include <kern/mach_param.h>
87#include <kern/macro_help.h>
88#include <kern/zalloc.h>
89#include <kern/misc_protos.h>
90
91#include <vm/vm_fault.h>
92#include <vm/vm_map.h>
93#include <vm/vm_object.h>
94#include <vm/vm_page.h>
95#include <vm/vm_kern.h>
96#include <vm/pmap.h>
97#include <vm/vm_pageout.h>
98#include <vm/vm_protos.h>
99#include <vm/vm_external.h>
100#include <vm/memory_object.h>
101#include <vm/vm_purgeable_internal.h>	/* Needed by some vm_page.h macros */
102#include <vm/vm_shared_region.h>
103
104#define VM_FAULT_CLASSIFY	0
105
106#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
107
108int	vm_object_pagein_throttle = 16;
109
110/*
111 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
112 * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
113 * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
114 * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
115 * keep the UI active so that the user has a chance to kill the offending task before the system
116 * completely hangs.
117 *
118 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
119 * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
120 * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
121 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
122 */
123
124extern boolean_t thread_is_io_throttled(void);
125extern void throttle_lowpri_io(int);
126
127uint64_t vm_hard_throttle_threshold;
128
129extern unsigned int dp_pages_free, dp_pages_reserve;
130
131#define NEED_TO_HARD_THROTTLE_THIS_TASK() 	(((dp_pages_free + dp_pages_reserve < 2000) && \
132						 (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \
133						 (current_task() != kernel_task) && VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) || \
134						 (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \
135						  (get_task_resident_size(current_task()) > vm_hard_throttle_threshold)))
136
137
138#define HARD_THROTTLE_DELAY	20000	/* 20000 us == 20 ms */
139#define SOFT_THROTTLE_DELAY	2000	/* 2000 us == 2 ms */
140
141
142extern int cs_debug;
143
144boolean_t current_thread_aborted(void);
145
146/* Forward declarations of internal routines. */
147extern kern_return_t vm_fault_wire_fast(
148				vm_map_t	map,
149				vm_map_offset_t	va,
150				vm_map_entry_t	entry,
151				pmap_t		pmap,
152				vm_map_offset_t	pmap_addr);
153
154extern void vm_fault_continue(void);
155
156extern void vm_fault_copy_cleanup(
157				vm_page_t	page,
158				vm_page_t	top_page);
159
160extern void vm_fault_copy_dst_cleanup(
161				vm_page_t	page);
162
163#if	VM_FAULT_CLASSIFY
164extern void vm_fault_classify(vm_object_t	object,
165			  vm_object_offset_t	offset,
166			  vm_prot_t		fault_type);
167
168extern void vm_fault_classify_init(void);
169#endif
170
171unsigned long vm_pmap_enter_blocked = 0;
172unsigned long vm_pmap_enter_retried = 0;
173
174unsigned long vm_cs_validates = 0;
175unsigned long vm_cs_revalidates = 0;
176unsigned long vm_cs_query_modified = 0;
177unsigned long vm_cs_validated_dirtied = 0;
178unsigned long vm_cs_bitmap_validated = 0;
179#if CONFIG_ENFORCE_SIGNED_CODE
180int cs_enforcement_disable=0;
181#else
182static const int cs_enforcement_disable=1;
183#endif
184
185/*
186 *	Routine:	vm_fault_init
187 *	Purpose:
188 *		Initialize our private data structures.
189 */
190void
191vm_fault_init(void)
192{
193#if !SECURE_KERNEL
194#if CONFIG_ENFORCE_SIGNED_CODE
195	PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable,
196			   sizeof (cs_enforcement_disable));
197#endif
198	PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
199#endif
200
201	/*
202	 * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
203	 * computed as a percentage of available memory, and the percentage used is scaled inversely with
204	 * the amount of memory.  The pertange runs between 10% and 35%.  We use 35% for small memory systems
205	 * and reduce the value down to 10% for very large memory configurations.  This helps give us a
206	 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
207	 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
208	 */
209
210	vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
211}
212
213/*
214 *	Routine:	vm_fault_cleanup
215 *	Purpose:
216 *		Clean up the result of vm_fault_page.
217 *	Results:
218 *		The paging reference for "object" is released.
219 *		"object" is unlocked.
220 *		If "top_page" is not null,  "top_page" is
221 *		freed and the paging reference for the object
222 *		containing it is released.
223 *
224 *	In/out conditions:
225 *		"object" must be locked.
226 */
227void
228vm_fault_cleanup(
229	register vm_object_t	object,
230	register vm_page_t	top_page)
231{
232	vm_object_paging_end(object);
233 	vm_object_unlock(object);
234
235	if (top_page != VM_PAGE_NULL) {
236	        object = top_page->object;
237
238		vm_object_lock(object);
239		VM_PAGE_FREE(top_page);
240		vm_object_paging_end(object);
241		vm_object_unlock(object);
242	}
243}
244
245#if	MACH_CLUSTER_STATS
246#define MAXCLUSTERPAGES 16
247struct {
248	unsigned long pages_in_cluster;
249	unsigned long pages_at_higher_offsets;
250	unsigned long pages_at_lower_offsets;
251} cluster_stats_in[MAXCLUSTERPAGES];
252#define CLUSTER_STAT(clause)	clause
253#define CLUSTER_STAT_HIGHER(x)	\
254	((cluster_stats_in[(x)].pages_at_higher_offsets)++)
255#define CLUSTER_STAT_LOWER(x)	\
256	 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
257#define CLUSTER_STAT_CLUSTER(x)	\
258	((cluster_stats_in[(x)].pages_in_cluster)++)
259#else	/* MACH_CLUSTER_STATS */
260#define CLUSTER_STAT(clause)
261#endif	/* MACH_CLUSTER_STATS */
262
263#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
264
265
266boolean_t	vm_page_deactivate_behind = TRUE;
267/*
268 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
269 */
270#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW	128
271#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER	16		/* don't make this too big... */
272                                                                /* we use it to size an array on the stack */
273
274int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
275
276#define MAX_SEQUENTIAL_RUN	(1024 * 1024 * 1024)
277
278/*
279 * vm_page_is_sequential
280 *
281 * Determine if sequential access is in progress
282 * in accordance with the behavior specified.
283 * Update state to indicate current access pattern.
284 *
285 * object must have at least the shared lock held
286 */
287static
288void
289vm_fault_is_sequential(
290	vm_object_t		object,
291	vm_object_offset_t	offset,
292	vm_behavior_t		behavior)
293{
294        vm_object_offset_t	last_alloc;
295	int			sequential;
296	int			orig_sequential;
297
298        last_alloc = object->last_alloc;
299	sequential = object->sequential;
300	orig_sequential = sequential;
301
302	switch (behavior) {
303	case VM_BEHAVIOR_RANDOM:
304	        /*
305		 * reset indicator of sequential behavior
306		 */
307	        sequential = 0;
308	        break;
309
310	case VM_BEHAVIOR_SEQUENTIAL:
311	        if (offset && last_alloc == offset - PAGE_SIZE_64) {
312		        /*
313			 * advance indicator of sequential behavior
314			 */
315		        if (sequential < MAX_SEQUENTIAL_RUN)
316			        sequential += PAGE_SIZE;
317		} else {
318		        /*
319			 * reset indicator of sequential behavior
320			 */
321		        sequential = 0;
322		}
323	        break;
324
325	case VM_BEHAVIOR_RSEQNTL:
326	        if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
327		        /*
328			 * advance indicator of sequential behavior
329			 */
330		        if (sequential > -MAX_SEQUENTIAL_RUN)
331			        sequential -= PAGE_SIZE;
332		} else {
333		        /*
334			 * reset indicator of sequential behavior
335			 */
336		        sequential = 0;
337		}
338	        break;
339
340	case VM_BEHAVIOR_DEFAULT:
341	default:
342	        if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
343		        /*
344			 * advance indicator of sequential behavior
345			 */
346		        if (sequential < 0)
347			        sequential = 0;
348		        if (sequential < MAX_SEQUENTIAL_RUN)
349			        sequential += PAGE_SIZE;
350
351		} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
352		        /*
353			 * advance indicator of sequential behavior
354			 */
355		        if (sequential > 0)
356			        sequential = 0;
357		        if (sequential > -MAX_SEQUENTIAL_RUN)
358			        sequential -= PAGE_SIZE;
359		} else {
360		        /*
361			 * reset indicator of sequential behavior
362			 */
363		        sequential = 0;
364		}
365	        break;
366	}
367	if (sequential != orig_sequential) {
368	        if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
369		        /*
370			 * if someone else has already updated object->sequential
371			 * don't bother trying to update it or object->last_alloc
372			 */
373		        return;
374		}
375	}
376	/*
377	 * I'd like to do this with a OSCompareAndSwap64, but that
378	 * doesn't exist for PPC...  however, it shouldn't matter
379	 * that much... last_alloc is maintained so that we can determine
380	 * if a sequential access pattern is taking place... if only
381	 * one thread is banging on this object, no problem with the unprotected
382	 * update... if 2 or more threads are banging away, we run the risk of
383	 * someone seeing a mangled update... however, in the face of multiple
384	 * accesses, no sequential access pattern can develop anyway, so we
385	 * haven't lost any real info.
386	 */
387	object->last_alloc = offset;
388}
389
390
391int vm_page_deactivate_behind_count = 0;
392
393/*
394 * vm_page_deactivate_behind
395 *
396 * Determine if sequential access is in progress
397 * in accordance with the behavior specified.  If
398 * so, compute a potential page to deactivate and
399 * deactivate it.
400 *
401 * object must be locked.
402 *
403 * return TRUE if we actually deactivate a page
404 */
405static
406boolean_t
407vm_fault_deactivate_behind(
408	vm_object_t		object,
409	vm_object_offset_t	offset,
410	vm_behavior_t		behavior)
411{
412	int		n;
413	int		pages_in_run = 0;
414	int		max_pages_in_run = 0;
415	int		sequential_run;
416	int		sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
417	vm_object_offset_t	run_offset = 0;
418	vm_object_offset_t	pg_offset = 0;
419	vm_page_t	m;
420	vm_page_t	page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
421
422	pages_in_run = 0;
423#if TRACEFAULTPAGE
424	dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind);	/* (TEST/DEBUG) */
425#endif
426
427	if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
428		/*
429		 * Do not deactivate pages from the kernel object: they
430		 * are not intended to become pageable.
431		 * or we've disabled the deactivate behind mechanism
432		 */
433		return FALSE;
434	}
435	if ((sequential_run = object->sequential)) {
436		  if (sequential_run < 0) {
437		          sequential_behavior = VM_BEHAVIOR_RSEQNTL;
438			  sequential_run = 0 - sequential_run;
439		  } else {
440		          sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
441		  }
442	}
443	switch (behavior) {
444	case VM_BEHAVIOR_RANDOM:
445		break;
446	case VM_BEHAVIOR_SEQUENTIAL:
447	        if (sequential_run >= (int)PAGE_SIZE) {
448			run_offset = 0 - PAGE_SIZE_64;
449			max_pages_in_run = 1;
450		}
451		break;
452	case VM_BEHAVIOR_RSEQNTL:
453	        if (sequential_run >= (int)PAGE_SIZE) {
454			run_offset = PAGE_SIZE_64;
455			max_pages_in_run = 1;
456		}
457		break;
458	case VM_BEHAVIOR_DEFAULT:
459	default:
460	{	vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
461
462	        /*
463		 * determine if the run of sequential accesss has been
464		 * long enough on an object with default access behavior
465		 * to consider it for deactivation
466		 */
467		if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
468			/*
469			 * the comparisons between offset and behind are done
470			 * in this kind of odd fashion in order to prevent wrap around
471			 * at the end points
472			 */
473		        if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
474			        if (offset >= behind) {
475					run_offset = 0 - behind;
476					pg_offset = PAGE_SIZE_64;
477					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
478				}
479			} else {
480			        if (offset < -behind) {
481					run_offset = behind;
482					pg_offset = 0 - PAGE_SIZE_64;
483					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
484				}
485			}
486		}
487		break;
488	}
489	}
490        for (n = 0; n < max_pages_in_run; n++) {
491		m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
492
493		if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
494			page_run[pages_in_run++] = m;
495			pmap_clear_reference(m->phys_page);
496		}
497	}
498	if (pages_in_run) {
499		vm_page_lockspin_queues();
500
501		for (n = 0; n < pages_in_run; n++) {
502
503			m = page_run[n];
504
505			vm_page_deactivate_internal(m, FALSE);
506
507			vm_page_deactivate_behind_count++;
508#if TRACEFAULTPAGE
509			dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);	/* (TEST/DEBUG) */
510#endif
511		}
512		vm_page_unlock_queues();
513
514		return TRUE;
515	}
516	return FALSE;
517}
518
519
520static int
521vm_page_throttled(void)
522{
523        clock_sec_t     elapsed_sec;
524        clock_sec_t     tv_sec;
525        clock_usec_t    tv_usec;
526
527	thread_t thread = current_thread();
528
529	if (thread->options & TH_OPT_VMPRIV)
530		return (0);
531
532	thread->t_page_creation_count++;
533
534	if (NEED_TO_HARD_THROTTLE_THIS_TASK())
535		return (HARD_THROTTLE_DELAY);
536
537	if (vm_page_free_count < vm_page_throttle_limit &&
538	    thread->t_page_creation_count > vm_page_creation_throttle) {
539
540		clock_get_system_microtime(&tv_sec, &tv_usec);
541
542		elapsed_sec = tv_sec - thread->t_page_creation_time;
543
544		if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
545
546			if (elapsed_sec >= 60) {
547				/*
548				 * we'll reset our stats to give a well behaved app
549				 * that was unlucky enough to accumulate a bunch of pages
550				 * over a long period of time a chance to get out of
551				 * the throttled state... we reset the counter and timestamp
552				 * so that if it stays under the rate limit for the next second
553				 * it will be back in our good graces... if it exceeds it, it
554				 * will remain in the throttled state
555				 */
556				thread->t_page_creation_time = tv_sec;
557				thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
558			}
559			++vm_page_throttle_count;
560
561			return (SOFT_THROTTLE_DELAY);
562		}
563		thread->t_page_creation_time = tv_sec;
564		thread->t_page_creation_count = 0;
565	}
566	return (0);
567}
568
569
570/*
571 * check for various conditions that would
572 * prevent us from creating a ZF page...
573 * cleanup is based on being called from vm_fault_page
574 *
575 * object must be locked
576 * object == m->object
577 */
578static vm_fault_return_t
579vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
580{
581	int throttle_delay;
582
583        if (object->shadow_severed ||
584	    VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
585	        /*
586		 * Either:
587		 * 1. the shadow chain was severed,
588		 * 2. the purgeable object is volatile or empty and is marked
589		 *    to fault on access while volatile.
590		 * Just have to return an error at this point
591		 */
592	        if (m != VM_PAGE_NULL)
593		        VM_PAGE_FREE(m);
594		vm_fault_cleanup(object, first_m);
595
596		thread_interrupt_level(interruptible_state);
597
598		return (VM_FAULT_MEMORY_ERROR);
599	}
600	if (vm_backing_store_low) {
601	        /*
602		 * are we protecting the system from
603		 * backing store exhaustion.  If so
604		 * sleep unless we are privileged.
605		 */
606	        if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
607
608			if (m != VM_PAGE_NULL)
609			        VM_PAGE_FREE(m);
610			vm_fault_cleanup(object, first_m);
611
612		        assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
613
614			thread_block(THREAD_CONTINUE_NULL);
615			thread_interrupt_level(interruptible_state);
616
617			return (VM_FAULT_RETRY);
618		}
619	}
620	if ((throttle_delay = vm_page_throttled())) {
621	        /*
622		 * we're throttling zero-fills...
623		 * treat this as if we couldn't grab a page
624		 */
625	        if (m != VM_PAGE_NULL)
626		        VM_PAGE_FREE(m);
627		vm_fault_cleanup(object, first_m);
628
629		VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
630
631		delay(throttle_delay);
632
633		if (current_thread_aborted()) {
634			thread_interrupt_level(interruptible_state);
635			return VM_FAULT_INTERRUPTED;
636		}
637		thread_interrupt_level(interruptible_state);
638
639		return (VM_FAULT_MEMORY_SHORTAGE);
640	}
641	return (VM_FAULT_SUCCESS);
642}
643
644
645/*
646 * do the work to zero fill a page and
647 * inject it into the correct paging queue
648 *
649 * m->object must be locked
650 * page queue lock must NOT be held
651 */
652static int
653vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
654{
655        int my_fault = DBG_ZERO_FILL_FAULT;
656
657	/*
658	 * This is is a zero-fill page fault...
659	 *
660	 * Checking the page lock is a waste of
661	 * time;  this page was absent, so
662	 * it can't be page locked by a pager.
663	 *
664	 * we also consider it undefined
665	 * with respect to instruction
666	 * execution.  i.e. it is the responsibility
667	 * of higher layers to call for an instruction
668	 * sync after changing the contents and before
669	 * sending a program into this area.  We
670	 * choose this approach for performance
671	 */
672	m->pmapped = TRUE;
673
674	m->cs_validated = FALSE;
675	m->cs_tainted = FALSE;
676
677	if (no_zero_fill == TRUE) {
678		my_fault = DBG_NZF_PAGE_FAULT;
679	} else {
680		vm_page_zero_fill(m);
681
682		VM_STAT_INCR(zero_fill_count);
683		DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
684	}
685	assert(!m->laundry);
686	assert(m->object != kernel_object);
687	//assert(m->pageq.next == NULL && m->pageq.prev == NULL);
688
689	if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
690		(m->object->purgable == VM_PURGABLE_DENY ||
691		 m->object->purgable == VM_PURGABLE_NONVOLATILE ||
692		 m->object->purgable == VM_PURGABLE_VOLATILE )) {
693
694		vm_page_lockspin_queues();
695
696		assert(!VM_PAGE_WIRED(m));
697
698		/*
699		 * can't be on the pageout queue since we don't
700		 * have a pager to try and clean to
701		 */
702		assert(!m->pageout_queue);
703
704		VM_PAGE_QUEUES_REMOVE(m);
705
706                queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
707                m->throttled = TRUE;
708                vm_page_throttled_count++;
709
710		vm_page_unlock_queues();
711	}
712	return (my_fault);
713}
714
715
716/*
717 *	Routine:	vm_fault_page
718 *	Purpose:
719 *		Find the resident page for the virtual memory
720 *		specified by the given virtual memory object
721 *		and offset.
722 *	Additional arguments:
723 *		The required permissions for the page is given
724 *		in "fault_type".  Desired permissions are included
725 *		in "protection".
726 *		fault_info is passed along to determine pagein cluster
727 *		limits... it contains the expected reference pattern,
728 *		cluster size if available, etc...
729 *
730 *		If the desired page is known to be resident (for
731 *		example, because it was previously wired down), asserting
732 *		the "unwiring" parameter will speed the search.
733 *
734 *		If the operation can be interrupted (by thread_abort
735 *		or thread_terminate), then the "interruptible"
736 *		parameter should be asserted.
737 *
738 *	Results:
739 *		The page containing the proper data is returned
740 *		in "result_page".
741 *
742 *	In/out conditions:
743 *		The source object must be locked and referenced,
744 *		and must donate one paging reference.  The reference
745 *		is not affected.  The paging reference and lock are
746 *		consumed.
747 *
748 *		If the call succeeds, the object in which "result_page"
749 *		resides is left locked and holding a paging reference.
750 *		If this is not the original object, a busy page in the
751 *		original object is returned in "top_page", to prevent other
752 *		callers from pursuing this same data, along with a paging
753 *		reference for the original object.  The "top_page" should
754 *		be destroyed when this guarantee is no longer required.
755 *		The "result_page" is also left busy.  It is not removed
756 *		from the pageout queues.
757 *	Special Case:
758 *		A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
759 *		fault succeeded but there's no VM page (i.e. the VM object
760 * 		does not actually hold VM pages, but device memory or
761 *		large pages).  The object is still locked and we still hold a
762 *		paging_in_progress reference.
763 */
764unsigned int vm_fault_page_blocked_access = 0;
765unsigned int vm_fault_page_forced_retry = 0;
766
767vm_fault_return_t
768vm_fault_page(
769	/* Arguments: */
770	vm_object_t	first_object,	/* Object to begin search */
771	vm_object_offset_t first_offset,	/* Offset into object */
772	vm_prot_t	fault_type,	/* What access is requested */
773	boolean_t	must_be_resident,/* Must page be resident? */
774	/* Modifies in place: */
775	vm_prot_t	*protection,	/* Protection for mapping */
776	/* Returns: */
777	vm_page_t	*result_page,	/* Page found, if successful */
778	vm_page_t	*top_page,	/* Page in top object, if
779					 * not result_page.  */
780	int             *type_of_fault, /* if non-null, fill in with type of fault
781					 * COW, zero-fill, etc... returned in trace point */
782	/* More arguments: */
783	kern_return_t	*error_code,	/* code if page is in error */
784	boolean_t	no_zero_fill,	/* don't zero fill absent pages */
785#if MACH_PAGEMAP
786	boolean_t	data_supply,	/* treat as data_supply if
787					 * it is a write fault and a full
788					 * page is provided */
789#else
790	__unused boolean_t data_supply,
791#endif
792	vm_object_fault_info_t fault_info)
793{
794	vm_page_t		m;
795	vm_object_t		object;
796	vm_object_offset_t	offset;
797	vm_page_t		first_m;
798	vm_object_t		next_object;
799	vm_object_t		copy_object;
800	boolean_t		look_for_page;
801	boolean_t		force_fault_retry = FALSE;
802	vm_prot_t		access_required = fault_type;
803	vm_prot_t		wants_copy_flag;
804	CLUSTER_STAT(int pages_at_higher_offsets;)
805	CLUSTER_STAT(int pages_at_lower_offsets;)
806	kern_return_t		wait_result;
807	boolean_t		interruptible_state;
808	boolean_t		data_already_requested = FALSE;
809	vm_behavior_t		orig_behavior;
810	vm_size_t		orig_cluster_size;
811	vm_fault_return_t	error;
812	int			my_fault;
813	uint32_t		try_failed_count;
814	int			interruptible; /* how may fault be interrupted? */
815	memory_object_t		pager;
816	vm_fault_return_t	retval;
817
818/*
819 * MACH page map - an optional optimization where a bit map is maintained
820 * by the VM subsystem for internal objects to indicate which pages of
821 * the object currently reside on backing store.  This existence map
822 * duplicates information maintained by the vnode pager.  It is
823 * created at the time of the first pageout against the object, i.e.
824 * at the same time pager for the object is created.  The optimization
825 * is designed to eliminate pager interaction overhead, if it is
826 * 'known' that the page does not exist on backing store.
827 *
828 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
829 * either marked as paged out in the existence map for the object or no
830 * existence map exists for the object.  MUST_ASK_PAGER() is one of the
831 * criteria in the decision to invoke the pager.   It is also used as one
832 * of the criteria to terminate the scan for adjacent pages in a clustered
833 * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
834 * permanent objects.  Note also that if the pager for an internal object
835 * has not been created, the pager is not invoked regardless of the value
836 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
837 * for which a pager has been created.
838 *
839 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
840 * is marked as paged out in the existence map for the object.  PAGED_OUT()
841 * PAGED_OUT() is used to determine if a page has already been pushed
842 * into a copy object in order to avoid a redundant page out operation.
843 */
844#if MACH_PAGEMAP
845#define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
846			!= VM_EXTERNAL_STATE_ABSENT)
847#define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
848			== VM_EXTERNAL_STATE_EXISTS)
849#else
850#define MUST_ASK_PAGER(o, f) (TRUE)
851#define PAGED_OUT(o, f) (FALSE)
852#endif
853
854/*
855 *	Recovery actions
856 */
857#define RELEASE_PAGE(m)					\
858	MACRO_BEGIN					\
859	PAGE_WAKEUP_DONE(m);				\
860	if (!m->active && !m->inactive && !m->throttled) {		\
861		vm_page_lockspin_queues();				\
862		if (!m->active && !m->inactive && !m->throttled)	\
863			vm_page_activate(m);				\
864		vm_page_unlock_queues();				\
865	}								\
866	MACRO_END
867
868#if TRACEFAULTPAGE
869	dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset);	/* (TEST/DEBUG) */
870#endif
871
872	interruptible = fault_info->interruptible;
873	interruptible_state = thread_interrupt_level(interruptible);
874
875	/*
876	 *	INVARIANTS (through entire routine):
877	 *
878	 *	1)	At all times, we must either have the object
879	 *		lock or a busy page in some object to prevent
880	 *		some other thread from trying to bring in
881	 *		the same page.
882	 *
883	 *		Note that we cannot hold any locks during the
884	 *		pager access or when waiting for memory, so
885	 *		we use a busy page then.
886	 *
887	 *	2)	To prevent another thread from racing us down the
888	 *		shadow chain and entering a new page in the top
889	 *		object before we do, we must keep a busy page in
890	 *		the top object while following the shadow chain.
891	 *
892	 *	3)	We must increment paging_in_progress on any object
893	 *		for which we have a busy page before dropping
894	 *		the object lock
895	 *
896	 *	4)	We leave busy pages on the pageout queues.
897	 *		If the pageout daemon comes across a busy page,
898	 *		it will remove the page from the pageout queues.
899	 */
900
901	object = first_object;
902	offset = first_offset;
903	first_m = VM_PAGE_NULL;
904	access_required = fault_type;
905
906
907	XPR(XPR_VM_FAULT,
908		"vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
909		object, offset, fault_type, *protection, 0);
910
911	/*
912	 * default type of fault
913	 */
914	my_fault = DBG_CACHE_HIT_FAULT;
915
916	while (TRUE) {
917#if TRACEFAULTPAGE
918		dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);	/* (TEST/DEBUG) */
919#endif
920		if (!object->alive) {
921		        /*
922			 * object is no longer valid
923			 * clean up and return error
924			 */
925			vm_fault_cleanup(object, first_m);
926			thread_interrupt_level(interruptible_state);
927
928			return (VM_FAULT_MEMORY_ERROR);
929		}
930
931		if (!object->pager_created && object->phys_contiguous) {
932			/*
933			 * A physically-contiguous object without a pager:
934			 * must be a "large page" object.  We do not deal
935			 * with VM pages for this object.
936			 */
937			m = VM_PAGE_NULL;
938			goto phys_contig_object;
939		}
940
941		if (object->blocked_access) {
942			/*
943			 * Access to this VM object has been blocked.
944			 * Replace our "paging_in_progress" reference with
945			 * a "activity_in_progress" reference and wait for
946			 * access to be unblocked.
947			 */
948			vm_object_activity_begin(object);
949			vm_object_paging_end(object);
950			while (object->blocked_access) {
951				vm_object_sleep(object,
952						VM_OBJECT_EVENT_UNBLOCKED,
953						THREAD_UNINT);
954			}
955			vm_fault_page_blocked_access++;
956			vm_object_paging_begin(object);
957			vm_object_activity_end(object);
958		}
959
960		/*
961		 * See whether the page at 'offset' is resident
962		 */
963		m = vm_page_lookup(object, offset);
964#if TRACEFAULTPAGE
965		dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);	/* (TEST/DEBUG) */
966#endif
967		if (m != VM_PAGE_NULL) {
968
969			if (m->busy) {
970			        /*
971				 * The page is being brought in,
972				 * wait for it and then retry.
973				 */
974#if TRACEFAULTPAGE
975				dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
976#endif
977				wait_result = PAGE_SLEEP(object, m, interruptible);
978
979				XPR(XPR_VM_FAULT,
980				    "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
981				    object, offset,
982				    m, 0, 0);
983				counter(c_vm_fault_page_block_busy_kernel++);
984
985				if (wait_result != THREAD_AWAKENED) {
986					vm_fault_cleanup(object, first_m);
987					thread_interrupt_level(interruptible_state);
988
989					if (wait_result == THREAD_RESTART)
990						return (VM_FAULT_RETRY);
991					else
992						return (VM_FAULT_INTERRUPTED);
993				}
994				continue;
995			}
996			if (m->laundry) {
997				m->pageout = FALSE;
998
999				if (!m->cleaning)
1000					vm_pageout_steal_laundry(m, FALSE);
1001			}
1002			if (m->phys_page == vm_page_guard_addr) {
1003				/*
1004				 * Guard page: off limits !
1005				 */
1006				if (fault_type == VM_PROT_NONE) {
1007					/*
1008					 * The fault is not requesting any
1009					 * access to the guard page, so it must
1010					 * be just to wire or unwire it.
1011					 * Let's pretend it succeeded...
1012					 */
1013					m->busy = TRUE;
1014					*result_page = m;
1015					assert(first_m == VM_PAGE_NULL);
1016					*top_page = first_m;
1017					if (type_of_fault)
1018						*type_of_fault = DBG_GUARD_FAULT;
1019					thread_interrupt_level(interruptible_state);
1020					return VM_FAULT_SUCCESS;
1021				} else {
1022					/*
1023					 * The fault requests access to the
1024					 * guard page: let's deny that !
1025					 */
1026					vm_fault_cleanup(object, first_m);
1027					thread_interrupt_level(interruptible_state);
1028					return VM_FAULT_MEMORY_ERROR;
1029				}
1030			}
1031
1032			if (m->error) {
1033			        /*
1034				 * The page is in error, give up now.
1035				 */
1036#if TRACEFAULTPAGE
1037				dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);	/* (TEST/DEBUG) */
1038#endif
1039				if (error_code)
1040				        *error_code = KERN_MEMORY_ERROR;
1041				VM_PAGE_FREE(m);
1042
1043				vm_fault_cleanup(object, first_m);
1044				thread_interrupt_level(interruptible_state);
1045
1046				return (VM_FAULT_MEMORY_ERROR);
1047			}
1048			if (m->restart) {
1049			        /*
1050				 * The pager wants us to restart
1051				 * at the top of the chain,
1052				 * typically because it has moved the
1053				 * page to another pager, then do so.
1054				 */
1055#if TRACEFAULTPAGE
1056				dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1057#endif
1058				VM_PAGE_FREE(m);
1059
1060				vm_fault_cleanup(object, first_m);
1061				thread_interrupt_level(interruptible_state);
1062
1063				return (VM_FAULT_RETRY);
1064			}
1065			if (m->absent) {
1066			        /*
1067				 * The page isn't busy, but is absent,
1068				 * therefore it's deemed "unavailable".
1069				 *
1070				 * Remove the non-existent page (unless it's
1071				 * in the top object) and move on down to the
1072				 * next object (if there is one).
1073				 */
1074#if TRACEFAULTPAGE
1075				dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);	/* (TEST/DEBUG) */
1076#endif
1077				next_object = object->shadow;
1078
1079				if (next_object == VM_OBJECT_NULL) {
1080					/*
1081					 * Absent page at bottom of shadow
1082					 * chain; zero fill the page we left
1083					 * busy in the first object, and free
1084					 * the absent page.
1085					 */
1086					assert(!must_be_resident);
1087
1088					/*
1089					 * check for any conditions that prevent
1090					 * us from creating a new zero-fill page
1091					 * vm_fault_check will do all of the
1092					 * fault cleanup in the case of an error condition
1093					 * including resetting the thread_interrupt_level
1094					 */
1095					error = vm_fault_check(object, m, first_m, interruptible_state);
1096
1097					if (error != VM_FAULT_SUCCESS)
1098					        return (error);
1099
1100					XPR(XPR_VM_FAULT,
1101					    "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1102						object, offset,
1103						m,
1104						first_object, 0);
1105
1106					if (object != first_object) {
1107					        /*
1108						 * free the absent page we just found
1109						 */
1110						VM_PAGE_FREE(m);
1111
1112						/*
1113						 * drop reference and lock on current object
1114						 */
1115						vm_object_paging_end(object);
1116						vm_object_unlock(object);
1117
1118						/*
1119						 * grab the original page we
1120						 * 'soldered' in place and
1121						 * retake lock on 'first_object'
1122						 */
1123						m = first_m;
1124						first_m = VM_PAGE_NULL;
1125
1126						object = first_object;
1127						offset = first_offset;
1128
1129						vm_object_lock(object);
1130					} else {
1131					        /*
1132						 * we're going to use the absent page we just found
1133						 * so convert it to a 'busy' page
1134						 */
1135					        m->absent = FALSE;
1136						m->busy = TRUE;
1137					}
1138					/*
1139					 * zero-fill the page and put it on
1140					 * the correct paging queue
1141					 */
1142					my_fault = vm_fault_zero_page(m, no_zero_fill);
1143
1144					if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1145						m->absent = TRUE;
1146
1147					break;
1148				} else {
1149					if (must_be_resident)
1150						vm_object_paging_end(object);
1151					else if (object != first_object) {
1152						vm_object_paging_end(object);
1153						VM_PAGE_FREE(m);
1154					} else {
1155						first_m = m;
1156						m->absent = FALSE;
1157						m->busy = TRUE;
1158
1159						vm_page_lockspin_queues();
1160
1161						assert(!m->pageout_queue);
1162						VM_PAGE_QUEUES_REMOVE(m);
1163
1164						vm_page_unlock_queues();
1165					}
1166					XPR(XPR_VM_FAULT,
1167					    "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1168						object, offset,
1169						next_object,
1170						offset+object->vo_shadow_offset,0);
1171
1172					offset += object->vo_shadow_offset;
1173					fault_info->lo_offset += object->vo_shadow_offset;
1174					fault_info->hi_offset += object->vo_shadow_offset;
1175					access_required = VM_PROT_READ;
1176
1177					vm_object_lock(next_object);
1178					vm_object_unlock(object);
1179					object = next_object;
1180					vm_object_paging_begin(object);
1181
1182					/*
1183					 * reset to default type of fault
1184					 */
1185					my_fault = DBG_CACHE_HIT_FAULT;
1186
1187					continue;
1188				}
1189			}
1190			if ((m->cleaning)
1191			    && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1192			    && (fault_type & VM_PROT_WRITE)) {
1193				/*
1194				 * This is a copy-on-write fault that will
1195				 * cause us to revoke access to this page, but
1196				 * this page is in the process of being cleaned
1197				 * in a clustered pageout. We must wait until
1198				 * the cleaning operation completes before
1199				 * revoking access to the original page,
1200				 * otherwise we might attempt to remove a
1201				 * wired mapping.
1202				 */
1203#if TRACEFAULTPAGE
1204				dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);	/* (TEST/DEBUG) */
1205#endif
1206				XPR(XPR_VM_FAULT,
1207				    "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1208					object, offset,
1209					m, 0, 0);
1210				/*
1211				 * take an extra ref so that object won't die
1212				 */
1213				vm_object_reference_locked(object);
1214
1215				vm_fault_cleanup(object, first_m);
1216
1217				counter(c_vm_fault_page_block_backoff_kernel++);
1218				vm_object_lock(object);
1219				assert(object->ref_count > 0);
1220
1221				m = vm_page_lookup(object, offset);
1222
1223				if (m != VM_PAGE_NULL && m->cleaning) {
1224					PAGE_ASSERT_WAIT(m, interruptible);
1225
1226					vm_object_unlock(object);
1227					wait_result = thread_block(THREAD_CONTINUE_NULL);
1228					vm_object_deallocate(object);
1229
1230					goto backoff;
1231				} else {
1232					vm_object_unlock(object);
1233
1234					vm_object_deallocate(object);
1235					thread_interrupt_level(interruptible_state);
1236
1237					return (VM_FAULT_RETRY);
1238				}
1239			}
1240			if (type_of_fault == NULL && m->speculative &&
1241			    !(fault_info != NULL && fault_info->stealth)) {
1242			        /*
1243				 * If we were passed a non-NULL pointer for
1244				 * "type_of_fault", than we came from
1245				 * vm_fault... we'll let it deal with
1246				 * this condition, since it
1247				 * needs to see m->speculative to correctly
1248				 * account the pageins, otherwise...
1249				 * take it off the speculative queue, we'll
1250				 * let the caller of vm_fault_page deal
1251				 * with getting it onto the correct queue
1252				 *
1253				 * If the caller specified in fault_info that
1254				 * it wants a "stealth" fault, we also leave
1255				 * the page in the speculative queue.
1256				 */
1257			        vm_page_lockspin_queues();
1258				if (m->speculative)
1259					VM_PAGE_QUEUES_REMOVE(m);
1260			        vm_page_unlock_queues();
1261			}
1262
1263			if (m->encrypted) {
1264				/*
1265				 * ENCRYPTED SWAP:
1266				 * the user needs access to a page that we
1267				 * encrypted before paging it out.
1268				 * Decrypt the page now.
1269				 * Keep it busy to prevent anyone from
1270				 * accessing it during the decryption.
1271				 */
1272				m->busy = TRUE;
1273				vm_page_decrypt(m, 0);
1274				assert(object == m->object);
1275				assert(m->busy);
1276				PAGE_WAKEUP_DONE(m);
1277
1278				/*
1279				 * Retry from the top, in case
1280				 * something changed while we were
1281				 * decrypting.
1282				 */
1283				continue;
1284			}
1285			ASSERT_PAGE_DECRYPTED(m);
1286
1287			if (m->object->code_signed) {
1288				/*
1289				 * CODE SIGNING:
1290				 * We just paged in a page from a signed
1291				 * memory object but we don't need to
1292				 * validate it now.  We'll validate it if
1293				 * when it gets mapped into a user address
1294				 * space for the first time or when the page
1295				 * gets copied to another object as a result
1296				 * of a copy-on-write.
1297				 */
1298			}
1299
1300			/*
1301			 * We mark the page busy and leave it on
1302			 * the pageout queues.  If the pageout
1303			 * deamon comes across it, then it will
1304			 * remove the page from the queue, but not the object
1305			 */
1306#if TRACEFAULTPAGE
1307			dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1308#endif
1309			XPR(XPR_VM_FAULT,
1310			    "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1311				object, offset, m, 0, 0);
1312			assert(!m->busy);
1313			assert(!m->absent);
1314
1315			m->busy = TRUE;
1316			break;
1317		}
1318
1319
1320		/*
1321		 * we get here when there is no page present in the object at
1322		 * the offset we're interested in... we'll allocate a page
1323		 * at this point if the pager associated with
1324		 * this object can provide the data or we're the top object...
1325		 * object is locked;  m == NULL
1326		 */
1327		if (must_be_resident)
1328			goto dont_look_for_page;
1329
1330		look_for_page =	(object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1331
1332#if TRACEFAULTPAGE
1333		dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);	/* (TEST/DEBUG) */
1334#endif
1335		if (!look_for_page && object == first_object && !object->phys_contiguous) {
1336			/*
1337			 * Allocate a new page for this object/offset pair as a placeholder
1338			 */
1339			m = vm_page_grab();
1340#if TRACEFAULTPAGE
1341			dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);	/* (TEST/DEBUG) */
1342#endif
1343			if (m == VM_PAGE_NULL) {
1344
1345				vm_fault_cleanup(object, first_m);
1346				thread_interrupt_level(interruptible_state);
1347
1348				return (VM_FAULT_MEMORY_SHORTAGE);
1349			}
1350
1351			if (fault_info && fault_info->batch_pmap_op == TRUE) {
1352				vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1353			} else {
1354				vm_page_insert(m, object, offset);
1355			}
1356		}
1357		if (look_for_page) {
1358			kern_return_t	rc;
1359
1360			/*
1361			 *	If the memory manager is not ready, we
1362			 *	cannot make requests.
1363			 */
1364			if (!object->pager_ready) {
1365#if TRACEFAULTPAGE
1366				dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);	/* (TEST/DEBUG) */
1367#endif
1368				if (m != VM_PAGE_NULL)
1369				        VM_PAGE_FREE(m);
1370
1371				XPR(XPR_VM_FAULT,
1372				"vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1373					object, offset, 0, 0, 0);
1374
1375				/*
1376				 * take an extra ref so object won't die
1377				 */
1378				vm_object_reference_locked(object);
1379				vm_fault_cleanup(object, first_m);
1380				counter(c_vm_fault_page_block_backoff_kernel++);
1381
1382				vm_object_lock(object);
1383				assert(object->ref_count > 0);
1384
1385				if (!object->pager_ready) {
1386					wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1387
1388					vm_object_unlock(object);
1389					if (wait_result == THREAD_WAITING)
1390						wait_result = thread_block(THREAD_CONTINUE_NULL);
1391					vm_object_deallocate(object);
1392
1393					goto backoff;
1394				} else {
1395					vm_object_unlock(object);
1396					vm_object_deallocate(object);
1397					thread_interrupt_level(interruptible_state);
1398
1399					return (VM_FAULT_RETRY);
1400				}
1401			}
1402			if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1403				/*
1404				 * If there are too many outstanding page
1405				 * requests pending on this external object, we
1406				 * wait for them to be resolved now.
1407				 */
1408#if TRACEFAULTPAGE
1409				dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1410#endif
1411				if (m != VM_PAGE_NULL)
1412					VM_PAGE_FREE(m);
1413				/*
1414				 * take an extra ref so object won't die
1415				 */
1416				vm_object_reference_locked(object);
1417
1418				vm_fault_cleanup(object, first_m);
1419
1420				counter(c_vm_fault_page_block_backoff_kernel++);
1421
1422				vm_object_lock(object);
1423				assert(object->ref_count > 0);
1424
1425				if (object->paging_in_progress >= vm_object_pagein_throttle) {
1426				        vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1427
1428					vm_object_unlock(object);
1429					wait_result = thread_block(THREAD_CONTINUE_NULL);
1430					vm_object_deallocate(object);
1431
1432					goto backoff;
1433				} else {
1434					vm_object_unlock(object);
1435					vm_object_deallocate(object);
1436					thread_interrupt_level(interruptible_state);
1437
1438					return (VM_FAULT_RETRY);
1439				}
1440			}
1441			if (m != VM_PAGE_NULL) {
1442				VM_PAGE_FREE(m);
1443				m = VM_PAGE_NULL;
1444			}
1445
1446#if TRACEFAULTPAGE
1447			dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);	/* (TEST/DEBUG) */
1448#endif
1449
1450			/*
1451			 * It's possible someone called vm_object_destroy while we weren't
1452			 * holding the object lock.  If that has happened, then bail out
1453			 * here.
1454			 */
1455
1456			pager = object->pager;
1457
1458			if (pager == MEMORY_OBJECT_NULL) {
1459				vm_fault_cleanup(object, first_m);
1460				thread_interrupt_level(interruptible_state);
1461				return VM_FAULT_MEMORY_ERROR;
1462			}
1463
1464			/*
1465			 * We have an absent page in place for the faulting offset,
1466			 * so we can release the object lock.
1467			 */
1468
1469			vm_object_unlock(object);
1470
1471			/*
1472			 * If this object uses a copy_call strategy,
1473			 * and we are interested in a copy of this object
1474			 * (having gotten here only by following a
1475			 * shadow chain), then tell the memory manager
1476			 * via a flag added to the desired_access
1477			 * parameter, so that it can detect a race
1478			 * between our walking down the shadow chain
1479			 * and its pushing pages up into a copy of
1480			 * the object that it manages.
1481			 */
1482			if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1483				wants_copy_flag = VM_PROT_WANTS_COPY;
1484			else
1485				wants_copy_flag = VM_PROT_NONE;
1486
1487			XPR(XPR_VM_FAULT,
1488			    "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1489				object, offset, m,
1490				access_required | wants_copy_flag, 0);
1491
1492			if (object->copy == first_object) {
1493				/*
1494				 * if we issue the memory_object_data_request in
1495				 * this state, we are subject to a deadlock with
1496				 * the underlying filesystem if it is trying to
1497				 * shrink the file resulting in a push of pages
1498				 * into the copy object...  that push will stall
1499				 * on the placeholder page, and if the pushing thread
1500				 * is holding a lock that is required on the pagein
1501				 * path (such as a truncate lock), we'll deadlock...
1502				 * to avoid this potential deadlock, we throw away
1503				 * our placeholder page before calling memory_object_data_request
1504				 * and force this thread to retry the vm_fault_page after
1505				 * we have issued the I/O.  the second time through this path
1506				 * we will find the page already in the cache (presumably still
1507				 * busy waiting for the I/O to complete) and then complete
1508				 * the fault w/o having to go through memory_object_data_request again
1509				 */
1510				assert(first_m != VM_PAGE_NULL);
1511				assert(first_m->object == first_object);
1512
1513				vm_object_lock(first_object);
1514				VM_PAGE_FREE(first_m);
1515				vm_object_paging_end(first_object);
1516				vm_object_unlock(first_object);
1517
1518				first_m = VM_PAGE_NULL;
1519				force_fault_retry = TRUE;
1520
1521				vm_fault_page_forced_retry++;
1522			}
1523
1524			if (data_already_requested == TRUE) {
1525				orig_behavior = fault_info->behavior;
1526				orig_cluster_size = fault_info->cluster_size;
1527
1528				fault_info->behavior = VM_BEHAVIOR_RANDOM;
1529				fault_info->cluster_size = PAGE_SIZE;
1530			}
1531			/*
1532			 * Call the memory manager to retrieve the data.
1533			 */
1534			rc = memory_object_data_request(
1535				pager,
1536				offset + object->paging_offset,
1537				PAGE_SIZE,
1538				access_required | wants_copy_flag,
1539				(memory_object_fault_info_t)fault_info);
1540
1541			if (data_already_requested == TRUE) {
1542				fault_info->behavior = orig_behavior;
1543				fault_info->cluster_size = orig_cluster_size;
1544			} else
1545				data_already_requested = TRUE;
1546
1547#if TRACEFAULTPAGE
1548			dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc);	/* (TEST/DEBUG) */
1549#endif
1550			vm_object_lock(object);
1551
1552			if (rc != KERN_SUCCESS) {
1553
1554				vm_fault_cleanup(object, first_m);
1555				thread_interrupt_level(interruptible_state);
1556
1557				return ((rc == MACH_SEND_INTERRUPTED) ?
1558					VM_FAULT_INTERRUPTED :
1559					VM_FAULT_MEMORY_ERROR);
1560			} else {
1561				clock_sec_t     tv_sec;
1562				clock_usec_t    tv_usec;
1563
1564				clock_get_system_microtime(&tv_sec, &tv_usec);
1565				current_thread()->t_page_creation_time = tv_sec;
1566				current_thread()->t_page_creation_count = 0;
1567			}
1568			if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1569
1570				vm_fault_cleanup(object, first_m);
1571				thread_interrupt_level(interruptible_state);
1572
1573				return (VM_FAULT_INTERRUPTED);
1574			}
1575			if (force_fault_retry == TRUE) {
1576
1577				vm_fault_cleanup(object, first_m);
1578				thread_interrupt_level(interruptible_state);
1579
1580				return (VM_FAULT_RETRY);
1581			}
1582			if (m == VM_PAGE_NULL && object->phys_contiguous) {
1583				/*
1584				 * No page here means that the object we
1585				 * initially looked up was "physically
1586				 * contiguous" (i.e. device memory).  However,
1587				 * with Virtual VRAM, the object might not
1588				 * be backed by that device memory anymore,
1589				 * so we're done here only if the object is
1590				 * still "phys_contiguous".
1591				 * Otherwise, if the object is no longer
1592				 * "phys_contiguous", we need to retry the
1593				 * page fault against the object's new backing
1594				 * store (different memory object).
1595				 */
1596			phys_contig_object:
1597				goto done;
1598			}
1599			/*
1600			 * potentially a pagein fault
1601			 * if we make it through the state checks
1602			 * above, than we'll count it as such
1603			 */
1604			my_fault = DBG_PAGEIN_FAULT;
1605
1606			/*
1607			 * Retry with same object/offset, since new data may
1608			 * be in a different page (i.e., m is meaningless at
1609			 * this point).
1610			 */
1611			continue;
1612		}
1613dont_look_for_page:
1614		/*
1615		 * We get here if the object has no pager, or an existence map
1616		 * exists and indicates the page isn't present on the pager
1617		 * or we're unwiring a page.  If a pager exists, but there
1618		 * is no existence map, then the m->absent case above handles
1619		 * the ZF case when the pager can't provide the page
1620		 */
1621#if TRACEFAULTPAGE
1622		dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);	/* (TEST/DEBUG) */
1623#endif
1624		if (object == first_object)
1625			first_m = m;
1626		else
1627			assert(m == VM_PAGE_NULL);
1628
1629		XPR(XPR_VM_FAULT,
1630		    "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1631			object, offset, m,
1632			object->shadow, 0);
1633
1634		next_object = object->shadow;
1635
1636		if (next_object == VM_OBJECT_NULL) {
1637			/*
1638			 * we've hit the bottom of the shadown chain,
1639			 * fill the page in the top object with zeros.
1640			 */
1641			assert(!must_be_resident);
1642
1643			if (object != first_object) {
1644				vm_object_paging_end(object);
1645				vm_object_unlock(object);
1646
1647				object = first_object;
1648				offset = first_offset;
1649				vm_object_lock(object);
1650			}
1651			m = first_m;
1652			assert(m->object == object);
1653			first_m = VM_PAGE_NULL;
1654
1655			/*
1656			 * check for any conditions that prevent
1657			 * us from creating a new zero-fill page
1658			 * vm_fault_check will do all of the
1659			 * fault cleanup in the case of an error condition
1660			 * including resetting the thread_interrupt_level
1661			 */
1662			error = vm_fault_check(object, m, first_m, interruptible_state);
1663
1664			if (error != VM_FAULT_SUCCESS)
1665			        return (error);
1666
1667			if (m == VM_PAGE_NULL) {
1668				m = vm_page_grab();
1669
1670				if (m == VM_PAGE_NULL) {
1671					vm_fault_cleanup(object, VM_PAGE_NULL);
1672					thread_interrupt_level(interruptible_state);
1673
1674					return (VM_FAULT_MEMORY_SHORTAGE);
1675				}
1676				vm_page_insert(m, object, offset);
1677			}
1678			my_fault = vm_fault_zero_page(m, no_zero_fill);
1679
1680			if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1681				m->absent = TRUE;
1682			break;
1683
1684		} else {
1685		        /*
1686			 * Move on to the next object.  Lock the next
1687			 * object before unlocking the current one.
1688			 */
1689			if ((object != first_object) || must_be_resident)
1690				vm_object_paging_end(object);
1691
1692			offset += object->vo_shadow_offset;
1693			fault_info->lo_offset += object->vo_shadow_offset;
1694			fault_info->hi_offset += object->vo_shadow_offset;
1695			access_required = VM_PROT_READ;
1696
1697			vm_object_lock(next_object);
1698			vm_object_unlock(object);
1699
1700			object = next_object;
1701			vm_object_paging_begin(object);
1702		}
1703	}
1704
1705	/*
1706	 *	PAGE HAS BEEN FOUND.
1707	 *
1708	 *	This page (m) is:
1709	 *		busy, so that we can play with it;
1710	 *		not absent, so that nobody else will fill it;
1711	 *		possibly eligible for pageout;
1712	 *
1713	 *	The top-level page (first_m) is:
1714	 *		VM_PAGE_NULL if the page was found in the
1715	 *		 top-level object;
1716	 *		busy, not absent, and ineligible for pageout.
1717	 *
1718	 *	The current object (object) is locked.  A paging
1719	 *	reference is held for the current and top-level
1720	 *	objects.
1721	 */
1722
1723#if TRACEFAULTPAGE
1724	dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);	/* (TEST/DEBUG) */
1725#endif
1726#if	EXTRA_ASSERTIONS
1727	assert(m->busy && !m->absent);
1728	assert((first_m == VM_PAGE_NULL) ||
1729	       (first_m->busy && !first_m->absent &&
1730		!first_m->active && !first_m->inactive));
1731#endif	/* EXTRA_ASSERTIONS */
1732
1733	/*
1734	 * ENCRYPTED SWAP:
1735	 * If we found a page, we must have decrypted it before we
1736	 * get here...
1737	 */
1738	ASSERT_PAGE_DECRYPTED(m);
1739
1740	XPR(XPR_VM_FAULT,
1741	    "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1742		object, offset, m,
1743		first_object, first_m);
1744
1745	/*
1746	 * If the page is being written, but isn't
1747	 * already owned by the top-level object,
1748	 * we have to copy it into a new page owned
1749	 * by the top-level object.
1750	 */
1751	if (object != first_object) {
1752
1753#if TRACEFAULTPAGE
1754		dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type);	/* (TEST/DEBUG) */
1755#endif
1756	    	if (fault_type & VM_PROT_WRITE) {
1757			vm_page_t copy_m;
1758
1759			/*
1760			 * We only really need to copy if we
1761			 * want to write it.
1762			 */
1763			assert(!must_be_resident);
1764
1765			/*
1766			 * are we protecting the system from
1767			 * backing store exhaustion.  If so
1768			 * sleep unless we are privileged.
1769			 */
1770			if (vm_backing_store_low) {
1771				if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1772
1773					RELEASE_PAGE(m);
1774					vm_fault_cleanup(object, first_m);
1775
1776					assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1777
1778					thread_block(THREAD_CONTINUE_NULL);
1779					thread_interrupt_level(interruptible_state);
1780
1781					return (VM_FAULT_RETRY);
1782				}
1783			}
1784			/*
1785			 * If we try to collapse first_object at this
1786			 * point, we may deadlock when we try to get
1787			 * the lock on an intermediate object (since we
1788			 * have the bottom object locked).  We can't
1789			 * unlock the bottom object, because the page
1790			 * we found may move (by collapse) if we do.
1791			 *
1792			 * Instead, we first copy the page.  Then, when
1793			 * we have no more use for the bottom object,
1794			 * we unlock it and try to collapse.
1795			 *
1796			 * Note that we copy the page even if we didn't
1797			 * need to... that's the breaks.
1798			 */
1799
1800			/*
1801			 * Allocate a page for the copy
1802			 */
1803			copy_m = vm_page_grab();
1804
1805			if (copy_m == VM_PAGE_NULL) {
1806				RELEASE_PAGE(m);
1807
1808				vm_fault_cleanup(object, first_m);
1809				thread_interrupt_level(interruptible_state);
1810
1811				return (VM_FAULT_MEMORY_SHORTAGE);
1812			}
1813			XPR(XPR_VM_FAULT,
1814			    "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1815				object, offset,
1816				m, copy_m, 0);
1817
1818			vm_page_copy(m, copy_m);
1819
1820			/*
1821			 * If another map is truly sharing this
1822			 * page with us, we have to flush all
1823			 * uses of the original page, since we
1824			 * can't distinguish those which want the
1825			 * original from those which need the
1826			 * new copy.
1827			 *
1828			 * XXXO If we know that only one map has
1829			 * access to this page, then we could
1830			 * avoid the pmap_disconnect() call.
1831			 */
1832			if (m->pmapped)
1833			        pmap_disconnect(m->phys_page);
1834
1835			assert(!m->cleaning);
1836
1837			/*
1838			 * We no longer need the old page or object.
1839			 */
1840			PAGE_WAKEUP_DONE(m);
1841			vm_object_paging_end(object);
1842			vm_object_unlock(object);
1843
1844			my_fault = DBG_COW_FAULT;
1845			VM_STAT_INCR(cow_faults);
1846			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1847			current_task()->cow_faults++;
1848
1849			object = first_object;
1850			offset = first_offset;
1851
1852			vm_object_lock(object);
1853			/*
1854			 * get rid of the place holder
1855			 * page that we soldered in earlier
1856			 */
1857			VM_PAGE_FREE(first_m);
1858			first_m = VM_PAGE_NULL;
1859
1860			/*
1861			 * and replace it with the
1862			 * page we just copied into
1863			 */
1864			assert(copy_m->busy);
1865			vm_page_insert(copy_m, object, offset);
1866			SET_PAGE_DIRTY(copy_m, TRUE);
1867
1868			m = copy_m;
1869			/*
1870			 * Now that we've gotten the copy out of the
1871			 * way, let's try to collapse the top object.
1872			 * But we have to play ugly games with
1873			 * paging_in_progress to do that...
1874			 */
1875			vm_object_paging_end(object);
1876			vm_object_collapse(object, offset, TRUE);
1877			vm_object_paging_begin(object);
1878
1879		} else
1880		    	*protection &= (~VM_PROT_WRITE);
1881	}
1882	/*
1883	 * Now check whether the page needs to be pushed into the
1884	 * copy object.  The use of asymmetric copy on write for
1885	 * shared temporary objects means that we may do two copies to
1886	 * satisfy the fault; one above to get the page from a
1887	 * shadowed object, and one here to push it into the copy.
1888	 */
1889	try_failed_count = 0;
1890
1891	while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
1892		vm_object_offset_t	copy_offset;
1893		vm_page_t		copy_m;
1894
1895#if TRACEFAULTPAGE
1896		dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);	/* (TEST/DEBUG) */
1897#endif
1898		/*
1899		 * If the page is being written, but hasn't been
1900		 * copied to the copy-object, we have to copy it there.
1901		 */
1902		if ((fault_type & VM_PROT_WRITE) == 0) {
1903			*protection &= ~VM_PROT_WRITE;
1904			break;
1905		}
1906
1907		/*
1908		 * If the page was guaranteed to be resident,
1909		 * we must have already performed the copy.
1910		 */
1911		if (must_be_resident)
1912			break;
1913
1914		/*
1915		 * Try to get the lock on the copy_object.
1916		 */
1917		if (!vm_object_lock_try(copy_object)) {
1918
1919			vm_object_unlock(object);
1920			try_failed_count++;
1921
1922			mutex_pause(try_failed_count);	/* wait a bit */
1923			vm_object_lock(object);
1924
1925			continue;
1926		}
1927		try_failed_count = 0;
1928
1929		/*
1930		 * Make another reference to the copy-object,
1931		 * to keep it from disappearing during the
1932		 * copy.
1933		 */
1934		vm_object_reference_locked(copy_object);
1935
1936		/*
1937		 * Does the page exist in the copy?
1938		 */
1939		copy_offset = first_offset - copy_object->vo_shadow_offset;
1940
1941		if (copy_object->vo_size <= copy_offset)
1942			/*
1943			 * Copy object doesn't cover this page -- do nothing.
1944			 */
1945			;
1946		else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1947			/*
1948			 * Page currently exists in the copy object
1949			 */
1950			if (copy_m->busy) {
1951				/*
1952				 * If the page is being brought
1953				 * in, wait for it and then retry.
1954				 */
1955				RELEASE_PAGE(m);
1956
1957				/*
1958				 * take an extra ref so object won't die
1959				 */
1960				vm_object_reference_locked(copy_object);
1961				vm_object_unlock(copy_object);
1962				vm_fault_cleanup(object, first_m);
1963				counter(c_vm_fault_page_block_backoff_kernel++);
1964
1965				vm_object_lock(copy_object);
1966				assert(copy_object->ref_count > 0);
1967				VM_OBJ_RES_DECR(copy_object);
1968				vm_object_lock_assert_exclusive(copy_object);
1969				copy_object->ref_count--;
1970				assert(copy_object->ref_count > 0);
1971				copy_m = vm_page_lookup(copy_object, copy_offset);
1972				/*
1973				 * ENCRYPTED SWAP:
1974				 * it's OK if the "copy_m" page is encrypted,
1975				 * because we're not moving it nor handling its
1976				 * contents.
1977				 */
1978				if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1979					PAGE_ASSERT_WAIT(copy_m, interruptible);
1980
1981					vm_object_unlock(copy_object);
1982					wait_result = thread_block(THREAD_CONTINUE_NULL);
1983					vm_object_deallocate(copy_object);
1984
1985					goto backoff;
1986				} else {
1987					vm_object_unlock(copy_object);
1988					vm_object_deallocate(copy_object);
1989					thread_interrupt_level(interruptible_state);
1990
1991					return (VM_FAULT_RETRY);
1992				}
1993			}
1994		}
1995		else if (!PAGED_OUT(copy_object, copy_offset)) {
1996			/*
1997			 * If PAGED_OUT is TRUE, then the page used to exist
1998			 * in the copy-object, and has already been paged out.
1999			 * We don't need to repeat this. If PAGED_OUT is
2000			 * FALSE, then either we don't know (!pager_created,
2001			 * for example) or it hasn't been paged out.
2002			 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2003			 * We must copy the page to the copy object.
2004			 */
2005
2006			if (vm_backing_store_low) {
2007			        /*
2008				 * we are protecting the system from
2009				 * backing store exhaustion.  If so
2010				 * sleep unless we are privileged.
2011				 */
2012				if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2013					assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2014
2015					RELEASE_PAGE(m);
2016					VM_OBJ_RES_DECR(copy_object);
2017					vm_object_lock_assert_exclusive(copy_object);
2018					copy_object->ref_count--;
2019					assert(copy_object->ref_count > 0);
2020
2021					vm_object_unlock(copy_object);
2022					vm_fault_cleanup(object, first_m);
2023					thread_block(THREAD_CONTINUE_NULL);
2024					thread_interrupt_level(interruptible_state);
2025
2026					return (VM_FAULT_RETRY);
2027				}
2028			}
2029			/*
2030			 * Allocate a page for the copy
2031			 */
2032			copy_m = vm_page_alloc(copy_object, copy_offset);
2033
2034			if (copy_m == VM_PAGE_NULL) {
2035				RELEASE_PAGE(m);
2036
2037				VM_OBJ_RES_DECR(copy_object);
2038				vm_object_lock_assert_exclusive(copy_object);
2039				copy_object->ref_count--;
2040				assert(copy_object->ref_count > 0);
2041
2042				vm_object_unlock(copy_object);
2043				vm_fault_cleanup(object, first_m);
2044				thread_interrupt_level(interruptible_state);
2045
2046				return (VM_FAULT_MEMORY_SHORTAGE);
2047			}
2048			/*
2049			 * Must copy page into copy-object.
2050			 */
2051			vm_page_copy(m, copy_m);
2052
2053			/*
2054			 * If the old page was in use by any users
2055			 * of the copy-object, it must be removed
2056			 * from all pmaps.  (We can't know which
2057			 * pmaps use it.)
2058			 */
2059			if (m->pmapped)
2060			        pmap_disconnect(m->phys_page);
2061
2062			/*
2063			 * If there's a pager, then immediately
2064			 * page out this page, using the "initialize"
2065			 * option.  Else, we use the copy.
2066			 */
2067		 	if ((!copy_object->pager_created)
2068#if MACH_PAGEMAP
2069			    || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2070#endif
2071			    ) {
2072
2073				vm_page_lockspin_queues();
2074				assert(!m->cleaning);
2075				vm_page_activate(copy_m);
2076				vm_page_unlock_queues();
2077
2078				SET_PAGE_DIRTY(copy_m, TRUE);
2079				PAGE_WAKEUP_DONE(copy_m);
2080
2081			} else if (copy_object->internal) {
2082				/*
2083				 * For internal objects check with the pager to see
2084				 * if the page already exists in the backing store.
2085				 * If yes, then we can drop the copy page. If not,
2086				 * then we'll activate it, mark it dirty and keep it
2087				 * around.
2088				 */
2089
2090				kern_return_t kr = KERN_SUCCESS;
2091
2092				memory_object_t	copy_pager = copy_object->pager;
2093				assert(copy_pager != MEMORY_OBJECT_NULL);
2094				vm_object_paging_begin(copy_object);
2095
2096				vm_object_unlock(copy_object);
2097
2098				kr = memory_object_data_request(
2099					copy_pager,
2100					copy_offset + copy_object->paging_offset,
2101					0, /* Only query the pager. */
2102					VM_PROT_READ,
2103					NULL);
2104
2105				vm_object_lock(copy_object);
2106
2107				vm_object_paging_end(copy_object);
2108
2109				/*
2110				 * Since we dropped the copy_object's lock,
2111				 * check whether we'll have to deallocate
2112				 * the hard way.
2113				 */
2114				if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2115					vm_object_unlock(copy_object);
2116					vm_object_deallocate(copy_object);
2117					vm_object_lock(object);
2118
2119					continue;
2120				}
2121				if (kr == KERN_SUCCESS) {
2122					/*
2123					 * The pager has the page. We don't want to overwrite
2124					 * that page by sending this one out to the backing store.
2125					 * So we drop the copy page.
2126					 */
2127					VM_PAGE_FREE(copy_m);
2128
2129				} else {
2130					/*
2131					 * The pager doesn't have the page. We'll keep this one
2132					 * around in the copy object. It might get sent out to
2133					 * the backing store under memory pressure.
2134					 */
2135					vm_page_lockspin_queues();
2136					assert(!m->cleaning);
2137					vm_page_activate(copy_m);
2138					vm_page_unlock_queues();
2139
2140					SET_PAGE_DIRTY(copy_m, TRUE);
2141					PAGE_WAKEUP_DONE(copy_m);
2142				}
2143			} else {
2144
2145				assert(copy_m->busy == TRUE);
2146				assert(!m->cleaning);
2147
2148				/*
2149				 * dirty is protected by the object lock
2150				 */
2151				SET_PAGE_DIRTY(copy_m, TRUE);
2152
2153				/*
2154				 * The page is already ready for pageout:
2155				 * not on pageout queues and busy.
2156				 * Unlock everything except the
2157				 * copy_object itself.
2158				 */
2159				vm_object_unlock(object);
2160
2161				/*
2162				 * Write the page to the copy-object,
2163				 * flushing it from the kernel.
2164				 */
2165				vm_pageout_initialize_page(copy_m);
2166
2167				/*
2168				 * Since the pageout may have
2169				 * temporarily dropped the
2170				 * copy_object's lock, we
2171				 * check whether we'll have
2172				 * to deallocate the hard way.
2173				 */
2174				if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2175					vm_object_unlock(copy_object);
2176					vm_object_deallocate(copy_object);
2177					vm_object_lock(object);
2178
2179					continue;
2180				}
2181				/*
2182				 * Pick back up the old object's
2183				 * lock.  [It is safe to do so,
2184				 * since it must be deeper in the
2185				 * object tree.]
2186				 */
2187				vm_object_lock(object);
2188			}
2189
2190			/*
2191			 * Because we're pushing a page upward
2192			 * in the object tree, we must restart
2193			 * any faults that are waiting here.
2194			 * [Note that this is an expansion of
2195			 * PAGE_WAKEUP that uses the THREAD_RESTART
2196			 * wait result].  Can't turn off the page's
2197			 * busy bit because we're not done with it.
2198			 */
2199			if (m->wanted) {
2200				m->wanted = FALSE;
2201				thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2202			}
2203		}
2204		/*
2205		 * The reference count on copy_object must be
2206		 * at least 2: one for our extra reference,
2207		 * and at least one from the outside world
2208		 * (we checked that when we last locked
2209		 * copy_object).
2210		 */
2211		vm_object_lock_assert_exclusive(copy_object);
2212		copy_object->ref_count--;
2213		assert(copy_object->ref_count > 0);
2214
2215		VM_OBJ_RES_DECR(copy_object);
2216		vm_object_unlock(copy_object);
2217
2218		break;
2219	}
2220
2221done:
2222	*result_page = m;
2223	*top_page = first_m;
2224
2225	XPR(XPR_VM_FAULT,
2226		"vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2227		object, offset, m, first_m, 0);
2228
2229	if (m != VM_PAGE_NULL) {
2230		retval = VM_FAULT_SUCCESS;
2231		if (my_fault == DBG_PAGEIN_FAULT) {
2232
2233			VM_STAT_INCR(pageins);
2234			DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2235			DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2236			current_task()->pageins++;
2237
2238			if (m->object->internal) {
2239				DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2240				my_fault = DBG_PAGEIND_FAULT;
2241			} else {
2242				DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2243				my_fault = DBG_PAGEINV_FAULT;
2244			}
2245
2246		        /*
2247			 * evaluate access pattern and update state
2248			 * vm_fault_deactivate_behind depends on the
2249			 * state being up to date
2250			 */
2251		        vm_fault_is_sequential(object, offset, fault_info->behavior);
2252
2253			vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2254		}
2255		if (type_of_fault)
2256		        *type_of_fault = my_fault;
2257	} else {
2258		retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2259		assert(first_m == VM_PAGE_NULL);
2260		assert(object == first_object);
2261	}
2262
2263	thread_interrupt_level(interruptible_state);
2264
2265#if TRACEFAULTPAGE
2266	dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);	/* (TEST/DEBUG) */
2267#endif
2268	return retval;
2269
2270backoff:
2271	thread_interrupt_level(interruptible_state);
2272
2273	if (wait_result == THREAD_INTERRUPTED)
2274		return (VM_FAULT_INTERRUPTED);
2275	return (VM_FAULT_RETRY);
2276
2277#undef	RELEASE_PAGE
2278}
2279
2280
2281
2282/*
2283 * CODE SIGNING:
2284 * When soft faulting a page, we have to validate the page if:
2285 * 1. the page is being mapped in user space
2286 * 2. the page hasn't already been found to be "tainted"
2287 * 3. the page belongs to a code-signed object
2288 * 4. the page has not been validated yet or has been mapped for write.
2289 */
2290#define VM_FAULT_NEED_CS_VALIDATION(pmap, page)				\
2291	((pmap) != kernel_pmap /*1*/ &&					\
2292	 !(page)->cs_tainted /*2*/ &&					\
2293	 (page)->object->code_signed /*3*/ &&				\
2294	 (!(page)->cs_validated || (page)->wpmapped /*4*/))
2295
2296
2297/*
2298 * page queue lock must NOT be held
2299 * m->object must be locked
2300 *
2301 * NOTE: m->object could be locked "shared" only if we are called
2302 * from vm_fault() as part of a soft fault.  If so, we must be
2303 * careful not to modify the VM object in any way that is not
2304 * legal under a shared lock...
2305 */
2306unsigned long cs_enter_tainted_rejected = 0;
2307unsigned long cs_enter_tainted_accepted = 0;
2308kern_return_t
2309vm_fault_enter(vm_page_t m,
2310	       pmap_t pmap,
2311	       vm_map_offset_t vaddr,
2312	       vm_prot_t prot,
2313	       vm_prot_t fault_type,
2314	       boolean_t wired,
2315	       boolean_t change_wiring,
2316	       boolean_t no_cache,
2317	       boolean_t cs_bypass,
2318	       boolean_t *need_retry,
2319	       int *type_of_fault)
2320{
2321	kern_return_t	kr, pe_result;
2322	boolean_t	previously_pmapped = m->pmapped;
2323	boolean_t	must_disconnect = 0;
2324	boolean_t	map_is_switched, map_is_switch_protected;
2325
2326	vm_object_lock_assert_held(m->object);
2327#if DEBUG
2328	lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2329#endif /* DEBUG */
2330
2331	if (m->phys_page == vm_page_guard_addr) {
2332		assert(m->fictitious);
2333		return KERN_SUCCESS;
2334	}
2335
2336	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2337
2338		vm_object_lock_assert_exclusive(m->object);
2339
2340	} else if ((fault_type & VM_PROT_WRITE) == 0) {
2341		/*
2342		 * This is not a "write" fault, so we
2343		 * might not have taken the object lock
2344		 * exclusively and we might not be able
2345		 * to update the "wpmapped" bit in
2346		 * vm_fault_enter().
2347		 * Let's just grant read access to
2348		 * the page for now and we'll
2349		 * soft-fault again if we need write
2350		 * access later...
2351		 */
2352		prot &= ~VM_PROT_WRITE;
2353	}
2354	if (m->pmapped == FALSE) {
2355
2356		if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2357		        /*
2358			 * found it in the cache, but this
2359			 * is the first fault-in of the page (m->pmapped == FALSE)
2360			 * so it must have come in as part of
2361			 * a cluster... account 1 pagein against it
2362			 */
2363		        VM_STAT_INCR(pageins);
2364			DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2365
2366			if (m->object->internal) {
2367				DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2368				*type_of_fault = DBG_PAGEIND_FAULT;
2369			} else {
2370				DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2371				*type_of_fault = DBG_PAGEINV_FAULT;
2372			}
2373
2374			current_task()->pageins++;
2375		}
2376		VM_PAGE_CONSUME_CLUSTERED(m);
2377
2378	}
2379
2380	if (*type_of_fault != DBG_COW_FAULT) {
2381		DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2382
2383		if (pmap == kernel_pmap) {
2384			DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2385		}
2386	}
2387
2388	/* Validate code signature if necessary. */
2389	if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2390		vm_object_lock_assert_exclusive(m->object);
2391
2392		if (m->cs_validated) {
2393			vm_cs_revalidates++;
2394		}
2395
2396		/* VM map is locked, so 1 ref will remain on VM object -
2397		 * so no harm if vm_page_validate_cs drops the object lock */
2398		vm_page_validate_cs(m);
2399	}
2400
2401#define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2402
2403	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2404			   (pmap == vm_map_pmap(current_thread()->map)));
2405	map_is_switch_protected = current_thread()->map->switch_protect;
2406
2407	/* If the map is switched, and is switch-protected, we must protect
2408	 * some pages from being write-faulted: immutable pages because by
2409	 * definition they may not be written, and executable pages because that
2410	 * would provide a way to inject unsigned code.
2411	 * If the page is immutable, we can simply return. However, we can't
2412	 * immediately determine whether a page is executable anywhere. But,
2413	 * we can disconnect it everywhere and remove the executable protection
2414	 * from the current map. We do that below right before we do the
2415	 * PMAP_ENTER.
2416	 */
2417	if(!cs_enforcement_disable && map_is_switched &&
2418	   map_is_switch_protected && page_immutable(m, prot) &&
2419	   (prot & VM_PROT_WRITE))
2420	{
2421		return KERN_CODESIGN_ERROR;
2422	}
2423
2424	/* A page could be tainted, or pose a risk of being tainted later.
2425	 * Check whether the receiving process wants it, and make it feel
2426	 * the consequences (that hapens in cs_invalid_page()).
2427	 * For CS Enforcement, two other conditions will
2428	 * cause that page to be tainted as well:
2429	 * - pmapping an unsigned page executable - this means unsigned code;
2430	 * - writeable mapping of a validated page - the content of that page
2431	 *   can be changed without the kernel noticing, therefore unsigned
2432	 *   code can be created
2433	 */
2434	if (m->cs_tainted ||
2435	    (( !cs_enforcement_disable && !cs_bypass ) &&
2436	     (/* The page is unsigned and wants to be executable */
2437	      (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2438	      /* The page should be immutable, but is in danger of being modified
2439		* This is the case where we want policy from the code directory -
2440		* is the page immutable or not? For now we have to assume that
2441		* code pages will be immutable, data pages not.
2442		* We'll assume a page is a code page if it has a code directory
2443		* and we fault for execution.
2444		* That is good enough since if we faulted the code page for
2445		* writing in another map before, it is wpmapped; if we fault
2446		* it for writing in this map later it will also be faulted for executing
2447		* at the same time; and if we fault for writing in another map
2448		* later, we will disconnect it from this pmap so we'll notice
2449		* the change.
2450		*/
2451	      (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2452	      ))
2453		)
2454	{
2455		/* We will have a tainted page. Have to handle the special case
2456		 * of a switched map now. If the map is not switched, standard
2457		 * procedure applies - call cs_invalid_page().
2458		 * If the map is switched, the real owner is invalid already.
2459		 * There is no point in invalidating the switching process since
2460		 * it will not be executing from the map. So we don't call
2461		 * cs_invalid_page() in that case. */
2462		boolean_t reject_page;
2463		if(map_is_switched) {
2464			assert(pmap==vm_map_pmap(current_thread()->map));
2465			assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2466			reject_page = FALSE;
2467		} else {
2468			reject_page = cs_invalid_page((addr64_t) vaddr);
2469		}
2470
2471		if (reject_page) {
2472			/* reject the tainted page: abort the page fault */
2473			kr = KERN_CODESIGN_ERROR;
2474			cs_enter_tainted_rejected++;
2475		} else {
2476			/* proceed with the tainted page */
2477			kr = KERN_SUCCESS;
2478			/* Page might have been tainted before or not; now it
2479			 * definitively is. If the page wasn't tainted, we must
2480			 * disconnect it from all pmaps later. */
2481			must_disconnect = !m->cs_tainted;
2482			m->cs_tainted = TRUE;
2483			cs_enter_tainted_accepted++;
2484		}
2485		if (cs_debug || kr != KERN_SUCCESS) {
2486			printf("CODESIGNING: vm_fault_enter(0x%llx): "
2487			       "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2488			       (long long)vaddr, m, m->object, m->offset);
2489		}
2490
2491	} else {
2492		/* proceed with the valid page */
2493		kr = KERN_SUCCESS;
2494	}
2495
2496	/* If we have a KERN_SUCCESS from the previous checks, we either have
2497	 * a good page, or a tainted page that has been accepted by the process.
2498	 * In both cases the page will be entered into the pmap.
2499	 * If the page is writeable, we need to disconnect it from other pmaps
2500	 * now so those processes can take note.
2501	 */
2502	if (kr == KERN_SUCCESS) {
2503	        /*
2504		 * NOTE: we may only hold the vm_object lock SHARED
2505		 * at this point, but the update of pmapped is ok
2506		 * since this is the ONLY bit updated behind the SHARED
2507		 * lock... however, we need to figure out how to do an atomic
2508		 * update on a bit field to make this less fragile... right
2509		 * now I don't know how to coerce 'C' to give me the offset info
2510		 * that's needed for an AtomicCompareAndSwap
2511		 */
2512		m->pmapped = TRUE;
2513		if(vm_page_is_slideable(m)) {
2514			boolean_t was_busy = m->busy;
2515			m->busy = TRUE;
2516			kr = vm_page_slide(m, 0);
2517			assert(m->busy);
2518			if(!was_busy) {
2519				PAGE_WAKEUP_DONE(m);
2520			}
2521			if (kr != KERN_SUCCESS) {
2522				/*
2523				 * This page has not been slid correctly,
2524				 * do not do the pmap_enter() !
2525				 * Let vm_fault_enter() return the error
2526				 * so the caller can fail the fault.
2527				 */
2528				goto after_the_pmap_enter;
2529			}
2530		}
2531
2532		if (fault_type & VM_PROT_WRITE) {
2533
2534			if (m->wpmapped == FALSE) {
2535				vm_object_lock_assert_exclusive(m->object);
2536
2537				m->wpmapped = TRUE;
2538			}
2539			if (must_disconnect) {
2540				/*
2541				 * We can only get here
2542				 * because of the CSE logic
2543				 */
2544				assert(cs_enforcement_disable == FALSE);
2545				pmap_disconnect(m->phys_page);
2546				/*
2547				 * If we are faulting for a write, we can clear
2548				 * the execute bit - that will ensure the page is
2549				 * checked again before being executable, which
2550				 * protects against a map switch.
2551				 * This only happens the first time the page
2552				 * gets tainted, so we won't get stuck here
2553				 * to make an already writeable page executable.
2554				 */
2555				if (!cs_bypass){
2556					prot &= ~VM_PROT_EXECUTE;
2557				}
2558			}
2559		}
2560
2561		/* Prevent a deadlock by not
2562		 * holding the object lock if we need to wait for a page in
2563		 * pmap_enter() - <rdar://problem/7138958> */
2564		PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
2565				  wired, PMAP_OPTIONS_NOWAIT, pe_result);
2566
2567		if(pe_result == KERN_RESOURCE_SHORTAGE) {
2568
2569			if (need_retry) {
2570				/*
2571				 * this will be non-null in the case where we hold the lock
2572				 * on the top-object in this chain... we can't just drop
2573				 * the lock on the object we're inserting the page into
2574				 * and recall the PMAP_ENTER since we can still cause
2575				 * a deadlock if one of the critical paths tries to
2576				 * acquire the lock on the top-object and we're blocked
2577				 * in PMAP_ENTER waiting for memory... our only recourse
2578				 * is to deal with it at a higher level where we can
2579				 * drop both locks.
2580				 */
2581				*need_retry = TRUE;
2582				vm_pmap_enter_retried++;
2583				goto after_the_pmap_enter;
2584			}
2585			/* The nonblocking version of pmap_enter did not succeed.
2586			 * and we don't need to drop other locks and retry
2587			 * at the level above us, so
2588			 * use the blocking version instead. Requires marking
2589			 * the page busy and unlocking the object */
2590			boolean_t was_busy = m->busy;
2591			m->busy = TRUE;
2592			vm_object_unlock(m->object);
2593
2594			PMAP_ENTER(pmap, vaddr, m, prot, fault_type, 0, wired);
2595
2596			/* Take the object lock again. */
2597			vm_object_lock(m->object);
2598
2599			/* If the page was busy, someone else will wake it up.
2600			 * Otherwise, we have to do it now. */
2601			assert(m->busy);
2602			if(!was_busy) {
2603				PAGE_WAKEUP_DONE(m);
2604			}
2605			vm_pmap_enter_blocked++;
2606		}
2607	}
2608
2609after_the_pmap_enter:
2610	/*
2611	 * Hold queues lock to manipulate
2612	 * the page queues.  Change wiring
2613	 * case is obvious.
2614	 */
2615	if (change_wiring) {
2616	        vm_page_lockspin_queues();
2617
2618		if (wired) {
2619			if (kr == KERN_SUCCESS) {
2620				vm_page_wire(m);
2621			}
2622		} else {
2623		        vm_page_unwire(m, TRUE);
2624		}
2625		vm_page_unlock_queues();
2626
2627	} else {
2628	        if (kr != KERN_SUCCESS) {
2629		        vm_page_lockspin_queues();
2630		        vm_page_deactivate(m);
2631		        vm_page_unlock_queues();
2632		} else {
2633		        if (((!m->active && !m->inactive) || m->clean_queue || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) {
2634
2635				if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2636					struct vpl	*lq;
2637					uint32_t	lid;
2638
2639					/*
2640					 * we got a local queue to stuff this new page on...
2641					 * its safe to manipulate local and local_id at this point
2642					 * since we're behind an exclusive object lock and the
2643					 * page is not on any global queue.
2644					 *
2645					 * we'll use the current cpu number to select the queue
2646					 * note that we don't need to disable preemption... we're
2647					 * going to behind the local queue's lock to do the real
2648					 * work
2649					 */
2650					lid = cpu_number();
2651
2652					lq = &vm_page_local_q[lid].vpl_un.vpl;
2653
2654					VPL_LOCK(&lq->vpl_lock);
2655
2656					queue_enter(&lq->vpl_queue, m, vm_page_t, pageq);
2657					m->local = TRUE;
2658					m->local_id = lid;
2659					lq->vpl_count++;
2660
2661					VPL_UNLOCK(&lq->vpl_lock);
2662
2663					if (lq->vpl_count > vm_page_local_q_soft_limit) {
2664						/*
2665						 * we're beyond the soft limit for the local queue
2666						 * vm_page_reactivate_local will 'try' to take
2667						 * the global page queue lock... if it can't that's
2668						 * ok... we'll let the queue continue to grow up
2669						 * to the hard limit... at that point we'll wait
2670						 * for the lock... once we've got the lock, we'll
2671						 * transfer all of the pages from the local queue
2672						 * to the global active queue
2673						 */
2674						vm_page_reactivate_local(lid, FALSE, FALSE);
2675					}
2676					return kr;
2677				}
2678
2679			        vm_page_lockspin_queues();
2680				/*
2681				 * test again now that we hold the page queue lock
2682				 */
2683				if (!VM_PAGE_WIRED(m)) {
2684					if (m->clean_queue) {
2685						VM_PAGE_QUEUES_REMOVE(m);
2686
2687						vm_pageout_cleaned_reactivated++;
2688						vm_pageout_cleaned_fault_reactivated++;
2689					}
2690
2691					if ((!m->active && !m->inactive) || no_cache) {
2692						/*
2693						 * If this is a no_cache mapping and the page has never been
2694						 * mapped before or was previously a no_cache page, then we
2695						 * want to leave pages in the speculative state so that they
2696						 * can be readily recycled if free memory runs low.  Otherwise
2697						 * the page is activated as normal.
2698						 */
2699
2700						if (no_cache && (!previously_pmapped || m->no_cache)) {
2701							m->no_cache = TRUE;
2702
2703							if (!m->speculative)
2704								vm_page_speculate(m, FALSE);
2705
2706						} else if (!m->active && !m->inactive) {
2707
2708							vm_page_activate(m);
2709						}
2710					}
2711				}
2712				vm_page_unlock_queues();
2713			}
2714		}
2715	}
2716	return kr;
2717}
2718
2719
2720/*
2721 *	Routine:	vm_fault
2722 *	Purpose:
2723 *		Handle page faults, including pseudo-faults
2724 *		used to change the wiring status of pages.
2725 *	Returns:
2726 *		Explicit continuations have been removed.
2727 *	Implementation:
2728 *		vm_fault and vm_fault_page save mucho state
2729 *		in the moral equivalent of a closure.  The state
2730 *		structure is allocated when first entering vm_fault
2731 *		and deallocated when leaving vm_fault.
2732 */
2733
2734extern int _map_enter_debug;
2735
2736unsigned long vm_fault_collapse_total = 0;
2737unsigned long vm_fault_collapse_skipped = 0;
2738
2739kern_return_t
2740vm_fault(
2741	vm_map_t	map,
2742	vm_map_offset_t	vaddr,
2743	vm_prot_t	fault_type,
2744	boolean_t	change_wiring,
2745	int		interruptible,
2746	pmap_t		caller_pmap,
2747	vm_map_offset_t	caller_pmap_addr)
2748{
2749	vm_map_version_t	version;	/* Map version for verificiation */
2750	boolean_t		wired;		/* Should mapping be wired down? */
2751	vm_object_t		object;		/* Top-level object */
2752	vm_object_offset_t	offset;		/* Top-level offset */
2753	vm_prot_t		prot;		/* Protection for mapping */
2754	vm_object_t		old_copy_object; /* Saved copy object */
2755	vm_page_t		result_page;	/* Result of vm_fault_page */
2756	vm_page_t		top_page;	/* Placeholder page */
2757	kern_return_t		kr;
2758
2759	vm_page_t		m;	/* Fast access to result_page */
2760	kern_return_t		error_code;
2761	vm_object_t		cur_object;
2762	vm_object_offset_t	cur_offset;
2763	vm_page_t		cur_m;
2764	vm_object_t		new_object;
2765	int                     type_of_fault;
2766	pmap_t			pmap;
2767	boolean_t		interruptible_state;
2768	vm_map_t		real_map = map;
2769	vm_map_t		original_map = map;
2770	vm_prot_t		original_fault_type;
2771	struct vm_object_fault_info fault_info;
2772	boolean_t		need_collapse = FALSE;
2773	boolean_t		need_retry = FALSE;
2774	int			object_lock_type = 0;
2775	int			cur_object_lock_type;
2776	vm_object_t		top_object = VM_OBJECT_NULL;
2777	int			throttle_delay;
2778
2779
2780	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2781	              (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2782			      (int)((uint64_t)vaddr >> 32),
2783			      (int)vaddr,
2784			      (map == kernel_map),
2785			      0,
2786			      0);
2787
2788	if (get_preemption_level() != 0) {
2789	        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2790				      (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2791				      (int)((uint64_t)vaddr >> 32),
2792				      (int)vaddr,
2793				      KERN_FAILURE,
2794				      0,
2795				      0);
2796
2797		return (KERN_FAILURE);
2798	}
2799
2800	interruptible_state = thread_interrupt_level(interruptible);
2801
2802	VM_STAT_INCR(faults);
2803	current_task()->faults++;
2804	original_fault_type = fault_type;
2805
2806	if (fault_type & VM_PROT_WRITE)
2807	        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2808	else
2809	        object_lock_type = OBJECT_LOCK_SHARED;
2810
2811	cur_object_lock_type = OBJECT_LOCK_SHARED;
2812
2813RetryFault:
2814	/*
2815	 * assume we will hit a page in the cache
2816	 * otherwise, explicitly override with
2817	 * the real fault type once we determine it
2818	 */
2819	type_of_fault = DBG_CACHE_HIT_FAULT;
2820
2821	/*
2822	 *	Find the backing store object and offset into
2823	 *	it to begin the search.
2824	 */
2825	fault_type = original_fault_type;
2826	map = original_map;
2827	vm_map_lock_read(map);
2828
2829	kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2830				  object_lock_type, &version,
2831				  &object, &offset, &prot, &wired,
2832				  &fault_info,
2833				  &real_map);
2834
2835	if (kr != KERN_SUCCESS) {
2836		vm_map_unlock_read(map);
2837		goto done;
2838	}
2839	pmap = real_map->pmap;
2840	fault_info.interruptible = interruptible;
2841	fault_info.stealth = FALSE;
2842	fault_info.io_sync = FALSE;
2843	fault_info.mark_zf_absent = FALSE;
2844	fault_info.batch_pmap_op = FALSE;
2845
2846	/*
2847	 * If the page is wired, we must fault for the current protection
2848	 * value, to avoid further faults.
2849	 */
2850	if (wired) {
2851		fault_type = prot | VM_PROT_WRITE;
2852		/*
2853		 * since we're treating this fault as a 'write'
2854		 * we must hold the top object lock exclusively
2855		 */
2856		if (object_lock_type == OBJECT_LOCK_SHARED) {
2857
2858		        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2859
2860			if (vm_object_lock_upgrade(object) == FALSE) {
2861			        /*
2862				 * couldn't upgrade, so explictly
2863				 * take the lock exclusively
2864				 */
2865			        vm_object_lock(object);
2866			}
2867		}
2868	}
2869
2870#if	VM_FAULT_CLASSIFY
2871	/*
2872	 *	Temporary data gathering code
2873	 */
2874	vm_fault_classify(object, offset, fault_type);
2875#endif
2876	/*
2877	 *	Fast fault code.  The basic idea is to do as much as
2878	 *	possible while holding the map lock and object locks.
2879	 *      Busy pages are not used until the object lock has to
2880	 *	be dropped to do something (copy, zero fill, pmap enter).
2881	 *	Similarly, paging references aren't acquired until that
2882	 *	point, and object references aren't used.
2883	 *
2884	 *	If we can figure out what to do
2885	 *	(zero fill, copy on write, pmap enter) while holding
2886	 *	the locks, then it gets done.  Otherwise, we give up,
2887	 *	and use the original fault path (which doesn't hold
2888	 *	the map lock, and relies on busy pages).
2889	 *	The give up cases include:
2890	 * 		- Have to talk to pager.
2891	 *		- Page is busy, absent or in error.
2892	 *		- Pager has locked out desired access.
2893	 *		- Fault needs to be restarted.
2894	 *		- Have to push page into copy object.
2895	 *
2896	 *	The code is an infinite loop that moves one level down
2897	 *	the shadow chain each time.  cur_object and cur_offset
2898	 * 	refer to the current object being examined. object and offset
2899	 *	are the original object from the map.  The loop is at the
2900	 *	top level if and only if object and cur_object are the same.
2901	 *
2902	 *	Invariants:  Map lock is held throughout.  Lock is held on
2903	 *		original object and cur_object (if different) when
2904	 *		continuing or exiting loop.
2905	 *
2906	 */
2907
2908
2909	/*
2910	 * If this page is to be inserted in a copy delay object
2911	 * for writing, and if the object has a copy, then the
2912	 * copy delay strategy is implemented in the slow fault page.
2913	 */
2914	if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2915	    object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2916	        goto handle_copy_delay;
2917
2918	cur_object = object;
2919	cur_offset = offset;
2920
2921	while (TRUE) {
2922		if (!cur_object->pager_created &&
2923		    cur_object->phys_contiguous) /* superpage */
2924			break;
2925
2926		if (cur_object->blocked_access) {
2927			/*
2928			 * Access to this VM object has been blocked.
2929			 * Let the slow path handle it.
2930			 */
2931			break;
2932		}
2933
2934		m = vm_page_lookup(cur_object, cur_offset);
2935
2936		if (m != VM_PAGE_NULL) {
2937			if (m->busy) {
2938			        wait_result_t	result;
2939
2940				/*
2941				 * in order to do the PAGE_ASSERT_WAIT, we must
2942				 * have object that 'm' belongs to locked exclusively
2943				 */
2944				if (object != cur_object) {
2945				        vm_object_unlock(object);
2946
2947					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2948
2949					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2950
2951						if (vm_object_lock_upgrade(cur_object) == FALSE) {
2952						        /*
2953							 * couldn't upgrade so go do a full retry
2954							 * immediately since we've already dropped
2955							 * the top object lock associated with this page
2956							 * and the current one got dropped due to the
2957							 * failed upgrade... the state is no longer valid
2958							 */
2959						        vm_map_unlock_read(map);
2960							if (real_map != map)
2961							        vm_map_unlock(real_map);
2962
2963							goto RetryFault;
2964						}
2965					}
2966				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
2967
2968				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2969
2970					if (vm_object_lock_upgrade(object) == FALSE) {
2971					        /*
2972						 * couldn't upgrade, so explictly take the lock
2973						 * exclusively and go relookup the page since we
2974						 * will have dropped the object lock and
2975						 * a different thread could have inserted
2976						 * a page at this offset
2977						 * no need for a full retry since we're
2978						 * at the top level of the object chain
2979						 */
2980					        vm_object_lock(object);
2981
2982						continue;
2983					}
2984				}
2985				vm_map_unlock_read(map);
2986				if (real_map != map)
2987				        vm_map_unlock(real_map);
2988
2989				result = PAGE_ASSERT_WAIT(m, interruptible);
2990
2991				vm_object_unlock(cur_object);
2992
2993				if (result == THREAD_WAITING) {
2994				        result = thread_block(THREAD_CONTINUE_NULL);
2995
2996					counter(c_vm_fault_page_block_busy_kernel++);
2997				}
2998				if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2999				        goto RetryFault;
3000
3001				kr = KERN_ABORTED;
3002				goto done;
3003			}
3004			if (m->laundry) {
3005				if (object != cur_object) {
3006					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3007						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3008
3009						vm_object_unlock(object);
3010						vm_object_unlock(cur_object);
3011
3012						vm_map_unlock_read(map);
3013						if (real_map != map)
3014							vm_map_unlock(real_map);
3015
3016						goto RetryFault;
3017					}
3018
3019				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3020
3021					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3022
3023					if (vm_object_lock_upgrade(object) == FALSE) {
3024						/*
3025						 * couldn't upgrade, so explictly take the lock
3026						 * exclusively and go relookup the page since we
3027						 * will have dropped the object lock and
3028						 * a different thread could have inserted
3029						 * a page at this offset
3030						 * no need for a full retry since we're
3031						 * at the top level of the object chain
3032						 */
3033						vm_object_lock(object);
3034
3035						continue;
3036					}
3037				}
3038				m->pageout = FALSE;
3039
3040				vm_pageout_steal_laundry(m, FALSE);
3041			}
3042
3043			if (m->phys_page == vm_page_guard_addr) {
3044				/*
3045				 * Guard page: let the slow path deal with it
3046				 */
3047				break;
3048			}
3049			if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3050			        /*
3051				 * Unusual case... let the slow path deal with it
3052				 */
3053				break;
3054			}
3055			if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3056				if (object != cur_object)
3057					vm_object_unlock(object);
3058				vm_map_unlock_read(map);
3059				if (real_map != map)
3060				        vm_map_unlock(real_map);
3061				vm_object_unlock(cur_object);
3062				kr = KERN_MEMORY_ERROR;
3063				goto done;
3064			}
3065
3066			if (m->encrypted) {
3067				/*
3068				 * ENCRYPTED SWAP:
3069				 * We've soft-faulted (because it's not in the page
3070				 * table) on an encrypted page.
3071				 * Keep the page "busy" so that no one messes with
3072				 * it during the decryption.
3073				 * Release the extra locks we're holding, keep only
3074				 * the page's VM object lock.
3075				 *
3076				 * in order to set 'busy' on 'm', we must
3077				 * have object that 'm' belongs to locked exclusively
3078				 */
3079			        if (object != cur_object) {
3080					vm_object_unlock(object);
3081
3082					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3083
3084					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3085
3086						if (vm_object_lock_upgrade(cur_object) == FALSE) {
3087						        /*
3088							 * couldn't upgrade so go do a full retry
3089							 * immediately since we've already dropped
3090							 * the top object lock associated with this page
3091							 * and the current one got dropped due to the
3092							 * failed upgrade... the state is no longer valid
3093							 */
3094						        vm_map_unlock_read(map);
3095							if (real_map != map)
3096							        vm_map_unlock(real_map);
3097
3098							goto RetryFault;
3099						}
3100					}
3101				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3102
3103				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3104
3105					if (vm_object_lock_upgrade(object) == FALSE) {
3106					        /*
3107						 * couldn't upgrade, so explictly take the lock
3108						 * exclusively and go relookup the page since we
3109						 * will have dropped the object lock and
3110						 * a different thread could have inserted
3111						 * a page at this offset
3112						 * no need for a full retry since we're
3113						 * at the top level of the object chain
3114						 */
3115					        vm_object_lock(object);
3116
3117						continue;
3118					}
3119				}
3120				m->busy = TRUE;
3121
3122				vm_map_unlock_read(map);
3123				if (real_map != map)
3124					vm_map_unlock(real_map);
3125
3126				vm_page_decrypt(m, 0);
3127
3128				assert(m->busy);
3129				PAGE_WAKEUP_DONE(m);
3130
3131				vm_object_unlock(cur_object);
3132				/*
3133				 * Retry from the top, in case anything
3134				 * changed while we were decrypting...
3135				 */
3136				goto RetryFault;
3137			}
3138			ASSERT_PAGE_DECRYPTED(m);
3139
3140			if(vm_page_is_slideable(m)) {
3141				/*
3142				 * We might need to slide this page, and so,
3143				 * we want to hold the VM object exclusively.
3144				 */
3145			        if (object != cur_object) {
3146					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3147						vm_object_unlock(object);
3148						vm_object_unlock(cur_object);
3149
3150					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3151
3152						vm_map_unlock_read(map);
3153						if (real_map != map)
3154							vm_map_unlock(real_map);
3155
3156						goto RetryFault;
3157					}
3158				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3159
3160					vm_object_unlock(object);
3161				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3162					vm_map_unlock_read(map);
3163					goto RetryFault;
3164				}
3165			}
3166
3167			if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
3168upgrade_for_validation:
3169				/*
3170				 * We might need to validate this page
3171				 * against its code signature, so we
3172				 * want to hold the VM object exclusively.
3173				 */
3174			        if (object != cur_object) {
3175					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3176						vm_object_unlock(object);
3177						vm_object_unlock(cur_object);
3178
3179					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3180
3181						vm_map_unlock_read(map);
3182						if (real_map != map)
3183							vm_map_unlock(real_map);
3184
3185						goto RetryFault;
3186					}
3187
3188				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3189
3190				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3191
3192					if (vm_object_lock_upgrade(object) == FALSE) {
3193					        /*
3194						 * couldn't upgrade, so explictly take the lock
3195						 * exclusively and go relookup the page since we
3196						 * will have dropped the object lock and
3197						 * a different thread could have inserted
3198						 * a page at this offset
3199						 * no need for a full retry since we're
3200						 * at the top level of the object chain
3201						 */
3202					        vm_object_lock(object);
3203
3204						continue;
3205					}
3206				}
3207			}
3208			/*
3209			 *	Two cases of map in faults:
3210			 *	    - At top level w/o copy object.
3211			 *	    - Read fault anywhere.
3212			 *		--> must disallow write.
3213			 */
3214
3215			if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3216
3217				goto FastPmapEnter;
3218			}
3219
3220			if ((fault_type & VM_PROT_WRITE) == 0) {
3221
3222			  	if (object != cur_object) {
3223				        /*
3224					 * We still need to hold the top object
3225					 * lock here to prevent a race between
3226					 * a read fault (taking only "shared"
3227					 * locks) and a write fault (taking
3228					 * an "exclusive" lock on the top
3229					 * object.
3230					 * Otherwise, as soon as we release the
3231					 * top lock, the write fault could
3232					 * proceed and actually complete before
3233					 * the read fault, and the copied page's
3234					 * translation could then be overwritten
3235					 * by the read fault's translation for
3236					 * the original page.
3237					 *
3238					 * Let's just record what the top object
3239					 * is and we'll release it later.
3240					 */
3241					top_object = object;
3242
3243					/*
3244					 * switch to the object that has the new page
3245					 */
3246					object = cur_object;
3247					object_lock_type = cur_object_lock_type;
3248				}
3249FastPmapEnter:
3250				/*
3251				 * prepare for the pmap_enter...
3252				 * object and map are both locked
3253				 * m contains valid data
3254				 * object == m->object
3255				 * cur_object == NULL or it's been unlocked
3256				 * no paging references on either object or cur_object
3257				 */
3258				if (caller_pmap) {
3259				        kr = vm_fault_enter(m,
3260							    caller_pmap,
3261							    caller_pmap_addr,
3262							    prot,
3263							    fault_type,
3264							    wired,
3265							    change_wiring,
3266							    fault_info.no_cache,
3267							    fault_info.cs_bypass,
3268							    (top_object != VM_OBJECT_NULL ? &need_retry : NULL),
3269							    &type_of_fault);
3270				} else {
3271				        kr = vm_fault_enter(m,
3272							    pmap,
3273							    vaddr,
3274							    prot,
3275							    fault_type,
3276							    wired,
3277							    change_wiring,
3278							    fault_info.no_cache,
3279							    fault_info.cs_bypass,
3280							    (top_object != VM_OBJECT_NULL ? &need_retry : NULL),
3281							    &type_of_fault);
3282				}
3283
3284				if (top_object != VM_OBJECT_NULL) {
3285					/*
3286					 * It's safe to drop the top object
3287					 * now that we've done our
3288					 * vm_fault_enter().  Any other fault
3289					 * in progress for that virtual
3290					 * address will either find our page
3291					 * and translation or put in a new page
3292					 * and translation.
3293					 */
3294					vm_object_unlock(top_object);
3295					top_object = VM_OBJECT_NULL;
3296				}
3297
3298				if (need_collapse == TRUE)
3299				        vm_object_collapse(object, offset, TRUE);
3300
3301				if (need_retry == FALSE &&
3302				    (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3303				        /*
3304					 * evaluate access pattern and update state
3305					 * vm_fault_deactivate_behind depends on the
3306					 * state being up to date
3307					 */
3308				        vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3309
3310					vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3311				}
3312				/*
3313				 * That's it, clean up and return.
3314				 */
3315				if (m->busy)
3316				        PAGE_WAKEUP_DONE(m);
3317
3318				vm_object_unlock(object);
3319
3320				vm_map_unlock_read(map);
3321				if (real_map != map)
3322					vm_map_unlock(real_map);
3323
3324				if (need_retry == TRUE) {
3325					/*
3326					 * vm_fault_enter couldn't complete the PMAP_ENTER...
3327					 * at this point we don't hold any locks so it's safe
3328					 * to ask the pmap layer to expand the page table to
3329					 * accommodate this mapping... once expanded, we'll
3330					 * re-drive the fault which should result in vm_fault_enter
3331					 * being able to successfully enter the mapping this time around
3332					 */
3333					(void)pmap_enter_options(pmap, vaddr, 0, 0, 0, 0, 0, PMAP_OPTIONS_NOENTER);
3334
3335					need_retry = FALSE;
3336					goto RetryFault;
3337				}
3338				goto done;
3339			}
3340			/*
3341			 * COPY ON WRITE FAULT
3342			 */
3343			assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3344
3345			if ((throttle_delay = vm_page_throttled())) {
3346				/*
3347				 * drop all of our locks...
3348				 * wait until the free queue is
3349				 * pumped back up and then
3350				 * redrive the fault
3351				 */
3352				if (object != cur_object)
3353					vm_object_unlock(cur_object);
3354				vm_object_unlock(object);
3355				vm_map_unlock_read(map);
3356				if (real_map != map)
3357					vm_map_unlock(real_map);
3358
3359				VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3360
3361				delay(throttle_delay);
3362
3363				if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3364						 THREAD_UNINT :
3365						 THREAD_ABORTSAFE))
3366					goto RetryFault;
3367				kr = KERN_ABORTED;
3368				goto done;
3369			}
3370                        /*
3371			 * If objects match, then
3372			 * object->copy must not be NULL (else control
3373			 * would be in previous code block), and we
3374			 * have a potential push into the copy object
3375			 * with which we can't cope with here.
3376			 */
3377			if (cur_object == object) {
3378			        /*
3379				 * must take the slow path to
3380				 * deal with the copy push
3381				 */
3382				break;
3383			}
3384
3385			/*
3386			 * This is now a shadow based copy on write
3387			 * fault -- it requires a copy up the shadow
3388			 * chain.
3389			 */
3390
3391			if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3392			    VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3393				goto upgrade_for_validation;
3394			}
3395
3396			/*
3397			 * Allocate a page in the original top level
3398			 * object. Give up if allocate fails.  Also
3399			 * need to remember current page, as it's the
3400			 * source of the copy.
3401			 *
3402			 * at this point we hold locks on both
3403			 * object and cur_object... no need to take
3404			 * paging refs or mark pages BUSY since
3405			 * we don't drop either object lock until
3406			 * the page has been copied and inserted
3407			 */
3408			cur_m = m;
3409			m = vm_page_grab();
3410
3411			if (m == VM_PAGE_NULL) {
3412			        /*
3413				 * no free page currently available...
3414				 * must take the slow path
3415				 */
3416				break;
3417			}
3418			/*
3419			 * Now do the copy.  Mark the source page busy...
3420			 *
3421			 *	NOTE: This code holds the map lock across
3422			 *	the page copy.
3423			 */
3424			vm_page_copy(cur_m, m);
3425			vm_page_insert(m, object, offset);
3426			SET_PAGE_DIRTY(m, FALSE);
3427
3428			/*
3429			 * Now cope with the source page and object
3430			 */
3431			if (object->ref_count > 1 && cur_m->pmapped)
3432			        pmap_disconnect(cur_m->phys_page);
3433
3434			need_collapse = TRUE;
3435
3436			if (!cur_object->internal &&
3437			    cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3438			        /*
3439				 * The object from which we've just
3440				 * copied a page is most probably backed
3441				 * by a vnode.  We don't want to waste too
3442				 * much time trying to collapse the VM objects
3443				 * and create a bottleneck when several tasks
3444				 * map the same file.
3445				 */
3446			        if (cur_object->copy == object) {
3447				        /*
3448					 * Shared mapping or no COW yet.
3449					 * We can never collapse a copy
3450					 * object into its backing object.
3451					 */
3452				        need_collapse = FALSE;
3453				} else if (cur_object->copy == object->shadow &&
3454					   object->shadow->resident_page_count == 0) {
3455				        /*
3456					 * Shared mapping after a COW occurred.
3457					 */
3458				        need_collapse = FALSE;
3459				}
3460			}
3461			vm_object_unlock(cur_object);
3462
3463			if (need_collapse == FALSE)
3464			        vm_fault_collapse_skipped++;
3465			vm_fault_collapse_total++;
3466
3467			type_of_fault = DBG_COW_FAULT;
3468			VM_STAT_INCR(cow_faults);
3469			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
3470			current_task()->cow_faults++;
3471
3472			goto FastPmapEnter;
3473
3474		} else {
3475			/*
3476			 * No page at cur_object, cur_offset... m == NULL
3477			 */
3478			if (cur_object->pager_created) {
3479			        if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
3480				        /*
3481					 * May have to talk to a pager...
3482					 * take the slow path.
3483					 */
3484				        break;
3485				}
3486				/*
3487				 * existence map present and indicates
3488				 * that the pager doesn't have this page
3489				 */
3490			}
3491			if (cur_object->shadow == VM_OBJECT_NULL) {
3492				/*
3493				 * Zero fill fault.  Page gets
3494				 * inserted into the original object.
3495				 */
3496				if (cur_object->shadow_severed ||
3497				    VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
3498				{
3499					if (object != cur_object)
3500					        vm_object_unlock(cur_object);
3501					vm_object_unlock(object);
3502
3503					vm_map_unlock_read(map);
3504					if (real_map != map)
3505						vm_map_unlock(real_map);
3506
3507					kr = KERN_MEMORY_ERROR;
3508					goto done;
3509				}
3510				if ((throttle_delay = vm_page_throttled())) {
3511					/*
3512					 * drop all of our locks...
3513					 * wait until the free queue is
3514					 * pumped back up and then
3515					 * redrive the fault
3516					 */
3517					if (object != cur_object)
3518						vm_object_unlock(cur_object);
3519					vm_object_unlock(object);
3520					vm_map_unlock_read(map);
3521					if (real_map != map)
3522						vm_map_unlock(real_map);
3523
3524					VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3525
3526					delay(throttle_delay);
3527
3528					if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3529							 THREAD_UNINT :
3530							 THREAD_ABORTSAFE))
3531						goto RetryFault;
3532					kr = KERN_ABORTED;
3533					goto done;
3534				}
3535				if (vm_backing_store_low) {
3536				        /*
3537					 * we are protecting the system from
3538					 * backing store exhaustion...
3539					 * must take the slow path if we're
3540					 * not privileged
3541					 */
3542					if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
3543					        break;
3544				}
3545			  	if (cur_object != object) {
3546					vm_object_unlock(cur_object);
3547
3548					cur_object = object;
3549				}
3550				if (object_lock_type == OBJECT_LOCK_SHARED) {
3551
3552				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3553
3554					if (vm_object_lock_upgrade(object) == FALSE) {
3555					        /*
3556						 * couldn't upgrade so do a full retry on the fault
3557						 * since we dropped the object lock which
3558						 * could allow another thread to insert
3559						 * a page at this offset
3560						 */
3561					        vm_map_unlock_read(map);
3562						if (real_map != map)
3563						        vm_map_unlock(real_map);
3564
3565						goto RetryFault;
3566					}
3567				}
3568				m = vm_page_alloc(object, offset);
3569
3570				if (m == VM_PAGE_NULL) {
3571				        /*
3572					 * no free page currently available...
3573					 * must take the slow path
3574					 */
3575					break;
3576				}
3577
3578				/*
3579				 * Now zero fill page...
3580				 * the page is probably going to
3581				 * be written soon, so don't bother
3582				 * to clear the modified bit
3583				 *
3584				 *   NOTE: This code holds the map
3585				 *   lock across the zero fill.
3586				 */
3587				type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
3588
3589				goto FastPmapEnter;
3590		        }
3591			/*
3592			 * On to the next level in the shadow chain
3593			 */
3594			cur_offset += cur_object->vo_shadow_offset;
3595			new_object = cur_object->shadow;
3596
3597			/*
3598			 * take the new_object's lock with the indicated state
3599			 */
3600			if (cur_object_lock_type == OBJECT_LOCK_SHARED)
3601			        vm_object_lock_shared(new_object);
3602			else
3603			        vm_object_lock(new_object);
3604
3605			if (cur_object != object)
3606				vm_object_unlock(cur_object);
3607
3608			cur_object = new_object;
3609
3610			continue;
3611		}
3612	}
3613	/*
3614	 * Cleanup from fast fault failure.  Drop any object
3615	 * lock other than original and drop map lock.
3616	 */
3617	if (object != cur_object)
3618		vm_object_unlock(cur_object);
3619
3620	/*
3621	 * must own the object lock exclusively at this point
3622	 */
3623	if (object_lock_type == OBJECT_LOCK_SHARED) {
3624	        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3625
3626		if (vm_object_lock_upgrade(object) == FALSE) {
3627		        /*
3628			 * couldn't upgrade, so explictly
3629			 * take the lock exclusively
3630			 * no need to retry the fault at this
3631			 * point since "vm_fault_page" will
3632			 * completely re-evaluate the state
3633			 */
3634		        vm_object_lock(object);
3635		}
3636	}
3637
3638handle_copy_delay:
3639	vm_map_unlock_read(map);
3640	if (real_map != map)
3641		vm_map_unlock(real_map);
3642
3643   	/*
3644	 * Make a reference to this object to
3645	 * prevent its disposal while we are messing with
3646	 * it.  Once we have the reference, the map is free
3647	 * to be diddled.  Since objects reference their
3648	 * shadows (and copies), they will stay around as well.
3649	 */
3650	vm_object_reference_locked(object);
3651	vm_object_paging_begin(object);
3652
3653	XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
3654
3655	error_code = 0;
3656
3657	kr = vm_fault_page(object, offset, fault_type,
3658			   (change_wiring && !wired),
3659			   &prot, &result_page, &top_page,
3660			   &type_of_fault,
3661			   &error_code, map->no_zero_fill,
3662			   FALSE, &fault_info);
3663
3664	/*
3665	 * if kr != VM_FAULT_SUCCESS, then the paging reference
3666	 * has been dropped and the object unlocked... the ref_count
3667	 * is still held
3668	 *
3669	 * if kr == VM_FAULT_SUCCESS, then the paging reference
3670	 * is still held along with the ref_count on the original object
3671	 *
3672	 *	the object is returned locked with a paging reference
3673	 *
3674	 *	if top_page != NULL, then it's BUSY and the
3675	 *	object it belongs to has a paging reference
3676	 *	but is returned unlocked
3677	 */
3678	if (kr != VM_FAULT_SUCCESS &&
3679	    kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
3680	        /*
3681		 * we didn't succeed, lose the object reference immediately.
3682		 */
3683		vm_object_deallocate(object);
3684
3685		/*
3686		 * See why we failed, and take corrective action.
3687		 */
3688		switch (kr) {
3689		case VM_FAULT_MEMORY_SHORTAGE:
3690			if (vm_page_wait((change_wiring) ?
3691					 THREAD_UNINT :
3692					 THREAD_ABORTSAFE))
3693				goto RetryFault;
3694			/*
3695			 * fall thru
3696			 */
3697		case VM_FAULT_INTERRUPTED:
3698			kr = KERN_ABORTED;
3699			goto done;
3700		case VM_FAULT_RETRY:
3701			goto RetryFault;
3702		case VM_FAULT_MEMORY_ERROR:
3703			if (error_code)
3704				kr = error_code;
3705			else
3706				kr = KERN_MEMORY_ERROR;
3707			goto done;
3708		default:
3709			panic("vm_fault: unexpected error 0x%x from "
3710			      "vm_fault_page()\n", kr);
3711		}
3712	}
3713	m = result_page;
3714
3715	if (m != VM_PAGE_NULL) {
3716		assert((change_wiring && !wired) ?
3717	   	    (top_page == VM_PAGE_NULL) :
3718	   	    ((top_page == VM_PAGE_NULL) == (m->object == object)));
3719	}
3720
3721	/*
3722	 * What to do with the resulting page from vm_fault_page
3723	 * if it doesn't get entered into the physical map:
3724	 */
3725#define RELEASE_PAGE(m)					\
3726	MACRO_BEGIN					\
3727	PAGE_WAKEUP_DONE(m);				\
3728	if (!m->active && !m->inactive && !m->throttled) {		\
3729		vm_page_lockspin_queues();				\
3730		if (!m->active && !m->inactive && !m->throttled)	\
3731			vm_page_activate(m);				\
3732		vm_page_unlock_queues();				\
3733	}								\
3734	MACRO_END
3735
3736	/*
3737	 * We must verify that the maps have not changed
3738	 * since our last lookup.
3739	 */
3740	if (m != VM_PAGE_NULL) {
3741		old_copy_object = m->object->copy;
3742		vm_object_unlock(m->object);
3743	} else {
3744		old_copy_object = VM_OBJECT_NULL;
3745		vm_object_unlock(object);
3746	}
3747
3748	/*
3749	 * no object locks are held at this point
3750	 */
3751	if ((map != original_map) || !vm_map_verify(map, &version)) {
3752		vm_object_t		retry_object;
3753		vm_object_offset_t	retry_offset;
3754		vm_prot_t		retry_prot;
3755
3756		/*
3757		 * To avoid trying to write_lock the map while another
3758		 * thread has it read_locked (in vm_map_pageable), we
3759		 * do not try for write permission.  If the page is
3760		 * still writable, we will get write permission.  If it
3761		 * is not, or has been marked needs_copy, we enter the
3762		 * mapping without write permission, and will merely
3763		 * take another fault.
3764		 */
3765		map = original_map;
3766		vm_map_lock_read(map);
3767
3768		kr = vm_map_lookup_locked(&map, vaddr,
3769					  fault_type & ~VM_PROT_WRITE,
3770					  OBJECT_LOCK_EXCLUSIVE, &version,
3771					  &retry_object, &retry_offset, &retry_prot,
3772					  &wired,
3773					  &fault_info,
3774					  &real_map);
3775		pmap = real_map->pmap;
3776
3777		if (kr != KERN_SUCCESS) {
3778			vm_map_unlock_read(map);
3779
3780			if (m != VM_PAGE_NULL) {
3781			        /*
3782				 * retake the lock so that
3783				 * we can drop the paging reference
3784				 * in vm_fault_cleanup and do the
3785				 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3786				 */
3787				vm_object_lock(m->object);
3788
3789				RELEASE_PAGE(m);
3790
3791				vm_fault_cleanup(m->object, top_page);
3792			} else {
3793			        /*
3794				 * retake the lock so that
3795				 * we can drop the paging reference
3796				 * in vm_fault_cleanup
3797				 */
3798			        vm_object_lock(object);
3799
3800			        vm_fault_cleanup(object, top_page);
3801			}
3802			vm_object_deallocate(object);
3803
3804			goto done;
3805		}
3806		vm_object_unlock(retry_object);
3807
3808		if ((retry_object != object) || (retry_offset != offset)) {
3809
3810			vm_map_unlock_read(map);
3811			if (real_map != map)
3812				vm_map_unlock(real_map);
3813
3814			if (m != VM_PAGE_NULL) {
3815			        /*
3816				 * retake the lock so that
3817				 * we can drop the paging reference
3818				 * in vm_fault_cleanup and do the
3819				 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3820				 */
3821			        vm_object_lock(m->object);
3822
3823				RELEASE_PAGE(m);
3824
3825				vm_fault_cleanup(m->object, top_page);
3826			} else {
3827			        /*
3828				 * retake the lock so that
3829				 * we can drop the paging reference
3830				 * in vm_fault_cleanup
3831				 */
3832			        vm_object_lock(object);
3833
3834			        vm_fault_cleanup(object, top_page);
3835			}
3836			vm_object_deallocate(object);
3837
3838			goto RetryFault;
3839		}
3840		/*
3841		 * Check whether the protection has changed or the object
3842		 * has been copied while we left the map unlocked.
3843		 */
3844		prot &= retry_prot;
3845	}
3846	if (m != VM_PAGE_NULL) {
3847		vm_object_lock(m->object);
3848
3849		if (m->object->copy != old_copy_object) {
3850		        /*
3851			 * The copy object changed while the top-level object
3852			 * was unlocked, so take away write permission.
3853			 */
3854			prot &= ~VM_PROT_WRITE;
3855		}
3856	} else
3857		vm_object_lock(object);
3858
3859	/*
3860	 * If we want to wire down this page, but no longer have
3861	 * adequate permissions, we must start all over.
3862	 */
3863	if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3864
3865		vm_map_verify_done(map, &version);
3866		if (real_map != map)
3867			vm_map_unlock(real_map);
3868
3869		if (m != VM_PAGE_NULL) {
3870			RELEASE_PAGE(m);
3871
3872			vm_fault_cleanup(m->object, top_page);
3873		} else
3874		        vm_fault_cleanup(object, top_page);
3875
3876		vm_object_deallocate(object);
3877
3878		goto RetryFault;
3879	}
3880	if (m != VM_PAGE_NULL) {
3881		/*
3882		 * Put this page into the physical map.
3883		 * We had to do the unlock above because pmap_enter
3884		 * may cause other faults.  The page may be on
3885		 * the pageout queues.  If the pageout daemon comes
3886		 * across the page, it will remove it from the queues.
3887		 */
3888		if (caller_pmap) {
3889			kr = vm_fault_enter(m,
3890					    caller_pmap,
3891					    caller_pmap_addr,
3892					    prot,
3893					    fault_type,
3894					    wired,
3895					    change_wiring,
3896					    fault_info.no_cache,
3897					    fault_info.cs_bypass,
3898					    NULL,
3899					    &type_of_fault);
3900		} else {
3901			kr = vm_fault_enter(m,
3902					    pmap,
3903					    vaddr,
3904					    prot,
3905					    fault_type,
3906					    wired,
3907					    change_wiring,
3908					    fault_info.no_cache,
3909					    fault_info.cs_bypass,
3910					    NULL,
3911					    &type_of_fault);
3912		}
3913		if (kr != KERN_SUCCESS) {
3914			/* abort this page fault */
3915			vm_map_verify_done(map, &version);
3916			if (real_map != map)
3917				vm_map_unlock(real_map);
3918			PAGE_WAKEUP_DONE(m);
3919			vm_fault_cleanup(m->object, top_page);
3920			vm_object_deallocate(object);
3921			goto done;
3922		}
3923	} else {
3924
3925		vm_map_entry_t		entry;
3926		vm_map_offset_t		laddr;
3927		vm_map_offset_t		ldelta, hdelta;
3928
3929		/*
3930		 * do a pmap block mapping from the physical address
3931		 * in the object
3932		 */
3933
3934#ifdef ppc
3935		/* While we do not worry about execution protection in   */
3936		/* general, certian pages may have instruction execution */
3937		/* disallowed.  We will check here, and if not allowed   */
3938		/* to execute, we return with a protection failure.      */
3939
3940		if ((fault_type & VM_PROT_EXECUTE) &&
3941			(!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
3942
3943			vm_map_verify_done(map, &version);
3944
3945			if (real_map != map)
3946				vm_map_unlock(real_map);
3947
3948			vm_fault_cleanup(object, top_page);
3949			vm_object_deallocate(object);
3950
3951			kr = KERN_PROTECTION_FAILURE;
3952			goto done;
3953		}
3954#endif	/* ppc */
3955
3956		if (real_map != map)
3957			vm_map_unlock(real_map);
3958
3959		if (original_map != map) {
3960			vm_map_unlock_read(map);
3961			vm_map_lock_read(original_map);
3962			map = original_map;
3963		}
3964		real_map = map;
3965
3966		laddr = vaddr;
3967		hdelta = 0xFFFFF000;
3968		ldelta = 0xFFFFF000;
3969
3970		while (vm_map_lookup_entry(map, laddr, &entry)) {
3971			if (ldelta > (laddr - entry->vme_start))
3972				ldelta = laddr - entry->vme_start;
3973			if (hdelta > (entry->vme_end - laddr))
3974				hdelta = entry->vme_end - laddr;
3975			if (entry->is_sub_map) {
3976
3977				laddr = (laddr - entry->vme_start)
3978							+ entry->offset;
3979				vm_map_lock_read(entry->object.sub_map);
3980
3981				if (map != real_map)
3982					vm_map_unlock_read(map);
3983				if (entry->use_pmap) {
3984					vm_map_unlock_read(real_map);
3985					real_map = entry->object.sub_map;
3986				}
3987				map = entry->object.sub_map;
3988
3989			} else {
3990				break;
3991			}
3992		}
3993
3994		if (vm_map_lookup_entry(map, laddr, &entry) &&
3995					(entry->object.vm_object != NULL) &&
3996					(entry->object.vm_object == object)) {
3997
3998			int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
3999			if (caller_pmap) {
4000				/*
4001				 * Set up a block mapped area
4002				 */
4003				assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4004				pmap_map_block(caller_pmap,
4005					       (addr64_t)(caller_pmap_addr - ldelta),
4006					       (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
4007							  entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4008					       (uint32_t)((ldelta + hdelta) >> 12), prot,
4009					       (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4010			} else {
4011				/*
4012				 * Set up a block mapped area
4013				 */
4014				assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4015				pmap_map_block(real_map->pmap,
4016					       (addr64_t)(vaddr - ldelta),
4017					       (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
4018							  entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4019					       (uint32_t)((ldelta + hdelta) >> 12), prot,
4020					       (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4021			}
4022		}
4023	}
4024
4025	/*
4026	 * Unlock everything, and return
4027	 */
4028	vm_map_verify_done(map, &version);
4029	if (real_map != map)
4030		vm_map_unlock(real_map);
4031
4032	if (m != VM_PAGE_NULL) {
4033		PAGE_WAKEUP_DONE(m);
4034
4035		vm_fault_cleanup(m->object, top_page);
4036	} else
4037	        vm_fault_cleanup(object, top_page);
4038
4039	vm_object_deallocate(object);
4040
4041#undef	RELEASE_PAGE
4042
4043	kr = KERN_SUCCESS;
4044done:
4045	thread_interrupt_level(interruptible_state);
4046
4047	throttle_lowpri_io(TRUE);
4048
4049	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4050			      (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4051			      (int)((uint64_t)vaddr >> 32),
4052			      (int)vaddr,
4053			      kr,
4054			      type_of_fault,
4055			      0);
4056
4057	return (kr);
4058}
4059
4060/*
4061 *	vm_fault_wire:
4062 *
4063 *	Wire down a range of virtual addresses in a map.
4064 */
4065kern_return_t
4066vm_fault_wire(
4067	vm_map_t	map,
4068	vm_map_entry_t	entry,
4069	pmap_t		pmap,
4070	vm_map_offset_t	pmap_addr)
4071{
4072
4073	register vm_map_offset_t	va;
4074	register vm_map_offset_t	end_addr = entry->vme_end;
4075	register kern_return_t	rc;
4076
4077	assert(entry->in_transition);
4078
4079	if ((entry->object.vm_object != NULL) &&
4080			!entry->is_sub_map &&
4081			entry->object.vm_object->phys_contiguous) {
4082		return KERN_SUCCESS;
4083	}
4084
4085	/*
4086	 *	Inform the physical mapping system that the
4087	 *	range of addresses may not fault, so that
4088	 *	page tables and such can be locked down as well.
4089	 */
4090
4091	pmap_pageable(pmap, pmap_addr,
4092		pmap_addr + (end_addr - entry->vme_start), FALSE);
4093
4094	/*
4095	 *	We simulate a fault to get the page and enter it
4096	 *	in the physical map.
4097	 */
4098
4099	for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4100		if ((rc = vm_fault_wire_fast(
4101			map, va, entry, pmap,
4102			pmap_addr + (va - entry->vme_start)
4103			)) != KERN_SUCCESS) {
4104			rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
4105			  	(pmap == kernel_pmap) ?
4106					THREAD_UNINT : THREAD_ABORTSAFE,
4107				pmap, pmap_addr + (va - entry->vme_start));
4108			DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
4109		}
4110
4111		if (rc != KERN_SUCCESS) {
4112			struct vm_map_entry	tmp_entry = *entry;
4113
4114			/* unwire wired pages */
4115			tmp_entry.vme_end = va;
4116			vm_fault_unwire(map,
4117				&tmp_entry, FALSE, pmap, pmap_addr);
4118
4119			return rc;
4120		}
4121	}
4122	return KERN_SUCCESS;
4123}
4124
4125/*
4126 *	vm_fault_unwire:
4127 *
4128 *	Unwire a range of virtual addresses in a map.
4129 */
4130void
4131vm_fault_unwire(
4132	vm_map_t	map,
4133	vm_map_entry_t	entry,
4134	boolean_t	deallocate,
4135	pmap_t		pmap,
4136	vm_map_offset_t	pmap_addr)
4137{
4138	register vm_map_offset_t	va;
4139	register vm_map_offset_t	end_addr = entry->vme_end;
4140	vm_object_t		object;
4141	struct vm_object_fault_info fault_info;
4142
4143	object = (entry->is_sub_map)
4144			? VM_OBJECT_NULL : entry->object.vm_object;
4145
4146	/*
4147	 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4148	 * do anything since such memory is wired by default.  So we don't have
4149	 * anything to undo here.
4150	 */
4151
4152	if (object != VM_OBJECT_NULL && object->phys_contiguous)
4153		return;
4154
4155	fault_info.interruptible = THREAD_UNINT;
4156	fault_info.behavior = entry->behavior;
4157	fault_info.user_tag = entry->alias;
4158	fault_info.lo_offset = entry->offset;
4159	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4160	fault_info.no_cache = entry->no_cache;
4161	fault_info.stealth = TRUE;
4162	fault_info.io_sync = FALSE;
4163	fault_info.cs_bypass = FALSE;
4164	fault_info.mark_zf_absent = FALSE;
4165	fault_info.batch_pmap_op = FALSE;
4166
4167	/*
4168	 *	Since the pages are wired down, we must be able to
4169	 *	get their mappings from the physical map system.
4170	 */
4171
4172	for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4173
4174		if (object == VM_OBJECT_NULL) {
4175			if (pmap) {
4176				pmap_change_wiring(pmap,
4177						   pmap_addr + (va - entry->vme_start), FALSE);
4178			}
4179			(void) vm_fault(map, va, VM_PROT_NONE,
4180					TRUE, THREAD_UNINT, pmap, pmap_addr);
4181		} else {
4182		 	vm_prot_t	prot;
4183			vm_page_t	result_page;
4184			vm_page_t	top_page;
4185			vm_object_t	result_object;
4186			vm_fault_return_t result;
4187
4188			if (end_addr - va > (vm_size_t) -1) {
4189				/* 32-bit overflow */
4190				fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4191			} else {
4192				fault_info.cluster_size = (vm_size_t) (end_addr - va);
4193				assert(fault_info.cluster_size == end_addr - va);
4194			}
4195
4196			do {
4197				prot = VM_PROT_NONE;
4198
4199				vm_object_lock(object);
4200				vm_object_paging_begin(object);
4201				XPR(XPR_VM_FAULT,
4202					"vm_fault_unwire -> vm_fault_page\n",
4203					0,0,0,0,0);
4204			 	result = vm_fault_page(
4205					object,
4206					entry->offset + (va - entry->vme_start),
4207					VM_PROT_NONE, TRUE,
4208					&prot, &result_page, &top_page,
4209					(int *)0,
4210					NULL, map->no_zero_fill,
4211					FALSE, &fault_info);
4212			} while (result == VM_FAULT_RETRY);
4213
4214			/*
4215			 * If this was a mapping to a file on a device that has been forcibly
4216			 * unmounted, then we won't get a page back from vm_fault_page().  Just
4217			 * move on to the next one in case the remaining pages are mapped from
4218			 * different objects.  During a forced unmount, the object is terminated
4219			 * so the alive flag will be false if this happens.  A forced unmount will
4220			 * will occur when an external disk is unplugged before the user does an
4221			 * eject, so we don't want to panic in that situation.
4222			 */
4223
4224			if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
4225				continue;
4226
4227			if (result != VM_FAULT_SUCCESS)
4228				panic("vm_fault_unwire: failure");
4229
4230			result_object = result_page->object;
4231
4232			if (deallocate) {
4233				assert(result_page->phys_page !=
4234				       vm_page_fictitious_addr);
4235				pmap_disconnect(result_page->phys_page);
4236				VM_PAGE_FREE(result_page);
4237			} else {
4238				if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
4239					pmap_change_wiring(pmap,
4240					    pmap_addr + (va - entry->vme_start), FALSE);
4241
4242
4243				if (VM_PAGE_WIRED(result_page)) {
4244					vm_page_lockspin_queues();
4245					vm_page_unwire(result_page, TRUE);
4246					vm_page_unlock_queues();
4247				}
4248				if(entry->zero_wired_pages) {
4249					pmap_zero_page(result_page->phys_page);
4250					entry->zero_wired_pages = FALSE;
4251				}
4252
4253				PAGE_WAKEUP_DONE(result_page);
4254			}
4255			vm_fault_cleanup(result_object, top_page);
4256		}
4257	}
4258
4259	/*
4260	 *	Inform the physical mapping system that the range
4261	 *	of addresses may fault, so that page tables and
4262	 *	such may be unwired themselves.
4263	 */
4264
4265	pmap_pageable(pmap, pmap_addr,
4266		pmap_addr + (end_addr - entry->vme_start), TRUE);
4267
4268}
4269
4270/*
4271 *	vm_fault_wire_fast:
4272 *
4273 *	Handle common case of a wire down page fault at the given address.
4274 *	If successful, the page is inserted into the associated physical map.
4275 *	The map entry is passed in to avoid the overhead of a map lookup.
4276 *
4277 *	NOTE: the given address should be truncated to the
4278 *	proper page address.
4279 *
4280 *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
4281 *	a standard error specifying why the fault is fatal is returned.
4282 *
4283 *	The map in question must be referenced, and remains so.
4284 *	Caller has a read lock on the map.
4285 *
4286 *	This is a stripped version of vm_fault() for wiring pages.  Anything
4287 *	other than the common case will return KERN_FAILURE, and the caller
4288 *	is expected to call vm_fault().
4289 */
4290kern_return_t
4291vm_fault_wire_fast(
4292	__unused vm_map_t	map,
4293	vm_map_offset_t	va,
4294	vm_map_entry_t	entry,
4295	pmap_t			pmap,
4296	vm_map_offset_t	pmap_addr)
4297{
4298	vm_object_t		object;
4299	vm_object_offset_t	offset;
4300	register vm_page_t	m;
4301	vm_prot_t		prot;
4302	thread_t           	thread = current_thread();
4303	int			type_of_fault;
4304	kern_return_t		kr;
4305
4306	VM_STAT_INCR(faults);
4307
4308	if (thread != THREAD_NULL && thread->task != TASK_NULL)
4309	  thread->task->faults++;
4310
4311/*
4312 *	Recovery actions
4313 */
4314
4315#undef	RELEASE_PAGE
4316#define RELEASE_PAGE(m)	{				\
4317	PAGE_WAKEUP_DONE(m);				\
4318	vm_page_lockspin_queues();			\
4319	vm_page_unwire(m, TRUE);			\
4320	vm_page_unlock_queues();			\
4321}
4322
4323
4324#undef	UNLOCK_THINGS
4325#define UNLOCK_THINGS	{				\
4326	vm_object_paging_end(object);			   \
4327	vm_object_unlock(object);			   \
4328}
4329
4330#undef	UNLOCK_AND_DEALLOCATE
4331#define UNLOCK_AND_DEALLOCATE	{			\
4332	UNLOCK_THINGS;					\
4333	vm_object_deallocate(object);			\
4334}
4335/*
4336 *	Give up and have caller do things the hard way.
4337 */
4338
4339#define GIVE_UP {					\
4340	UNLOCK_AND_DEALLOCATE;				\
4341	return(KERN_FAILURE);				\
4342}
4343
4344
4345	/*
4346	 *	If this entry is not directly to a vm_object, bail out.
4347	 */
4348	if (entry->is_sub_map)
4349		return(KERN_FAILURE);
4350
4351	/*
4352	 *	Find the backing store object and offset into it.
4353	 */
4354
4355	object = entry->object.vm_object;
4356	offset = (va - entry->vme_start) + entry->offset;
4357	prot = entry->protection;
4358
4359   	/*
4360	 *	Make a reference to this object to prevent its
4361	 *	disposal while we are messing with it.
4362	 */
4363
4364	vm_object_lock(object);
4365	vm_object_reference_locked(object);
4366	vm_object_paging_begin(object);
4367
4368	/*
4369	 *	INVARIANTS (through entire routine):
4370	 *
4371	 *	1)	At all times, we must either have the object
4372	 *		lock or a busy page in some object to prevent
4373	 *		some other thread from trying to bring in
4374	 *		the same page.
4375	 *
4376	 *	2)	Once we have a busy page, we must remove it from
4377	 *		the pageout queues, so that the pageout daemon
4378	 *		will not grab it away.
4379	 *
4380	 */
4381
4382	/*
4383	 *	Look for page in top-level object.  If it's not there or
4384	 *	there's something going on, give up.
4385	 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
4386	 * decrypt the page before wiring it down.
4387	 */
4388	m = vm_page_lookup(object, offset);
4389	if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
4390	    (m->unusual && ( m->error || m->restart || m->absent))) {
4391
4392		GIVE_UP;
4393	}
4394	ASSERT_PAGE_DECRYPTED(m);
4395
4396	if (m->fictitious &&
4397	    m->phys_page == vm_page_guard_addr) {
4398		/*
4399		 * Guard pages are fictitious pages and are never
4400		 * entered into a pmap, so let's say it's been wired...
4401		 */
4402		kr = KERN_SUCCESS;
4403		goto done;
4404	}
4405
4406	/*
4407	 *	Wire the page down now.  All bail outs beyond this
4408	 *	point must unwire the page.
4409	 */
4410
4411	vm_page_lockspin_queues();
4412	vm_page_wire(m);
4413	vm_page_unlock_queues();
4414
4415	/*
4416	 *	Mark page busy for other threads.
4417	 */
4418	assert(!m->busy);
4419	m->busy = TRUE;
4420	assert(!m->absent);
4421
4422	/*
4423	 *	Give up if the page is being written and there's a copy object
4424	 */
4425	if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
4426		RELEASE_PAGE(m);
4427		GIVE_UP;
4428	}
4429
4430	/*
4431	 *	Put this page into the physical map.
4432	 */
4433	type_of_fault = DBG_CACHE_HIT_FAULT;
4434	kr = vm_fault_enter(m,
4435			    pmap,
4436			    pmap_addr,
4437			    prot,
4438			    prot,
4439			    TRUE,
4440			    FALSE,
4441			    FALSE,
4442			    FALSE,
4443			    NULL,
4444			    &type_of_fault);
4445
4446done:
4447	/*
4448	 *	Unlock everything, and return
4449	 */
4450
4451	PAGE_WAKEUP_DONE(m);
4452	UNLOCK_AND_DEALLOCATE;
4453
4454	return kr;
4455
4456}
4457
4458/*
4459 *	Routine:	vm_fault_copy_cleanup
4460 *	Purpose:
4461 *		Release a page used by vm_fault_copy.
4462 */
4463
4464void
4465vm_fault_copy_cleanup(
4466	vm_page_t	page,
4467	vm_page_t	top_page)
4468{
4469	vm_object_t	object = page->object;
4470
4471	vm_object_lock(object);
4472	PAGE_WAKEUP_DONE(page);
4473	if (!page->active && !page->inactive && !page->throttled) {
4474		vm_page_lockspin_queues();
4475		if (!page->active && !page->inactive && !page->throttled)
4476			vm_page_activate(page);
4477		vm_page_unlock_queues();
4478	}
4479	vm_fault_cleanup(object, top_page);
4480}
4481
4482void
4483vm_fault_copy_dst_cleanup(
4484	vm_page_t	page)
4485{
4486	vm_object_t	object;
4487
4488	if (page != VM_PAGE_NULL) {
4489		object = page->object;
4490		vm_object_lock(object);
4491		vm_page_lockspin_queues();
4492		vm_page_unwire(page, TRUE);
4493		vm_page_unlock_queues();
4494		vm_object_paging_end(object);
4495		vm_object_unlock(object);
4496	}
4497}
4498
4499/*
4500 *	Routine:	vm_fault_copy
4501 *
4502 *	Purpose:
4503 *		Copy pages from one virtual memory object to another --
4504 *		neither the source nor destination pages need be resident.
4505 *
4506 *		Before actually copying a page, the version associated with
4507 *		the destination address map wil be verified.
4508 *
4509 *	In/out conditions:
4510 *		The caller must hold a reference, but not a lock, to
4511 *		each of the source and destination objects and to the
4512 *		destination map.
4513 *
4514 *	Results:
4515 *		Returns KERN_SUCCESS if no errors were encountered in
4516 *		reading or writing the data.  Returns KERN_INTERRUPTED if
4517 *		the operation was interrupted (only possible if the
4518 *		"interruptible" argument is asserted).  Other return values
4519 *		indicate a permanent error in copying the data.
4520 *
4521 *		The actual amount of data copied will be returned in the
4522 *		"copy_size" argument.  In the event that the destination map
4523 *		verification failed, this amount may be less than the amount
4524 *		requested.
4525 */
4526kern_return_t
4527vm_fault_copy(
4528	vm_object_t		src_object,
4529	vm_object_offset_t	src_offset,
4530	vm_map_size_t		*copy_size,		/* INOUT */
4531	vm_object_t		dst_object,
4532	vm_object_offset_t	dst_offset,
4533	vm_map_t		dst_map,
4534	vm_map_version_t	 *dst_version,
4535	int			interruptible)
4536{
4537	vm_page_t		result_page;
4538
4539	vm_page_t		src_page;
4540	vm_page_t		src_top_page;
4541	vm_prot_t		src_prot;
4542
4543	vm_page_t		dst_page;
4544	vm_page_t		dst_top_page;
4545	vm_prot_t		dst_prot;
4546
4547	vm_map_size_t		amount_left;
4548	vm_object_t		old_copy_object;
4549	kern_return_t		error = 0;
4550	vm_fault_return_t	result;
4551
4552	vm_map_size_t		part_size;
4553	struct vm_object_fault_info fault_info_src;
4554	struct vm_object_fault_info fault_info_dst;
4555
4556	/*
4557	 * In order not to confuse the clustered pageins, align
4558	 * the different offsets on a page boundary.
4559	 */
4560
4561#define	RETURN(x)					\
4562	MACRO_BEGIN					\
4563	*copy_size -= amount_left;			\
4564	MACRO_RETURN(x);				\
4565	MACRO_END
4566
4567	amount_left = *copy_size;
4568
4569	fault_info_src.interruptible = interruptible;
4570	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
4571	fault_info_src.user_tag  = 0;
4572	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
4573	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
4574	fault_info_src.no_cache   = FALSE;
4575	fault_info_src.stealth = TRUE;
4576	fault_info_src.io_sync = FALSE;
4577	fault_info_src.cs_bypass = FALSE;
4578	fault_info_src.mark_zf_absent = FALSE;
4579	fault_info_src.batch_pmap_op = FALSE;
4580
4581	fault_info_dst.interruptible = interruptible;
4582	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
4583	fault_info_dst.user_tag  = 0;
4584	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
4585	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
4586	fault_info_dst.no_cache   = FALSE;
4587	fault_info_dst.stealth = TRUE;
4588	fault_info_dst.io_sync = FALSE;
4589	fault_info_dst.cs_bypass = FALSE;
4590	fault_info_dst.mark_zf_absent = FALSE;
4591	fault_info_dst.batch_pmap_op = FALSE;
4592
4593	do { /* while (amount_left > 0) */
4594		/*
4595		 * There may be a deadlock if both source and destination
4596		 * pages are the same. To avoid this deadlock, the copy must
4597		 * start by getting the destination page in order to apply
4598		 * COW semantics if any.
4599		 */
4600
4601	RetryDestinationFault: ;
4602
4603		dst_prot = VM_PROT_WRITE|VM_PROT_READ;
4604
4605		vm_object_lock(dst_object);
4606		vm_object_paging_begin(dst_object);
4607
4608		if (amount_left > (vm_size_t) -1) {
4609			/* 32-bit overflow */
4610			fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4611		} else {
4612			fault_info_dst.cluster_size = (vm_size_t) amount_left;
4613			assert(fault_info_dst.cluster_size == amount_left);
4614		}
4615
4616		XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
4617		result = vm_fault_page(dst_object,
4618				       vm_object_trunc_page(dst_offset),
4619				       VM_PROT_WRITE|VM_PROT_READ,
4620				       FALSE,
4621				       &dst_prot, &dst_page, &dst_top_page,
4622				       (int *)0,
4623				       &error,
4624				       dst_map->no_zero_fill,
4625				       FALSE, &fault_info_dst);
4626		switch (result) {
4627		case VM_FAULT_SUCCESS:
4628			break;
4629		case VM_FAULT_RETRY:
4630			goto RetryDestinationFault;
4631		case VM_FAULT_MEMORY_SHORTAGE:
4632			if (vm_page_wait(interruptible))
4633				goto RetryDestinationFault;
4634			/* fall thru */
4635		case VM_FAULT_INTERRUPTED:
4636			RETURN(MACH_SEND_INTERRUPTED);
4637		case VM_FAULT_SUCCESS_NO_VM_PAGE:
4638			/* success but no VM page: fail the copy */
4639			vm_object_paging_end(dst_object);
4640			vm_object_unlock(dst_object);
4641			/*FALLTHROUGH*/
4642		case VM_FAULT_MEMORY_ERROR:
4643			if (error)
4644				return (error);
4645			else
4646				return(KERN_MEMORY_ERROR);
4647		default:
4648			panic("vm_fault_copy: unexpected error 0x%x from "
4649			      "vm_fault_page()\n", result);
4650		}
4651		assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
4652
4653		old_copy_object = dst_page->object->copy;
4654
4655		/*
4656		 * There exists the possiblity that the source and
4657		 * destination page are the same.  But we can't
4658		 * easily determine that now.  If they are the
4659		 * same, the call to vm_fault_page() for the
4660		 * destination page will deadlock.  To prevent this we
4661		 * wire the page so we can drop busy without having
4662		 * the page daemon steal the page.  We clean up the
4663		 * top page  but keep the paging reference on the object
4664		 * holding the dest page so it doesn't go away.
4665		 */
4666
4667		vm_page_lockspin_queues();
4668		vm_page_wire(dst_page);
4669		vm_page_unlock_queues();
4670		PAGE_WAKEUP_DONE(dst_page);
4671		vm_object_unlock(dst_page->object);
4672
4673		if (dst_top_page != VM_PAGE_NULL) {
4674			vm_object_lock(dst_object);
4675			VM_PAGE_FREE(dst_top_page);
4676			vm_object_paging_end(dst_object);
4677			vm_object_unlock(dst_object);
4678		}
4679
4680	RetrySourceFault: ;
4681
4682		if (src_object == VM_OBJECT_NULL) {
4683			/*
4684			 *	No source object.  We will just
4685			 *	zero-fill the page in dst_object.
4686			 */
4687			src_page = VM_PAGE_NULL;
4688			result_page = VM_PAGE_NULL;
4689		} else {
4690			vm_object_lock(src_object);
4691			src_page = vm_page_lookup(src_object,
4692						  vm_object_trunc_page(src_offset));
4693			if (src_page == dst_page) {
4694				src_prot = dst_prot;
4695				result_page = VM_PAGE_NULL;
4696			} else {
4697				src_prot = VM_PROT_READ;
4698				vm_object_paging_begin(src_object);
4699
4700				if (amount_left > (vm_size_t) -1) {
4701					/* 32-bit overflow */
4702					fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4703				} else {
4704					fault_info_src.cluster_size = (vm_size_t) amount_left;
4705					assert(fault_info_src.cluster_size == amount_left);
4706				}
4707
4708				XPR(XPR_VM_FAULT,
4709					"vm_fault_copy(2) -> vm_fault_page\n",
4710					0,0,0,0,0);
4711				result = vm_fault_page(
4712					src_object,
4713					vm_object_trunc_page(src_offset),
4714					VM_PROT_READ, FALSE,
4715					&src_prot,
4716					&result_page, &src_top_page,
4717					(int *)0, &error, FALSE,
4718					FALSE, &fault_info_src);
4719
4720				switch (result) {
4721				case VM_FAULT_SUCCESS:
4722					break;
4723				case VM_FAULT_RETRY:
4724					goto RetrySourceFault;
4725				case VM_FAULT_MEMORY_SHORTAGE:
4726					if (vm_page_wait(interruptible))
4727						goto RetrySourceFault;
4728					/* fall thru */
4729				case VM_FAULT_INTERRUPTED:
4730					vm_fault_copy_dst_cleanup(dst_page);
4731					RETURN(MACH_SEND_INTERRUPTED);
4732				case VM_FAULT_SUCCESS_NO_VM_PAGE:
4733					/* success but no VM page: fail */
4734					vm_object_paging_end(src_object);
4735					vm_object_unlock(src_object);
4736					/*FALLTHROUGH*/
4737				case VM_FAULT_MEMORY_ERROR:
4738					vm_fault_copy_dst_cleanup(dst_page);
4739					if (error)
4740						return (error);
4741					else
4742						return(KERN_MEMORY_ERROR);
4743				default:
4744					panic("vm_fault_copy(2): unexpected "
4745					      "error 0x%x from "
4746					      "vm_fault_page()\n", result);
4747				}
4748
4749
4750				assert((src_top_page == VM_PAGE_NULL) ==
4751				       (result_page->object == src_object));
4752			}
4753			assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
4754			vm_object_unlock(result_page->object);
4755		}
4756
4757		if (!vm_map_verify(dst_map, dst_version)) {
4758			if (result_page != VM_PAGE_NULL && src_page != dst_page)
4759				vm_fault_copy_cleanup(result_page, src_top_page);
4760			vm_fault_copy_dst_cleanup(dst_page);
4761			break;
4762		}
4763
4764		vm_object_lock(dst_page->object);
4765
4766		if (dst_page->object->copy != old_copy_object) {
4767			vm_object_unlock(dst_page->object);
4768			vm_map_verify_done(dst_map, dst_version);
4769			if (result_page != VM_PAGE_NULL && src_page != dst_page)
4770				vm_fault_copy_cleanup(result_page, src_top_page);
4771			vm_fault_copy_dst_cleanup(dst_page);
4772			break;
4773		}
4774		vm_object_unlock(dst_page->object);
4775
4776		/*
4777		 *	Copy the page, and note that it is dirty
4778		 *	immediately.
4779		 */
4780
4781		if (!page_aligned(src_offset) ||
4782			!page_aligned(dst_offset) ||
4783			!page_aligned(amount_left)) {
4784
4785			vm_object_offset_t	src_po,
4786						dst_po;
4787
4788			src_po = src_offset - vm_object_trunc_page(src_offset);
4789			dst_po = dst_offset - vm_object_trunc_page(dst_offset);
4790
4791			if (dst_po > src_po) {
4792				part_size = PAGE_SIZE - dst_po;
4793			} else {
4794				part_size = PAGE_SIZE - src_po;
4795			}
4796			if (part_size > (amount_left)){
4797				part_size = amount_left;
4798			}
4799
4800			if (result_page == VM_PAGE_NULL) {
4801				assert((vm_offset_t) dst_po == dst_po);
4802				assert((vm_size_t) part_size == part_size);
4803				vm_page_part_zero_fill(dst_page,
4804						       (vm_offset_t) dst_po,
4805						       (vm_size_t) part_size);
4806			} else {
4807				assert((vm_offset_t) src_po == src_po);
4808				assert((vm_offset_t) dst_po == dst_po);
4809				assert((vm_size_t) part_size == part_size);
4810				vm_page_part_copy(result_page,
4811						  (vm_offset_t) src_po,
4812						  dst_page,
4813						  (vm_offset_t) dst_po,
4814						  (vm_size_t)part_size);
4815				if(!dst_page->dirty){
4816					vm_object_lock(dst_object);
4817					SET_PAGE_DIRTY(dst_page, TRUE);
4818					vm_object_unlock(dst_page->object);
4819				}
4820
4821			}
4822		} else {
4823			part_size = PAGE_SIZE;
4824
4825			if (result_page == VM_PAGE_NULL)
4826				vm_page_zero_fill(dst_page);
4827			else{
4828				vm_object_lock(result_page->object);
4829				vm_page_copy(result_page, dst_page);
4830				vm_object_unlock(result_page->object);
4831
4832				if(!dst_page->dirty){
4833					vm_object_lock(dst_object);
4834					SET_PAGE_DIRTY(dst_page, TRUE);
4835					vm_object_unlock(dst_page->object);
4836				}
4837			}
4838
4839		}
4840
4841		/*
4842		 *	Unlock everything, and return
4843		 */
4844
4845		vm_map_verify_done(dst_map, dst_version);
4846
4847		if (result_page != VM_PAGE_NULL && src_page != dst_page)
4848			vm_fault_copy_cleanup(result_page, src_top_page);
4849		vm_fault_copy_dst_cleanup(dst_page);
4850
4851		amount_left -= part_size;
4852		src_offset += part_size;
4853		dst_offset += part_size;
4854	} while (amount_left > 0);
4855
4856	RETURN(KERN_SUCCESS);
4857#undef	RETURN
4858
4859	/*NOTREACHED*/
4860}
4861
4862#if	VM_FAULT_CLASSIFY
4863/*
4864 *	Temporary statistics gathering support.
4865 */
4866
4867/*
4868 *	Statistics arrays:
4869 */
4870#define VM_FAULT_TYPES_MAX	5
4871#define	VM_FAULT_LEVEL_MAX	8
4872
4873int	vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4874
4875#define	VM_FAULT_TYPE_ZERO_FILL	0
4876#define	VM_FAULT_TYPE_MAP_IN	1
4877#define	VM_FAULT_TYPE_PAGER	2
4878#define	VM_FAULT_TYPE_COPY	3
4879#define	VM_FAULT_TYPE_OTHER	4
4880
4881
4882void
4883vm_fault_classify(vm_object_t		object,
4884		  vm_object_offset_t	offset,
4885		  vm_prot_t		fault_type)
4886{
4887	int		type, level = 0;
4888	vm_page_t	m;
4889
4890	while (TRUE) {
4891		m = vm_page_lookup(object, offset);
4892		if (m != VM_PAGE_NULL) {
4893		        if (m->busy || m->error || m->restart || m->absent) {
4894				type = VM_FAULT_TYPE_OTHER;
4895				break;
4896			}
4897			if (((fault_type & VM_PROT_WRITE) == 0) ||
4898			    ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4899				type = VM_FAULT_TYPE_MAP_IN;
4900				break;
4901			}
4902			type = VM_FAULT_TYPE_COPY;
4903			break;
4904		}
4905		else {
4906			if (object->pager_created) {
4907				type = VM_FAULT_TYPE_PAGER;
4908				break;
4909			}
4910			if (object->shadow == VM_OBJECT_NULL) {
4911				type = VM_FAULT_TYPE_ZERO_FILL;
4912				break;
4913		        }
4914
4915			offset += object->vo_shadow_offset;
4916			object = object->shadow;
4917			level++;
4918			continue;
4919		}
4920	}
4921
4922	if (level > VM_FAULT_LEVEL_MAX)
4923		level = VM_FAULT_LEVEL_MAX;
4924
4925	vm_fault_stats[type][level] += 1;
4926
4927	return;
4928}
4929
4930/* cleanup routine to call from debugger */
4931
4932void
4933vm_fault_classify_init(void)
4934{
4935	int type, level;
4936
4937	for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4938		for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4939			vm_fault_stats[type][level] = 0;
4940		}
4941	}
4942
4943	return;
4944}
4945#endif	/* VM_FAULT_CLASSIFY */
4946
4947
4948extern int cs_validation;
4949
4950void
4951vm_page_validate_cs_mapped(
4952	vm_page_t	page,
4953	const void 	*kaddr)
4954{
4955	vm_object_t		object;
4956	vm_object_offset_t	offset;
4957	kern_return_t		kr;
4958	memory_object_t		pager;
4959	void			*blobs;
4960	boolean_t		validated, tainted;
4961
4962	assert(page->busy);
4963	vm_object_lock_assert_exclusive(page->object);
4964
4965	if (!cs_validation) {
4966		return;
4967	}
4968
4969	if (page->wpmapped && !page->cs_tainted) {
4970		/*
4971		 * This page was mapped for "write" access sometime in the
4972		 * past and could still be modifiable in the future.
4973		 * Consider it tainted.
4974		 * [ If the page was already found to be "tainted", no
4975		 * need to re-validate. ]
4976		 */
4977		page->cs_validated = TRUE;
4978		page->cs_tainted = TRUE;
4979		if (cs_debug) {
4980			printf("CODESIGNING: vm_page_validate_cs: "
4981			       "page %p obj %p off 0x%llx "
4982			       "was modified\n",
4983			       page, page->object, page->offset);
4984		}
4985		vm_cs_validated_dirtied++;
4986	}
4987
4988	if (page->cs_validated) {
4989		return;
4990	}
4991
4992	vm_cs_validates++;
4993
4994	object = page->object;
4995	assert(object->code_signed);
4996	offset = page->offset;
4997
4998	if (!object->alive || object->terminating || object->pager == NULL) {
4999		/*
5000		 * The object is terminating and we don't have its pager
5001		 * so we can't validate the data...
5002		 */
5003		return;
5004	}
5005	/*
5006	 * Since we get here to validate a page that was brought in by
5007	 * the pager, we know that this pager is all setup and ready
5008	 * by now.
5009	 */
5010	assert(!object->internal);
5011	assert(object->pager != NULL);
5012	assert(object->pager_ready);
5013
5014	pager = object->pager;
5015	assert(object->paging_in_progress);
5016	kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5017	if (kr != KERN_SUCCESS) {
5018		blobs = NULL;
5019	}
5020
5021	/* verify the SHA1 hash for this page */
5022	validated = cs_validate_page(blobs,
5023				     pager,
5024				     offset + object->paging_offset,
5025				     (const void *)kaddr,
5026				     &tainted);
5027
5028	page->cs_validated = validated;
5029	if (validated) {
5030		page->cs_tainted = tainted;
5031	}
5032}
5033
5034void
5035vm_page_validate_cs(
5036	vm_page_t	page)
5037{
5038	vm_object_t		object;
5039	vm_object_offset_t	offset;
5040	vm_map_offset_t		koffset;
5041	vm_map_size_t		ksize;
5042	vm_offset_t		kaddr;
5043	kern_return_t		kr;
5044	boolean_t		busy_page;
5045
5046	vm_object_lock_assert_held(page->object);
5047
5048	if (!cs_validation) {
5049		return;
5050	}
5051
5052	if (page->wpmapped && !page->cs_tainted) {
5053		vm_object_lock_assert_exclusive(page->object);
5054
5055		/*
5056		 * This page was mapped for "write" access sometime in the
5057		 * past and could still be modifiable in the future.
5058		 * Consider it tainted.
5059		 * [ If the page was already found to be "tainted", no
5060		 * need to re-validate. ]
5061		 */
5062		page->cs_validated = TRUE;
5063		page->cs_tainted = TRUE;
5064		if (cs_debug) {
5065			printf("CODESIGNING: vm_page_validate_cs: "
5066			       "page %p obj %p off 0x%llx "
5067			       "was modified\n",
5068			       page, page->object, page->offset);
5069		}
5070		vm_cs_validated_dirtied++;
5071	}
5072
5073	if (page->cs_validated) {
5074		return;
5075	}
5076
5077#if CHECK_CS_VALIDATION_BITMAP
5078	if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5079		page->cs_validated = TRUE;
5080		page->cs_tainted = FALSE;
5081		vm_cs_bitmap_validated++;
5082		return;
5083	}
5084#endif
5085	vm_object_lock_assert_exclusive(page->object);
5086
5087	object = page->object;
5088	assert(object->code_signed);
5089	offset = page->offset;
5090
5091	busy_page = page->busy;
5092	if (!busy_page) {
5093		/* keep page busy while we map (and unlock) the VM object */
5094		page->busy = TRUE;
5095	}
5096
5097	/*
5098	 * Take a paging reference on the VM object
5099	 * to protect it from collapse or bypass,
5100	 * and keep it from disappearing too.
5101	 */
5102	vm_object_paging_begin(object);
5103
5104	/* map the page in the kernel address space */
5105	koffset = 0;
5106	ksize = PAGE_SIZE_64;
5107	kr = vm_paging_map_object(&koffset,
5108				  page,
5109				  object,
5110				  offset,
5111				  &ksize,
5112				  VM_PROT_READ,
5113				  FALSE); /* can't unlock object ! */
5114	if (kr != KERN_SUCCESS) {
5115		panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5116	}
5117	kaddr = CAST_DOWN(vm_offset_t, koffset);
5118
5119	/* validate the mapped page */
5120	vm_page_validate_cs_mapped(page, (const void *) kaddr);
5121
5122#if CHECK_CS_VALIDATION_BITMAP
5123	if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5124		vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5125	}
5126#endif
5127	assert(page->busy);
5128	assert(object == page->object);
5129	vm_object_lock_assert_exclusive(object);
5130
5131	if (!busy_page) {
5132		PAGE_WAKEUP_DONE(page);
5133	}
5134	if (koffset != 0) {
5135		/* unmap the map from the kernel address space */
5136		vm_paging_unmap_object(object, koffset, koffset + ksize);
5137		koffset = 0;
5138		ksize = 0;
5139		kaddr = 0;
5140	}
5141	vm_object_paging_end(object);
5142}
5143