1/*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 *	File:	vm_fault.c
60 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 *	Page fault handling module.
63 */
64
65#include <mach_cluster_stats.h>
66#include <mach_pagemap.h>
67#include <libkern/OSAtomic.h>
68
69#include <mach/mach_types.h>
70#include <mach/kern_return.h>
71#include <mach/message.h>	/* for error codes */
72#include <mach/vm_param.h>
73#include <mach/vm_behavior.h>
74#include <mach/memory_object.h>
75				/* For memory_object_data_{request,unlock} */
76#include <mach/sdt.h>
77
78#include <kern/kern_types.h>
79#include <kern/host_statistics.h>
80#include <kern/counters.h>
81#include <kern/task.h>
82#include <kern/thread.h>
83#include <kern/sched_prim.h>
84#include <kern/host.h>
85#include <kern/xpr.h>
86#include <kern/mach_param.h>
87#include <kern/macro_help.h>
88#include <kern/zalloc.h>
89#include <kern/misc_protos.h>
90
91#include <vm/vm_compressor.h>
92#include <vm/vm_compressor_pager.h>
93#include <vm/vm_fault.h>
94#include <vm/vm_map.h>
95#include <vm/vm_object.h>
96#include <vm/vm_page.h>
97#include <vm/vm_kern.h>
98#include <vm/pmap.h>
99#include <vm/vm_pageout.h>
100#include <vm/vm_protos.h>
101#include <vm/vm_external.h>
102#include <vm/memory_object.h>
103#include <vm/vm_purgeable_internal.h>	/* Needed by some vm_page.h macros */
104#include <vm/vm_shared_region.h>
105
106#include <sys/codesign.h>
107
108#include <libsa/sys/timers.h>	/* for struct timespec */
109
110#define VM_FAULT_CLASSIFY	0
111
112#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
113
114unsigned int	vm_object_pagein_throttle = 16;
115
116/*
117 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
118 * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
119 * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
120 * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
121 * keep the UI active so that the user has a chance to kill the offending task before the system
122 * completely hangs.
123 *
124 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
125 * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
126 * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
127 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
128 */
129
130extern void throttle_lowpri_io(int);
131
132uint64_t vm_hard_throttle_threshold;
133
134
135
136#define NEED_TO_HARD_THROTTLE_THIS_TASK()	(vm_wants_task_throttled(current_task()) ||	\
137						 (vm_page_free_count < vm_page_throttle_limit && \
138						  proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
139
140
141#define HARD_THROTTLE_DELAY	20000	/* 20000 us == 20 ms */
142#define SOFT_THROTTLE_DELAY	2000	/* 2000 us == 2 ms */
143
144boolean_t current_thread_aborted(void);
145
146/* Forward declarations of internal routines. */
147extern kern_return_t vm_fault_wire_fast(
148				vm_map_t	map,
149				vm_map_offset_t	va,
150				vm_map_entry_t	entry,
151				pmap_t		pmap,
152				vm_map_offset_t	pmap_addr,
153				ppnum_t		*physpage_p);
154
155extern void vm_fault_continue(void);
156
157extern void vm_fault_copy_cleanup(
158				vm_page_t	page,
159				vm_page_t	top_page);
160
161extern void vm_fault_copy_dst_cleanup(
162				vm_page_t	page);
163
164#if	VM_FAULT_CLASSIFY
165extern void vm_fault_classify(vm_object_t	object,
166			  vm_object_offset_t	offset,
167			  vm_prot_t		fault_type);
168
169extern void vm_fault_classify_init(void);
170#endif
171
172unsigned long vm_pmap_enter_blocked = 0;
173unsigned long vm_pmap_enter_retried = 0;
174
175unsigned long vm_cs_validates = 0;
176unsigned long vm_cs_revalidates = 0;
177unsigned long vm_cs_query_modified = 0;
178unsigned long vm_cs_validated_dirtied = 0;
179unsigned long vm_cs_bitmap_validated = 0;
180
181void vm_pre_fault(vm_map_offset_t);
182
183/*
184 *	Routine:	vm_fault_init
185 *	Purpose:
186 *		Initialize our private data structures.
187 */
188void
189vm_fault_init(void)
190{
191	int i, vm_compressor_temp;
192	boolean_t need_default_val = TRUE;
193	/*
194	 * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
195	 * computed as a percentage of available memory, and the percentage used is scaled inversely with
196	 * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
197	 * and reduce the value down to 10% for very large memory configurations.  This helps give us a
198	 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
199	 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
200	 */
201
202	vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
203
204	/*
205	 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
206	 */
207
208	if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof (vm_compressor_temp))) {
209		for ( i = 0; i < VM_PAGER_MAX_MODES; i++) {
210			if (vm_compressor_temp > 0 &&
211			    ((vm_compressor_temp & ( 1 << i)) == vm_compressor_temp)) {
212				need_default_val = FALSE;
213				vm_compressor_mode = vm_compressor_temp;
214				break;
215			}
216		}
217		if (need_default_val)
218			printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
219	}
220	if (need_default_val) {
221		/* If no boot arg or incorrect boot arg, try device tree. */
222		PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
223	}
224	PE_parse_boot_argn("vm_compressor_threads", &vm_compressor_thread_count, sizeof (vm_compressor_thread_count));
225	printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
226}
227
228/*
229 *	Routine:	vm_fault_cleanup
230 *	Purpose:
231 *		Clean up the result of vm_fault_page.
232 *	Results:
233 *		The paging reference for "object" is released.
234 *		"object" is unlocked.
235 *		If "top_page" is not null,  "top_page" is
236 *		freed and the paging reference for the object
237 *		containing it is released.
238 *
239 *	In/out conditions:
240 *		"object" must be locked.
241 */
242void
243vm_fault_cleanup(
244	register vm_object_t	object,
245	register vm_page_t	top_page)
246{
247	vm_object_paging_end(object);
248 	vm_object_unlock(object);
249
250	if (top_page != VM_PAGE_NULL) {
251	        object = top_page->object;
252
253		vm_object_lock(object);
254		VM_PAGE_FREE(top_page);
255		vm_object_paging_end(object);
256		vm_object_unlock(object);
257	}
258}
259
260#if	MACH_CLUSTER_STATS
261#define MAXCLUSTERPAGES 16
262struct {
263	unsigned long pages_in_cluster;
264	unsigned long pages_at_higher_offsets;
265	unsigned long pages_at_lower_offsets;
266} cluster_stats_in[MAXCLUSTERPAGES];
267#define CLUSTER_STAT(clause)	clause
268#define CLUSTER_STAT_HIGHER(x)	\
269	((cluster_stats_in[(x)].pages_at_higher_offsets)++)
270#define CLUSTER_STAT_LOWER(x)	\
271	 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
272#define CLUSTER_STAT_CLUSTER(x)	\
273	((cluster_stats_in[(x)].pages_in_cluster)++)
274#else	/* MACH_CLUSTER_STATS */
275#define CLUSTER_STAT(clause)
276#endif	/* MACH_CLUSTER_STATS */
277
278#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
279
280
281boolean_t	vm_page_deactivate_behind = TRUE;
282/*
283 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
284 */
285#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW	128
286#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER	16		/* don't make this too big... */
287                                                                /* we use it to size an array on the stack */
288
289int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
290
291#define MAX_SEQUENTIAL_RUN	(1024 * 1024 * 1024)
292
293/*
294 * vm_page_is_sequential
295 *
296 * Determine if sequential access is in progress
297 * in accordance with the behavior specified.
298 * Update state to indicate current access pattern.
299 *
300 * object must have at least the shared lock held
301 */
302static
303void
304vm_fault_is_sequential(
305	vm_object_t		object,
306	vm_object_offset_t	offset,
307	vm_behavior_t		behavior)
308{
309        vm_object_offset_t	last_alloc;
310	int			sequential;
311	int			orig_sequential;
312
313        last_alloc = object->last_alloc;
314	sequential = object->sequential;
315	orig_sequential = sequential;
316
317	switch (behavior) {
318	case VM_BEHAVIOR_RANDOM:
319	        /*
320		 * reset indicator of sequential behavior
321		 */
322	        sequential = 0;
323	        break;
324
325	case VM_BEHAVIOR_SEQUENTIAL:
326	        if (offset && last_alloc == offset - PAGE_SIZE_64) {
327		        /*
328			 * advance indicator of sequential behavior
329			 */
330		        if (sequential < MAX_SEQUENTIAL_RUN)
331			        sequential += PAGE_SIZE;
332		} else {
333		        /*
334			 * reset indicator of sequential behavior
335			 */
336		        sequential = 0;
337		}
338	        break;
339
340	case VM_BEHAVIOR_RSEQNTL:
341	        if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
342		        /*
343			 * advance indicator of sequential behavior
344			 */
345		        if (sequential > -MAX_SEQUENTIAL_RUN)
346			        sequential -= PAGE_SIZE;
347		} else {
348		        /*
349			 * reset indicator of sequential behavior
350			 */
351		        sequential = 0;
352		}
353	        break;
354
355	case VM_BEHAVIOR_DEFAULT:
356	default:
357	        if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
358		        /*
359			 * advance indicator of sequential behavior
360			 */
361		        if (sequential < 0)
362			        sequential = 0;
363		        if (sequential < MAX_SEQUENTIAL_RUN)
364			        sequential += PAGE_SIZE;
365
366		} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
367		        /*
368			 * advance indicator of sequential behavior
369			 */
370		        if (sequential > 0)
371			        sequential = 0;
372		        if (sequential > -MAX_SEQUENTIAL_RUN)
373			        sequential -= PAGE_SIZE;
374		} else {
375		        /*
376			 * reset indicator of sequential behavior
377			 */
378		        sequential = 0;
379		}
380	        break;
381	}
382	if (sequential != orig_sequential) {
383	        if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
384		        /*
385			 * if someone else has already updated object->sequential
386			 * don't bother trying to update it or object->last_alloc
387			 */
388		        return;
389		}
390	}
391	/*
392	 * I'd like to do this with a OSCompareAndSwap64, but that
393	 * doesn't exist for PPC...  however, it shouldn't matter
394	 * that much... last_alloc is maintained so that we can determine
395	 * if a sequential access pattern is taking place... if only
396	 * one thread is banging on this object, no problem with the unprotected
397	 * update... if 2 or more threads are banging away, we run the risk of
398	 * someone seeing a mangled update... however, in the face of multiple
399	 * accesses, no sequential access pattern can develop anyway, so we
400	 * haven't lost any real info.
401	 */
402	object->last_alloc = offset;
403}
404
405
406int vm_page_deactivate_behind_count = 0;
407
408/*
409 * vm_page_deactivate_behind
410 *
411 * Determine if sequential access is in progress
412 * in accordance with the behavior specified.  If
413 * so, compute a potential page to deactivate and
414 * deactivate it.
415 *
416 * object must be locked.
417 *
418 * return TRUE if we actually deactivate a page
419 */
420static
421boolean_t
422vm_fault_deactivate_behind(
423	vm_object_t		object,
424	vm_object_offset_t	offset,
425	vm_behavior_t		behavior)
426{
427	int		n;
428	int		pages_in_run = 0;
429	int		max_pages_in_run = 0;
430	int		sequential_run;
431	int		sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
432	vm_object_offset_t	run_offset = 0;
433	vm_object_offset_t	pg_offset = 0;
434	vm_page_t	m;
435	vm_page_t	page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
436
437	pages_in_run = 0;
438#if TRACEFAULTPAGE
439	dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind);	/* (TEST/DEBUG) */
440#endif
441
442	if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
443		/*
444		 * Do not deactivate pages from the kernel object: they
445		 * are not intended to become pageable.
446		 * or we've disabled the deactivate behind mechanism
447		 */
448		return FALSE;
449	}
450	if ((sequential_run = object->sequential)) {
451		  if (sequential_run < 0) {
452		          sequential_behavior = VM_BEHAVIOR_RSEQNTL;
453			  sequential_run = 0 - sequential_run;
454		  } else {
455		          sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
456		  }
457	}
458	switch (behavior) {
459	case VM_BEHAVIOR_RANDOM:
460		break;
461	case VM_BEHAVIOR_SEQUENTIAL:
462	        if (sequential_run >= (int)PAGE_SIZE) {
463			run_offset = 0 - PAGE_SIZE_64;
464			max_pages_in_run = 1;
465		}
466		break;
467	case VM_BEHAVIOR_RSEQNTL:
468	        if (sequential_run >= (int)PAGE_SIZE) {
469			run_offset = PAGE_SIZE_64;
470			max_pages_in_run = 1;
471		}
472		break;
473	case VM_BEHAVIOR_DEFAULT:
474	default:
475	{	vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
476
477	        /*
478		 * determine if the run of sequential accesss has been
479		 * long enough on an object with default access behavior
480		 * to consider it for deactivation
481		 */
482		if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
483			/*
484			 * the comparisons between offset and behind are done
485			 * in this kind of odd fashion in order to prevent wrap around
486			 * at the end points
487			 */
488		        if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
489			        if (offset >= behind) {
490					run_offset = 0 - behind;
491					pg_offset = PAGE_SIZE_64;
492					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
493				}
494			} else {
495			        if (offset < -behind) {
496					run_offset = behind;
497					pg_offset = 0 - PAGE_SIZE_64;
498					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
499				}
500			}
501		}
502		break;
503	}
504	}
505        for (n = 0; n < max_pages_in_run; n++) {
506		m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
507
508		if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
509			page_run[pages_in_run++] = m;
510
511			/*
512			 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
513			 *
514			 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
515			 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
516			 * new reference happens. If no futher references happen on the page after that remote TLB flushes
517			 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
518			 * by pageout_scan, which is just fine since the last reference would have happened quite far
519			 * in the past (TLB caches don't hang around for very long), and of course could just as easily
520			 * have happened before we did the deactivate_behind.
521			 */
522			pmap_clear_refmod_options(m->phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
523		}
524	}
525	if (pages_in_run) {
526		vm_page_lockspin_queues();
527
528		for (n = 0; n < pages_in_run; n++) {
529
530			m = page_run[n];
531
532			vm_page_deactivate_internal(m, FALSE);
533
534			vm_page_deactivate_behind_count++;
535#if TRACEFAULTPAGE
536			dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);	/* (TEST/DEBUG) */
537#endif
538		}
539		vm_page_unlock_queues();
540
541		return TRUE;
542	}
543	return FALSE;
544}
545
546
547static int
548vm_page_throttled(void)
549{
550        clock_sec_t     elapsed_sec;
551        clock_sec_t     tv_sec;
552        clock_usec_t    tv_usec;
553
554	thread_t thread = current_thread();
555
556	if (thread->options & TH_OPT_VMPRIV)
557		return (0);
558
559	thread->t_page_creation_count++;
560
561	if (NEED_TO_HARD_THROTTLE_THIS_TASK())
562		return (HARD_THROTTLE_DELAY);
563
564	if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
565	    thread->t_page_creation_count > vm_page_creation_throttle) {
566
567		clock_get_system_microtime(&tv_sec, &tv_usec);
568
569		elapsed_sec = tv_sec - thread->t_page_creation_time;
570
571		if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
572
573			if (elapsed_sec >= 60) {
574				/*
575				 * we'll reset our stats to give a well behaved app
576				 * that was unlucky enough to accumulate a bunch of pages
577				 * over a long period of time a chance to get out of
578				 * the throttled state... we reset the counter and timestamp
579				 * so that if it stays under the rate limit for the next second
580				 * it will be back in our good graces... if it exceeds it, it
581				 * will remain in the throttled state
582				 */
583				thread->t_page_creation_time = tv_sec;
584				thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
585			}
586			++vm_page_throttle_count;
587
588			if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED())
589				return (HARD_THROTTLE_DELAY);
590			else
591				return (SOFT_THROTTLE_DELAY);
592		}
593		thread->t_page_creation_time = tv_sec;
594		thread->t_page_creation_count = 0;
595	}
596	return (0);
597}
598
599
600/*
601 * check for various conditions that would
602 * prevent us from creating a ZF page...
603 * cleanup is based on being called from vm_fault_page
604 *
605 * object must be locked
606 * object == m->object
607 */
608static vm_fault_return_t
609vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
610{
611	int throttle_delay;
612
613        if (object->shadow_severed ||
614	    VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
615	        /*
616		 * Either:
617		 * 1. the shadow chain was severed,
618		 * 2. the purgeable object is volatile or empty and is marked
619		 *    to fault on access while volatile.
620		 * Just have to return an error at this point
621		 */
622	        if (m != VM_PAGE_NULL)
623		        VM_PAGE_FREE(m);
624		vm_fault_cleanup(object, first_m);
625
626		thread_interrupt_level(interruptible_state);
627
628		return (VM_FAULT_MEMORY_ERROR);
629	}
630	if (vm_backing_store_low) {
631	        /*
632		 * are we protecting the system from
633		 * backing store exhaustion.  If so
634		 * sleep unless we are privileged.
635		 */
636	        if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
637
638			if (m != VM_PAGE_NULL)
639			        VM_PAGE_FREE(m);
640			vm_fault_cleanup(object, first_m);
641
642		        assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
643
644			thread_block(THREAD_CONTINUE_NULL);
645			thread_interrupt_level(interruptible_state);
646
647			return (VM_FAULT_RETRY);
648		}
649	}
650	if ((throttle_delay = vm_page_throttled())) {
651	        /*
652		 * we're throttling zero-fills...
653		 * treat this as if we couldn't grab a page
654		 */
655	        if (m != VM_PAGE_NULL)
656		        VM_PAGE_FREE(m);
657		vm_fault_cleanup(object, first_m);
658
659		VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
660
661		delay(throttle_delay);
662
663		if (current_thread_aborted()) {
664			thread_interrupt_level(interruptible_state);
665			return VM_FAULT_INTERRUPTED;
666		}
667		thread_interrupt_level(interruptible_state);
668
669		return (VM_FAULT_MEMORY_SHORTAGE);
670	}
671	return (VM_FAULT_SUCCESS);
672}
673
674
675/*
676 * do the work to zero fill a page and
677 * inject it into the correct paging queue
678 *
679 * m->object must be locked
680 * page queue lock must NOT be held
681 */
682static int
683vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
684{
685        int my_fault = DBG_ZERO_FILL_FAULT;
686
687	/*
688	 * This is is a zero-fill page fault...
689	 *
690	 * Checking the page lock is a waste of
691	 * time;  this page was absent, so
692	 * it can't be page locked by a pager.
693	 *
694	 * we also consider it undefined
695	 * with respect to instruction
696	 * execution.  i.e. it is the responsibility
697	 * of higher layers to call for an instruction
698	 * sync after changing the contents and before
699	 * sending a program into this area.  We
700	 * choose this approach for performance
701	 */
702	m->pmapped = TRUE;
703
704	m->cs_validated = FALSE;
705	m->cs_tainted = FALSE;
706
707	if (no_zero_fill == TRUE) {
708		my_fault = DBG_NZF_PAGE_FAULT;
709
710		if (m->absent && m->busy)
711			return (my_fault);
712	} else {
713		vm_page_zero_fill(m);
714
715		VM_STAT_INCR(zero_fill_count);
716		DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
717	}
718	assert(!m->laundry);
719	assert(m->object != kernel_object);
720	//assert(m->pageq.next == NULL && m->pageq.prev == NULL);
721
722	if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
723		(m->object->purgable == VM_PURGABLE_DENY ||
724		 m->object->purgable == VM_PURGABLE_NONVOLATILE ||
725		 m->object->purgable == VM_PURGABLE_VOLATILE )) {
726
727		vm_page_lockspin_queues();
728
729		if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) {
730			assert(!VM_PAGE_WIRED(m));
731
732			/*
733			 * can't be on the pageout queue since we don't
734			 * have a pager to try and clean to
735			 */
736			assert(!m->pageout_queue);
737
738			VM_PAGE_QUEUES_REMOVE(m);
739
740			queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
741			m->throttled = TRUE;
742			vm_page_throttled_count++;
743		}
744		vm_page_unlock_queues();
745	}
746	return (my_fault);
747}
748
749
750/*
751 *	Routine:	vm_fault_page
752 *	Purpose:
753 *		Find the resident page for the virtual memory
754 *		specified by the given virtual memory object
755 *		and offset.
756 *	Additional arguments:
757 *		The required permissions for the page is given
758 *		in "fault_type".  Desired permissions are included
759 *		in "protection".
760 *		fault_info is passed along to determine pagein cluster
761 *		limits... it contains the expected reference pattern,
762 *		cluster size if available, etc...
763 *
764 *		If the desired page is known to be resident (for
765 *		example, because it was previously wired down), asserting
766 *		the "unwiring" parameter will speed the search.
767 *
768 *		If the operation can be interrupted (by thread_abort
769 *		or thread_terminate), then the "interruptible"
770 *		parameter should be asserted.
771 *
772 *	Results:
773 *		The page containing the proper data is returned
774 *		in "result_page".
775 *
776 *	In/out conditions:
777 *		The source object must be locked and referenced,
778 *		and must donate one paging reference.  The reference
779 *		is not affected.  The paging reference and lock are
780 *		consumed.
781 *
782 *		If the call succeeds, the object in which "result_page"
783 *		resides is left locked and holding a paging reference.
784 *		If this is not the original object, a busy page in the
785 *		original object is returned in "top_page", to prevent other
786 *		callers from pursuing this same data, along with a paging
787 *		reference for the original object.  The "top_page" should
788 *		be destroyed when this guarantee is no longer required.
789 *		The "result_page" is also left busy.  It is not removed
790 *		from the pageout queues.
791 *	Special Case:
792 *		A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
793 *		fault succeeded but there's no VM page (i.e. the VM object
794 * 		does not actually hold VM pages, but device memory or
795 *		large pages).  The object is still locked and we still hold a
796 *		paging_in_progress reference.
797 */
798unsigned int vm_fault_page_blocked_access = 0;
799unsigned int vm_fault_page_forced_retry = 0;
800
801vm_fault_return_t
802vm_fault_page(
803	/* Arguments: */
804	vm_object_t	first_object,	/* Object to begin search */
805	vm_object_offset_t first_offset,	/* Offset into object */
806	vm_prot_t	fault_type,	/* What access is requested */
807	boolean_t	must_be_resident,/* Must page be resident? */
808	boolean_t	caller_lookup,	/* caller looked up page */
809	/* Modifies in place: */
810	vm_prot_t	*protection,	/* Protection for mapping */
811	vm_page_t	*result_page,	/* Page found, if successful */
812	/* Returns: */
813	vm_page_t	*top_page,	/* Page in top object, if
814					 * not result_page.  */
815	int             *type_of_fault, /* if non-null, fill in with type of fault
816					 * COW, zero-fill, etc... returned in trace point */
817	/* More arguments: */
818	kern_return_t	*error_code,	/* code if page is in error */
819	boolean_t	no_zero_fill,	/* don't zero fill absent pages */
820	boolean_t	data_supply,	/* treat as data_supply if
821					 * it is a write fault and a full
822					 * page is provided */
823	vm_object_fault_info_t fault_info)
824{
825	vm_page_t		m;
826	vm_object_t		object;
827	vm_object_offset_t	offset;
828	vm_page_t		first_m;
829	vm_object_t		next_object;
830	vm_object_t		copy_object;
831	boolean_t		look_for_page;
832	boolean_t		force_fault_retry = FALSE;
833	vm_prot_t		access_required = fault_type;
834	vm_prot_t		wants_copy_flag;
835	CLUSTER_STAT(int pages_at_higher_offsets;)
836	CLUSTER_STAT(int pages_at_lower_offsets;)
837	kern_return_t		wait_result;
838	boolean_t		interruptible_state;
839	boolean_t		data_already_requested = FALSE;
840	vm_behavior_t		orig_behavior;
841	vm_size_t		orig_cluster_size;
842	vm_fault_return_t	error;
843	int			my_fault;
844	uint32_t		try_failed_count;
845	int			interruptible; /* how may fault be interrupted? */
846	int			external_state = VM_EXTERNAL_STATE_UNKNOWN;
847	memory_object_t		pager;
848	vm_fault_return_t	retval;
849
850/*
851 * MACH page map - an optional optimization where a bit map is maintained
852 * by the VM subsystem for internal objects to indicate which pages of
853 * the object currently reside on backing store.  This existence map
854 * duplicates information maintained by the vnode pager.  It is
855 * created at the time of the first pageout against the object, i.e.
856 * at the same time pager for the object is created.  The optimization
857 * is designed to eliminate pager interaction overhead, if it is
858 * 'known' that the page does not exist on backing store.
859 *
860 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
861 * either marked as paged out in the existence map for the object or no
862 * existence map exists for the object.  MUST_ASK_PAGER() is one of the
863 * criteria in the decision to invoke the pager.   It is also used as one
864 * of the criteria to terminate the scan for adjacent pages in a clustered
865 * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
866 * permanent objects.  Note also that if the pager for an internal object
867 * has not been created, the pager is not invoked regardless of the value
868 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
869 * for which a pager has been created.
870 *
871 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
872 * is marked as paged out in the existence map for the object.  PAGED_OUT()
873 * PAGED_OUT() is used to determine if a page has already been pushed
874 * into a copy object in order to avoid a redundant page out operation.
875 */
876#if MACH_PAGEMAP
877#define MUST_ASK_PAGER(o, f, s)					\
878	((vm_external_state_get((o)->existence_map, (f))	\
879	  != VM_EXTERNAL_STATE_ABSENT) &&			\
880	 (s = (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))))	\
881	 != VM_EXTERNAL_STATE_ABSENT)
882#define PAGED_OUT(o, f)						\
883	((vm_external_state_get((o)->existence_map, (f))	\
884	  == VM_EXTERNAL_STATE_EXISTS) ||			\
885	 (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))		\
886	  == VM_EXTERNAL_STATE_EXISTS))
887#else /* MACH_PAGEMAP */
888#define MUST_ASK_PAGER(o, f, s)					\
889	((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
890#define PAGED_OUT(o, f) \
891	(VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
892#endif /* MACH_PAGEMAP */
893
894/*
895 *	Recovery actions
896 */
897#define RELEASE_PAGE(m)					\
898	MACRO_BEGIN					\
899	PAGE_WAKEUP_DONE(m);				\
900	if (!m->active && !m->inactive && !m->throttled) {		\
901		vm_page_lockspin_queues();				\
902		if (!m->active && !m->inactive && !m->throttled) {	\
903			if (COMPRESSED_PAGER_IS_ACTIVE)	\
904                                vm_page_deactivate(m);                  \
905                        else						\
906				vm_page_activate(m);			\
907		}							\
908		vm_page_unlock_queues();				\
909	}								\
910	MACRO_END
911
912#if TRACEFAULTPAGE
913	dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset);	/* (TEST/DEBUG) */
914#endif
915
916	interruptible = fault_info->interruptible;
917	interruptible_state = thread_interrupt_level(interruptible);
918
919	/*
920	 *	INVARIANTS (through entire routine):
921	 *
922	 *	1)	At all times, we must either have the object
923	 *		lock or a busy page in some object to prevent
924	 *		some other thread from trying to bring in
925	 *		the same page.
926	 *
927	 *		Note that we cannot hold any locks during the
928	 *		pager access or when waiting for memory, so
929	 *		we use a busy page then.
930	 *
931	 *	2)	To prevent another thread from racing us down the
932	 *		shadow chain and entering a new page in the top
933	 *		object before we do, we must keep a busy page in
934	 *		the top object while following the shadow chain.
935	 *
936	 *	3)	We must increment paging_in_progress on any object
937	 *		for which we have a busy page before dropping
938	 *		the object lock
939	 *
940	 *	4)	We leave busy pages on the pageout queues.
941	 *		If the pageout daemon comes across a busy page,
942	 *		it will remove the page from the pageout queues.
943	 */
944
945	object = first_object;
946	offset = first_offset;
947	first_m = VM_PAGE_NULL;
948	access_required = fault_type;
949
950
951	XPR(XPR_VM_FAULT,
952		"vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
953		object, offset, fault_type, *protection, 0);
954
955	/*
956	 * default type of fault
957	 */
958	my_fault = DBG_CACHE_HIT_FAULT;
959
960	while (TRUE) {
961#if TRACEFAULTPAGE
962		dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);	/* (TEST/DEBUG) */
963#endif
964		if (!object->alive) {
965		        /*
966			 * object is no longer valid
967			 * clean up and return error
968			 */
969			vm_fault_cleanup(object, first_m);
970			thread_interrupt_level(interruptible_state);
971
972			return (VM_FAULT_MEMORY_ERROR);
973		}
974
975		if (!object->pager_created && object->phys_contiguous) {
976			/*
977			 * A physically-contiguous object without a pager:
978			 * must be a "large page" object.  We do not deal
979			 * with VM pages for this object.
980			 */
981			caller_lookup = FALSE;
982			m = VM_PAGE_NULL;
983			goto phys_contig_object;
984		}
985
986		if (object->blocked_access) {
987			/*
988			 * Access to this VM object has been blocked.
989			 * Replace our "paging_in_progress" reference with
990			 * a "activity_in_progress" reference and wait for
991			 * access to be unblocked.
992			 */
993			caller_lookup = FALSE; /* no longer valid after sleep */
994			vm_object_activity_begin(object);
995			vm_object_paging_end(object);
996			while (object->blocked_access) {
997				vm_object_sleep(object,
998						VM_OBJECT_EVENT_UNBLOCKED,
999						THREAD_UNINT);
1000			}
1001			vm_fault_page_blocked_access++;
1002			vm_object_paging_begin(object);
1003			vm_object_activity_end(object);
1004		}
1005
1006		/*
1007		 * See whether the page at 'offset' is resident
1008		 */
1009		if (caller_lookup == TRUE) {
1010			/*
1011			 * The caller has already looked up the page
1012			 * and gave us the result in "result_page".
1013			 * We can use this for the first lookup but
1014			 * it loses its validity as soon as we unlock
1015			 * the object.
1016			 */
1017			m = *result_page;
1018			caller_lookup = FALSE; /* no longer valid after that */
1019		} else {
1020			m = vm_page_lookup(object, offset);
1021		}
1022#if TRACEFAULTPAGE
1023		dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);	/* (TEST/DEBUG) */
1024#endif
1025		if (m != VM_PAGE_NULL) {
1026
1027			if (m->busy) {
1028			        /*
1029				 * The page is being brought in,
1030				 * wait for it and then retry.
1031				 */
1032#if TRACEFAULTPAGE
1033				dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1034#endif
1035				wait_result = PAGE_SLEEP(object, m, interruptible);
1036
1037				XPR(XPR_VM_FAULT,
1038				    "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1039				    object, offset,
1040				    m, 0, 0);
1041				counter(c_vm_fault_page_block_busy_kernel++);
1042
1043				if (wait_result != THREAD_AWAKENED) {
1044					vm_fault_cleanup(object, first_m);
1045					thread_interrupt_level(interruptible_state);
1046
1047					if (wait_result == THREAD_RESTART)
1048						return (VM_FAULT_RETRY);
1049					else
1050						return (VM_FAULT_INTERRUPTED);
1051				}
1052				continue;
1053			}
1054			if (m->laundry) {
1055				m->pageout = FALSE;
1056
1057				if (!m->cleaning)
1058					vm_pageout_steal_laundry(m, FALSE);
1059			}
1060			if (m->phys_page == vm_page_guard_addr) {
1061				/*
1062				 * Guard page: off limits !
1063				 */
1064				if (fault_type == VM_PROT_NONE) {
1065					/*
1066					 * The fault is not requesting any
1067					 * access to the guard page, so it must
1068					 * be just to wire or unwire it.
1069					 * Let's pretend it succeeded...
1070					 */
1071					m->busy = TRUE;
1072					*result_page = m;
1073					assert(first_m == VM_PAGE_NULL);
1074					*top_page = first_m;
1075					if (type_of_fault)
1076						*type_of_fault = DBG_GUARD_FAULT;
1077					thread_interrupt_level(interruptible_state);
1078					return VM_FAULT_SUCCESS;
1079				} else {
1080					/*
1081					 * The fault requests access to the
1082					 * guard page: let's deny that !
1083					 */
1084					vm_fault_cleanup(object, first_m);
1085					thread_interrupt_level(interruptible_state);
1086					return VM_FAULT_MEMORY_ERROR;
1087				}
1088			}
1089
1090			if (m->error) {
1091			        /*
1092				 * The page is in error, give up now.
1093				 */
1094#if TRACEFAULTPAGE
1095				dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);	/* (TEST/DEBUG) */
1096#endif
1097				if (error_code)
1098				        *error_code = KERN_MEMORY_ERROR;
1099				VM_PAGE_FREE(m);
1100
1101				vm_fault_cleanup(object, first_m);
1102				thread_interrupt_level(interruptible_state);
1103
1104				return (VM_FAULT_MEMORY_ERROR);
1105			}
1106			if (m->restart) {
1107			        /*
1108				 * The pager wants us to restart
1109				 * at the top of the chain,
1110				 * typically because it has moved the
1111				 * page to another pager, then do so.
1112				 */
1113#if TRACEFAULTPAGE
1114				dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1115#endif
1116				VM_PAGE_FREE(m);
1117
1118				vm_fault_cleanup(object, first_m);
1119				thread_interrupt_level(interruptible_state);
1120
1121				return (VM_FAULT_RETRY);
1122			}
1123			if (m->absent) {
1124			        /*
1125				 * The page isn't busy, but is absent,
1126				 * therefore it's deemed "unavailable".
1127				 *
1128				 * Remove the non-existent page (unless it's
1129				 * in the top object) and move on down to the
1130				 * next object (if there is one).
1131				 */
1132#if TRACEFAULTPAGE
1133				dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);	/* (TEST/DEBUG) */
1134#endif
1135				next_object = object->shadow;
1136
1137				if (next_object == VM_OBJECT_NULL) {
1138					/*
1139					 * Absent page at bottom of shadow
1140					 * chain; zero fill the page we left
1141					 * busy in the first object, and free
1142					 * the absent page.
1143					 */
1144					assert(!must_be_resident);
1145
1146					/*
1147					 * check for any conditions that prevent
1148					 * us from creating a new zero-fill page
1149					 * vm_fault_check will do all of the
1150					 * fault cleanup in the case of an error condition
1151					 * including resetting the thread_interrupt_level
1152					 */
1153					error = vm_fault_check(object, m, first_m, interruptible_state);
1154
1155					if (error != VM_FAULT_SUCCESS)
1156					        return (error);
1157
1158					XPR(XPR_VM_FAULT,
1159					    "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1160						object, offset,
1161						m,
1162						first_object, 0);
1163
1164					if (object != first_object) {
1165					        /*
1166						 * free the absent page we just found
1167						 */
1168						VM_PAGE_FREE(m);
1169
1170						/*
1171						 * drop reference and lock on current object
1172						 */
1173						vm_object_paging_end(object);
1174						vm_object_unlock(object);
1175
1176						/*
1177						 * grab the original page we
1178						 * 'soldered' in place and
1179						 * retake lock on 'first_object'
1180						 */
1181						m = first_m;
1182						first_m = VM_PAGE_NULL;
1183
1184						object = first_object;
1185						offset = first_offset;
1186
1187						vm_object_lock(object);
1188					} else {
1189					        /*
1190						 * we're going to use the absent page we just found
1191						 * so convert it to a 'busy' page
1192						 */
1193					        m->absent = FALSE;
1194						m->busy = TRUE;
1195					}
1196					if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1197						m->absent = TRUE;
1198					/*
1199					 * zero-fill the page and put it on
1200					 * the correct paging queue
1201					 */
1202					my_fault = vm_fault_zero_page(m, no_zero_fill);
1203
1204					break;
1205				} else {
1206					if (must_be_resident)
1207						vm_object_paging_end(object);
1208					else if (object != first_object) {
1209						vm_object_paging_end(object);
1210						VM_PAGE_FREE(m);
1211					} else {
1212						first_m = m;
1213						m->absent = FALSE;
1214						m->busy = TRUE;
1215
1216						vm_page_lockspin_queues();
1217
1218						assert(!m->pageout_queue);
1219						VM_PAGE_QUEUES_REMOVE(m);
1220
1221						vm_page_unlock_queues();
1222					}
1223					XPR(XPR_VM_FAULT,
1224					    "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1225						object, offset,
1226						next_object,
1227						offset+object->vo_shadow_offset,0);
1228
1229					offset += object->vo_shadow_offset;
1230					fault_info->lo_offset += object->vo_shadow_offset;
1231					fault_info->hi_offset += object->vo_shadow_offset;
1232					access_required = VM_PROT_READ;
1233
1234					vm_object_lock(next_object);
1235					vm_object_unlock(object);
1236					object = next_object;
1237					vm_object_paging_begin(object);
1238
1239					/*
1240					 * reset to default type of fault
1241					 */
1242					my_fault = DBG_CACHE_HIT_FAULT;
1243
1244					continue;
1245				}
1246			}
1247			if ((m->cleaning)
1248			    && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1249			    && (fault_type & VM_PROT_WRITE)) {
1250				/*
1251				 * This is a copy-on-write fault that will
1252				 * cause us to revoke access to this page, but
1253				 * this page is in the process of being cleaned
1254				 * in a clustered pageout. We must wait until
1255				 * the cleaning operation completes before
1256				 * revoking access to the original page,
1257				 * otherwise we might attempt to remove a
1258				 * wired mapping.
1259				 */
1260#if TRACEFAULTPAGE
1261				dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);	/* (TEST/DEBUG) */
1262#endif
1263				XPR(XPR_VM_FAULT,
1264				    "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1265					object, offset,
1266					m, 0, 0);
1267				/*
1268				 * take an extra ref so that object won't die
1269				 */
1270				vm_object_reference_locked(object);
1271
1272				vm_fault_cleanup(object, first_m);
1273
1274				counter(c_vm_fault_page_block_backoff_kernel++);
1275				vm_object_lock(object);
1276				assert(object->ref_count > 0);
1277
1278				m = vm_page_lookup(object, offset);
1279
1280				if (m != VM_PAGE_NULL && m->cleaning) {
1281					PAGE_ASSERT_WAIT(m, interruptible);
1282
1283					vm_object_unlock(object);
1284					wait_result = thread_block(THREAD_CONTINUE_NULL);
1285					vm_object_deallocate(object);
1286
1287					goto backoff;
1288				} else {
1289					vm_object_unlock(object);
1290
1291					vm_object_deallocate(object);
1292					thread_interrupt_level(interruptible_state);
1293
1294					return (VM_FAULT_RETRY);
1295				}
1296			}
1297			if (type_of_fault == NULL && m->speculative &&
1298			    !(fault_info != NULL && fault_info->stealth)) {
1299			        /*
1300				 * If we were passed a non-NULL pointer for
1301				 * "type_of_fault", than we came from
1302				 * vm_fault... we'll let it deal with
1303				 * this condition, since it
1304				 * needs to see m->speculative to correctly
1305				 * account the pageins, otherwise...
1306				 * take it off the speculative queue, we'll
1307				 * let the caller of vm_fault_page deal
1308				 * with getting it onto the correct queue
1309				 *
1310				 * If the caller specified in fault_info that
1311				 * it wants a "stealth" fault, we also leave
1312				 * the page in the speculative queue.
1313				 */
1314			        vm_page_lockspin_queues();
1315				if (m->speculative)
1316					VM_PAGE_QUEUES_REMOVE(m);
1317			        vm_page_unlock_queues();
1318			}
1319
1320			if (m->encrypted) {
1321				/*
1322				 * ENCRYPTED SWAP:
1323				 * the user needs access to a page that we
1324				 * encrypted before paging it out.
1325				 * Decrypt the page now.
1326				 * Keep it busy to prevent anyone from
1327				 * accessing it during the decryption.
1328				 */
1329				m->busy = TRUE;
1330				vm_page_decrypt(m, 0);
1331				assert(object == m->object);
1332				assert(m->busy);
1333				PAGE_WAKEUP_DONE(m);
1334
1335				/*
1336				 * Retry from the top, in case
1337				 * something changed while we were
1338				 * decrypting.
1339				 */
1340				continue;
1341			}
1342			ASSERT_PAGE_DECRYPTED(m);
1343
1344			if (m->object->code_signed) {
1345				/*
1346				 * CODE SIGNING:
1347				 * We just paged in a page from a signed
1348				 * memory object but we don't need to
1349				 * validate it now.  We'll validate it if
1350				 * when it gets mapped into a user address
1351				 * space for the first time or when the page
1352				 * gets copied to another object as a result
1353				 * of a copy-on-write.
1354				 */
1355			}
1356
1357			/*
1358			 * We mark the page busy and leave it on
1359			 * the pageout queues.  If the pageout
1360			 * deamon comes across it, then it will
1361			 * remove the page from the queue, but not the object
1362			 */
1363#if TRACEFAULTPAGE
1364			dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1365#endif
1366			XPR(XPR_VM_FAULT,
1367			    "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1368				object, offset, m, 0, 0);
1369			assert(!m->busy);
1370			assert(!m->absent);
1371
1372			m->busy = TRUE;
1373			break;
1374		}
1375
1376
1377		/*
1378		 * we get here when there is no page present in the object at
1379		 * the offset we're interested in... we'll allocate a page
1380		 * at this point if the pager associated with
1381		 * this object can provide the data or we're the top object...
1382		 * object is locked;  m == NULL
1383		 */
1384		if (must_be_resident) {
1385			if (fault_type == VM_PROT_NONE &&
1386			    object == kernel_object) {
1387				/*
1388				 * We've been called from vm_fault_unwire()
1389				 * while removing a map entry that was allocated
1390				 * with KMA_KOBJECT and KMA_VAONLY.  This page
1391				 * is not present and there's nothing more to
1392				 * do here (nothing to unwire).
1393				 */
1394				vm_fault_cleanup(object, first_m);
1395				thread_interrupt_level(interruptible_state);
1396
1397				return VM_FAULT_MEMORY_ERROR;
1398			}
1399
1400			goto dont_look_for_page;
1401		}
1402
1403#if !MACH_PAGEMAP
1404		data_supply = FALSE;
1405#endif /* !MACH_PAGEMAP */
1406
1407		look_for_page =	(object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1408
1409#if TRACEFAULTPAGE
1410		dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);	/* (TEST/DEBUG) */
1411#endif
1412		if (!look_for_page && object == first_object && !object->phys_contiguous) {
1413			/*
1414			 * Allocate a new page for this object/offset pair as a placeholder
1415			 */
1416			m = vm_page_grab();
1417#if TRACEFAULTPAGE
1418			dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);	/* (TEST/DEBUG) */
1419#endif
1420			if (m == VM_PAGE_NULL) {
1421
1422				vm_fault_cleanup(object, first_m);
1423				thread_interrupt_level(interruptible_state);
1424
1425				return (VM_FAULT_MEMORY_SHORTAGE);
1426			}
1427
1428			if (fault_info && fault_info->batch_pmap_op == TRUE) {
1429				vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1430			} else {
1431				vm_page_insert(m, object, offset);
1432			}
1433		}
1434		if (look_for_page) {
1435			kern_return_t	rc;
1436			int		my_fault_type;
1437
1438			/*
1439			 *	If the memory manager is not ready, we
1440			 *	cannot make requests.
1441			 */
1442			if (!object->pager_ready) {
1443#if TRACEFAULTPAGE
1444				dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);	/* (TEST/DEBUG) */
1445#endif
1446				if (m != VM_PAGE_NULL)
1447				        VM_PAGE_FREE(m);
1448
1449				XPR(XPR_VM_FAULT,
1450				"vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1451					object, offset, 0, 0, 0);
1452
1453				/*
1454				 * take an extra ref so object won't die
1455				 */
1456				vm_object_reference_locked(object);
1457				vm_fault_cleanup(object, first_m);
1458				counter(c_vm_fault_page_block_backoff_kernel++);
1459
1460				vm_object_lock(object);
1461				assert(object->ref_count > 0);
1462
1463				if (!object->pager_ready) {
1464					wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1465
1466					vm_object_unlock(object);
1467					if (wait_result == THREAD_WAITING)
1468						wait_result = thread_block(THREAD_CONTINUE_NULL);
1469					vm_object_deallocate(object);
1470
1471					goto backoff;
1472				} else {
1473					vm_object_unlock(object);
1474					vm_object_deallocate(object);
1475					thread_interrupt_level(interruptible_state);
1476
1477					return (VM_FAULT_RETRY);
1478				}
1479			}
1480			if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1481				/*
1482				 * If there are too many outstanding page
1483				 * requests pending on this external object, we
1484				 * wait for them to be resolved now.
1485				 */
1486#if TRACEFAULTPAGE
1487				dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1488#endif
1489				if (m != VM_PAGE_NULL)
1490					VM_PAGE_FREE(m);
1491				/*
1492				 * take an extra ref so object won't die
1493				 */
1494				vm_object_reference_locked(object);
1495
1496				vm_fault_cleanup(object, first_m);
1497
1498				counter(c_vm_fault_page_block_backoff_kernel++);
1499
1500				vm_object_lock(object);
1501				assert(object->ref_count > 0);
1502
1503				if (object->paging_in_progress >= vm_object_pagein_throttle) {
1504				        vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1505
1506					vm_object_unlock(object);
1507					wait_result = thread_block(THREAD_CONTINUE_NULL);
1508					vm_object_deallocate(object);
1509
1510					goto backoff;
1511				} else {
1512					vm_object_unlock(object);
1513					vm_object_deallocate(object);
1514					thread_interrupt_level(interruptible_state);
1515
1516					return (VM_FAULT_RETRY);
1517				}
1518			}
1519			if (object->internal &&
1520			    (COMPRESSED_PAGER_IS_ACTIVE
1521			     || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)) {
1522				int compressed_count_delta;
1523
1524				if (m == VM_PAGE_NULL) {
1525					/*
1526					 * Allocate a new page for this object/offset pair as a placeholder
1527					 */
1528					m = vm_page_grab();
1529#if TRACEFAULTPAGE
1530					dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);	/* (TEST/DEBUG) */
1531#endif
1532					if (m == VM_PAGE_NULL) {
1533
1534						vm_fault_cleanup(object, first_m);
1535						thread_interrupt_level(interruptible_state);
1536
1537						return (VM_FAULT_MEMORY_SHORTAGE);
1538					}
1539
1540					m->absent = TRUE;
1541					if (fault_info && fault_info->batch_pmap_op == TRUE) {
1542						vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1543					} else {
1544						vm_page_insert(m, object, offset);
1545					}
1546				}
1547				assert(m->busy);
1548
1549				m->absent = TRUE;
1550				pager = object->pager;
1551
1552				assert(object->paging_in_progress > 0);
1553				vm_object_unlock(object);
1554
1555				rc = vm_compressor_pager_get(
1556					pager,
1557					offset + object->paging_offset,
1558					m->phys_page,
1559					&my_fault_type,
1560					0,
1561					&compressed_count_delta);
1562
1563				vm_object_lock(object);
1564				assert(object->paging_in_progress > 0);
1565
1566				vm_compressor_pager_count(
1567					pager,
1568					compressed_count_delta,
1569					FALSE, /* shared_lock */
1570					object);
1571
1572				switch (rc) {
1573				case KERN_SUCCESS:
1574					m->absent = FALSE;
1575					m->dirty = TRUE;
1576					if ((m->object->wimg_bits &
1577					     VM_WIMG_MASK) !=
1578					    VM_WIMG_USE_DEFAULT) {
1579						/*
1580						 * If the page is not cacheable,
1581						 * we can't let its contents
1582						 * linger in the data cache
1583						 * after the decompression.
1584						 */
1585						pmap_sync_page_attributes_phys(
1586							m->phys_page);
1587					} else {
1588						m->written_by_kernel = TRUE;
1589					}
1590
1591					/*
1592					 * If the object is purgeable, its
1593					 * owner's purgeable ledgers have been
1594					 * updated in vm_page_insert() but the
1595					 * page was also accounted for in a
1596					 * "compressed purgeable" ledger, so
1597					 * update that now.
1598					 */
1599					if ((object->purgable !=
1600					     VM_PURGABLE_DENY) &&
1601					    (object->vo_purgeable_owner !=
1602					     NULL)) {
1603						/*
1604						 * One less compressed
1605						 * purgeable page.
1606						 */
1607						vm_purgeable_compressed_update(
1608							object,
1609							-1);
1610					}
1611
1612					break;
1613				case KERN_MEMORY_FAILURE:
1614					m->unusual = TRUE;
1615					m->error = TRUE;
1616					m->absent = FALSE;
1617					break;
1618				case KERN_MEMORY_ERROR:
1619					assert(m->absent);
1620					break;
1621				default:
1622					panic("vm_fault_page(): unexpected "
1623					      "error %d from "
1624					      "vm_compressor_pager_get()\n",
1625					      rc);
1626				}
1627				PAGE_WAKEUP_DONE(m);
1628
1629				rc = KERN_SUCCESS;
1630				goto data_requested;
1631			}
1632			my_fault_type = DBG_PAGEIN_FAULT;
1633
1634			if (m != VM_PAGE_NULL) {
1635				VM_PAGE_FREE(m);
1636				m = VM_PAGE_NULL;
1637			}
1638
1639#if TRACEFAULTPAGE
1640			dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);	/* (TEST/DEBUG) */
1641#endif
1642
1643			/*
1644			 * It's possible someone called vm_object_destroy while we weren't
1645			 * holding the object lock.  If that has happened, then bail out
1646			 * here.
1647			 */
1648
1649			pager = object->pager;
1650
1651			if (pager == MEMORY_OBJECT_NULL) {
1652				vm_fault_cleanup(object, first_m);
1653				thread_interrupt_level(interruptible_state);
1654				return VM_FAULT_MEMORY_ERROR;
1655			}
1656
1657			/*
1658			 * We have an absent page in place for the faulting offset,
1659			 * so we can release the object lock.
1660			 */
1661
1662			vm_object_unlock(object);
1663
1664			/*
1665			 * If this object uses a copy_call strategy,
1666			 * and we are interested in a copy of this object
1667			 * (having gotten here only by following a
1668			 * shadow chain), then tell the memory manager
1669			 * via a flag added to the desired_access
1670			 * parameter, so that it can detect a race
1671			 * between our walking down the shadow chain
1672			 * and its pushing pages up into a copy of
1673			 * the object that it manages.
1674			 */
1675			if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1676				wants_copy_flag = VM_PROT_WANTS_COPY;
1677			else
1678				wants_copy_flag = VM_PROT_NONE;
1679
1680			XPR(XPR_VM_FAULT,
1681			    "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1682				object, offset, m,
1683				access_required | wants_copy_flag, 0);
1684
1685			if (object->copy == first_object) {
1686				/*
1687				 * if we issue the memory_object_data_request in
1688				 * this state, we are subject to a deadlock with
1689				 * the underlying filesystem if it is trying to
1690				 * shrink the file resulting in a push of pages
1691				 * into the copy object...  that push will stall
1692				 * on the placeholder page, and if the pushing thread
1693				 * is holding a lock that is required on the pagein
1694				 * path (such as a truncate lock), we'll deadlock...
1695				 * to avoid this potential deadlock, we throw away
1696				 * our placeholder page before calling memory_object_data_request
1697				 * and force this thread to retry the vm_fault_page after
1698				 * we have issued the I/O.  the second time through this path
1699				 * we will find the page already in the cache (presumably still
1700				 * busy waiting for the I/O to complete) and then complete
1701				 * the fault w/o having to go through memory_object_data_request again
1702				 */
1703				assert(first_m != VM_PAGE_NULL);
1704				assert(first_m->object == first_object);
1705
1706				vm_object_lock(first_object);
1707				VM_PAGE_FREE(first_m);
1708				vm_object_paging_end(first_object);
1709				vm_object_unlock(first_object);
1710
1711				first_m = VM_PAGE_NULL;
1712				force_fault_retry = TRUE;
1713
1714				vm_fault_page_forced_retry++;
1715			}
1716
1717			if (data_already_requested == TRUE) {
1718				orig_behavior = fault_info->behavior;
1719				orig_cluster_size = fault_info->cluster_size;
1720
1721				fault_info->behavior = VM_BEHAVIOR_RANDOM;
1722				fault_info->cluster_size = PAGE_SIZE;
1723			}
1724			/*
1725			 * Call the memory manager to retrieve the data.
1726			 */
1727			rc = memory_object_data_request(
1728				pager,
1729				offset + object->paging_offset,
1730				PAGE_SIZE,
1731				access_required | wants_copy_flag,
1732				(memory_object_fault_info_t)fault_info);
1733
1734			if (data_already_requested == TRUE) {
1735				fault_info->behavior = orig_behavior;
1736				fault_info->cluster_size = orig_cluster_size;
1737			} else
1738				data_already_requested = TRUE;
1739
1740			DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1741#if TRACEFAULTPAGE
1742			dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc);	/* (TEST/DEBUG) */
1743#endif
1744			vm_object_lock(object);
1745
1746		data_requested:
1747			if (rc != KERN_SUCCESS) {
1748
1749				vm_fault_cleanup(object, first_m);
1750				thread_interrupt_level(interruptible_state);
1751
1752				return ((rc == MACH_SEND_INTERRUPTED) ?
1753					VM_FAULT_INTERRUPTED :
1754					VM_FAULT_MEMORY_ERROR);
1755			} else {
1756				clock_sec_t     tv_sec;
1757				clock_usec_t    tv_usec;
1758
1759				if (my_fault_type == DBG_PAGEIN_FAULT) {
1760					clock_get_system_microtime(&tv_sec, &tv_usec);
1761					current_thread()->t_page_creation_time = tv_sec;
1762					current_thread()->t_page_creation_count = 0;
1763				}
1764			}
1765			if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1766
1767				vm_fault_cleanup(object, first_m);
1768				thread_interrupt_level(interruptible_state);
1769
1770				return (VM_FAULT_INTERRUPTED);
1771			}
1772			if (force_fault_retry == TRUE) {
1773
1774				vm_fault_cleanup(object, first_m);
1775				thread_interrupt_level(interruptible_state);
1776
1777				return (VM_FAULT_RETRY);
1778			}
1779			if (m == VM_PAGE_NULL && object->phys_contiguous) {
1780				/*
1781				 * No page here means that the object we
1782				 * initially looked up was "physically
1783				 * contiguous" (i.e. device memory).  However,
1784				 * with Virtual VRAM, the object might not
1785				 * be backed by that device memory anymore,
1786				 * so we're done here only if the object is
1787				 * still "phys_contiguous".
1788				 * Otherwise, if the object is no longer
1789				 * "phys_contiguous", we need to retry the
1790				 * page fault against the object's new backing
1791				 * store (different memory object).
1792				 */
1793			phys_contig_object:
1794				goto done;
1795			}
1796			/*
1797			 * potentially a pagein fault
1798			 * if we make it through the state checks
1799			 * above, than we'll count it as such
1800			 */
1801			my_fault = my_fault_type;
1802
1803			/*
1804			 * Retry with same object/offset, since new data may
1805			 * be in a different page (i.e., m is meaningless at
1806			 * this point).
1807			 */
1808			continue;
1809		}
1810dont_look_for_page:
1811		/*
1812		 * We get here if the object has no pager, or an existence map
1813		 * exists and indicates the page isn't present on the pager
1814		 * or we're unwiring a page.  If a pager exists, but there
1815		 * is no existence map, then the m->absent case above handles
1816		 * the ZF case when the pager can't provide the page
1817		 */
1818#if TRACEFAULTPAGE
1819		dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);	/* (TEST/DEBUG) */
1820#endif
1821		if (object == first_object)
1822			first_m = m;
1823		else
1824			assert(m == VM_PAGE_NULL);
1825
1826		XPR(XPR_VM_FAULT,
1827		    "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1828			object, offset, m,
1829			object->shadow, 0);
1830
1831		next_object = object->shadow;
1832
1833		if (next_object == VM_OBJECT_NULL) {
1834			/*
1835			 * we've hit the bottom of the shadown chain,
1836			 * fill the page in the top object with zeros.
1837			 */
1838			assert(!must_be_resident);
1839
1840			if (object != first_object) {
1841				vm_object_paging_end(object);
1842				vm_object_unlock(object);
1843
1844				object = first_object;
1845				offset = first_offset;
1846				vm_object_lock(object);
1847			}
1848			m = first_m;
1849			assert(m->object == object);
1850			first_m = VM_PAGE_NULL;
1851
1852			/*
1853			 * check for any conditions that prevent
1854			 * us from creating a new zero-fill page
1855			 * vm_fault_check will do all of the
1856			 * fault cleanup in the case of an error condition
1857			 * including resetting the thread_interrupt_level
1858			 */
1859			error = vm_fault_check(object, m, first_m, interruptible_state);
1860
1861			if (error != VM_FAULT_SUCCESS)
1862			        return (error);
1863
1864			if (m == VM_PAGE_NULL) {
1865				m = vm_page_grab();
1866
1867				if (m == VM_PAGE_NULL) {
1868					vm_fault_cleanup(object, VM_PAGE_NULL);
1869					thread_interrupt_level(interruptible_state);
1870
1871					return (VM_FAULT_MEMORY_SHORTAGE);
1872				}
1873				vm_page_insert(m, object, offset);
1874			}
1875			if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1876				m->absent = TRUE;
1877
1878			my_fault = vm_fault_zero_page(m, no_zero_fill);
1879
1880			break;
1881
1882		} else {
1883		        /*
1884			 * Move on to the next object.  Lock the next
1885			 * object before unlocking the current one.
1886			 */
1887			if ((object != first_object) || must_be_resident)
1888				vm_object_paging_end(object);
1889
1890			offset += object->vo_shadow_offset;
1891			fault_info->lo_offset += object->vo_shadow_offset;
1892			fault_info->hi_offset += object->vo_shadow_offset;
1893			access_required = VM_PROT_READ;
1894
1895			vm_object_lock(next_object);
1896			vm_object_unlock(object);
1897
1898			object = next_object;
1899			vm_object_paging_begin(object);
1900		}
1901	}
1902
1903	/*
1904	 *	PAGE HAS BEEN FOUND.
1905	 *
1906	 *	This page (m) is:
1907	 *		busy, so that we can play with it;
1908	 *		not absent, so that nobody else will fill it;
1909	 *		possibly eligible for pageout;
1910	 *
1911	 *	The top-level page (first_m) is:
1912	 *		VM_PAGE_NULL if the page was found in the
1913	 *		 top-level object;
1914	 *		busy, not absent, and ineligible for pageout.
1915	 *
1916	 *	The current object (object) is locked.  A paging
1917	 *	reference is held for the current and top-level
1918	 *	objects.
1919	 */
1920
1921#if TRACEFAULTPAGE
1922	dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);	/* (TEST/DEBUG) */
1923#endif
1924#if	EXTRA_ASSERTIONS
1925	assert(m->busy && !m->absent);
1926	assert((first_m == VM_PAGE_NULL) ||
1927	       (first_m->busy && !first_m->absent &&
1928		!first_m->active && !first_m->inactive));
1929#endif	/* EXTRA_ASSERTIONS */
1930
1931	/*
1932	 * ENCRYPTED SWAP:
1933	 * If we found a page, we must have decrypted it before we
1934	 * get here...
1935	 */
1936	ASSERT_PAGE_DECRYPTED(m);
1937
1938	XPR(XPR_VM_FAULT,
1939	    "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1940		object, offset, m,
1941		first_object, first_m);
1942
1943	/*
1944	 * If the page is being written, but isn't
1945	 * already owned by the top-level object,
1946	 * we have to copy it into a new page owned
1947	 * by the top-level object.
1948	 */
1949	if (object != first_object) {
1950
1951#if TRACEFAULTPAGE
1952		dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type);	/* (TEST/DEBUG) */
1953#endif
1954	    	if (fault_type & VM_PROT_WRITE) {
1955			vm_page_t copy_m;
1956
1957			/*
1958			 * We only really need to copy if we
1959			 * want to write it.
1960			 */
1961			assert(!must_be_resident);
1962
1963			/*
1964			 * are we protecting the system from
1965			 * backing store exhaustion.  If so
1966			 * sleep unless we are privileged.
1967			 */
1968			if (vm_backing_store_low) {
1969				if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1970
1971					RELEASE_PAGE(m);
1972					vm_fault_cleanup(object, first_m);
1973
1974					assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1975
1976					thread_block(THREAD_CONTINUE_NULL);
1977					thread_interrupt_level(interruptible_state);
1978
1979					return (VM_FAULT_RETRY);
1980				}
1981			}
1982			/*
1983			 * If we try to collapse first_object at this
1984			 * point, we may deadlock when we try to get
1985			 * the lock on an intermediate object (since we
1986			 * have the bottom object locked).  We can't
1987			 * unlock the bottom object, because the page
1988			 * we found may move (by collapse) if we do.
1989			 *
1990			 * Instead, we first copy the page.  Then, when
1991			 * we have no more use for the bottom object,
1992			 * we unlock it and try to collapse.
1993			 *
1994			 * Note that we copy the page even if we didn't
1995			 * need to... that's the breaks.
1996			 */
1997
1998			/*
1999			 * Allocate a page for the copy
2000			 */
2001			copy_m = vm_page_grab();
2002
2003			if (copy_m == VM_PAGE_NULL) {
2004				RELEASE_PAGE(m);
2005
2006				vm_fault_cleanup(object, first_m);
2007				thread_interrupt_level(interruptible_state);
2008
2009				return (VM_FAULT_MEMORY_SHORTAGE);
2010			}
2011			XPR(XPR_VM_FAULT,
2012			    "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
2013				object, offset,
2014				m, copy_m, 0);
2015
2016			vm_page_copy(m, copy_m);
2017
2018			/*
2019			 * If another map is truly sharing this
2020			 * page with us, we have to flush all
2021			 * uses of the original page, since we
2022			 * can't distinguish those which want the
2023			 * original from those which need the
2024			 * new copy.
2025			 *
2026			 * XXXO If we know that only one map has
2027			 * access to this page, then we could
2028			 * avoid the pmap_disconnect() call.
2029			 */
2030			if (m->pmapped)
2031			        pmap_disconnect(m->phys_page);
2032
2033			if (m->clustered) {
2034				VM_PAGE_COUNT_AS_PAGEIN(m);
2035				VM_PAGE_CONSUME_CLUSTERED(m);
2036			}
2037			assert(!m->cleaning);
2038
2039			/*
2040			 * We no longer need the old page or object.
2041			 */
2042			RELEASE_PAGE(m);
2043
2044			vm_object_paging_end(object);
2045			vm_object_unlock(object);
2046
2047			my_fault = DBG_COW_FAULT;
2048			VM_STAT_INCR(cow_faults);
2049			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2050			current_task()->cow_faults++;
2051
2052			object = first_object;
2053			offset = first_offset;
2054
2055			vm_object_lock(object);
2056			/*
2057			 * get rid of the place holder
2058			 * page that we soldered in earlier
2059			 */
2060			VM_PAGE_FREE(first_m);
2061			first_m = VM_PAGE_NULL;
2062
2063			/*
2064			 * and replace it with the
2065			 * page we just copied into
2066			 */
2067			assert(copy_m->busy);
2068			vm_page_insert(copy_m, object, offset);
2069			SET_PAGE_DIRTY(copy_m, TRUE);
2070
2071			m = copy_m;
2072			/*
2073			 * Now that we've gotten the copy out of the
2074			 * way, let's try to collapse the top object.
2075			 * But we have to play ugly games with
2076			 * paging_in_progress to do that...
2077			 */
2078			vm_object_paging_end(object);
2079			vm_object_collapse(object, offset, TRUE);
2080			vm_object_paging_begin(object);
2081
2082		} else
2083		    	*protection &= (~VM_PROT_WRITE);
2084	}
2085	/*
2086	 * Now check whether the page needs to be pushed into the
2087	 * copy object.  The use of asymmetric copy on write for
2088	 * shared temporary objects means that we may do two copies to
2089	 * satisfy the fault; one above to get the page from a
2090	 * shadowed object, and one here to push it into the copy.
2091	 */
2092	try_failed_count = 0;
2093
2094	while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2095		vm_object_offset_t	copy_offset;
2096		vm_page_t		copy_m;
2097
2098#if TRACEFAULTPAGE
2099		dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);	/* (TEST/DEBUG) */
2100#endif
2101		/*
2102		 * If the page is being written, but hasn't been
2103		 * copied to the copy-object, we have to copy it there.
2104		 */
2105		if ((fault_type & VM_PROT_WRITE) == 0) {
2106			*protection &= ~VM_PROT_WRITE;
2107			break;
2108		}
2109
2110		/*
2111		 * If the page was guaranteed to be resident,
2112		 * we must have already performed the copy.
2113		 */
2114		if (must_be_resident)
2115			break;
2116
2117		/*
2118		 * Try to get the lock on the copy_object.
2119		 */
2120		if (!vm_object_lock_try(copy_object)) {
2121
2122			vm_object_unlock(object);
2123			try_failed_count++;
2124
2125			mutex_pause(try_failed_count);	/* wait a bit */
2126			vm_object_lock(object);
2127
2128			continue;
2129		}
2130		try_failed_count = 0;
2131
2132		/*
2133		 * Make another reference to the copy-object,
2134		 * to keep it from disappearing during the
2135		 * copy.
2136		 */
2137		vm_object_reference_locked(copy_object);
2138
2139		/*
2140		 * Does the page exist in the copy?
2141		 */
2142		copy_offset = first_offset - copy_object->vo_shadow_offset;
2143
2144		if (copy_object->vo_size <= copy_offset)
2145			/*
2146			 * Copy object doesn't cover this page -- do nothing.
2147			 */
2148			;
2149		else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2150			/*
2151			 * Page currently exists in the copy object
2152			 */
2153			if (copy_m->busy) {
2154				/*
2155				 * If the page is being brought
2156				 * in, wait for it and then retry.
2157				 */
2158				RELEASE_PAGE(m);
2159
2160				/*
2161				 * take an extra ref so object won't die
2162				 */
2163				vm_object_reference_locked(copy_object);
2164				vm_object_unlock(copy_object);
2165				vm_fault_cleanup(object, first_m);
2166				counter(c_vm_fault_page_block_backoff_kernel++);
2167
2168				vm_object_lock(copy_object);
2169				assert(copy_object->ref_count > 0);
2170				VM_OBJ_RES_DECR(copy_object);
2171				vm_object_lock_assert_exclusive(copy_object);
2172				copy_object->ref_count--;
2173				assert(copy_object->ref_count > 0);
2174				copy_m = vm_page_lookup(copy_object, copy_offset);
2175				/*
2176				 * ENCRYPTED SWAP:
2177				 * it's OK if the "copy_m" page is encrypted,
2178				 * because we're not moving it nor handling its
2179				 * contents.
2180				 */
2181				if (copy_m != VM_PAGE_NULL && copy_m->busy) {
2182					PAGE_ASSERT_WAIT(copy_m, interruptible);
2183
2184					vm_object_unlock(copy_object);
2185					wait_result = thread_block(THREAD_CONTINUE_NULL);
2186					vm_object_deallocate(copy_object);
2187
2188					goto backoff;
2189				} else {
2190					vm_object_unlock(copy_object);
2191					vm_object_deallocate(copy_object);
2192					thread_interrupt_level(interruptible_state);
2193
2194					return (VM_FAULT_RETRY);
2195				}
2196			}
2197		}
2198		else if (!PAGED_OUT(copy_object, copy_offset)) {
2199			/*
2200			 * If PAGED_OUT is TRUE, then the page used to exist
2201			 * in the copy-object, and has already been paged out.
2202			 * We don't need to repeat this. If PAGED_OUT is
2203			 * FALSE, then either we don't know (!pager_created,
2204			 * for example) or it hasn't been paged out.
2205			 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2206			 * We must copy the page to the copy object.
2207			 */
2208
2209			if (vm_backing_store_low) {
2210			        /*
2211				 * we are protecting the system from
2212				 * backing store exhaustion.  If so
2213				 * sleep unless we are privileged.
2214				 */
2215				if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2216					assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2217
2218					RELEASE_PAGE(m);
2219					VM_OBJ_RES_DECR(copy_object);
2220					vm_object_lock_assert_exclusive(copy_object);
2221					copy_object->ref_count--;
2222					assert(copy_object->ref_count > 0);
2223
2224					vm_object_unlock(copy_object);
2225					vm_fault_cleanup(object, first_m);
2226					thread_block(THREAD_CONTINUE_NULL);
2227					thread_interrupt_level(interruptible_state);
2228
2229					return (VM_FAULT_RETRY);
2230				}
2231			}
2232			/*
2233			 * Allocate a page for the copy
2234			 */
2235			copy_m = vm_page_alloc(copy_object, copy_offset);
2236
2237			if (copy_m == VM_PAGE_NULL) {
2238				RELEASE_PAGE(m);
2239
2240				VM_OBJ_RES_DECR(copy_object);
2241				vm_object_lock_assert_exclusive(copy_object);
2242				copy_object->ref_count--;
2243				assert(copy_object->ref_count > 0);
2244
2245				vm_object_unlock(copy_object);
2246				vm_fault_cleanup(object, first_m);
2247				thread_interrupt_level(interruptible_state);
2248
2249				return (VM_FAULT_MEMORY_SHORTAGE);
2250			}
2251			/*
2252			 * Must copy page into copy-object.
2253			 */
2254			vm_page_copy(m, copy_m);
2255
2256			/*
2257			 * If the old page was in use by any users
2258			 * of the copy-object, it must be removed
2259			 * from all pmaps.  (We can't know which
2260			 * pmaps use it.)
2261			 */
2262			if (m->pmapped)
2263			        pmap_disconnect(m->phys_page);
2264
2265			if (m->clustered) {
2266				VM_PAGE_COUNT_AS_PAGEIN(m);
2267				VM_PAGE_CONSUME_CLUSTERED(m);
2268			}
2269			/*
2270			 * If there's a pager, then immediately
2271			 * page out this page, using the "initialize"
2272			 * option.  Else, we use the copy.
2273			 */
2274		 	if ((!copy_object->pager_ready)
2275#if MACH_PAGEMAP
2276			    || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2277#endif
2278			    || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2279			    ) {
2280
2281				vm_page_lockspin_queues();
2282				assert(!m->cleaning);
2283				vm_page_activate(copy_m);
2284				vm_page_unlock_queues();
2285
2286				SET_PAGE_DIRTY(copy_m, TRUE);
2287				PAGE_WAKEUP_DONE(copy_m);
2288
2289			} else if (copy_object->internal &&
2290				   (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE)) {
2291				/*
2292				 * For internal objects check with the pager to see
2293				 * if the page already exists in the backing store.
2294				 * If yes, then we can drop the copy page. If not,
2295				 * then we'll activate it, mark it dirty and keep it
2296				 * around.
2297				 */
2298
2299				kern_return_t kr = KERN_SUCCESS;
2300
2301				memory_object_t	copy_pager = copy_object->pager;
2302				assert(copy_pager != MEMORY_OBJECT_NULL);
2303				vm_object_paging_begin(copy_object);
2304
2305				vm_object_unlock(copy_object);
2306
2307				kr = memory_object_data_request(
2308					copy_pager,
2309					copy_offset + copy_object->paging_offset,
2310					0, /* Only query the pager. */
2311					VM_PROT_READ,
2312					NULL);
2313
2314				vm_object_lock(copy_object);
2315
2316				vm_object_paging_end(copy_object);
2317
2318				/*
2319				 * Since we dropped the copy_object's lock,
2320				 * check whether we'll have to deallocate
2321				 * the hard way.
2322				 */
2323				if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2324					vm_object_unlock(copy_object);
2325					vm_object_deallocate(copy_object);
2326					vm_object_lock(object);
2327
2328					continue;
2329				}
2330				if (kr == KERN_SUCCESS) {
2331					/*
2332					 * The pager has the page. We don't want to overwrite
2333					 * that page by sending this one out to the backing store.
2334					 * So we drop the copy page.
2335					 */
2336					VM_PAGE_FREE(copy_m);
2337
2338				} else {
2339					/*
2340					 * The pager doesn't have the page. We'll keep this one
2341					 * around in the copy object. It might get sent out to
2342					 * the backing store under memory pressure.
2343					 */
2344					vm_page_lockspin_queues();
2345					assert(!m->cleaning);
2346					vm_page_activate(copy_m);
2347					vm_page_unlock_queues();
2348
2349					SET_PAGE_DIRTY(copy_m, TRUE);
2350					PAGE_WAKEUP_DONE(copy_m);
2351				}
2352			} else {
2353
2354				assert(copy_m->busy == TRUE);
2355				assert(!m->cleaning);
2356
2357				/*
2358				 * dirty is protected by the object lock
2359				 */
2360				SET_PAGE_DIRTY(copy_m, TRUE);
2361
2362				/*
2363				 * The page is already ready for pageout:
2364				 * not on pageout queues and busy.
2365				 * Unlock everything except the
2366				 * copy_object itself.
2367				 */
2368				vm_object_unlock(object);
2369
2370				/*
2371				 * Write the page to the copy-object,
2372				 * flushing it from the kernel.
2373				 */
2374				vm_pageout_initialize_page(copy_m);
2375
2376				/*
2377				 * Since the pageout may have
2378				 * temporarily dropped the
2379				 * copy_object's lock, we
2380				 * check whether we'll have
2381				 * to deallocate the hard way.
2382				 */
2383				if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2384					vm_object_unlock(copy_object);
2385					vm_object_deallocate(copy_object);
2386					vm_object_lock(object);
2387
2388					continue;
2389				}
2390				/*
2391				 * Pick back up the old object's
2392				 * lock.  [It is safe to do so,
2393				 * since it must be deeper in the
2394				 * object tree.]
2395				 */
2396				vm_object_lock(object);
2397			}
2398
2399			/*
2400			 * Because we're pushing a page upward
2401			 * in the object tree, we must restart
2402			 * any faults that are waiting here.
2403			 * [Note that this is an expansion of
2404			 * PAGE_WAKEUP that uses the THREAD_RESTART
2405			 * wait result].  Can't turn off the page's
2406			 * busy bit because we're not done with it.
2407			 */
2408			if (m->wanted) {
2409				m->wanted = FALSE;
2410				thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2411			}
2412		}
2413		/*
2414		 * The reference count on copy_object must be
2415		 * at least 2: one for our extra reference,
2416		 * and at least one from the outside world
2417		 * (we checked that when we last locked
2418		 * copy_object).
2419		 */
2420		vm_object_lock_assert_exclusive(copy_object);
2421		copy_object->ref_count--;
2422		assert(copy_object->ref_count > 0);
2423
2424		VM_OBJ_RES_DECR(copy_object);
2425		vm_object_unlock(copy_object);
2426
2427		break;
2428	}
2429
2430done:
2431	*result_page = m;
2432	*top_page = first_m;
2433
2434	XPR(XPR_VM_FAULT,
2435		"vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2436		object, offset, m, first_m, 0);
2437
2438	if (m != VM_PAGE_NULL) {
2439		retval = VM_FAULT_SUCCESS;
2440
2441		if (my_fault == DBG_PAGEIN_FAULT) {
2442
2443			VM_PAGE_COUNT_AS_PAGEIN(m);
2444
2445			if (m->object->internal)
2446				my_fault = DBG_PAGEIND_FAULT;
2447			else
2448				my_fault = DBG_PAGEINV_FAULT;
2449
2450		        /*
2451			 * evaluate access pattern and update state
2452			 * vm_fault_deactivate_behind depends on the
2453			 * state being up to date
2454			 */
2455		        vm_fault_is_sequential(object, offset, fault_info->behavior);
2456
2457			vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2458		} else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2459
2460			VM_STAT_INCR(decompressions);
2461		}
2462		if (type_of_fault)
2463		        *type_of_fault = my_fault;
2464	} else {
2465		retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2466		assert(first_m == VM_PAGE_NULL);
2467		assert(object == first_object);
2468	}
2469
2470	thread_interrupt_level(interruptible_state);
2471
2472#if TRACEFAULTPAGE
2473	dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);	/* (TEST/DEBUG) */
2474#endif
2475	return retval;
2476
2477backoff:
2478	thread_interrupt_level(interruptible_state);
2479
2480	if (wait_result == THREAD_INTERRUPTED)
2481		return (VM_FAULT_INTERRUPTED);
2482	return (VM_FAULT_RETRY);
2483
2484#undef	RELEASE_PAGE
2485}
2486
2487
2488
2489/*
2490 * CODE SIGNING:
2491 * When soft faulting a page, we have to validate the page if:
2492 * 1. the page is being mapped in user space
2493 * 2. the page hasn't already been found to be "tainted"
2494 * 3. the page belongs to a code-signed object
2495 * 4. the page has not been validated yet or has been mapped for write.
2496 */
2497#define VM_FAULT_NEED_CS_VALIDATION(pmap, page)				\
2498	((pmap) != kernel_pmap /*1*/ &&					\
2499	 !(page)->cs_tainted /*2*/ &&					\
2500	 (page)->object->code_signed /*3*/ &&				\
2501	 (!(page)->cs_validated || (page)->wpmapped /*4*/))
2502
2503
2504/*
2505 * page queue lock must NOT be held
2506 * m->object must be locked
2507 *
2508 * NOTE: m->object could be locked "shared" only if we are called
2509 * from vm_fault() as part of a soft fault.  If so, we must be
2510 * careful not to modify the VM object in any way that is not
2511 * legal under a shared lock...
2512 */
2513extern int proc_selfpid(void);
2514extern char *proc_name_address(void *p);
2515unsigned long cs_enter_tainted_rejected = 0;
2516unsigned long cs_enter_tainted_accepted = 0;
2517kern_return_t
2518vm_fault_enter(vm_page_t m,
2519	       pmap_t pmap,
2520	       vm_map_offset_t vaddr,
2521	       vm_prot_t prot,
2522	       vm_prot_t fault_type,
2523	       boolean_t wired,
2524	       boolean_t change_wiring,
2525	       boolean_t no_cache,
2526	       boolean_t cs_bypass,
2527	       __unused int	 user_tag,
2528	       int	 pmap_options,
2529	       boolean_t *need_retry,
2530	       int *type_of_fault)
2531{
2532	kern_return_t	kr, pe_result;
2533	boolean_t	previously_pmapped = m->pmapped;
2534	boolean_t	must_disconnect = 0;
2535	boolean_t	map_is_switched, map_is_switch_protected;
2536	int		cs_enforcement_enabled;
2537
2538	vm_object_lock_assert_held(m->object);
2539#if DEBUG
2540	lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2541#endif /* DEBUG */
2542
2543	if (m->phys_page == vm_page_guard_addr) {
2544		assert(m->fictitious);
2545		return KERN_SUCCESS;
2546	}
2547
2548	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2549
2550		vm_object_lock_assert_exclusive(m->object);
2551
2552	} else if ((fault_type & VM_PROT_WRITE) == 0) {
2553		/*
2554		 * This is not a "write" fault, so we
2555		 * might not have taken the object lock
2556		 * exclusively and we might not be able
2557		 * to update the "wpmapped" bit in
2558		 * vm_fault_enter().
2559		 * Let's just grant read access to
2560		 * the page for now and we'll
2561		 * soft-fault again if we need write
2562		 * access later...
2563		 */
2564		prot &= ~VM_PROT_WRITE;
2565	}
2566	if (m->pmapped == FALSE) {
2567
2568		if (m->clustered) {
2569			if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2570				/*
2571				 * found it in the cache, but this
2572				 * is the first fault-in of the page (m->pmapped == FALSE)
2573				 * so it must have come in as part of
2574				 * a cluster... account 1 pagein against it
2575				 */
2576				if (m->object->internal)
2577					*type_of_fault = DBG_PAGEIND_FAULT;
2578				else
2579					*type_of_fault = DBG_PAGEINV_FAULT;
2580
2581				VM_PAGE_COUNT_AS_PAGEIN(m);
2582			}
2583			VM_PAGE_CONSUME_CLUSTERED(m);
2584		}
2585	}
2586
2587	if (*type_of_fault != DBG_COW_FAULT) {
2588		DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2589
2590		if (pmap == kernel_pmap) {
2591			DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2592		}
2593	}
2594
2595	/* Validate code signature if necessary. */
2596	if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2597		vm_object_lock_assert_exclusive(m->object);
2598
2599		if (m->cs_validated) {
2600			vm_cs_revalidates++;
2601		}
2602
2603		/* VM map is locked, so 1 ref will remain on VM object -
2604		 * so no harm if vm_page_validate_cs drops the object lock */
2605		vm_page_validate_cs(m);
2606	}
2607
2608#define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2609
2610	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2611			   (pmap == vm_map_pmap(current_thread()->map)));
2612	map_is_switch_protected = current_thread()->map->switch_protect;
2613
2614	/* If the map is switched, and is switch-protected, we must protect
2615	 * some pages from being write-faulted: immutable pages because by
2616	 * definition they may not be written, and executable pages because that
2617	 * would provide a way to inject unsigned code.
2618	 * If the page is immutable, we can simply return. However, we can't
2619	 * immediately determine whether a page is executable anywhere. But,
2620	 * we can disconnect it everywhere and remove the executable protection
2621	 * from the current map. We do that below right before we do the
2622	 * PMAP_ENTER.
2623	 */
2624	cs_enforcement_enabled = cs_enforcement(NULL);
2625
2626	if(cs_enforcement_enabled && map_is_switched &&
2627	   map_is_switch_protected && page_immutable(m, prot) &&
2628	   (prot & VM_PROT_WRITE))
2629	{
2630		return KERN_CODESIGN_ERROR;
2631	}
2632
2633	/* A page could be tainted, or pose a risk of being tainted later.
2634	 * Check whether the receiving process wants it, and make it feel
2635	 * the consequences (that hapens in cs_invalid_page()).
2636	 * For CS Enforcement, two other conditions will
2637	 * cause that page to be tainted as well:
2638	 * - pmapping an unsigned page executable - this means unsigned code;
2639	 * - writeable mapping of a validated page - the content of that page
2640	 *   can be changed without the kernel noticing, therefore unsigned
2641	 *   code can be created
2642	 */
2643	if (m->cs_tainted ||
2644	    ((cs_enforcement_enabled && !cs_bypass ) &&
2645	     (/* The page is unsigned and wants to be executable */
2646	      (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2647	      /* The page should be immutable, but is in danger of being modified
2648		* This is the case where we want policy from the code directory -
2649		* is the page immutable or not? For now we have to assume that
2650		* code pages will be immutable, data pages not.
2651		* We'll assume a page is a code page if it has a code directory
2652		* and we fault for execution.
2653		* That is good enough since if we faulted the code page for
2654		* writing in another map before, it is wpmapped; if we fault
2655		* it for writing in this map later it will also be faulted for executing
2656		* at the same time; and if we fault for writing in another map
2657		* later, we will disconnect it from this pmap so we'll notice
2658		* the change.
2659		*/
2660	      (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2661	      ))
2662		)
2663	{
2664		/* We will have a tainted page. Have to handle the special case
2665		 * of a switched map now. If the map is not switched, standard
2666		 * procedure applies - call cs_invalid_page().
2667		 * If the map is switched, the real owner is invalid already.
2668		 * There is no point in invalidating the switching process since
2669		 * it will not be executing from the map. So we don't call
2670		 * cs_invalid_page() in that case. */
2671		boolean_t reject_page;
2672		if(map_is_switched) {
2673			assert(pmap==vm_map_pmap(current_thread()->map));
2674			assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2675			reject_page = FALSE;
2676		} else {
2677			if (cs_debug > 5)
2678				printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s slid: %s prot: 0x%x\n",
2679				       m->object->code_signed ? "yes" : "no",
2680				       m->cs_validated ? "yes" : "no",
2681				       m->cs_tainted ? "yes" : "no",
2682				       m->wpmapped ? "yes" : "no",
2683				       m->slid ? "yes" : "no",
2684				       (int)prot);
2685			reject_page = cs_invalid_page((addr64_t) vaddr);
2686		}
2687
2688		if (reject_page) {
2689			/* reject the invalid page: abort the page fault */
2690			int			pid;
2691			const char		*procname;
2692			task_t			task;
2693			vm_object_t		file_object, shadow;
2694			vm_object_offset_t	file_offset;
2695			char			*pathname, *filename;
2696			vm_size_t		pathname_len, filename_len;
2697			boolean_t		truncated_path;
2698#define __PATH_MAX 1024
2699			struct timespec		mtime, cs_mtime;
2700
2701			kr = KERN_CODESIGN_ERROR;
2702			cs_enter_tainted_rejected++;
2703
2704			/* get process name and pid */
2705			procname = "?";
2706			task = current_task();
2707			pid = proc_selfpid();
2708			if (task->bsd_info != NULL)
2709				procname = proc_name_address(task->bsd_info);
2710
2711			/* get file's VM object */
2712			file_object = m->object;
2713			file_offset = m->offset;
2714			for (shadow = file_object->shadow;
2715			     shadow != VM_OBJECT_NULL;
2716			     shadow = file_object->shadow) {
2717				vm_object_lock_shared(shadow);
2718				if (file_object != m->object) {
2719					vm_object_unlock(file_object);
2720				}
2721				file_offset += file_object->vo_shadow_offset;
2722				file_object = shadow;
2723			}
2724
2725			mtime.tv_sec = 0;
2726			mtime.tv_nsec = 0;
2727			cs_mtime.tv_sec = 0;
2728			cs_mtime.tv_nsec = 0;
2729
2730			/* get file's pathname and/or filename */
2731			pathname = NULL;
2732			filename = NULL;
2733			pathname_len = 0;
2734			filename_len = 0;
2735			truncated_path = FALSE;
2736			if (file_object->pager == NULL) {
2737				/* no pager -> no file -> no pathname */
2738				pathname = (char *) "<nil>";
2739			} else {
2740				pathname = (char *)kalloc(__PATH_MAX * 2);
2741				if (pathname) {
2742					pathname[0] = '\0';
2743					pathname_len = __PATH_MAX;
2744					filename = pathname + pathname_len;
2745					filename_len = __PATH_MAX;
2746				}
2747				vnode_pager_get_object_name(file_object->pager,
2748							    pathname,
2749							    pathname_len,
2750							    filename,
2751							    filename_len,
2752							    &truncated_path);
2753				vnode_pager_get_object_mtime(file_object->pager,
2754							     &mtime,
2755							     &cs_mtime);
2756			}
2757			printf("CODE SIGNING: process %d[%s]: "
2758			       "rejecting invalid page at address 0x%llx "
2759			       "from offset 0x%llx in file \"%s%s%s\" "
2760			       "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2761			       "(signed:%d validated:%d tainted:%d "
2762			       "wpmapped:%d slid:%d)\n",
2763			       pid, procname, (addr64_t) vaddr,
2764			       file_offset,
2765			       (pathname ? pathname : ""),
2766			       (truncated_path ? "/.../" : ""),
2767			       (truncated_path ? filename : ""),
2768			       cs_mtime.tv_sec, cs_mtime.tv_nsec,
2769			       ((cs_mtime.tv_sec == mtime.tv_sec &&
2770				 cs_mtime.tv_nsec == mtime.tv_nsec)
2771				? "=="
2772				: "!="),
2773			       mtime.tv_sec, mtime.tv_nsec,
2774			       m->object->code_signed,
2775			       m->cs_validated,
2776			       m->cs_tainted,
2777			       m->wpmapped,
2778			       m->slid);
2779			if (file_object != m->object) {
2780				vm_object_unlock(file_object);
2781			}
2782			if (pathname_len != 0) {
2783				kfree(pathname, __PATH_MAX * 2);
2784				pathname = NULL;
2785				filename = NULL;
2786			}
2787		} else {
2788			/* proceed with the invalid page */
2789			kr = KERN_SUCCESS;
2790			if (!m->cs_validated) {
2791				/*
2792				 * This page has not been validated, so it
2793				 * must not belong to a code-signed object
2794				 * and should not be forcefully considered
2795				 * as tainted.
2796				 * We're just concerned about it here because
2797				 * we've been asked to "execute" it but that
2798				 * does not mean that it should cause other
2799				 * accesses to fail.
2800				 * This happens when a debugger sets a
2801				 * breakpoint and we then execute code in
2802				 * that page.  Marking the page as "tainted"
2803				 * would cause any inspection tool ("leaks",
2804				 * "vmmap", "CrashReporter", ...) to get killed
2805				 * due to code-signing violation on that page,
2806				 * even though they're just reading it and not
2807				 * executing from it.
2808				 */
2809				assert(!m->object->code_signed);
2810			} else {
2811				/*
2812				 * Page might have been tainted before or not;
2813				 * now it definitively is. If the page wasn't
2814				 * tainted, we must disconnect it from all
2815				 * pmaps later, to force existing mappings
2816				 * through that code path for re-consideration
2817				 * of the validity of that page.
2818				 */
2819				must_disconnect = !m->cs_tainted;
2820				m->cs_tainted = TRUE;
2821			}
2822			cs_enter_tainted_accepted++;
2823		}
2824		if (kr != KERN_SUCCESS) {
2825			if (cs_debug) {
2826				printf("CODESIGNING: vm_fault_enter(0x%llx): "
2827				       "*** INVALID PAGE ***\n",
2828				       (long long)vaddr);
2829			}
2830#if !SECURE_KERNEL
2831			if (cs_enforcement_panic) {
2832				panic("CODESIGNING: panicking on invalid page\n");
2833			}
2834#endif
2835		}
2836
2837	} else {
2838		/* proceed with the valid page */
2839		kr = KERN_SUCCESS;
2840	}
2841
2842	boolean_t	page_queues_locked = FALSE;
2843#define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()	\
2844MACRO_BEGIN			    		\
2845	if (! page_queues_locked) {		\
2846		page_queues_locked = TRUE;	\
2847		vm_page_lockspin_queues();	\
2848	}					\
2849MACRO_END
2850#define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()	\
2851MACRO_BEGIN			    		\
2852	if (page_queues_locked) {		\
2853		page_queues_locked = FALSE;	\
2854		vm_page_unlock_queues();	\
2855	}					\
2856MACRO_END
2857
2858	/*
2859	 * Hold queues lock to manipulate
2860	 * the page queues.  Change wiring
2861	 * case is obvious.
2862	 */
2863	assert(m->compressor || m->object != compressor_object);
2864	if (m->compressor) {
2865		/*
2866		 * Compressor pages are neither wired
2867		 * nor pageable and should never change.
2868		 */
2869		assert(m->object == compressor_object);
2870	} else if (change_wiring) {
2871	        __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2872
2873		if (wired) {
2874			if (kr == KERN_SUCCESS) {
2875				vm_page_wire(m);
2876			}
2877		} else {
2878		        vm_page_unwire(m, TRUE);
2879		}
2880		/* we keep the page queues lock, if we need it later */
2881
2882	} else {
2883	        if (kr != KERN_SUCCESS) {
2884		        __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2885		        vm_page_deactivate(m);
2886			/* we keep the page queues lock, if we need it later */
2887		} else if (((!m->active && !m->inactive) ||
2888			    m->clean_queue ||
2889			    no_cache) &&
2890			   !VM_PAGE_WIRED(m) && !m->throttled) {
2891
2892			if (vm_page_local_q &&
2893			    !no_cache &&
2894			    (*type_of_fault == DBG_COW_FAULT ||
2895			     *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2896				struct vpl	*lq;
2897				uint32_t	lid;
2898
2899				__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
2900				vm_object_lock_assert_exclusive(m->object);
2901
2902				/*
2903				 * we got a local queue to stuff this
2904				 * new page on...
2905				 * its safe to manipulate local and
2906				 * local_id at this point since we're
2907				 * behind an exclusive object lock and
2908				 * the page is not on any global queue.
2909				 *
2910				 * we'll use the current cpu number to
2911				 * select the queue note that we don't
2912				 * need to disable preemption... we're
2913				 * going to behind the local queue's
2914				 * lock to do the real work
2915				 */
2916				lid = cpu_number();
2917
2918				lq = &vm_page_local_q[lid].vpl_un.vpl;
2919
2920				VPL_LOCK(&lq->vpl_lock);
2921
2922				queue_enter(&lq->vpl_queue, m,
2923					    vm_page_t, pageq);
2924				m->local = TRUE;
2925				m->local_id = lid;
2926				lq->vpl_count++;
2927
2928				if (m->object->internal)
2929					lq->vpl_internal_count++;
2930				else
2931					lq->vpl_external_count++;
2932
2933				VPL_UNLOCK(&lq->vpl_lock);
2934
2935				if (lq->vpl_count > vm_page_local_q_soft_limit)
2936				{
2937					/*
2938					 * we're beyond the soft limit
2939					 * for the local queue
2940					 * vm_page_reactivate_local will
2941					 * 'try' to take the global page
2942					 * queue lock... if it can't
2943					 * that's ok... we'll let the
2944					 * queue continue to grow up
2945					 * to the hard limit... at that
2946					 * point we'll wait for the
2947					 * lock... once we've got the
2948					 * lock, we'll transfer all of
2949					 * the pages from the local
2950					 * queue to the global active
2951					 * queue
2952					 */
2953					vm_page_reactivate_local(lid, FALSE, FALSE);
2954				}
2955			} else {
2956
2957				__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2958
2959				/*
2960				 * test again now that we hold the
2961				 * page queue lock
2962				 */
2963				if (!VM_PAGE_WIRED(m)) {
2964					if (m->clean_queue) {
2965						VM_PAGE_QUEUES_REMOVE(m);
2966
2967						vm_pageout_cleaned_reactivated++;
2968						vm_pageout_cleaned_fault_reactivated++;
2969					}
2970
2971					if ((!m->active &&
2972					     !m->inactive) ||
2973					    no_cache) {
2974						/*
2975						 * If this is a no_cache mapping
2976						 * and the page has never been
2977						 * mapped before or was
2978						 * previously a no_cache page,
2979						 * then we want to leave pages
2980						 * in the speculative state so
2981						 * that they can be readily
2982						 * recycled if free memory runs
2983						 * low.  Otherwise the page is
2984						 * activated as normal.
2985						 */
2986
2987						if (no_cache &&
2988						    (!previously_pmapped ||
2989						     m->no_cache)) {
2990							m->no_cache = TRUE;
2991
2992							if (!m->speculative)
2993								vm_page_speculate(m, FALSE);
2994
2995						} else if (!m->active &&
2996							   !m->inactive) {
2997
2998							vm_page_activate(m);
2999						}
3000					}
3001				}
3002				/* we keep the page queues lock, if we need it later */
3003			}
3004		}
3005	}
3006	/* we're done with the page queues lock, if we ever took it */
3007	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3008
3009
3010	/* If we have a KERN_SUCCESS from the previous checks, we either have
3011	 * a good page, or a tainted page that has been accepted by the process.
3012	 * In both cases the page will be entered into the pmap.
3013	 * If the page is writeable, we need to disconnect it from other pmaps
3014	 * now so those processes can take note.
3015	 */
3016	if (kr == KERN_SUCCESS) {
3017
3018	        /*
3019		 * NOTE: we may only hold the vm_object lock SHARED
3020		 * at this point, so we need the phys_page lock to
3021		 * properly serialize updating the pmapped and
3022		 * xpmapped bits
3023		 */
3024		if ((prot & VM_PROT_EXECUTE) && !m->xpmapped) {
3025
3026			pmap_lock_phys_page(m->phys_page);
3027			/*
3028			 * go ahead and take the opportunity
3029			 * to set 'pmapped' here so that we don't
3030			 * need to grab this lock a 2nd time
3031			 * just below
3032			 */
3033			m->pmapped = TRUE;
3034
3035			if (!m->xpmapped) {
3036
3037				m->xpmapped = TRUE;
3038
3039				pmap_unlock_phys_page(m->phys_page);
3040
3041				if (!m->object->internal)
3042					OSAddAtomic(1, &vm_page_xpmapped_external_count);
3043
3044				if ((COMPRESSED_PAGER_IS_ACTIVE) &&
3045				    m->object->internal &&
3046				    m->object->pager != NULL) {
3047					/*
3048					 * This page could have been
3049					 * uncompressed by the
3050					 * compressor pager and its
3051					 * contents might be only in
3052					 * the data cache.
3053					 * Since it's being mapped for
3054					 * "execute" for the fist time,
3055					 * make sure the icache is in
3056					 * sync.
3057					 */
3058					pmap_sync_page_data_phys(m->phys_page);
3059				}
3060			} else
3061				pmap_unlock_phys_page(m->phys_page);
3062		} else {
3063			if (m->pmapped == FALSE) {
3064				pmap_lock_phys_page(m->phys_page);
3065				m->pmapped = TRUE;
3066				pmap_unlock_phys_page(m->phys_page);
3067			}
3068		}
3069		if (vm_page_is_slideable(m)) {
3070			boolean_t was_busy = m->busy;
3071
3072			vm_object_lock_assert_exclusive(m->object);
3073
3074			m->busy = TRUE;
3075			kr = vm_page_slide(m, 0);
3076			assert(m->busy);
3077			if(!was_busy) {
3078				PAGE_WAKEUP_DONE(m);
3079			}
3080			if (kr != KERN_SUCCESS) {
3081				/*
3082				 * This page has not been slid correctly,
3083				 * do not do the pmap_enter() !
3084				 * Let vm_fault_enter() return the error
3085				 * so the caller can fail the fault.
3086				 */
3087				goto after_the_pmap_enter;
3088			}
3089		}
3090
3091		if (fault_type & VM_PROT_WRITE) {
3092
3093			if (m->wpmapped == FALSE) {
3094				vm_object_lock_assert_exclusive(m->object);
3095
3096				m->wpmapped = TRUE;
3097			}
3098			if (must_disconnect) {
3099				/*
3100				 * We can only get here
3101				 * because of the CSE logic
3102				 */
3103				assert(cs_enforcement_enabled);
3104				pmap_disconnect(m->phys_page);
3105				/*
3106				 * If we are faulting for a write, we can clear
3107				 * the execute bit - that will ensure the page is
3108				 * checked again before being executable, which
3109				 * protects against a map switch.
3110				 * This only happens the first time the page
3111				 * gets tainted, so we won't get stuck here
3112				 * to make an already writeable page executable.
3113				 */
3114				if (!cs_bypass){
3115					prot &= ~VM_PROT_EXECUTE;
3116				}
3117			}
3118		}
3119
3120		/* Prevent a deadlock by not
3121		 * holding the object lock if we need to wait for a page in
3122		 * pmap_enter() - <rdar://problem/7138958> */
3123		PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3124				   wired,
3125				   pmap_options | PMAP_OPTIONS_NOWAIT,
3126				   pe_result);
3127
3128		if(pe_result == KERN_RESOURCE_SHORTAGE) {
3129
3130			if (need_retry) {
3131				/*
3132				 * this will be non-null in the case where we hold the lock
3133				 * on the top-object in this chain... we can't just drop
3134				 * the lock on the object we're inserting the page into
3135				 * and recall the PMAP_ENTER since we can still cause
3136				 * a deadlock if one of the critical paths tries to
3137				 * acquire the lock on the top-object and we're blocked
3138				 * in PMAP_ENTER waiting for memory... our only recourse
3139				 * is to deal with it at a higher level where we can
3140				 * drop both locks.
3141				 */
3142				*need_retry = TRUE;
3143				vm_pmap_enter_retried++;
3144				goto after_the_pmap_enter;
3145			}
3146			/* The nonblocking version of pmap_enter did not succeed.
3147			 * and we don't need to drop other locks and retry
3148			 * at the level above us, so
3149			 * use the blocking version instead. Requires marking
3150			 * the page busy and unlocking the object */
3151			boolean_t was_busy = m->busy;
3152
3153			vm_object_lock_assert_exclusive(m->object);
3154
3155			m->busy = TRUE;
3156			vm_object_unlock(m->object);
3157
3158			PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3159					   0, wired,
3160			                   pmap_options, pe_result);
3161
3162			/* Take the object lock again. */
3163			vm_object_lock(m->object);
3164
3165			/* If the page was busy, someone else will wake it up.
3166			 * Otherwise, we have to do it now. */
3167			assert(m->busy);
3168			if(!was_busy) {
3169				PAGE_WAKEUP_DONE(m);
3170			}
3171			vm_pmap_enter_blocked++;
3172		}
3173	}
3174
3175after_the_pmap_enter:
3176	return kr;
3177}
3178
3179void
3180vm_pre_fault(vm_map_offset_t vaddr)
3181{
3182	if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3183
3184		vm_fault(current_map(), /* map */
3185			vaddr,		/* vaddr */
3186			VM_PROT_READ, /* fault_type */
3187			FALSE, /* change_wiring */
3188			THREAD_UNINT, /* interruptible */
3189			NULL, /* caller_pmap */
3190			0 /* caller_pmap_addr */);
3191	}
3192}
3193
3194
3195/*
3196 *	Routine:	vm_fault
3197 *	Purpose:
3198 *		Handle page faults, including pseudo-faults
3199 *		used to change the wiring status of pages.
3200 *	Returns:
3201 *		Explicit continuations have been removed.
3202 *	Implementation:
3203 *		vm_fault and vm_fault_page save mucho state
3204 *		in the moral equivalent of a closure.  The state
3205 *		structure is allocated when first entering vm_fault
3206 *		and deallocated when leaving vm_fault.
3207 */
3208
3209extern int _map_enter_debug;
3210
3211unsigned long vm_fault_collapse_total = 0;
3212unsigned long vm_fault_collapse_skipped = 0;
3213
3214
3215kern_return_t
3216vm_fault(
3217	vm_map_t	map,
3218	vm_map_offset_t	vaddr,
3219	vm_prot_t	fault_type,
3220	boolean_t	change_wiring,
3221	int		interruptible,
3222	pmap_t		caller_pmap,
3223	vm_map_offset_t	caller_pmap_addr)
3224{
3225	return vm_fault_internal(map, vaddr, fault_type, change_wiring,
3226				 interruptible, caller_pmap, caller_pmap_addr,
3227				 NULL);
3228}
3229
3230kern_return_t
3231vm_fault_internal(
3232	vm_map_t	map,
3233	vm_map_offset_t	vaddr,
3234	vm_prot_t	fault_type,
3235	boolean_t	change_wiring,
3236	int		interruptible,
3237	pmap_t		caller_pmap,
3238	vm_map_offset_t	caller_pmap_addr,
3239	ppnum_t		*physpage_p)
3240{
3241	vm_map_version_t	version;	/* Map version for verificiation */
3242	boolean_t		wired;		/* Should mapping be wired down? */
3243	vm_object_t		object;		/* Top-level object */
3244	vm_object_offset_t	offset;		/* Top-level offset */
3245	vm_prot_t		prot;		/* Protection for mapping */
3246	vm_object_t		old_copy_object; /* Saved copy object */
3247	vm_page_t		result_page;	/* Result of vm_fault_page */
3248	vm_page_t		top_page;	/* Placeholder page */
3249	kern_return_t		kr;
3250
3251	vm_page_t		m;	/* Fast access to result_page */
3252	kern_return_t		error_code;
3253	vm_object_t		cur_object;
3254	vm_object_offset_t	cur_offset;
3255	vm_page_t		cur_m;
3256	vm_object_t		new_object;
3257	int                     type_of_fault;
3258	pmap_t			pmap;
3259	boolean_t		interruptible_state;
3260	vm_map_t		real_map = map;
3261	vm_map_t		original_map = map;
3262	vm_prot_t		original_fault_type;
3263	struct vm_object_fault_info fault_info;
3264	boolean_t		need_collapse = FALSE;
3265	boolean_t		need_retry = FALSE;
3266	boolean_t		*need_retry_ptr = NULL;
3267	int			object_lock_type = 0;
3268	int			cur_object_lock_type;
3269	vm_object_t		top_object = VM_OBJECT_NULL;
3270	int			throttle_delay;
3271	int			compressed_count_delta;
3272
3273
3274	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3275	              (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3276			      ((uint64_t)vaddr >> 32),
3277			      vaddr,
3278			      (map == kernel_map),
3279			      0,
3280			      0);
3281
3282	if (get_preemption_level() != 0) {
3283	        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3284				      (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3285				      ((uint64_t)vaddr >> 32),
3286				      vaddr,
3287				      KERN_FAILURE,
3288				      0,
3289				      0);
3290
3291		return (KERN_FAILURE);
3292	}
3293
3294	interruptible_state = thread_interrupt_level(interruptible);
3295
3296	VM_STAT_INCR(faults);
3297	current_task()->faults++;
3298	original_fault_type = fault_type;
3299
3300	if (fault_type & VM_PROT_WRITE)
3301	        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3302	else
3303	        object_lock_type = OBJECT_LOCK_SHARED;
3304
3305	cur_object_lock_type = OBJECT_LOCK_SHARED;
3306
3307RetryFault:
3308	/*
3309	 * assume we will hit a page in the cache
3310	 * otherwise, explicitly override with
3311	 * the real fault type once we determine it
3312	 */
3313	type_of_fault = DBG_CACHE_HIT_FAULT;
3314
3315	/*
3316	 *	Find the backing store object and offset into
3317	 *	it to begin the search.
3318	 */
3319	fault_type = original_fault_type;
3320	map = original_map;
3321	vm_map_lock_read(map);
3322
3323	kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3324				  object_lock_type, &version,
3325				  &object, &offset, &prot, &wired,
3326				  &fault_info,
3327				  &real_map);
3328
3329	if (kr != KERN_SUCCESS) {
3330		vm_map_unlock_read(map);
3331		goto done;
3332	}
3333	pmap = real_map->pmap;
3334	fault_info.interruptible = interruptible;
3335	fault_info.stealth = FALSE;
3336	fault_info.io_sync = FALSE;
3337	fault_info.mark_zf_absent = FALSE;
3338	fault_info.batch_pmap_op = FALSE;
3339
3340	/*
3341	 * If the page is wired, we must fault for the current protection
3342	 * value, to avoid further faults.
3343	 */
3344	if (wired) {
3345		fault_type = prot | VM_PROT_WRITE;
3346		/*
3347		 * since we're treating this fault as a 'write'
3348		 * we must hold the top object lock exclusively
3349		 */
3350		if (object_lock_type == OBJECT_LOCK_SHARED) {
3351
3352		        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3353
3354			if (vm_object_lock_upgrade(object) == FALSE) {
3355			        /*
3356				 * couldn't upgrade, so explictly
3357				 * take the lock exclusively
3358				 */
3359			        vm_object_lock(object);
3360			}
3361		}
3362	}
3363
3364#if	VM_FAULT_CLASSIFY
3365	/*
3366	 *	Temporary data gathering code
3367	 */
3368	vm_fault_classify(object, offset, fault_type);
3369#endif
3370	/*
3371	 *	Fast fault code.  The basic idea is to do as much as
3372	 *	possible while holding the map lock and object locks.
3373	 *      Busy pages are not used until the object lock has to
3374	 *	be dropped to do something (copy, zero fill, pmap enter).
3375	 *	Similarly, paging references aren't acquired until that
3376	 *	point, and object references aren't used.
3377	 *
3378	 *	If we can figure out what to do
3379	 *	(zero fill, copy on write, pmap enter) while holding
3380	 *	the locks, then it gets done.  Otherwise, we give up,
3381	 *	and use the original fault path (which doesn't hold
3382	 *	the map lock, and relies on busy pages).
3383	 *	The give up cases include:
3384	 * 		- Have to talk to pager.
3385	 *		- Page is busy, absent or in error.
3386	 *		- Pager has locked out desired access.
3387	 *		- Fault needs to be restarted.
3388	 *		- Have to push page into copy object.
3389	 *
3390	 *	The code is an infinite loop that moves one level down
3391	 *	the shadow chain each time.  cur_object and cur_offset
3392	 * 	refer to the current object being examined. object and offset
3393	 *	are the original object from the map.  The loop is at the
3394	 *	top level if and only if object and cur_object are the same.
3395	 *
3396	 *	Invariants:  Map lock is held throughout.  Lock is held on
3397	 *		original object and cur_object (if different) when
3398	 *		continuing or exiting loop.
3399	 *
3400	 */
3401
3402
3403	/*
3404	 * If this page is to be inserted in a copy delay object
3405	 * for writing, and if the object has a copy, then the
3406	 * copy delay strategy is implemented in the slow fault page.
3407	 */
3408	if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3409	    object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
3410	        goto handle_copy_delay;
3411
3412	cur_object = object;
3413	cur_offset = offset;
3414
3415	while (TRUE) {
3416		if (!cur_object->pager_created &&
3417		    cur_object->phys_contiguous) /* superpage */
3418			break;
3419
3420		if (cur_object->blocked_access) {
3421			/*
3422			 * Access to this VM object has been blocked.
3423			 * Let the slow path handle it.
3424			 */
3425			break;
3426		}
3427
3428		m = vm_page_lookup(cur_object, cur_offset);
3429
3430		if (m != VM_PAGE_NULL) {
3431			if (m->busy) {
3432			        wait_result_t	result;
3433
3434				/*
3435				 * in order to do the PAGE_ASSERT_WAIT, we must
3436				 * have object that 'm' belongs to locked exclusively
3437				 */
3438				if (object != cur_object) {
3439
3440					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3441
3442					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3443
3444						if (vm_object_lock_upgrade(cur_object) == FALSE) {
3445						        /*
3446							 * couldn't upgrade so go do a full retry
3447							 * immediately since we can no longer be
3448							 * certain about cur_object (since we
3449							 * don't hold a reference on it)...
3450							 * first drop the top object lock
3451							 */
3452							vm_object_unlock(object);
3453
3454						        vm_map_unlock_read(map);
3455							if (real_map != map)
3456							        vm_map_unlock(real_map);
3457
3458							goto RetryFault;
3459						}
3460					}
3461				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3462
3463				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3464
3465					if (vm_object_lock_upgrade(object) == FALSE) {
3466					        /*
3467						 * couldn't upgrade, so explictly take the lock
3468						 * exclusively and go relookup the page since we
3469						 * will have dropped the object lock and
3470						 * a different thread could have inserted
3471						 * a page at this offset
3472						 * no need for a full retry since we're
3473						 * at the top level of the object chain
3474						 */
3475					        vm_object_lock(object);
3476
3477						continue;
3478					}
3479				}
3480				if (m->pageout_queue && m->object->internal && COMPRESSED_PAGER_IS_ACTIVE) {
3481					/*
3482					 * m->busy == TRUE and the object is locked exclusively
3483					 * if m->pageout_queue == TRUE after we acquire the
3484					 * queues lock, we are guaranteed that it is stable on
3485					 * the pageout queue and therefore reclaimable
3486					 *
3487					 * NOTE: this is only true for the internal pageout queue
3488					 * in the compressor world
3489					 */
3490					vm_page_lock_queues();
3491
3492					if (m->pageout_queue) {
3493						vm_pageout_throttle_up(m);
3494						vm_page_unlock_queues();
3495
3496						PAGE_WAKEUP_DONE(m);
3497						goto reclaimed_from_pageout;
3498					}
3499					vm_page_unlock_queues();
3500				}
3501				if (object != cur_object)
3502					vm_object_unlock(object);
3503
3504				vm_map_unlock_read(map);
3505				if (real_map != map)
3506				        vm_map_unlock(real_map);
3507
3508				result = PAGE_ASSERT_WAIT(m, interruptible);
3509
3510				vm_object_unlock(cur_object);
3511
3512				if (result == THREAD_WAITING) {
3513				        result = thread_block(THREAD_CONTINUE_NULL);
3514
3515					counter(c_vm_fault_page_block_busy_kernel++);
3516				}
3517				if (result == THREAD_AWAKENED || result == THREAD_RESTART)
3518				        goto RetryFault;
3519
3520				kr = KERN_ABORTED;
3521				goto done;
3522			}
3523reclaimed_from_pageout:
3524			if (m->laundry) {
3525				if (object != cur_object) {
3526					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3527						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3528
3529						vm_object_unlock(object);
3530						vm_object_unlock(cur_object);
3531
3532						vm_map_unlock_read(map);
3533						if (real_map != map)
3534							vm_map_unlock(real_map);
3535
3536						goto RetryFault;
3537					}
3538
3539				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3540
3541					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3542
3543					if (vm_object_lock_upgrade(object) == FALSE) {
3544						/*
3545						 * couldn't upgrade, so explictly take the lock
3546						 * exclusively and go relookup the page since we
3547						 * will have dropped the object lock and
3548						 * a different thread could have inserted
3549						 * a page at this offset
3550						 * no need for a full retry since we're
3551						 * at the top level of the object chain
3552						 */
3553						vm_object_lock(object);
3554
3555						continue;
3556					}
3557				}
3558				m->pageout = FALSE;
3559
3560				vm_pageout_steal_laundry(m, FALSE);
3561			}
3562
3563			if (m->phys_page == vm_page_guard_addr) {
3564				/*
3565				 * Guard page: let the slow path deal with it
3566				 */
3567				break;
3568			}
3569			if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3570			        /*
3571				 * Unusual case... let the slow path deal with it
3572				 */
3573				break;
3574			}
3575			if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3576				if (object != cur_object)
3577					vm_object_unlock(object);
3578				vm_map_unlock_read(map);
3579				if (real_map != map)
3580				        vm_map_unlock(real_map);
3581				vm_object_unlock(cur_object);
3582				kr = KERN_MEMORY_ERROR;
3583				goto done;
3584			}
3585
3586			if (m->encrypted) {
3587				/*
3588				 * ENCRYPTED SWAP:
3589				 * We've soft-faulted (because it's not in the page
3590				 * table) on an encrypted page.
3591				 * Keep the page "busy" so that no one messes with
3592				 * it during the decryption.
3593				 * Release the extra locks we're holding, keep only
3594				 * the page's VM object lock.
3595				 *
3596				 * in order to set 'busy' on 'm', we must
3597				 * have object that 'm' belongs to locked exclusively
3598				 */
3599			        if (object != cur_object) {
3600					vm_object_unlock(object);
3601
3602					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3603
3604					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3605
3606						if (vm_object_lock_upgrade(cur_object) == FALSE) {
3607						        /*
3608							 * couldn't upgrade so go do a full retry
3609							 * immediately since we've already dropped
3610							 * the top object lock associated with this page
3611							 * and the current one got dropped due to the
3612							 * failed upgrade... the state is no longer valid
3613							 */
3614						        vm_map_unlock_read(map);
3615							if (real_map != map)
3616							        vm_map_unlock(real_map);
3617
3618							goto RetryFault;
3619						}
3620					}
3621				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3622
3623				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3624
3625					if (vm_object_lock_upgrade(object) == FALSE) {
3626					        /*
3627						 * couldn't upgrade, so explictly take the lock
3628						 * exclusively and go relookup the page since we
3629						 * will have dropped the object lock and
3630						 * a different thread could have inserted
3631						 * a page at this offset
3632						 * no need for a full retry since we're
3633						 * at the top level of the object chain
3634						 */
3635					        vm_object_lock(object);
3636
3637						continue;
3638					}
3639				}
3640				m->busy = TRUE;
3641
3642				vm_map_unlock_read(map);
3643				if (real_map != map)
3644					vm_map_unlock(real_map);
3645
3646				vm_page_decrypt(m, 0);
3647
3648				assert(m->busy);
3649				PAGE_WAKEUP_DONE(m);
3650
3651				vm_object_unlock(cur_object);
3652				/*
3653				 * Retry from the top, in case anything
3654				 * changed while we were decrypting...
3655				 */
3656				goto RetryFault;
3657			}
3658			ASSERT_PAGE_DECRYPTED(m);
3659
3660			if(vm_page_is_slideable(m)) {
3661				/*
3662				 * We might need to slide this page, and so,
3663				 * we want to hold the VM object exclusively.
3664				 */
3665			        if (object != cur_object) {
3666					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3667						vm_object_unlock(object);
3668						vm_object_unlock(cur_object);
3669
3670					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3671
3672						vm_map_unlock_read(map);
3673						if (real_map != map)
3674							vm_map_unlock(real_map);
3675
3676						goto RetryFault;
3677					}
3678				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3679
3680					vm_object_unlock(object);
3681				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3682					vm_map_unlock_read(map);
3683					goto RetryFault;
3684				}
3685			}
3686
3687			if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m) ||
3688			    (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3689upgrade_for_validation:
3690				/*
3691				 * We might need to validate this page
3692				 * against its code signature, so we
3693				 * want to hold the VM object exclusively.
3694				 */
3695			        if (object != cur_object) {
3696					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3697						vm_object_unlock(object);
3698						vm_object_unlock(cur_object);
3699
3700					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3701
3702						vm_map_unlock_read(map);
3703						if (real_map != map)
3704							vm_map_unlock(real_map);
3705
3706						goto RetryFault;
3707					}
3708
3709				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3710
3711				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3712
3713					if (vm_object_lock_upgrade(object) == FALSE) {
3714					        /*
3715						 * couldn't upgrade, so explictly take the lock
3716						 * exclusively and go relookup the page since we
3717						 * will have dropped the object lock and
3718						 * a different thread could have inserted
3719						 * a page at this offset
3720						 * no need for a full retry since we're
3721						 * at the top level of the object chain
3722						 */
3723					        vm_object_lock(object);
3724
3725						continue;
3726					}
3727				}
3728			}
3729			/*
3730			 *	Two cases of map in faults:
3731			 *	    - At top level w/o copy object.
3732			 *	    - Read fault anywhere.
3733			 *		--> must disallow write.
3734			 */
3735
3736			if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3737
3738				goto FastPmapEnter;
3739			}
3740
3741			if ((fault_type & VM_PROT_WRITE) == 0) {
3742
3743			  	if (object != cur_object) {
3744				        /*
3745					 * We still need to hold the top object
3746					 * lock here to prevent a race between
3747					 * a read fault (taking only "shared"
3748					 * locks) and a write fault (taking
3749					 * an "exclusive" lock on the top
3750					 * object.
3751					 * Otherwise, as soon as we release the
3752					 * top lock, the write fault could
3753					 * proceed and actually complete before
3754					 * the read fault, and the copied page's
3755					 * translation could then be overwritten
3756					 * by the read fault's translation for
3757					 * the original page.
3758					 *
3759					 * Let's just record what the top object
3760					 * is and we'll release it later.
3761					 */
3762					top_object = object;
3763
3764					/*
3765					 * switch to the object that has the new page
3766					 */
3767					object = cur_object;
3768					object_lock_type = cur_object_lock_type;
3769				}
3770FastPmapEnter:
3771				/*
3772				 * prepare for the pmap_enter...
3773				 * object and map are both locked
3774				 * m contains valid data
3775				 * object == m->object
3776				 * cur_object == NULL or it's been unlocked
3777				 * no paging references on either object or cur_object
3778				 */
3779				if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE)
3780					need_retry_ptr = &need_retry;
3781				else
3782					need_retry_ptr = NULL;
3783
3784				if (caller_pmap) {
3785				        kr = vm_fault_enter(m,
3786							    caller_pmap,
3787							    caller_pmap_addr,
3788							    prot,
3789							    fault_type,
3790							    wired,
3791							    change_wiring,
3792							    fault_info.no_cache,
3793							    fault_info.cs_bypass,
3794							    fault_info.user_tag,
3795							    fault_info.pmap_options,
3796							    need_retry_ptr,
3797							    &type_of_fault);
3798				} else {
3799				        kr = vm_fault_enter(m,
3800							    pmap,
3801							    vaddr,
3802							    prot,
3803							    fault_type,
3804							    wired,
3805							    change_wiring,
3806							    fault_info.no_cache,
3807							    fault_info.cs_bypass,
3808							    fault_info.user_tag,
3809							    fault_info.pmap_options,
3810							    need_retry_ptr,
3811							    &type_of_fault);
3812				}
3813
3814				if (kr == KERN_SUCCESS &&
3815				    physpage_p != NULL) {
3816					/* for vm_map_wire_and_extract() */
3817					*physpage_p = m->phys_page;
3818					if (prot & VM_PROT_WRITE) {
3819						vm_object_lock_assert_exclusive(
3820							m->object);
3821						m->dirty = TRUE;
3822					}
3823				}
3824
3825				if (top_object != VM_OBJECT_NULL) {
3826					/*
3827					 * It's safe to drop the top object
3828					 * now that we've done our
3829					 * vm_fault_enter().  Any other fault
3830					 * in progress for that virtual
3831					 * address will either find our page
3832					 * and translation or put in a new page
3833					 * and translation.
3834					 */
3835					vm_object_unlock(top_object);
3836					top_object = VM_OBJECT_NULL;
3837				}
3838
3839				if (need_collapse == TRUE)
3840				        vm_object_collapse(object, offset, TRUE);
3841
3842				if (need_retry == FALSE &&
3843				    (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3844				        /*
3845					 * evaluate access pattern and update state
3846					 * vm_fault_deactivate_behind depends on the
3847					 * state being up to date
3848					 */
3849				        vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3850
3851					vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3852				}
3853				/*
3854				 * That's it, clean up and return.
3855				 */
3856				if (m->busy)
3857				        PAGE_WAKEUP_DONE(m);
3858
3859				vm_object_unlock(object);
3860
3861				vm_map_unlock_read(map);
3862				if (real_map != map)
3863					vm_map_unlock(real_map);
3864
3865				if (need_retry == TRUE) {
3866					/*
3867					 * vm_fault_enter couldn't complete the PMAP_ENTER...
3868					 * at this point we don't hold any locks so it's safe
3869					 * to ask the pmap layer to expand the page table to
3870					 * accommodate this mapping... once expanded, we'll
3871					 * re-drive the fault which should result in vm_fault_enter
3872					 * being able to successfully enter the mapping this time around
3873					 */
3874					(void)pmap_enter_options(
3875						pmap, vaddr, 0, 0, 0, 0, 0,
3876						PMAP_OPTIONS_NOENTER, NULL);
3877
3878					need_retry = FALSE;
3879					goto RetryFault;
3880				}
3881				goto done;
3882			}
3883			/*
3884			 * COPY ON WRITE FAULT
3885			 */
3886			assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3887
3888			if ((throttle_delay = vm_page_throttled())) {
3889				/*
3890				 * drop all of our locks...
3891				 * wait until the free queue is
3892				 * pumped back up and then
3893				 * redrive the fault
3894				 */
3895				if (object != cur_object)
3896					vm_object_unlock(cur_object);
3897				vm_object_unlock(object);
3898				vm_map_unlock_read(map);
3899				if (real_map != map)
3900					vm_map_unlock(real_map);
3901
3902				VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3903
3904				delay(throttle_delay);
3905
3906				if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3907						 THREAD_UNINT :
3908						 THREAD_ABORTSAFE))
3909					goto RetryFault;
3910				kr = KERN_ABORTED;
3911				goto done;
3912			}
3913                        /*
3914			 * If objects match, then
3915			 * object->copy must not be NULL (else control
3916			 * would be in previous code block), and we
3917			 * have a potential push into the copy object
3918			 * with which we can't cope with here.
3919			 */
3920			if (cur_object == object) {
3921			        /*
3922				 * must take the slow path to
3923				 * deal with the copy push
3924				 */
3925				break;
3926			}
3927
3928			/*
3929			 * This is now a shadow based copy on write
3930			 * fault -- it requires a copy up the shadow
3931			 * chain.
3932			 */
3933
3934			if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3935			    VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3936				goto upgrade_for_validation;
3937			}
3938
3939			/*
3940			 * Allocate a page in the original top level
3941			 * object. Give up if allocate fails.  Also
3942			 * need to remember current page, as it's the
3943			 * source of the copy.
3944			 *
3945			 * at this point we hold locks on both
3946			 * object and cur_object... no need to take
3947			 * paging refs or mark pages BUSY since
3948			 * we don't drop either object lock until
3949			 * the page has been copied and inserted
3950			 */
3951			cur_m = m;
3952			m = vm_page_grab();
3953
3954			if (m == VM_PAGE_NULL) {
3955			        /*
3956				 * no free page currently available...
3957				 * must take the slow path
3958				 */
3959				break;
3960			}
3961			/*
3962			 * Now do the copy.  Mark the source page busy...
3963			 *
3964			 *	NOTE: This code holds the map lock across
3965			 *	the page copy.
3966			 */
3967			vm_page_copy(cur_m, m);
3968			vm_page_insert(m, object, offset);
3969			SET_PAGE_DIRTY(m, FALSE);
3970
3971			/*
3972			 * Now cope with the source page and object
3973			 */
3974			if (object->ref_count > 1 && cur_m->pmapped)
3975			        pmap_disconnect(cur_m->phys_page);
3976
3977			if (cur_m->clustered) {
3978				VM_PAGE_COUNT_AS_PAGEIN(cur_m);
3979				VM_PAGE_CONSUME_CLUSTERED(cur_m);
3980			}
3981			need_collapse = TRUE;
3982
3983			if (!cur_object->internal &&
3984			    cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3985			        /*
3986				 * The object from which we've just
3987				 * copied a page is most probably backed
3988				 * by a vnode.  We don't want to waste too
3989				 * much time trying to collapse the VM objects
3990				 * and create a bottleneck when several tasks
3991				 * map the same file.
3992				 */
3993			        if (cur_object->copy == object) {
3994				        /*
3995					 * Shared mapping or no COW yet.
3996					 * We can never collapse a copy
3997					 * object into its backing object.
3998					 */
3999				        need_collapse = FALSE;
4000				} else if (cur_object->copy == object->shadow &&
4001					   object->shadow->resident_page_count == 0) {
4002				        /*
4003					 * Shared mapping after a COW occurred.
4004					 */
4005				        need_collapse = FALSE;
4006				}
4007			}
4008			vm_object_unlock(cur_object);
4009
4010			if (need_collapse == FALSE)
4011			        vm_fault_collapse_skipped++;
4012			vm_fault_collapse_total++;
4013
4014			type_of_fault = DBG_COW_FAULT;
4015			VM_STAT_INCR(cow_faults);
4016			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4017			current_task()->cow_faults++;
4018
4019			goto FastPmapEnter;
4020
4021		} else {
4022			/*
4023			 * No page at cur_object, cur_offset... m == NULL
4024			 */
4025			if (cur_object->pager_created) {
4026				int	compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4027
4028			        if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4029					int		my_fault_type;
4030					int		c_flags = C_DONT_BLOCK;
4031					boolean_t	insert_cur_object = FALSE;
4032
4033				        /*
4034					 * May have to talk to a pager...
4035					 * if so, take the slow path by
4036					 * doing a 'break' from the while (TRUE) loop
4037					 *
4038					 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4039					 * if the compressor is active and the page exists there
4040					 */
4041					if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS)
4042						break;
4043
4044					if (map == kernel_map || real_map == kernel_map) {
4045						/*
4046						 * can't call into the compressor with the kernel_map
4047						 * lock held, since the compressor may try to operate
4048						 * on the kernel map in order to return an empty c_segment
4049						 */
4050						break;
4051					}
4052					if (object != cur_object) {
4053						if (fault_type & VM_PROT_WRITE)
4054							c_flags |= C_KEEP;
4055						else
4056							insert_cur_object = TRUE;
4057					}
4058					if (insert_cur_object == TRUE) {
4059
4060						if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4061
4062							cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4063
4064							if (vm_object_lock_upgrade(cur_object) == FALSE) {
4065								/*
4066								 * couldn't upgrade so go do a full retry
4067								 * immediately since we can no longer be
4068								 * certain about cur_object (since we
4069								 * don't hold a reference on it)...
4070								 * first drop the top object lock
4071								 */
4072								vm_object_unlock(object);
4073
4074								vm_map_unlock_read(map);
4075								if (real_map != map)
4076									vm_map_unlock(real_map);
4077
4078								goto RetryFault;
4079							}
4080						}
4081					} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4082
4083						object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4084
4085						if (object != cur_object) {
4086							/*
4087							 * we can't go for the upgrade on the top
4088							 * lock since the upgrade may block waiting
4089							 * for readers to drain... since we hold
4090							 * cur_object locked at this point, waiting
4091							 * for the readers to drain would represent
4092							 * a lock order inversion since the lock order
4093							 * for objects is the reference order in the
4094							 * shadown chain
4095							 */
4096							vm_object_unlock(object);
4097							vm_object_unlock(cur_object);
4098
4099							vm_map_unlock_read(map);
4100							if (real_map != map)
4101								vm_map_unlock(real_map);
4102
4103							goto RetryFault;
4104						}
4105						if (vm_object_lock_upgrade(object) == FALSE) {
4106							/*
4107							 * couldn't upgrade, so explictly take the lock
4108							 * exclusively and go relookup the page since we
4109							 * will have dropped the object lock and
4110							 * a different thread could have inserted
4111							 * a page at this offset
4112							 * no need for a full retry since we're
4113							 * at the top level of the object chain
4114							 */
4115							vm_object_lock(object);
4116
4117							continue;
4118						}
4119					}
4120					m = vm_page_grab();
4121
4122					if (m == VM_PAGE_NULL) {
4123						/*
4124						 * no free page currently available...
4125						 * must take the slow path
4126						 */
4127						break;
4128					}
4129
4130					/*
4131					 * The object is and remains locked
4132					 * so no need to take a
4133					 * "paging_in_progress" reference.
4134					 */
4135					boolean_t shared_lock;
4136					if ((object == cur_object &&
4137					     object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4138					    (object != cur_object &&
4139					     cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4140						shared_lock = FALSE;
4141					} else {
4142						shared_lock = TRUE;
4143					}
4144
4145					kr = vm_compressor_pager_get(
4146						cur_object->pager,
4147						(cur_offset +
4148						 cur_object->paging_offset),
4149						m->phys_page,
4150						&my_fault_type,
4151						c_flags,
4152						&compressed_count_delta);
4153
4154					vm_compressor_pager_count(
4155						cur_object->pager,
4156						compressed_count_delta,
4157						shared_lock,
4158						cur_object);
4159
4160					if (kr != KERN_SUCCESS) {
4161						vm_page_release(m);
4162						break;
4163					}
4164					m->dirty = TRUE;
4165
4166					/*
4167					 * If the object is purgeable, its
4168					 * owner's purgeable ledgers will be
4169					 * updated in vm_page_insert() but the
4170					 * page was also accounted for in a
4171					 * "compressed purgeable" ledger, so
4172					 * update that now.
4173					 */
4174					if (object != cur_object &&
4175					    !insert_cur_object) {
4176						/*
4177						 * We're not going to insert
4178						 * the decompressed page into
4179						 * the object it came from.
4180						 *
4181						 * We're dealing with a
4182						 * copy-on-write fault on
4183						 * "object".
4184						 * We're going to decompress
4185						 * the page directly into the
4186						 * target "object" while
4187						 * keepin the compressed
4188						 * page for "cur_object", so
4189						 * no ledger update in that
4190						 * case.
4191						 */
4192					} else if ((cur_object->purgable ==
4193						    VM_PURGABLE_DENY) ||
4194						   (cur_object->vo_purgeable_owner ==
4195						    NULL)) {
4196						/*
4197						 * "cur_object" is not purgeable
4198						 * or is not owned, so no
4199						 * purgeable ledgers to update.
4200						 */
4201					} else {
4202						/*
4203						 * One less compressed
4204						 * purgeable page for
4205						 * cur_object's owner.
4206						 */
4207						vm_purgeable_compressed_update(
4208							cur_object,
4209							-1);
4210					}
4211
4212					if (insert_cur_object) {
4213						vm_page_insert(m, cur_object, cur_offset);
4214					} else {
4215						vm_page_insert(m, object, offset);
4216					}
4217
4218					if ((m->object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4219                                                /*
4220						 * If the page is not cacheable,
4221						 * we can't let its contents
4222						 * linger in the data cache
4223						 * after the decompression.
4224						 */
4225						pmap_sync_page_attributes_phys(m->phys_page);
4226					}
4227
4228					type_of_fault = my_fault_type;
4229
4230					VM_STAT_INCR(decompressions);
4231
4232					if (cur_object != object) {
4233						if (insert_cur_object) {
4234							top_object = object;
4235							/*
4236							 * switch to the object that has the new page
4237							 */
4238							object = cur_object;
4239							object_lock_type = cur_object_lock_type;
4240						} else {
4241							vm_object_unlock(cur_object);
4242							cur_object = object;
4243						}
4244					}
4245					goto FastPmapEnter;
4246				}
4247				/*
4248				 * existence map present and indicates
4249				 * that the pager doesn't have this page
4250				 */
4251			}
4252			if (cur_object->shadow == VM_OBJECT_NULL) {
4253				/*
4254				 * Zero fill fault.  Page gets
4255				 * inserted into the original object.
4256				 */
4257				if (cur_object->shadow_severed ||
4258				    VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
4259				{
4260					if (object != cur_object)
4261					        vm_object_unlock(cur_object);
4262					vm_object_unlock(object);
4263
4264					vm_map_unlock_read(map);
4265					if (real_map != map)
4266						vm_map_unlock(real_map);
4267
4268					kr = KERN_MEMORY_ERROR;
4269					goto done;
4270				}
4271				if ((throttle_delay = vm_page_throttled())) {
4272					/*
4273					 * drop all of our locks...
4274					 * wait until the free queue is
4275					 * pumped back up and then
4276					 * redrive the fault
4277					 */
4278					if (object != cur_object)
4279						vm_object_unlock(cur_object);
4280					vm_object_unlock(object);
4281					vm_map_unlock_read(map);
4282					if (real_map != map)
4283						vm_map_unlock(real_map);
4284
4285					VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
4286
4287					delay(throttle_delay);
4288
4289					if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
4290							 THREAD_UNINT :
4291							 THREAD_ABORTSAFE))
4292						goto RetryFault;
4293					kr = KERN_ABORTED;
4294					goto done;
4295				}
4296				if (vm_backing_store_low) {
4297				        /*
4298					 * we are protecting the system from
4299					 * backing store exhaustion...
4300					 * must take the slow path if we're
4301					 * not privileged
4302					 */
4303					if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
4304					        break;
4305				}
4306			  	if (cur_object != object) {
4307					vm_object_unlock(cur_object);
4308
4309					cur_object = object;
4310				}
4311				if (object_lock_type == OBJECT_LOCK_SHARED) {
4312
4313				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4314
4315					if (vm_object_lock_upgrade(object) == FALSE) {
4316					        /*
4317						 * couldn't upgrade so do a full retry on the fault
4318						 * since we dropped the object lock which
4319						 * could allow another thread to insert
4320						 * a page at this offset
4321						 */
4322					        vm_map_unlock_read(map);
4323						if (real_map != map)
4324						        vm_map_unlock(real_map);
4325
4326						goto RetryFault;
4327					}
4328				}
4329				m = vm_page_alloc(object, offset);
4330
4331				if (m == VM_PAGE_NULL) {
4332				        /*
4333					 * no free page currently available...
4334					 * must take the slow path
4335					 */
4336					break;
4337				}
4338
4339				/*
4340				 * Now zero fill page...
4341				 * the page is probably going to
4342				 * be written soon, so don't bother
4343				 * to clear the modified bit
4344				 *
4345				 *   NOTE: This code holds the map
4346				 *   lock across the zero fill.
4347				 */
4348				type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4349
4350				goto FastPmapEnter;
4351		        }
4352			/*
4353			 * On to the next level in the shadow chain
4354			 */
4355			cur_offset += cur_object->vo_shadow_offset;
4356			new_object = cur_object->shadow;
4357
4358			/*
4359			 * take the new_object's lock with the indicated state
4360			 */
4361			if (cur_object_lock_type == OBJECT_LOCK_SHARED)
4362			        vm_object_lock_shared(new_object);
4363			else
4364			        vm_object_lock(new_object);
4365
4366			if (cur_object != object)
4367				vm_object_unlock(cur_object);
4368
4369			cur_object = new_object;
4370
4371			continue;
4372		}
4373	}
4374	/*
4375	 * Cleanup from fast fault failure.  Drop any object
4376	 * lock other than original and drop map lock.
4377	 */
4378	if (object != cur_object)
4379		vm_object_unlock(cur_object);
4380
4381	/*
4382	 * must own the object lock exclusively at this point
4383	 */
4384	if (object_lock_type == OBJECT_LOCK_SHARED) {
4385	        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4386
4387		if (vm_object_lock_upgrade(object) == FALSE) {
4388		        /*
4389			 * couldn't upgrade, so explictly
4390			 * take the lock exclusively
4391			 * no need to retry the fault at this
4392			 * point since "vm_fault_page" will
4393			 * completely re-evaluate the state
4394			 */
4395		        vm_object_lock(object);
4396		}
4397	}
4398
4399handle_copy_delay:
4400	vm_map_unlock_read(map);
4401	if (real_map != map)
4402		vm_map_unlock(real_map);
4403
4404   	/*
4405	 * Make a reference to this object to
4406	 * prevent its disposal while we are messing with
4407	 * it.  Once we have the reference, the map is free
4408	 * to be diddled.  Since objects reference their
4409	 * shadows (and copies), they will stay around as well.
4410	 */
4411	vm_object_reference_locked(object);
4412	vm_object_paging_begin(object);
4413
4414	XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
4415
4416	error_code = 0;
4417
4418	result_page = VM_PAGE_NULL;
4419	kr = vm_fault_page(object, offset, fault_type,
4420			   (change_wiring && !wired),
4421			   FALSE, /* page not looked up */
4422			   &prot, &result_page, &top_page,
4423			   &type_of_fault,
4424			   &error_code, map->no_zero_fill,
4425			   FALSE, &fault_info);
4426
4427	/*
4428	 * if kr != VM_FAULT_SUCCESS, then the paging reference
4429	 * has been dropped and the object unlocked... the ref_count
4430	 * is still held
4431	 *
4432	 * if kr == VM_FAULT_SUCCESS, then the paging reference
4433	 * is still held along with the ref_count on the original object
4434	 *
4435	 *	the object is returned locked with a paging reference
4436	 *
4437	 *	if top_page != NULL, then it's BUSY and the
4438	 *	object it belongs to has a paging reference
4439	 *	but is returned unlocked
4440	 */
4441	if (kr != VM_FAULT_SUCCESS &&
4442	    kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4443	        /*
4444		 * we didn't succeed, lose the object reference immediately.
4445		 */
4446		vm_object_deallocate(object);
4447
4448		/*
4449		 * See why we failed, and take corrective action.
4450		 */
4451		switch (kr) {
4452		case VM_FAULT_MEMORY_SHORTAGE:
4453			if (vm_page_wait((change_wiring) ?
4454					 THREAD_UNINT :
4455					 THREAD_ABORTSAFE))
4456				goto RetryFault;
4457			/*
4458			 * fall thru
4459			 */
4460		case VM_FAULT_INTERRUPTED:
4461			kr = KERN_ABORTED;
4462			goto done;
4463		case VM_FAULT_RETRY:
4464			goto RetryFault;
4465		case VM_FAULT_MEMORY_ERROR:
4466			if (error_code)
4467				kr = error_code;
4468			else
4469				kr = KERN_MEMORY_ERROR;
4470			goto done;
4471		default:
4472			panic("vm_fault: unexpected error 0x%x from "
4473			      "vm_fault_page()\n", kr);
4474		}
4475	}
4476	m = result_page;
4477
4478	if (m != VM_PAGE_NULL) {
4479		assert((change_wiring && !wired) ?
4480	   	    (top_page == VM_PAGE_NULL) :
4481	   	    ((top_page == VM_PAGE_NULL) == (m->object == object)));
4482	}
4483
4484	/*
4485	 * What to do with the resulting page from vm_fault_page
4486	 * if it doesn't get entered into the physical map:
4487	 */
4488#define RELEASE_PAGE(m)					\
4489	MACRO_BEGIN					\
4490	PAGE_WAKEUP_DONE(m);				\
4491	if (!m->active && !m->inactive && !m->throttled) {		\
4492		vm_page_lockspin_queues();				\
4493		if (!m->active && !m->inactive && !m->throttled)	\
4494			vm_page_activate(m);				\
4495		vm_page_unlock_queues();				\
4496	}								\
4497	MACRO_END
4498
4499	/*
4500	 * We must verify that the maps have not changed
4501	 * since our last lookup.
4502	 */
4503	if (m != VM_PAGE_NULL) {
4504		old_copy_object = m->object->copy;
4505		vm_object_unlock(m->object);
4506	} else {
4507		old_copy_object = VM_OBJECT_NULL;
4508		vm_object_unlock(object);
4509	}
4510
4511	/*
4512	 * no object locks are held at this point
4513	 */
4514	if ((map != original_map) || !vm_map_verify(map, &version)) {
4515		vm_object_t		retry_object;
4516		vm_object_offset_t	retry_offset;
4517		vm_prot_t		retry_prot;
4518
4519		/*
4520		 * To avoid trying to write_lock the map while another
4521		 * thread has it read_locked (in vm_map_pageable), we
4522		 * do not try for write permission.  If the page is
4523		 * still writable, we will get write permission.  If it
4524		 * is not, or has been marked needs_copy, we enter the
4525		 * mapping without write permission, and will merely
4526		 * take another fault.
4527		 */
4528		map = original_map;
4529		vm_map_lock_read(map);
4530
4531		kr = vm_map_lookup_locked(&map, vaddr,
4532					  fault_type & ~VM_PROT_WRITE,
4533					  OBJECT_LOCK_EXCLUSIVE, &version,
4534					  &retry_object, &retry_offset, &retry_prot,
4535					  &wired,
4536					  &fault_info,
4537					  &real_map);
4538		pmap = real_map->pmap;
4539
4540		if (kr != KERN_SUCCESS) {
4541			vm_map_unlock_read(map);
4542
4543			if (m != VM_PAGE_NULL) {
4544			        /*
4545				 * retake the lock so that
4546				 * we can drop the paging reference
4547				 * in vm_fault_cleanup and do the
4548				 * PAGE_WAKEUP_DONE in RELEASE_PAGE
4549				 */
4550				vm_object_lock(m->object);
4551
4552				RELEASE_PAGE(m);
4553
4554				vm_fault_cleanup(m->object, top_page);
4555			} else {
4556			        /*
4557				 * retake the lock so that
4558				 * we can drop the paging reference
4559				 * in vm_fault_cleanup
4560				 */
4561			        vm_object_lock(object);
4562
4563			        vm_fault_cleanup(object, top_page);
4564			}
4565			vm_object_deallocate(object);
4566
4567			goto done;
4568		}
4569		vm_object_unlock(retry_object);
4570
4571		if ((retry_object != object) || (retry_offset != offset)) {
4572
4573			vm_map_unlock_read(map);
4574			if (real_map != map)
4575				vm_map_unlock(real_map);
4576
4577			if (m != VM_PAGE_NULL) {
4578			        /*
4579				 * retake the lock so that
4580				 * we can drop the paging reference
4581				 * in vm_fault_cleanup and do the
4582				 * PAGE_WAKEUP_DONE in RELEASE_PAGE
4583				 */
4584			        vm_object_lock(m->object);
4585
4586				RELEASE_PAGE(m);
4587
4588				vm_fault_cleanup(m->object, top_page);
4589			} else {
4590			        /*
4591				 * retake the lock so that
4592				 * we can drop the paging reference
4593				 * in vm_fault_cleanup
4594				 */
4595			        vm_object_lock(object);
4596
4597			        vm_fault_cleanup(object, top_page);
4598			}
4599			vm_object_deallocate(object);
4600
4601			goto RetryFault;
4602		}
4603		/*
4604		 * Check whether the protection has changed or the object
4605		 * has been copied while we left the map unlocked.
4606		 */
4607		prot &= retry_prot;
4608	}
4609	if (m != VM_PAGE_NULL) {
4610		vm_object_lock(m->object);
4611
4612		if (m->object->copy != old_copy_object) {
4613		        /*
4614			 * The copy object changed while the top-level object
4615			 * was unlocked, so take away write permission.
4616			 */
4617			prot &= ~VM_PROT_WRITE;
4618		}
4619	} else
4620		vm_object_lock(object);
4621
4622	/*
4623	 * If we want to wire down this page, but no longer have
4624	 * adequate permissions, we must start all over.
4625	 */
4626	if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
4627
4628		vm_map_verify_done(map, &version);
4629		if (real_map != map)
4630			vm_map_unlock(real_map);
4631
4632		if (m != VM_PAGE_NULL) {
4633			RELEASE_PAGE(m);
4634
4635			vm_fault_cleanup(m->object, top_page);
4636		} else
4637		        vm_fault_cleanup(object, top_page);
4638
4639		vm_object_deallocate(object);
4640
4641		goto RetryFault;
4642	}
4643	if (m != VM_PAGE_NULL) {
4644		/*
4645		 * Put this page into the physical map.
4646		 * We had to do the unlock above because pmap_enter
4647		 * may cause other faults.  The page may be on
4648		 * the pageout queues.  If the pageout daemon comes
4649		 * across the page, it will remove it from the queues.
4650		 */
4651		if (caller_pmap) {
4652			kr = vm_fault_enter(m,
4653					    caller_pmap,
4654					    caller_pmap_addr,
4655					    prot,
4656					    fault_type,
4657					    wired,
4658					    change_wiring,
4659					    fault_info.no_cache,
4660					    fault_info.cs_bypass,
4661					    fault_info.user_tag,
4662					    fault_info.pmap_options,
4663					    NULL,
4664					    &type_of_fault);
4665		} else {
4666			kr = vm_fault_enter(m,
4667					    pmap,
4668					    vaddr,
4669					    prot,
4670					    fault_type,
4671					    wired,
4672					    change_wiring,
4673					    fault_info.no_cache,
4674					    fault_info.cs_bypass,
4675					    fault_info.user_tag,
4676					    fault_info.pmap_options,
4677					    NULL,
4678					    &type_of_fault);
4679		}
4680		if (kr != KERN_SUCCESS) {
4681			/* abort this page fault */
4682			vm_map_verify_done(map, &version);
4683			if (real_map != map)
4684				vm_map_unlock(real_map);
4685			PAGE_WAKEUP_DONE(m);
4686			vm_fault_cleanup(m->object, top_page);
4687			vm_object_deallocate(object);
4688			goto done;
4689		}
4690		if (physpage_p != NULL) {
4691			/* for vm_map_wire_and_extract() */
4692			*physpage_p = m->phys_page;
4693			if (prot & VM_PROT_WRITE) {
4694				vm_object_lock_assert_exclusive(m->object);
4695				m->dirty = TRUE;
4696			}
4697		}
4698	} else {
4699
4700		vm_map_entry_t		entry;
4701		vm_map_offset_t		laddr;
4702		vm_map_offset_t		ldelta, hdelta;
4703
4704		/*
4705		 * do a pmap block mapping from the physical address
4706		 * in the object
4707		 */
4708
4709#ifdef ppc
4710		/* While we do not worry about execution protection in   */
4711		/* general, certian pages may have instruction execution */
4712		/* disallowed.  We will check here, and if not allowed   */
4713		/* to execute, we return with a protection failure.      */
4714
4715		if ((fault_type & VM_PROT_EXECUTE) &&
4716			(!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
4717
4718			vm_map_verify_done(map, &version);
4719
4720			if (real_map != map)
4721				vm_map_unlock(real_map);
4722
4723			vm_fault_cleanup(object, top_page);
4724			vm_object_deallocate(object);
4725
4726			kr = KERN_PROTECTION_FAILURE;
4727			goto done;
4728		}
4729#endif	/* ppc */
4730
4731		if (real_map != map)
4732			vm_map_unlock(real_map);
4733
4734		if (original_map != map) {
4735			vm_map_unlock_read(map);
4736			vm_map_lock_read(original_map);
4737			map = original_map;
4738		}
4739		real_map = map;
4740
4741		laddr = vaddr;
4742		hdelta = 0xFFFFF000;
4743		ldelta = 0xFFFFF000;
4744
4745		while (vm_map_lookup_entry(map, laddr, &entry)) {
4746			if (ldelta > (laddr - entry->vme_start))
4747				ldelta = laddr - entry->vme_start;
4748			if (hdelta > (entry->vme_end - laddr))
4749				hdelta = entry->vme_end - laddr;
4750			if (entry->is_sub_map) {
4751
4752				laddr = (laddr - entry->vme_start)
4753							+ entry->offset;
4754				vm_map_lock_read(entry->object.sub_map);
4755
4756				if (map != real_map)
4757					vm_map_unlock_read(map);
4758				if (entry->use_pmap) {
4759					vm_map_unlock_read(real_map);
4760					real_map = entry->object.sub_map;
4761				}
4762				map = entry->object.sub_map;
4763
4764			} else {
4765				break;
4766			}
4767		}
4768
4769		if (vm_map_lookup_entry(map, laddr, &entry) &&
4770		    (entry->object.vm_object != NULL) &&
4771		    (entry->object.vm_object == object)) {
4772
4773			int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
4774
4775			if (superpage && physpage_p) {
4776				/* for vm_map_wire_and_extract() */
4777				*physpage_p = (ppnum_t) ((((vm_map_offset_t) entry->object.vm_object->vo_shadow_offset)
4778							  + entry->offset
4779							  + (laddr - entry->vme_start))
4780							 >> PAGE_SHIFT);
4781			}
4782
4783			if (caller_pmap) {
4784				/*
4785				 * Set up a block mapped area
4786				 */
4787				assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
4788				pmap_map_block(caller_pmap,
4789					       (addr64_t)(caller_pmap_addr - ldelta),
4790					       (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
4791							  entry->offset + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
4792					       (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
4793					       (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4794			} else {
4795				/*
4796				 * Set up a block mapped area
4797				 */
4798				assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
4799				pmap_map_block(real_map->pmap,
4800					       (addr64_t)(vaddr - ldelta),
4801					       (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
4802							  entry->offset + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
4803					       (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
4804					       (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4805			}
4806		}
4807	}
4808
4809	/*
4810	 * Unlock everything, and return
4811	 */
4812	vm_map_verify_done(map, &version);
4813	if (real_map != map)
4814		vm_map_unlock(real_map);
4815
4816	if (m != VM_PAGE_NULL) {
4817		PAGE_WAKEUP_DONE(m);
4818
4819		vm_fault_cleanup(m->object, top_page);
4820	} else
4821	        vm_fault_cleanup(object, top_page);
4822
4823	vm_object_deallocate(object);
4824
4825#undef	RELEASE_PAGE
4826
4827	kr = KERN_SUCCESS;
4828done:
4829	thread_interrupt_level(interruptible_state);
4830
4831	/*
4832	 * Only throttle on faults which cause a pagein.
4833	 */
4834	if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
4835		throttle_lowpri_io(1);
4836	}
4837
4838	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4839			      (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4840			      ((uint64_t)vaddr >> 32),
4841			      vaddr,
4842			      kr,
4843			      type_of_fault,
4844			      0);
4845
4846	return (kr);
4847}
4848
4849/*
4850 *	vm_fault_wire:
4851 *
4852 *	Wire down a range of virtual addresses in a map.
4853 */
4854kern_return_t
4855vm_fault_wire(
4856	vm_map_t	map,
4857	vm_map_entry_t	entry,
4858	pmap_t		pmap,
4859	vm_map_offset_t	pmap_addr,
4860	ppnum_t		*physpage_p)
4861{
4862
4863	register vm_map_offset_t	va;
4864	register vm_map_offset_t	end_addr = entry->vme_end;
4865	register kern_return_t	rc;
4866
4867	assert(entry->in_transition);
4868
4869	if ((entry->object.vm_object != NULL) &&
4870	    !entry->is_sub_map &&
4871	    entry->object.vm_object->phys_contiguous) {
4872		return KERN_SUCCESS;
4873	}
4874
4875	/*
4876	 *	Inform the physical mapping system that the
4877	 *	range of addresses may not fault, so that
4878	 *	page tables and such can be locked down as well.
4879	 */
4880
4881	pmap_pageable(pmap, pmap_addr,
4882		pmap_addr + (end_addr - entry->vme_start), FALSE);
4883
4884	/*
4885	 *	We simulate a fault to get the page and enter it
4886	 *	in the physical map.
4887	 */
4888
4889	for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4890		rc = vm_fault_wire_fast(map, va, entry, pmap,
4891					pmap_addr + (va - entry->vme_start),
4892					physpage_p);
4893		if (rc != KERN_SUCCESS) {
4894			rc = vm_fault_internal(map, va, VM_PROT_NONE, TRUE,
4895					       ((pmap == kernel_pmap)
4896						? THREAD_UNINT
4897						: THREAD_ABORTSAFE),
4898					       pmap,
4899					       (pmap_addr +
4900						(va - entry->vme_start)),
4901					       physpage_p);
4902			DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
4903		}
4904
4905		if (rc != KERN_SUCCESS) {
4906			struct vm_map_entry	tmp_entry = *entry;
4907
4908			/* unwire wired pages */
4909			tmp_entry.vme_end = va;
4910			vm_fault_unwire(map,
4911				&tmp_entry, FALSE, pmap, pmap_addr);
4912
4913			return rc;
4914		}
4915	}
4916	return KERN_SUCCESS;
4917}
4918
4919/*
4920 *	vm_fault_unwire:
4921 *
4922 *	Unwire a range of virtual addresses in a map.
4923 */
4924void
4925vm_fault_unwire(
4926	vm_map_t	map,
4927	vm_map_entry_t	entry,
4928	boolean_t	deallocate,
4929	pmap_t		pmap,
4930	vm_map_offset_t	pmap_addr)
4931{
4932	register vm_map_offset_t	va;
4933	register vm_map_offset_t	end_addr = entry->vme_end;
4934	vm_object_t		object;
4935	struct vm_object_fault_info fault_info;
4936
4937	object = (entry->is_sub_map)
4938			? VM_OBJECT_NULL : entry->object.vm_object;
4939
4940	/*
4941	 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4942	 * do anything since such memory is wired by default.  So we don't have
4943	 * anything to undo here.
4944	 */
4945
4946	if (object != VM_OBJECT_NULL && object->phys_contiguous)
4947		return;
4948
4949	fault_info.interruptible = THREAD_UNINT;
4950	fault_info.behavior = entry->behavior;
4951	fault_info.user_tag = entry->alias;
4952	fault_info.pmap_options = 0;
4953	if (entry->iokit_acct ||
4954	    (!entry->is_sub_map && !entry->use_pmap)) {
4955		fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
4956	}
4957	fault_info.lo_offset = entry->offset;
4958	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4959	fault_info.no_cache = entry->no_cache;
4960	fault_info.stealth = TRUE;
4961	fault_info.io_sync = FALSE;
4962	fault_info.cs_bypass = FALSE;
4963	fault_info.mark_zf_absent = FALSE;
4964	fault_info.batch_pmap_op = FALSE;
4965
4966	/*
4967	 *	Since the pages are wired down, we must be able to
4968	 *	get their mappings from the physical map system.
4969	 */
4970
4971	for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4972
4973		if (object == VM_OBJECT_NULL) {
4974			if (pmap) {
4975				pmap_change_wiring(pmap,
4976						   pmap_addr + (va - entry->vme_start), FALSE);
4977			}
4978			(void) vm_fault(map, va, VM_PROT_NONE,
4979					TRUE, THREAD_UNINT, pmap, pmap_addr);
4980		} else {
4981		 	vm_prot_t	prot;
4982			vm_page_t	result_page;
4983			vm_page_t	top_page;
4984			vm_object_t	result_object;
4985			vm_fault_return_t result;
4986
4987			if (end_addr - va > (vm_size_t) -1) {
4988				/* 32-bit overflow */
4989				fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4990			} else {
4991				fault_info.cluster_size = (vm_size_t) (end_addr - va);
4992				assert(fault_info.cluster_size == end_addr - va);
4993			}
4994
4995			do {
4996				prot = VM_PROT_NONE;
4997
4998				vm_object_lock(object);
4999				vm_object_paging_begin(object);
5000				XPR(XPR_VM_FAULT,
5001					"vm_fault_unwire -> vm_fault_page\n",
5002					0,0,0,0,0);
5003				result_page = VM_PAGE_NULL;
5004			 	result = vm_fault_page(
5005					object,
5006					entry->offset + (va - entry->vme_start),
5007					VM_PROT_NONE, TRUE,
5008					FALSE, /* page not looked up */
5009					&prot, &result_page, &top_page,
5010					(int *)0,
5011					NULL, map->no_zero_fill,
5012					FALSE, &fault_info);
5013			} while (result == VM_FAULT_RETRY);
5014
5015			/*
5016			 * If this was a mapping to a file on a device that has been forcibly
5017			 * unmounted, then we won't get a page back from vm_fault_page().  Just
5018			 * move on to the next one in case the remaining pages are mapped from
5019			 * different objects.  During a forced unmount, the object is terminated
5020			 * so the alive flag will be false if this happens.  A forced unmount will
5021			 * will occur when an external disk is unplugged before the user does an
5022			 * eject, so we don't want to panic in that situation.
5023			 */
5024
5025			if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
5026				continue;
5027
5028			if (result == VM_FAULT_MEMORY_ERROR &&
5029			    object == kernel_object) {
5030				/*
5031				 * This must have been allocated with
5032				 * KMA_KOBJECT and KMA_VAONLY and there's
5033				 * no physical page at this offset.
5034				 * We're done (no page to free).
5035				 */
5036				assert(deallocate);
5037				continue;
5038			}
5039
5040			if (result != VM_FAULT_SUCCESS)
5041				panic("vm_fault_unwire: failure");
5042
5043			result_object = result_page->object;
5044
5045			if (deallocate) {
5046				assert(result_page->phys_page !=
5047				       vm_page_fictitious_addr);
5048				pmap_disconnect(result_page->phys_page);
5049				VM_PAGE_FREE(result_page);
5050			} else {
5051				if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
5052					pmap_change_wiring(pmap,
5053					    pmap_addr + (va - entry->vme_start), FALSE);
5054
5055
5056				if (VM_PAGE_WIRED(result_page)) {
5057					vm_page_lockspin_queues();
5058					vm_page_unwire(result_page, TRUE);
5059					vm_page_unlock_queues();
5060				}
5061				if(entry->zero_wired_pages) {
5062					pmap_zero_page(result_page->phys_page);
5063					entry->zero_wired_pages = FALSE;
5064				}
5065
5066				PAGE_WAKEUP_DONE(result_page);
5067			}
5068			vm_fault_cleanup(result_object, top_page);
5069		}
5070	}
5071
5072	/*
5073	 *	Inform the physical mapping system that the range
5074	 *	of addresses may fault, so that page tables and
5075	 *	such may be unwired themselves.
5076	 */
5077
5078	pmap_pageable(pmap, pmap_addr,
5079		pmap_addr + (end_addr - entry->vme_start), TRUE);
5080
5081}
5082
5083/*
5084 *	vm_fault_wire_fast:
5085 *
5086 *	Handle common case of a wire down page fault at the given address.
5087 *	If successful, the page is inserted into the associated physical map.
5088 *	The map entry is passed in to avoid the overhead of a map lookup.
5089 *
5090 *	NOTE: the given address should be truncated to the
5091 *	proper page address.
5092 *
5093 *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
5094 *	a standard error specifying why the fault is fatal is returned.
5095 *
5096 *	The map in question must be referenced, and remains so.
5097 *	Caller has a read lock on the map.
5098 *
5099 *	This is a stripped version of vm_fault() for wiring pages.  Anything
5100 *	other than the common case will return KERN_FAILURE, and the caller
5101 *	is expected to call vm_fault().
5102 */
5103kern_return_t
5104vm_fault_wire_fast(
5105	__unused vm_map_t	map,
5106	vm_map_offset_t	va,
5107	vm_map_entry_t	entry,
5108	pmap_t		pmap,
5109	vm_map_offset_t	pmap_addr,
5110	ppnum_t		*physpage_p)
5111{
5112	vm_object_t		object;
5113	vm_object_offset_t	offset;
5114	register vm_page_t	m;
5115	vm_prot_t		prot;
5116	thread_t           	thread = current_thread();
5117	int			type_of_fault;
5118	kern_return_t		kr;
5119
5120	VM_STAT_INCR(faults);
5121
5122	if (thread != THREAD_NULL && thread->task != TASK_NULL)
5123	  thread->task->faults++;
5124
5125/*
5126 *	Recovery actions
5127 */
5128
5129#undef	RELEASE_PAGE
5130#define RELEASE_PAGE(m)	{				\
5131	PAGE_WAKEUP_DONE(m);				\
5132	vm_page_lockspin_queues();			\
5133	vm_page_unwire(m, TRUE);			\
5134	vm_page_unlock_queues();			\
5135}
5136
5137
5138#undef	UNLOCK_THINGS
5139#define UNLOCK_THINGS	{				\
5140	vm_object_paging_end(object);			   \
5141	vm_object_unlock(object);			   \
5142}
5143
5144#undef	UNLOCK_AND_DEALLOCATE
5145#define UNLOCK_AND_DEALLOCATE	{			\
5146	UNLOCK_THINGS;					\
5147	vm_object_deallocate(object);			\
5148}
5149/*
5150 *	Give up and have caller do things the hard way.
5151 */
5152
5153#define GIVE_UP {					\
5154	UNLOCK_AND_DEALLOCATE;				\
5155	return(KERN_FAILURE);				\
5156}
5157
5158
5159	/*
5160	 *	If this entry is not directly to a vm_object, bail out.
5161	 */
5162	if (entry->is_sub_map) {
5163		assert(physpage_p == NULL);
5164		return(KERN_FAILURE);
5165	}
5166
5167	/*
5168	 *	Find the backing store object and offset into it.
5169	 */
5170
5171	object = entry->object.vm_object;
5172	offset = (va - entry->vme_start) + entry->offset;
5173	prot = entry->protection;
5174
5175   	/*
5176	 *	Make a reference to this object to prevent its
5177	 *	disposal while we are messing with it.
5178	 */
5179
5180	vm_object_lock(object);
5181	vm_object_reference_locked(object);
5182	vm_object_paging_begin(object);
5183
5184	/*
5185	 *	INVARIANTS (through entire routine):
5186	 *
5187	 *	1)	At all times, we must either have the object
5188	 *		lock or a busy page in some object to prevent
5189	 *		some other thread from trying to bring in
5190	 *		the same page.
5191	 *
5192	 *	2)	Once we have a busy page, we must remove it from
5193	 *		the pageout queues, so that the pageout daemon
5194	 *		will not grab it away.
5195	 *
5196	 */
5197
5198	/*
5199	 *	Look for page in top-level object.  If it's not there or
5200	 *	there's something going on, give up.
5201	 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
5202	 * decrypt the page before wiring it down.
5203	 */
5204	m = vm_page_lookup(object, offset);
5205	if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
5206	    (m->unusual && ( m->error || m->restart || m->absent))) {
5207
5208		GIVE_UP;
5209	}
5210	ASSERT_PAGE_DECRYPTED(m);
5211
5212	if (m->fictitious &&
5213	    m->phys_page == vm_page_guard_addr) {
5214		/*
5215		 * Guard pages are fictitious pages and are never
5216		 * entered into a pmap, so let's say it's been wired...
5217		 */
5218		kr = KERN_SUCCESS;
5219		goto done;
5220	}
5221
5222	/*
5223	 *	Wire the page down now.  All bail outs beyond this
5224	 *	point must unwire the page.
5225	 */
5226
5227	vm_page_lockspin_queues();
5228	vm_page_wire(m);
5229	vm_page_unlock_queues();
5230
5231	/*
5232	 *	Mark page busy for other threads.
5233	 */
5234	assert(!m->busy);
5235	m->busy = TRUE;
5236	assert(!m->absent);
5237
5238	/*
5239	 *	Give up if the page is being written and there's a copy object
5240	 */
5241	if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5242		RELEASE_PAGE(m);
5243		GIVE_UP;
5244	}
5245
5246	/*
5247	 *	Put this page into the physical map.
5248	 */
5249	type_of_fault = DBG_CACHE_HIT_FAULT;
5250	kr = vm_fault_enter(m,
5251			    pmap,
5252			    pmap_addr,
5253			    prot,
5254			    prot,
5255			    TRUE,
5256			    FALSE,
5257			    FALSE,
5258			    FALSE,
5259			    entry->alias,
5260			    ((entry->iokit_acct ||
5261			      (!entry->is_sub_map && !entry->use_pmap))
5262			     ? PMAP_OPTIONS_ALT_ACCT
5263			     : 0),
5264			    NULL,
5265			    &type_of_fault);
5266
5267done:
5268	/*
5269	 *	Unlock everything, and return
5270	 */
5271
5272	if (physpage_p) {
5273		/* for vm_map_wire_and_extract() */
5274		if (kr == KERN_SUCCESS) {
5275			*physpage_p = m->phys_page;
5276			if (prot & VM_PROT_WRITE) {
5277				vm_object_lock_assert_exclusive(m->object);
5278				m->dirty = TRUE;
5279			}
5280		} else {
5281			*physpage_p = 0;
5282		}
5283	}
5284
5285	PAGE_WAKEUP_DONE(m);
5286	UNLOCK_AND_DEALLOCATE;
5287
5288	return kr;
5289
5290}
5291
5292/*
5293 *	Routine:	vm_fault_copy_cleanup
5294 *	Purpose:
5295 *		Release a page used by vm_fault_copy.
5296 */
5297
5298void
5299vm_fault_copy_cleanup(
5300	vm_page_t	page,
5301	vm_page_t	top_page)
5302{
5303	vm_object_t	object = page->object;
5304
5305	vm_object_lock(object);
5306	PAGE_WAKEUP_DONE(page);
5307	if (!page->active && !page->inactive && !page->throttled) {
5308		vm_page_lockspin_queues();
5309		if (!page->active && !page->inactive && !page->throttled)
5310			vm_page_activate(page);
5311		vm_page_unlock_queues();
5312	}
5313	vm_fault_cleanup(object, top_page);
5314}
5315
5316void
5317vm_fault_copy_dst_cleanup(
5318	vm_page_t	page)
5319{
5320	vm_object_t	object;
5321
5322	if (page != VM_PAGE_NULL) {
5323		object = page->object;
5324		vm_object_lock(object);
5325		vm_page_lockspin_queues();
5326		vm_page_unwire(page, TRUE);
5327		vm_page_unlock_queues();
5328		vm_object_paging_end(object);
5329		vm_object_unlock(object);
5330	}
5331}
5332
5333/*
5334 *	Routine:	vm_fault_copy
5335 *
5336 *	Purpose:
5337 *		Copy pages from one virtual memory object to another --
5338 *		neither the source nor destination pages need be resident.
5339 *
5340 *		Before actually copying a page, the version associated with
5341 *		the destination address map wil be verified.
5342 *
5343 *	In/out conditions:
5344 *		The caller must hold a reference, but not a lock, to
5345 *		each of the source and destination objects and to the
5346 *		destination map.
5347 *
5348 *	Results:
5349 *		Returns KERN_SUCCESS if no errors were encountered in
5350 *		reading or writing the data.  Returns KERN_INTERRUPTED if
5351 *		the operation was interrupted (only possible if the
5352 *		"interruptible" argument is asserted).  Other return values
5353 *		indicate a permanent error in copying the data.
5354 *
5355 *		The actual amount of data copied will be returned in the
5356 *		"copy_size" argument.  In the event that the destination map
5357 *		verification failed, this amount may be less than the amount
5358 *		requested.
5359 */
5360kern_return_t
5361vm_fault_copy(
5362	vm_object_t		src_object,
5363	vm_object_offset_t	src_offset,
5364	vm_map_size_t		*copy_size,		/* INOUT */
5365	vm_object_t		dst_object,
5366	vm_object_offset_t	dst_offset,
5367	vm_map_t		dst_map,
5368	vm_map_version_t	 *dst_version,
5369	int			interruptible)
5370{
5371	vm_page_t		result_page;
5372
5373	vm_page_t		src_page;
5374	vm_page_t		src_top_page;
5375	vm_prot_t		src_prot;
5376
5377	vm_page_t		dst_page;
5378	vm_page_t		dst_top_page;
5379	vm_prot_t		dst_prot;
5380
5381	vm_map_size_t		amount_left;
5382	vm_object_t		old_copy_object;
5383	kern_return_t		error = 0;
5384	vm_fault_return_t	result;
5385
5386	vm_map_size_t		part_size;
5387	struct vm_object_fault_info fault_info_src;
5388	struct vm_object_fault_info fault_info_dst;
5389
5390	/*
5391	 * In order not to confuse the clustered pageins, align
5392	 * the different offsets on a page boundary.
5393	 */
5394
5395#define	RETURN(x)					\
5396	MACRO_BEGIN					\
5397	*copy_size -= amount_left;			\
5398	MACRO_RETURN(x);				\
5399	MACRO_END
5400
5401	amount_left = *copy_size;
5402
5403	fault_info_src.interruptible = interruptible;
5404	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5405	fault_info_src.user_tag  = 0;
5406	fault_info_src.pmap_options = 0;
5407	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5408	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5409	fault_info_src.no_cache   = FALSE;
5410	fault_info_src.stealth = TRUE;
5411	fault_info_src.io_sync = FALSE;
5412	fault_info_src.cs_bypass = FALSE;
5413	fault_info_src.mark_zf_absent = FALSE;
5414	fault_info_src.batch_pmap_op = FALSE;
5415
5416	fault_info_dst.interruptible = interruptible;
5417	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5418	fault_info_dst.user_tag  = 0;
5419	fault_info_dst.pmap_options = 0;
5420	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5421	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5422	fault_info_dst.no_cache   = FALSE;
5423	fault_info_dst.stealth = TRUE;
5424	fault_info_dst.io_sync = FALSE;
5425	fault_info_dst.cs_bypass = FALSE;
5426	fault_info_dst.mark_zf_absent = FALSE;
5427	fault_info_dst.batch_pmap_op = FALSE;
5428
5429	do { /* while (amount_left > 0) */
5430		/*
5431		 * There may be a deadlock if both source and destination
5432		 * pages are the same. To avoid this deadlock, the copy must
5433		 * start by getting the destination page in order to apply
5434		 * COW semantics if any.
5435		 */
5436
5437	RetryDestinationFault: ;
5438
5439		dst_prot = VM_PROT_WRITE|VM_PROT_READ;
5440
5441		vm_object_lock(dst_object);
5442		vm_object_paging_begin(dst_object);
5443
5444		if (amount_left > (vm_size_t) -1) {
5445			/* 32-bit overflow */
5446			fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5447		} else {
5448			fault_info_dst.cluster_size = (vm_size_t) amount_left;
5449			assert(fault_info_dst.cluster_size == amount_left);
5450		}
5451
5452		XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
5453		dst_page = VM_PAGE_NULL;
5454		result = vm_fault_page(dst_object,
5455				       vm_object_trunc_page(dst_offset),
5456				       VM_PROT_WRITE|VM_PROT_READ,
5457				       FALSE,
5458				       FALSE, /* page not looked up */
5459				       &dst_prot, &dst_page, &dst_top_page,
5460				       (int *)0,
5461				       &error,
5462				       dst_map->no_zero_fill,
5463				       FALSE, &fault_info_dst);
5464		switch (result) {
5465		case VM_FAULT_SUCCESS:
5466			break;
5467		case VM_FAULT_RETRY:
5468			goto RetryDestinationFault;
5469		case VM_FAULT_MEMORY_SHORTAGE:
5470			if (vm_page_wait(interruptible))
5471				goto RetryDestinationFault;
5472			/* fall thru */
5473		case VM_FAULT_INTERRUPTED:
5474			RETURN(MACH_SEND_INTERRUPTED);
5475		case VM_FAULT_SUCCESS_NO_VM_PAGE:
5476			/* success but no VM page: fail the copy */
5477			vm_object_paging_end(dst_object);
5478			vm_object_unlock(dst_object);
5479			/*FALLTHROUGH*/
5480		case VM_FAULT_MEMORY_ERROR:
5481			if (error)
5482				return (error);
5483			else
5484				return(KERN_MEMORY_ERROR);
5485		default:
5486			panic("vm_fault_copy: unexpected error 0x%x from "
5487			      "vm_fault_page()\n", result);
5488		}
5489		assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5490
5491		old_copy_object = dst_page->object->copy;
5492
5493		/*
5494		 * There exists the possiblity that the source and
5495		 * destination page are the same.  But we can't
5496		 * easily determine that now.  If they are the
5497		 * same, the call to vm_fault_page() for the
5498		 * destination page will deadlock.  To prevent this we
5499		 * wire the page so we can drop busy without having
5500		 * the page daemon steal the page.  We clean up the
5501		 * top page  but keep the paging reference on the object
5502		 * holding the dest page so it doesn't go away.
5503		 */
5504
5505		vm_page_lockspin_queues();
5506		vm_page_wire(dst_page);
5507		vm_page_unlock_queues();
5508		PAGE_WAKEUP_DONE(dst_page);
5509		vm_object_unlock(dst_page->object);
5510
5511		if (dst_top_page != VM_PAGE_NULL) {
5512			vm_object_lock(dst_object);
5513			VM_PAGE_FREE(dst_top_page);
5514			vm_object_paging_end(dst_object);
5515			vm_object_unlock(dst_object);
5516		}
5517
5518	RetrySourceFault: ;
5519
5520		if (src_object == VM_OBJECT_NULL) {
5521			/*
5522			 *	No source object.  We will just
5523			 *	zero-fill the page in dst_object.
5524			 */
5525			src_page = VM_PAGE_NULL;
5526			result_page = VM_PAGE_NULL;
5527		} else {
5528			vm_object_lock(src_object);
5529			src_page = vm_page_lookup(src_object,
5530						  vm_object_trunc_page(src_offset));
5531			if (src_page == dst_page) {
5532				src_prot = dst_prot;
5533				result_page = VM_PAGE_NULL;
5534			} else {
5535				src_prot = VM_PROT_READ;
5536				vm_object_paging_begin(src_object);
5537
5538				if (amount_left > (vm_size_t) -1) {
5539					/* 32-bit overflow */
5540					fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5541				} else {
5542					fault_info_src.cluster_size = (vm_size_t) amount_left;
5543					assert(fault_info_src.cluster_size == amount_left);
5544				}
5545
5546				XPR(XPR_VM_FAULT,
5547					"vm_fault_copy(2) -> vm_fault_page\n",
5548					0,0,0,0,0);
5549				result_page = VM_PAGE_NULL;
5550				result = vm_fault_page(
5551					src_object,
5552					vm_object_trunc_page(src_offset),
5553					VM_PROT_READ, FALSE,
5554					FALSE, /* page not looked up */
5555					&src_prot,
5556					&result_page, &src_top_page,
5557					(int *)0, &error, FALSE,
5558					FALSE, &fault_info_src);
5559
5560				switch (result) {
5561				case VM_FAULT_SUCCESS:
5562					break;
5563				case VM_FAULT_RETRY:
5564					goto RetrySourceFault;
5565				case VM_FAULT_MEMORY_SHORTAGE:
5566					if (vm_page_wait(interruptible))
5567						goto RetrySourceFault;
5568					/* fall thru */
5569				case VM_FAULT_INTERRUPTED:
5570					vm_fault_copy_dst_cleanup(dst_page);
5571					RETURN(MACH_SEND_INTERRUPTED);
5572				case VM_FAULT_SUCCESS_NO_VM_PAGE:
5573					/* success but no VM page: fail */
5574					vm_object_paging_end(src_object);
5575					vm_object_unlock(src_object);
5576					/*FALLTHROUGH*/
5577				case VM_FAULT_MEMORY_ERROR:
5578					vm_fault_copy_dst_cleanup(dst_page);
5579					if (error)
5580						return (error);
5581					else
5582						return(KERN_MEMORY_ERROR);
5583				default:
5584					panic("vm_fault_copy(2): unexpected "
5585					      "error 0x%x from "
5586					      "vm_fault_page()\n", result);
5587				}
5588
5589
5590				assert((src_top_page == VM_PAGE_NULL) ==
5591				       (result_page->object == src_object));
5592			}
5593			assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5594			vm_object_unlock(result_page->object);
5595		}
5596
5597		if (!vm_map_verify(dst_map, dst_version)) {
5598			if (result_page != VM_PAGE_NULL && src_page != dst_page)
5599				vm_fault_copy_cleanup(result_page, src_top_page);
5600			vm_fault_copy_dst_cleanup(dst_page);
5601			break;
5602		}
5603
5604		vm_object_lock(dst_page->object);
5605
5606		if (dst_page->object->copy != old_copy_object) {
5607			vm_object_unlock(dst_page->object);
5608			vm_map_verify_done(dst_map, dst_version);
5609			if (result_page != VM_PAGE_NULL && src_page != dst_page)
5610				vm_fault_copy_cleanup(result_page, src_top_page);
5611			vm_fault_copy_dst_cleanup(dst_page);
5612			break;
5613		}
5614		vm_object_unlock(dst_page->object);
5615
5616		/*
5617		 *	Copy the page, and note that it is dirty
5618		 *	immediately.
5619		 */
5620
5621		if (!page_aligned(src_offset) ||
5622			!page_aligned(dst_offset) ||
5623			!page_aligned(amount_left)) {
5624
5625			vm_object_offset_t	src_po,
5626						dst_po;
5627
5628			src_po = src_offset - vm_object_trunc_page(src_offset);
5629			dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5630
5631			if (dst_po > src_po) {
5632				part_size = PAGE_SIZE - dst_po;
5633			} else {
5634				part_size = PAGE_SIZE - src_po;
5635			}
5636			if (part_size > (amount_left)){
5637				part_size = amount_left;
5638			}
5639
5640			if (result_page == VM_PAGE_NULL) {
5641				assert((vm_offset_t) dst_po == dst_po);
5642				assert((vm_size_t) part_size == part_size);
5643				vm_page_part_zero_fill(dst_page,
5644						       (vm_offset_t) dst_po,
5645						       (vm_size_t) part_size);
5646			} else {
5647				assert((vm_offset_t) src_po == src_po);
5648				assert((vm_offset_t) dst_po == dst_po);
5649				assert((vm_size_t) part_size == part_size);
5650				vm_page_part_copy(result_page,
5651						  (vm_offset_t) src_po,
5652						  dst_page,
5653						  (vm_offset_t) dst_po,
5654						  (vm_size_t)part_size);
5655				if(!dst_page->dirty){
5656					vm_object_lock(dst_object);
5657					SET_PAGE_DIRTY(dst_page, TRUE);
5658					vm_object_unlock(dst_page->object);
5659				}
5660
5661			}
5662		} else {
5663			part_size = PAGE_SIZE;
5664
5665			if (result_page == VM_PAGE_NULL)
5666				vm_page_zero_fill(dst_page);
5667			else{
5668				vm_object_lock(result_page->object);
5669				vm_page_copy(result_page, dst_page);
5670				vm_object_unlock(result_page->object);
5671
5672				if(!dst_page->dirty){
5673					vm_object_lock(dst_object);
5674					SET_PAGE_DIRTY(dst_page, TRUE);
5675					vm_object_unlock(dst_page->object);
5676				}
5677			}
5678
5679		}
5680
5681		/*
5682		 *	Unlock everything, and return
5683		 */
5684
5685		vm_map_verify_done(dst_map, dst_version);
5686
5687		if (result_page != VM_PAGE_NULL && src_page != dst_page)
5688			vm_fault_copy_cleanup(result_page, src_top_page);
5689		vm_fault_copy_dst_cleanup(dst_page);
5690
5691		amount_left -= part_size;
5692		src_offset += part_size;
5693		dst_offset += part_size;
5694	} while (amount_left > 0);
5695
5696	RETURN(KERN_SUCCESS);
5697#undef	RETURN
5698
5699	/*NOTREACHED*/
5700}
5701
5702#if	VM_FAULT_CLASSIFY
5703/*
5704 *	Temporary statistics gathering support.
5705 */
5706
5707/*
5708 *	Statistics arrays:
5709 */
5710#define VM_FAULT_TYPES_MAX	5
5711#define	VM_FAULT_LEVEL_MAX	8
5712
5713int	vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
5714
5715#define	VM_FAULT_TYPE_ZERO_FILL	0
5716#define	VM_FAULT_TYPE_MAP_IN	1
5717#define	VM_FAULT_TYPE_PAGER	2
5718#define	VM_FAULT_TYPE_COPY	3
5719#define	VM_FAULT_TYPE_OTHER	4
5720
5721
5722void
5723vm_fault_classify(vm_object_t		object,
5724		  vm_object_offset_t	offset,
5725		  vm_prot_t		fault_type)
5726{
5727	int		type, level = 0;
5728	vm_page_t	m;
5729
5730	while (TRUE) {
5731		m = vm_page_lookup(object, offset);
5732		if (m != VM_PAGE_NULL) {
5733		        if (m->busy || m->error || m->restart || m->absent) {
5734				type = VM_FAULT_TYPE_OTHER;
5735				break;
5736			}
5737			if (((fault_type & VM_PROT_WRITE) == 0) ||
5738			    ((level == 0) && object->copy == VM_OBJECT_NULL)) {
5739				type = VM_FAULT_TYPE_MAP_IN;
5740				break;
5741			}
5742			type = VM_FAULT_TYPE_COPY;
5743			break;
5744		}
5745		else {
5746			if (object->pager_created) {
5747				type = VM_FAULT_TYPE_PAGER;
5748				break;
5749			}
5750			if (object->shadow == VM_OBJECT_NULL) {
5751				type = VM_FAULT_TYPE_ZERO_FILL;
5752				break;
5753		        }
5754
5755			offset += object->vo_shadow_offset;
5756			object = object->shadow;
5757			level++;
5758			continue;
5759		}
5760	}
5761
5762	if (level > VM_FAULT_LEVEL_MAX)
5763		level = VM_FAULT_LEVEL_MAX;
5764
5765	vm_fault_stats[type][level] += 1;
5766
5767	return;
5768}
5769
5770/* cleanup routine to call from debugger */
5771
5772void
5773vm_fault_classify_init(void)
5774{
5775	int type, level;
5776
5777	for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
5778		for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
5779			vm_fault_stats[type][level] = 0;
5780		}
5781	}
5782
5783	return;
5784}
5785#endif	/* VM_FAULT_CLASSIFY */
5786
5787
5788void
5789vm_page_validate_cs_mapped(
5790	vm_page_t	page,
5791	const void 	*kaddr)
5792{
5793	vm_object_t		object;
5794	vm_object_offset_t	offset;
5795	kern_return_t		kr;
5796	memory_object_t		pager;
5797	void			*blobs;
5798	boolean_t		validated, tainted;
5799
5800	assert(page->busy);
5801	vm_object_lock_assert_exclusive(page->object);
5802
5803	if (!cs_validation) {
5804		return;
5805	}
5806
5807	if (page->wpmapped && !page->cs_tainted) {
5808		/*
5809		 * This page was mapped for "write" access sometime in the
5810		 * past and could still be modifiable in the future.
5811		 * Consider it tainted.
5812		 * [ If the page was already found to be "tainted", no
5813		 * need to re-validate. ]
5814		 */
5815		page->cs_validated = TRUE;
5816		page->cs_tainted = TRUE;
5817		if (cs_debug) {
5818			printf("CODESIGNING: vm_page_validate_cs: "
5819			       "page %p obj %p off 0x%llx "
5820			       "was modified\n",
5821			       page, page->object, page->offset);
5822		}
5823		vm_cs_validated_dirtied++;
5824	}
5825
5826	if (page->cs_validated) {
5827		return;
5828	}
5829
5830	vm_cs_validates++;
5831
5832	object = page->object;
5833	assert(object->code_signed);
5834	offset = page->offset;
5835
5836	if (!object->alive || object->terminating || object->pager == NULL) {
5837		/*
5838		 * The object is terminating and we don't have its pager
5839		 * so we can't validate the data...
5840		 */
5841		return;
5842	}
5843	/*
5844	 * Since we get here to validate a page that was brought in by
5845	 * the pager, we know that this pager is all setup and ready
5846	 * by now.
5847	 */
5848	assert(!object->internal);
5849	assert(object->pager != NULL);
5850	assert(object->pager_ready);
5851
5852	pager = object->pager;
5853	assert(object->paging_in_progress);
5854	kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5855	if (kr != KERN_SUCCESS) {
5856		blobs = NULL;
5857	}
5858
5859	/* verify the SHA1 hash for this page */
5860	validated = cs_validate_page(blobs,
5861				     pager,
5862				     offset + object->paging_offset,
5863				     (const void *)kaddr,
5864				     &tainted);
5865
5866	page->cs_validated = validated;
5867	if (validated) {
5868		page->cs_tainted = tainted;
5869	}
5870}
5871
5872void
5873vm_page_validate_cs(
5874	vm_page_t	page)
5875{
5876	vm_object_t		object;
5877	vm_object_offset_t	offset;
5878	vm_map_offset_t		koffset;
5879	vm_map_size_t		ksize;
5880	vm_offset_t		kaddr;
5881	kern_return_t		kr;
5882	boolean_t		busy_page;
5883	boolean_t		need_unmap;
5884
5885	vm_object_lock_assert_held(page->object);
5886
5887	if (!cs_validation) {
5888		return;
5889	}
5890
5891	if (page->wpmapped && !page->cs_tainted) {
5892		vm_object_lock_assert_exclusive(page->object);
5893
5894		/*
5895		 * This page was mapped for "write" access sometime in the
5896		 * past and could still be modifiable in the future.
5897		 * Consider it tainted.
5898		 * [ If the page was already found to be "tainted", no
5899		 * need to re-validate. ]
5900		 */
5901		page->cs_validated = TRUE;
5902		page->cs_tainted = TRUE;
5903		if (cs_debug) {
5904			printf("CODESIGNING: vm_page_validate_cs: "
5905			       "page %p obj %p off 0x%llx "
5906			       "was modified\n",
5907			       page, page->object, page->offset);
5908		}
5909		vm_cs_validated_dirtied++;
5910	}
5911
5912	if (page->cs_validated) {
5913		return;
5914	}
5915
5916	if (page->slid) {
5917		panic("vm_page_validate_cs(%p): page is slid\n", page);
5918	}
5919	assert(!page->slid);
5920
5921#if CHECK_CS_VALIDATION_BITMAP
5922	if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5923		page->cs_validated = TRUE;
5924		page->cs_tainted = FALSE;
5925		vm_cs_bitmap_validated++;
5926		return;
5927	}
5928#endif
5929	vm_object_lock_assert_exclusive(page->object);
5930
5931	object = page->object;
5932	assert(object->code_signed);
5933	offset = page->offset;
5934
5935	busy_page = page->busy;
5936	if (!busy_page) {
5937		/* keep page busy while we map (and unlock) the VM object */
5938		page->busy = TRUE;
5939	}
5940
5941	/*
5942	 * Take a paging reference on the VM object
5943	 * to protect it from collapse or bypass,
5944	 * and keep it from disappearing too.
5945	 */
5946	vm_object_paging_begin(object);
5947
5948	/* map the page in the kernel address space */
5949	ksize = PAGE_SIZE_64;
5950	koffset = 0;
5951	need_unmap = FALSE;
5952	kr = vm_paging_map_object(page,
5953				  object,
5954				  offset,
5955				  VM_PROT_READ,
5956				  FALSE, /* can't unlock object ! */
5957				  &ksize,
5958				  &koffset,
5959				  &need_unmap);
5960	if (kr != KERN_SUCCESS) {
5961		panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5962	}
5963	kaddr = CAST_DOWN(vm_offset_t, koffset);
5964
5965	/* validate the mapped page */
5966	vm_page_validate_cs_mapped(page, (const void *) kaddr);
5967
5968#if CHECK_CS_VALIDATION_BITMAP
5969	if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5970		vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5971	}
5972#endif
5973	assert(page->busy);
5974	assert(object == page->object);
5975	vm_object_lock_assert_exclusive(object);
5976
5977	if (!busy_page) {
5978		PAGE_WAKEUP_DONE(page);
5979	}
5980	if (need_unmap) {
5981		/* unmap the map from the kernel address space */
5982		vm_paging_unmap_object(object, koffset, koffset + ksize);
5983		koffset = 0;
5984		ksize = 0;
5985		kaddr = 0;
5986	}
5987	vm_object_paging_end(object);
5988}
5989