1/*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 *	File:	vm_fault.c
60 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 *	Page fault handling module.
63 */
64
65#include <mach_cluster_stats.h>
66#include <mach_pagemap.h>
67#include <libkern/OSAtomic.h>
68
69#include <mach/mach_types.h>
70#include <mach/kern_return.h>
71#include <mach/message.h>	/* for error codes */
72#include <mach/vm_param.h>
73#include <mach/vm_behavior.h>
74#include <mach/memory_object.h>
75				/* For memory_object_data_{request,unlock} */
76#include <mach/sdt.h>
77
78#include <kern/kern_types.h>
79#include <kern/host_statistics.h>
80#include <kern/counters.h>
81#include <kern/task.h>
82#include <kern/thread.h>
83#include <kern/sched_prim.h>
84#include <kern/host.h>
85#include <kern/xpr.h>
86#include <kern/mach_param.h>
87#include <kern/macro_help.h>
88#include <kern/zalloc.h>
89#include <kern/misc_protos.h>
90
91#include <vm/vm_compressor.h>
92#include <vm/vm_compressor_pager.h>
93#include <vm/vm_fault.h>
94#include <vm/vm_map.h>
95#include <vm/vm_object.h>
96#include <vm/vm_page.h>
97#include <vm/vm_kern.h>
98#include <vm/pmap.h>
99#include <vm/vm_pageout.h>
100#include <vm/vm_protos.h>
101#include <vm/vm_external.h>
102#include <vm/memory_object.h>
103#include <vm/vm_purgeable_internal.h>	/* Needed by some vm_page.h macros */
104#include <vm/vm_shared_region.h>
105
106#include <sys/codesign.h>
107
108#include <libsa/sys/timers.h>	/* for struct timespec */
109
110#define VM_FAULT_CLASSIFY	0
111
112#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
113
114int	vm_object_pagein_throttle = 16;
115
116/*
117 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
118 * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
119 * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
120 * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
121 * keep the UI active so that the user has a chance to kill the offending task before the system
122 * completely hangs.
123 *
124 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
125 * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
126 * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
127 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
128 */
129
130extern void throttle_lowpri_io(int);
131
132uint64_t vm_hard_throttle_threshold;
133
134
135
136#define NEED_TO_HARD_THROTTLE_THIS_TASK() 	((current_task() != kernel_task && \
137						  get_task_resident_size(current_task()) > (((AVAILABLE_NON_COMPRESSED_MEMORY) * PAGE_SIZE) / 5)) && \
138						 (vm_low_on_space() || (vm_page_free_count < vm_page_throttle_limit && \
139						                        proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED )))
140
141
142
143#define HARD_THROTTLE_DELAY	20000	/* 20000 us == 20 ms */
144#define SOFT_THROTTLE_DELAY	2000	/* 2000 us == 2 ms */
145
146boolean_t current_thread_aborted(void);
147
148/* Forward declarations of internal routines. */
149extern kern_return_t vm_fault_wire_fast(
150				vm_map_t	map,
151				vm_map_offset_t	va,
152				vm_map_entry_t	entry,
153				pmap_t		pmap,
154				vm_map_offset_t	pmap_addr);
155
156extern void vm_fault_continue(void);
157
158extern void vm_fault_copy_cleanup(
159				vm_page_t	page,
160				vm_page_t	top_page);
161
162extern void vm_fault_copy_dst_cleanup(
163				vm_page_t	page);
164
165#if	VM_FAULT_CLASSIFY
166extern void vm_fault_classify(vm_object_t	object,
167			  vm_object_offset_t	offset,
168			  vm_prot_t		fault_type);
169
170extern void vm_fault_classify_init(void);
171#endif
172
173unsigned long vm_pmap_enter_blocked = 0;
174unsigned long vm_pmap_enter_retried = 0;
175
176unsigned long vm_cs_validates = 0;
177unsigned long vm_cs_revalidates = 0;
178unsigned long vm_cs_query_modified = 0;
179unsigned long vm_cs_validated_dirtied = 0;
180unsigned long vm_cs_bitmap_validated = 0;
181
182/*
183 *	Routine:	vm_fault_init
184 *	Purpose:
185 *		Initialize our private data structures.
186 */
187void
188vm_fault_init(void)
189{
190	int i, vm_compressor_temp;
191	boolean_t need_default_val = TRUE;
192	/*
193	 * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
194	 * computed as a percentage of available memory, and the percentage used is scaled inversely with
195	 * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
196	 * and reduce the value down to 10% for very large memory configurations.  This helps give us a
197	 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
198	 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
199	 */
200
201	vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
202
203	/*
204	 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
205	 */
206
207	if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof (vm_compressor_temp))) {
208		for ( i = 0; i < VM_PAGER_MAX_MODES; i++) {
209			if (vm_compressor_temp > 0 &&
210			    ((vm_compressor_temp & ( 1 << i)) == vm_compressor_temp)) {
211				need_default_val = FALSE;
212				vm_compressor_mode = vm_compressor_temp;
213				break;
214			}
215		}
216		if (need_default_val)
217			printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
218	}
219	if (need_default_val) {
220		/* If no boot arg or incorrect boot arg, try device tree. */
221		PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
222	}
223	PE_parse_boot_argn("vm_compressor_threads", &vm_compressor_thread_count, sizeof (vm_compressor_thread_count));
224	printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
225}
226
227/*
228 *	Routine:	vm_fault_cleanup
229 *	Purpose:
230 *		Clean up the result of vm_fault_page.
231 *	Results:
232 *		The paging reference for "object" is released.
233 *		"object" is unlocked.
234 *		If "top_page" is not null,  "top_page" is
235 *		freed and the paging reference for the object
236 *		containing it is released.
237 *
238 *	In/out conditions:
239 *		"object" must be locked.
240 */
241void
242vm_fault_cleanup(
243	register vm_object_t	object,
244	register vm_page_t	top_page)
245{
246	vm_object_paging_end(object);
247 	vm_object_unlock(object);
248
249	if (top_page != VM_PAGE_NULL) {
250	        object = top_page->object;
251
252		vm_object_lock(object);
253		VM_PAGE_FREE(top_page);
254		vm_object_paging_end(object);
255		vm_object_unlock(object);
256	}
257}
258
259#if	MACH_CLUSTER_STATS
260#define MAXCLUSTERPAGES 16
261struct {
262	unsigned long pages_in_cluster;
263	unsigned long pages_at_higher_offsets;
264	unsigned long pages_at_lower_offsets;
265} cluster_stats_in[MAXCLUSTERPAGES];
266#define CLUSTER_STAT(clause)	clause
267#define CLUSTER_STAT_HIGHER(x)	\
268	((cluster_stats_in[(x)].pages_at_higher_offsets)++)
269#define CLUSTER_STAT_LOWER(x)	\
270	 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
271#define CLUSTER_STAT_CLUSTER(x)	\
272	((cluster_stats_in[(x)].pages_in_cluster)++)
273#else	/* MACH_CLUSTER_STATS */
274#define CLUSTER_STAT(clause)
275#endif	/* MACH_CLUSTER_STATS */
276
277#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
278
279
280boolean_t	vm_page_deactivate_behind = TRUE;
281/*
282 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
283 */
284#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW	128
285#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER	16		/* don't make this too big... */
286                                                                /* we use it to size an array on the stack */
287
288int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
289
290#define MAX_SEQUENTIAL_RUN	(1024 * 1024 * 1024)
291
292/*
293 * vm_page_is_sequential
294 *
295 * Determine if sequential access is in progress
296 * in accordance with the behavior specified.
297 * Update state to indicate current access pattern.
298 *
299 * object must have at least the shared lock held
300 */
301static
302void
303vm_fault_is_sequential(
304	vm_object_t		object,
305	vm_object_offset_t	offset,
306	vm_behavior_t		behavior)
307{
308        vm_object_offset_t	last_alloc;
309	int			sequential;
310	int			orig_sequential;
311
312        last_alloc = object->last_alloc;
313	sequential = object->sequential;
314	orig_sequential = sequential;
315
316	switch (behavior) {
317	case VM_BEHAVIOR_RANDOM:
318	        /*
319		 * reset indicator of sequential behavior
320		 */
321	        sequential = 0;
322	        break;
323
324	case VM_BEHAVIOR_SEQUENTIAL:
325	        if (offset && last_alloc == offset - PAGE_SIZE_64) {
326		        /*
327			 * advance indicator of sequential behavior
328			 */
329		        if (sequential < MAX_SEQUENTIAL_RUN)
330			        sequential += PAGE_SIZE;
331		} else {
332		        /*
333			 * reset indicator of sequential behavior
334			 */
335		        sequential = 0;
336		}
337	        break;
338
339	case VM_BEHAVIOR_RSEQNTL:
340	        if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
341		        /*
342			 * advance indicator of sequential behavior
343			 */
344		        if (sequential > -MAX_SEQUENTIAL_RUN)
345			        sequential -= PAGE_SIZE;
346		} else {
347		        /*
348			 * reset indicator of sequential behavior
349			 */
350		        sequential = 0;
351		}
352	        break;
353
354	case VM_BEHAVIOR_DEFAULT:
355	default:
356	        if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
357		        /*
358			 * advance indicator of sequential behavior
359			 */
360		        if (sequential < 0)
361			        sequential = 0;
362		        if (sequential < MAX_SEQUENTIAL_RUN)
363			        sequential += PAGE_SIZE;
364
365		} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
366		        /*
367			 * advance indicator of sequential behavior
368			 */
369		        if (sequential > 0)
370			        sequential = 0;
371		        if (sequential > -MAX_SEQUENTIAL_RUN)
372			        sequential -= PAGE_SIZE;
373		} else {
374		        /*
375			 * reset indicator of sequential behavior
376			 */
377		        sequential = 0;
378		}
379	        break;
380	}
381	if (sequential != orig_sequential) {
382	        if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
383		        /*
384			 * if someone else has already updated object->sequential
385			 * don't bother trying to update it or object->last_alloc
386			 */
387		        return;
388		}
389	}
390	/*
391	 * I'd like to do this with a OSCompareAndSwap64, but that
392	 * doesn't exist for PPC...  however, it shouldn't matter
393	 * that much... last_alloc is maintained so that we can determine
394	 * if a sequential access pattern is taking place... if only
395	 * one thread is banging on this object, no problem with the unprotected
396	 * update... if 2 or more threads are banging away, we run the risk of
397	 * someone seeing a mangled update... however, in the face of multiple
398	 * accesses, no sequential access pattern can develop anyway, so we
399	 * haven't lost any real info.
400	 */
401	object->last_alloc = offset;
402}
403
404
405int vm_page_deactivate_behind_count = 0;
406
407/*
408 * vm_page_deactivate_behind
409 *
410 * Determine if sequential access is in progress
411 * in accordance with the behavior specified.  If
412 * so, compute a potential page to deactivate and
413 * deactivate it.
414 *
415 * object must be locked.
416 *
417 * return TRUE if we actually deactivate a page
418 */
419static
420boolean_t
421vm_fault_deactivate_behind(
422	vm_object_t		object,
423	vm_object_offset_t	offset,
424	vm_behavior_t		behavior)
425{
426	int		n;
427	int		pages_in_run = 0;
428	int		max_pages_in_run = 0;
429	int		sequential_run;
430	int		sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
431	vm_object_offset_t	run_offset = 0;
432	vm_object_offset_t	pg_offset = 0;
433	vm_page_t	m;
434	vm_page_t	page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
435
436	pages_in_run = 0;
437#if TRACEFAULTPAGE
438	dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind);	/* (TEST/DEBUG) */
439#endif
440
441	if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
442		/*
443		 * Do not deactivate pages from the kernel object: they
444		 * are not intended to become pageable.
445		 * or we've disabled the deactivate behind mechanism
446		 */
447		return FALSE;
448	}
449	if ((sequential_run = object->sequential)) {
450		  if (sequential_run < 0) {
451		          sequential_behavior = VM_BEHAVIOR_RSEQNTL;
452			  sequential_run = 0 - sequential_run;
453		  } else {
454		          sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
455		  }
456	}
457	switch (behavior) {
458	case VM_BEHAVIOR_RANDOM:
459		break;
460	case VM_BEHAVIOR_SEQUENTIAL:
461	        if (sequential_run >= (int)PAGE_SIZE) {
462			run_offset = 0 - PAGE_SIZE_64;
463			max_pages_in_run = 1;
464		}
465		break;
466	case VM_BEHAVIOR_RSEQNTL:
467	        if (sequential_run >= (int)PAGE_SIZE) {
468			run_offset = PAGE_SIZE_64;
469			max_pages_in_run = 1;
470		}
471		break;
472	case VM_BEHAVIOR_DEFAULT:
473	default:
474	{	vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
475
476	        /*
477		 * determine if the run of sequential accesss has been
478		 * long enough on an object with default access behavior
479		 * to consider it for deactivation
480		 */
481		if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
482			/*
483			 * the comparisons between offset and behind are done
484			 * in this kind of odd fashion in order to prevent wrap around
485			 * at the end points
486			 */
487		        if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
488			        if (offset >= behind) {
489					run_offset = 0 - behind;
490					pg_offset = PAGE_SIZE_64;
491					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
492				}
493			} else {
494			        if (offset < -behind) {
495					run_offset = behind;
496					pg_offset = 0 - PAGE_SIZE_64;
497					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
498				}
499			}
500		}
501		break;
502	}
503	}
504        for (n = 0; n < max_pages_in_run; n++) {
505		m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
506
507		if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
508			page_run[pages_in_run++] = m;
509
510			/*
511			 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
512			 *
513			 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
514			 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
515			 * new reference happens. If no futher references happen on the page after that remote TLB flushes
516			 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
517			 * by pageout_scan, which is just fine since the last reference would have happened quite far
518			 * in the past (TLB caches don't hang around for very long), and of course could just as easily
519			 * have happened before we did the deactivate_behind.
520			 */
521			pmap_clear_refmod_options(m->phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
522		}
523	}
524	if (pages_in_run) {
525		vm_page_lockspin_queues();
526
527		for (n = 0; n < pages_in_run; n++) {
528
529			m = page_run[n];
530
531			vm_page_deactivate_internal(m, FALSE);
532
533			vm_page_deactivate_behind_count++;
534#if TRACEFAULTPAGE
535			dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);	/* (TEST/DEBUG) */
536#endif
537		}
538		vm_page_unlock_queues();
539
540		return TRUE;
541	}
542	return FALSE;
543}
544
545
546static int
547vm_page_throttled(void)
548{
549        clock_sec_t     elapsed_sec;
550        clock_sec_t     tv_sec;
551        clock_usec_t    tv_usec;
552
553	thread_t thread = current_thread();
554
555	if (thread->options & TH_OPT_VMPRIV)
556		return (0);
557
558	thread->t_page_creation_count++;
559
560	if (NEED_TO_HARD_THROTTLE_THIS_TASK())
561		return (HARD_THROTTLE_DELAY);
562
563	if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
564	    thread->t_page_creation_count > vm_page_creation_throttle) {
565
566		clock_get_system_microtime(&tv_sec, &tv_usec);
567
568		elapsed_sec = tv_sec - thread->t_page_creation_time;
569
570		if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
571
572			if (elapsed_sec >= 60) {
573				/*
574				 * we'll reset our stats to give a well behaved app
575				 * that was unlucky enough to accumulate a bunch of pages
576				 * over a long period of time a chance to get out of
577				 * the throttled state... we reset the counter and timestamp
578				 * so that if it stays under the rate limit for the next second
579				 * it will be back in our good graces... if it exceeds it, it
580				 * will remain in the throttled state
581				 */
582				thread->t_page_creation_time = tv_sec;
583				thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
584			}
585			++vm_page_throttle_count;
586
587			if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED())
588				return (HARD_THROTTLE_DELAY);
589			else
590				return (SOFT_THROTTLE_DELAY);
591		}
592		thread->t_page_creation_time = tv_sec;
593		thread->t_page_creation_count = 0;
594	}
595	return (0);
596}
597
598
599/*
600 * check for various conditions that would
601 * prevent us from creating a ZF page...
602 * cleanup is based on being called from vm_fault_page
603 *
604 * object must be locked
605 * object == m->object
606 */
607static vm_fault_return_t
608vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
609{
610	int throttle_delay;
611
612        if (object->shadow_severed ||
613	    VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
614	        /*
615		 * Either:
616		 * 1. the shadow chain was severed,
617		 * 2. the purgeable object is volatile or empty and is marked
618		 *    to fault on access while volatile.
619		 * Just have to return an error at this point
620		 */
621	        if (m != VM_PAGE_NULL)
622		        VM_PAGE_FREE(m);
623		vm_fault_cleanup(object, first_m);
624
625		thread_interrupt_level(interruptible_state);
626
627		return (VM_FAULT_MEMORY_ERROR);
628	}
629	if (vm_backing_store_low) {
630	        /*
631		 * are we protecting the system from
632		 * backing store exhaustion.  If so
633		 * sleep unless we are privileged.
634		 */
635	        if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
636
637			if (m != VM_PAGE_NULL)
638			        VM_PAGE_FREE(m);
639			vm_fault_cleanup(object, first_m);
640
641		        assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
642
643			thread_block(THREAD_CONTINUE_NULL);
644			thread_interrupt_level(interruptible_state);
645
646			return (VM_FAULT_RETRY);
647		}
648	}
649	if ((throttle_delay = vm_page_throttled())) {
650	        /*
651		 * we're throttling zero-fills...
652		 * treat this as if we couldn't grab a page
653		 */
654	        if (m != VM_PAGE_NULL)
655		        VM_PAGE_FREE(m);
656		vm_fault_cleanup(object, first_m);
657
658		VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
659
660		delay(throttle_delay);
661
662		if (current_thread_aborted()) {
663			thread_interrupt_level(interruptible_state);
664			return VM_FAULT_INTERRUPTED;
665		}
666		thread_interrupt_level(interruptible_state);
667
668		return (VM_FAULT_MEMORY_SHORTAGE);
669	}
670	return (VM_FAULT_SUCCESS);
671}
672
673
674/*
675 * do the work to zero fill a page and
676 * inject it into the correct paging queue
677 *
678 * m->object must be locked
679 * page queue lock must NOT be held
680 */
681static int
682vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
683{
684        int my_fault = DBG_ZERO_FILL_FAULT;
685
686	/*
687	 * This is is a zero-fill page fault...
688	 *
689	 * Checking the page lock is a waste of
690	 * time;  this page was absent, so
691	 * it can't be page locked by a pager.
692	 *
693	 * we also consider it undefined
694	 * with respect to instruction
695	 * execution.  i.e. it is the responsibility
696	 * of higher layers to call for an instruction
697	 * sync after changing the contents and before
698	 * sending a program into this area.  We
699	 * choose this approach for performance
700	 */
701	m->pmapped = TRUE;
702
703	m->cs_validated = FALSE;
704	m->cs_tainted = FALSE;
705
706	if (no_zero_fill == TRUE) {
707		my_fault = DBG_NZF_PAGE_FAULT;
708	} else {
709		vm_page_zero_fill(m);
710
711		VM_STAT_INCR(zero_fill_count);
712		DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
713	}
714	assert(!m->laundry);
715	assert(m->object != kernel_object);
716	//assert(m->pageq.next == NULL && m->pageq.prev == NULL);
717
718	if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
719		(m->object->purgable == VM_PURGABLE_DENY ||
720		 m->object->purgable == VM_PURGABLE_NONVOLATILE ||
721		 m->object->purgable == VM_PURGABLE_VOLATILE )) {
722
723		vm_page_lockspin_queues();
724
725		if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) {
726			assert(!VM_PAGE_WIRED(m));
727
728			/*
729			 * can't be on the pageout queue since we don't
730			 * have a pager to try and clean to
731			 */
732			assert(!m->pageout_queue);
733
734			VM_PAGE_QUEUES_REMOVE(m);
735
736			queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
737			m->throttled = TRUE;
738			vm_page_throttled_count++;
739		}
740		vm_page_unlock_queues();
741	}
742	return (my_fault);
743}
744
745
746/*
747 *	Routine:	vm_fault_page
748 *	Purpose:
749 *		Find the resident page for the virtual memory
750 *		specified by the given virtual memory object
751 *		and offset.
752 *	Additional arguments:
753 *		The required permissions for the page is given
754 *		in "fault_type".  Desired permissions are included
755 *		in "protection".
756 *		fault_info is passed along to determine pagein cluster
757 *		limits... it contains the expected reference pattern,
758 *		cluster size if available, etc...
759 *
760 *		If the desired page is known to be resident (for
761 *		example, because it was previously wired down), asserting
762 *		the "unwiring" parameter will speed the search.
763 *
764 *		If the operation can be interrupted (by thread_abort
765 *		or thread_terminate), then the "interruptible"
766 *		parameter should be asserted.
767 *
768 *	Results:
769 *		The page containing the proper data is returned
770 *		in "result_page".
771 *
772 *	In/out conditions:
773 *		The source object must be locked and referenced,
774 *		and must donate one paging reference.  The reference
775 *		is not affected.  The paging reference and lock are
776 *		consumed.
777 *
778 *		If the call succeeds, the object in which "result_page"
779 *		resides is left locked and holding a paging reference.
780 *		If this is not the original object, a busy page in the
781 *		original object is returned in "top_page", to prevent other
782 *		callers from pursuing this same data, along with a paging
783 *		reference for the original object.  The "top_page" should
784 *		be destroyed when this guarantee is no longer required.
785 *		The "result_page" is also left busy.  It is not removed
786 *		from the pageout queues.
787 *	Special Case:
788 *		A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
789 *		fault succeeded but there's no VM page (i.e. the VM object
790 * 		does not actually hold VM pages, but device memory or
791 *		large pages).  The object is still locked and we still hold a
792 *		paging_in_progress reference.
793 */
794unsigned int vm_fault_page_blocked_access = 0;
795unsigned int vm_fault_page_forced_retry = 0;
796
797vm_fault_return_t
798vm_fault_page(
799	/* Arguments: */
800	vm_object_t	first_object,	/* Object to begin search */
801	vm_object_offset_t first_offset,	/* Offset into object */
802	vm_prot_t	fault_type,	/* What access is requested */
803	boolean_t	must_be_resident,/* Must page be resident? */
804	boolean_t	caller_lookup,	/* caller looked up page */
805	/* Modifies in place: */
806	vm_prot_t	*protection,	/* Protection for mapping */
807	vm_page_t	*result_page,	/* Page found, if successful */
808	/* Returns: */
809	vm_page_t	*top_page,	/* Page in top object, if
810					 * not result_page.  */
811	int             *type_of_fault, /* if non-null, fill in with type of fault
812					 * COW, zero-fill, etc... returned in trace point */
813	/* More arguments: */
814	kern_return_t	*error_code,	/* code if page is in error */
815	boolean_t	no_zero_fill,	/* don't zero fill absent pages */
816	boolean_t	data_supply,	/* treat as data_supply if
817					 * it is a write fault and a full
818					 * page is provided */
819	vm_object_fault_info_t fault_info)
820{
821	vm_page_t		m;
822	vm_object_t		object;
823	vm_object_offset_t	offset;
824	vm_page_t		first_m;
825	vm_object_t		next_object;
826	vm_object_t		copy_object;
827	boolean_t		look_for_page;
828	boolean_t		force_fault_retry = FALSE;
829	vm_prot_t		access_required = fault_type;
830	vm_prot_t		wants_copy_flag;
831	CLUSTER_STAT(int pages_at_higher_offsets;)
832	CLUSTER_STAT(int pages_at_lower_offsets;)
833	kern_return_t		wait_result;
834	boolean_t		interruptible_state;
835	boolean_t		data_already_requested = FALSE;
836	vm_behavior_t		orig_behavior;
837	vm_size_t		orig_cluster_size;
838	vm_fault_return_t	error;
839	int			my_fault;
840	uint32_t		try_failed_count;
841	int			interruptible; /* how may fault be interrupted? */
842	int			external_state = VM_EXTERNAL_STATE_UNKNOWN;
843	memory_object_t		pager;
844	vm_fault_return_t	retval;
845
846/*
847 * MACH page map - an optional optimization where a bit map is maintained
848 * by the VM subsystem for internal objects to indicate which pages of
849 * the object currently reside on backing store.  This existence map
850 * duplicates information maintained by the vnode pager.  It is
851 * created at the time of the first pageout against the object, i.e.
852 * at the same time pager for the object is created.  The optimization
853 * is designed to eliminate pager interaction overhead, if it is
854 * 'known' that the page does not exist on backing store.
855 *
856 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
857 * either marked as paged out in the existence map for the object or no
858 * existence map exists for the object.  MUST_ASK_PAGER() is one of the
859 * criteria in the decision to invoke the pager.   It is also used as one
860 * of the criteria to terminate the scan for adjacent pages in a clustered
861 * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
862 * permanent objects.  Note also that if the pager for an internal object
863 * has not been created, the pager is not invoked regardless of the value
864 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
865 * for which a pager has been created.
866 *
867 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
868 * is marked as paged out in the existence map for the object.  PAGED_OUT()
869 * PAGED_OUT() is used to determine if a page has already been pushed
870 * into a copy object in order to avoid a redundant page out operation.
871 */
872#if MACH_PAGEMAP
873#define MUST_ASK_PAGER(o, f, s)					\
874	((vm_external_state_get((o)->existence_map, (f))	\
875	  != VM_EXTERNAL_STATE_ABSENT) &&			\
876	 (s = (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))))	\
877	 != VM_EXTERNAL_STATE_ABSENT)
878#define PAGED_OUT(o, f)						\
879	((vm_external_state_get((o)->existence_map, (f))	\
880	  == VM_EXTERNAL_STATE_EXISTS) ||			\
881	 (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))		\
882	  == VM_EXTERNAL_STATE_EXISTS))
883#else /* MACH_PAGEMAP */
884#define MUST_ASK_PAGER(o, f, s)					\
885	((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
886#define PAGED_OUT(o, f) \
887	(VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
888#endif /* MACH_PAGEMAP */
889
890/*
891 *	Recovery actions
892 */
893#define RELEASE_PAGE(m)					\
894	MACRO_BEGIN					\
895	PAGE_WAKEUP_DONE(m);				\
896	if (!m->active && !m->inactive && !m->throttled) {		\
897		vm_page_lockspin_queues();				\
898		if (!m->active && !m->inactive && !m->throttled) {	\
899			if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)	\
900                                vm_page_deactivate(m);                  \
901                        else						\
902				vm_page_activate(m);			\
903		}							\
904		vm_page_unlock_queues();				\
905	}								\
906	MACRO_END
907
908#if TRACEFAULTPAGE
909	dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset);	/* (TEST/DEBUG) */
910#endif
911
912	interruptible = fault_info->interruptible;
913	interruptible_state = thread_interrupt_level(interruptible);
914
915	/*
916	 *	INVARIANTS (through entire routine):
917	 *
918	 *	1)	At all times, we must either have the object
919	 *		lock or a busy page in some object to prevent
920	 *		some other thread from trying to bring in
921	 *		the same page.
922	 *
923	 *		Note that we cannot hold any locks during the
924	 *		pager access or when waiting for memory, so
925	 *		we use a busy page then.
926	 *
927	 *	2)	To prevent another thread from racing us down the
928	 *		shadow chain and entering a new page in the top
929	 *		object before we do, we must keep a busy page in
930	 *		the top object while following the shadow chain.
931	 *
932	 *	3)	We must increment paging_in_progress on any object
933	 *		for which we have a busy page before dropping
934	 *		the object lock
935	 *
936	 *	4)	We leave busy pages on the pageout queues.
937	 *		If the pageout daemon comes across a busy page,
938	 *		it will remove the page from the pageout queues.
939	 */
940
941	object = first_object;
942	offset = first_offset;
943	first_m = VM_PAGE_NULL;
944	access_required = fault_type;
945
946
947	XPR(XPR_VM_FAULT,
948		"vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
949		object, offset, fault_type, *protection, 0);
950
951	/*
952	 * default type of fault
953	 */
954	my_fault = DBG_CACHE_HIT_FAULT;
955
956	while (TRUE) {
957#if TRACEFAULTPAGE
958		dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);	/* (TEST/DEBUG) */
959#endif
960		if (!object->alive) {
961		        /*
962			 * object is no longer valid
963			 * clean up and return error
964			 */
965			vm_fault_cleanup(object, first_m);
966			thread_interrupt_level(interruptible_state);
967
968			return (VM_FAULT_MEMORY_ERROR);
969		}
970
971		if (!object->pager_created && object->phys_contiguous) {
972			/*
973			 * A physically-contiguous object without a pager:
974			 * must be a "large page" object.  We do not deal
975			 * with VM pages for this object.
976			 */
977			caller_lookup = FALSE;
978			m = VM_PAGE_NULL;
979			goto phys_contig_object;
980		}
981
982		if (object->blocked_access) {
983			/*
984			 * Access to this VM object has been blocked.
985			 * Replace our "paging_in_progress" reference with
986			 * a "activity_in_progress" reference and wait for
987			 * access to be unblocked.
988			 */
989			caller_lookup = FALSE; /* no longer valid after sleep */
990			vm_object_activity_begin(object);
991			vm_object_paging_end(object);
992			while (object->blocked_access) {
993				vm_object_sleep(object,
994						VM_OBJECT_EVENT_UNBLOCKED,
995						THREAD_UNINT);
996			}
997			vm_fault_page_blocked_access++;
998			vm_object_paging_begin(object);
999			vm_object_activity_end(object);
1000		}
1001
1002		/*
1003		 * See whether the page at 'offset' is resident
1004		 */
1005		if (caller_lookup == TRUE) {
1006			/*
1007			 * The caller has already looked up the page
1008			 * and gave us the result in "result_page".
1009			 * We can use this for the first lookup but
1010			 * it loses its validity as soon as we unlock
1011			 * the object.
1012			 */
1013			m = *result_page;
1014			caller_lookup = FALSE; /* no longer valid after that */
1015		} else {
1016			m = vm_page_lookup(object, offset);
1017		}
1018#if TRACEFAULTPAGE
1019		dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);	/* (TEST/DEBUG) */
1020#endif
1021		if (m != VM_PAGE_NULL) {
1022
1023			if (m->busy) {
1024			        /*
1025				 * The page is being brought in,
1026				 * wait for it and then retry.
1027				 */
1028#if TRACEFAULTPAGE
1029				dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1030#endif
1031				wait_result = PAGE_SLEEP(object, m, interruptible);
1032
1033				XPR(XPR_VM_FAULT,
1034				    "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1035				    object, offset,
1036				    m, 0, 0);
1037				counter(c_vm_fault_page_block_busy_kernel++);
1038
1039				if (wait_result != THREAD_AWAKENED) {
1040					vm_fault_cleanup(object, first_m);
1041					thread_interrupt_level(interruptible_state);
1042
1043					if (wait_result == THREAD_RESTART)
1044						return (VM_FAULT_RETRY);
1045					else
1046						return (VM_FAULT_INTERRUPTED);
1047				}
1048				continue;
1049			}
1050			if (m->laundry) {
1051				m->pageout = FALSE;
1052
1053				if (!m->cleaning)
1054					vm_pageout_steal_laundry(m, FALSE);
1055			}
1056			if (m->phys_page == vm_page_guard_addr) {
1057				/*
1058				 * Guard page: off limits !
1059				 */
1060				if (fault_type == VM_PROT_NONE) {
1061					/*
1062					 * The fault is not requesting any
1063					 * access to the guard page, so it must
1064					 * be just to wire or unwire it.
1065					 * Let's pretend it succeeded...
1066					 */
1067					m->busy = TRUE;
1068					*result_page = m;
1069					assert(first_m == VM_PAGE_NULL);
1070					*top_page = first_m;
1071					if (type_of_fault)
1072						*type_of_fault = DBG_GUARD_FAULT;
1073					thread_interrupt_level(interruptible_state);
1074					return VM_FAULT_SUCCESS;
1075				} else {
1076					/*
1077					 * The fault requests access to the
1078					 * guard page: let's deny that !
1079					 */
1080					vm_fault_cleanup(object, first_m);
1081					thread_interrupt_level(interruptible_state);
1082					return VM_FAULT_MEMORY_ERROR;
1083				}
1084			}
1085
1086			if (m->error) {
1087			        /*
1088				 * The page is in error, give up now.
1089				 */
1090#if TRACEFAULTPAGE
1091				dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);	/* (TEST/DEBUG) */
1092#endif
1093				if (error_code)
1094				        *error_code = KERN_MEMORY_ERROR;
1095				VM_PAGE_FREE(m);
1096
1097				vm_fault_cleanup(object, first_m);
1098				thread_interrupt_level(interruptible_state);
1099
1100				return (VM_FAULT_MEMORY_ERROR);
1101			}
1102			if (m->restart) {
1103			        /*
1104				 * The pager wants us to restart
1105				 * at the top of the chain,
1106				 * typically because it has moved the
1107				 * page to another pager, then do so.
1108				 */
1109#if TRACEFAULTPAGE
1110				dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1111#endif
1112				VM_PAGE_FREE(m);
1113
1114				vm_fault_cleanup(object, first_m);
1115				thread_interrupt_level(interruptible_state);
1116
1117				return (VM_FAULT_RETRY);
1118			}
1119			if (m->absent) {
1120			        /*
1121				 * The page isn't busy, but is absent,
1122				 * therefore it's deemed "unavailable".
1123				 *
1124				 * Remove the non-existent page (unless it's
1125				 * in the top object) and move on down to the
1126				 * next object (if there is one).
1127				 */
1128#if TRACEFAULTPAGE
1129				dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);	/* (TEST/DEBUG) */
1130#endif
1131				next_object = object->shadow;
1132
1133				if (next_object == VM_OBJECT_NULL) {
1134					/*
1135					 * Absent page at bottom of shadow
1136					 * chain; zero fill the page we left
1137					 * busy in the first object, and free
1138					 * the absent page.
1139					 */
1140					assert(!must_be_resident);
1141
1142					/*
1143					 * check for any conditions that prevent
1144					 * us from creating a new zero-fill page
1145					 * vm_fault_check will do all of the
1146					 * fault cleanup in the case of an error condition
1147					 * including resetting the thread_interrupt_level
1148					 */
1149					error = vm_fault_check(object, m, first_m, interruptible_state);
1150
1151					if (error != VM_FAULT_SUCCESS)
1152					        return (error);
1153
1154					XPR(XPR_VM_FAULT,
1155					    "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1156						object, offset,
1157						m,
1158						first_object, 0);
1159
1160					if (object != first_object) {
1161					        /*
1162						 * free the absent page we just found
1163						 */
1164						VM_PAGE_FREE(m);
1165
1166						/*
1167						 * drop reference and lock on current object
1168						 */
1169						vm_object_paging_end(object);
1170						vm_object_unlock(object);
1171
1172						/*
1173						 * grab the original page we
1174						 * 'soldered' in place and
1175						 * retake lock on 'first_object'
1176						 */
1177						m = first_m;
1178						first_m = VM_PAGE_NULL;
1179
1180						object = first_object;
1181						offset = first_offset;
1182
1183						vm_object_lock(object);
1184					} else {
1185					        /*
1186						 * we're going to use the absent page we just found
1187						 * so convert it to a 'busy' page
1188						 */
1189					        m->absent = FALSE;
1190						m->busy = TRUE;
1191					}
1192					/*
1193					 * zero-fill the page and put it on
1194					 * the correct paging queue
1195					 */
1196					my_fault = vm_fault_zero_page(m, no_zero_fill);
1197
1198					if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1199						m->absent = TRUE;
1200
1201					break;
1202				} else {
1203					if (must_be_resident)
1204						vm_object_paging_end(object);
1205					else if (object != first_object) {
1206						vm_object_paging_end(object);
1207						VM_PAGE_FREE(m);
1208					} else {
1209						first_m = m;
1210						m->absent = FALSE;
1211						m->busy = TRUE;
1212
1213						vm_page_lockspin_queues();
1214
1215						assert(!m->pageout_queue);
1216						VM_PAGE_QUEUES_REMOVE(m);
1217
1218						vm_page_unlock_queues();
1219					}
1220					XPR(XPR_VM_FAULT,
1221					    "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1222						object, offset,
1223						next_object,
1224						offset+object->vo_shadow_offset,0);
1225
1226					offset += object->vo_shadow_offset;
1227					fault_info->lo_offset += object->vo_shadow_offset;
1228					fault_info->hi_offset += object->vo_shadow_offset;
1229					access_required = VM_PROT_READ;
1230
1231					vm_object_lock(next_object);
1232					vm_object_unlock(object);
1233					object = next_object;
1234					vm_object_paging_begin(object);
1235
1236					/*
1237					 * reset to default type of fault
1238					 */
1239					my_fault = DBG_CACHE_HIT_FAULT;
1240
1241					continue;
1242				}
1243			}
1244			if ((m->cleaning)
1245			    && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1246			    && (fault_type & VM_PROT_WRITE)) {
1247				/*
1248				 * This is a copy-on-write fault that will
1249				 * cause us to revoke access to this page, but
1250				 * this page is in the process of being cleaned
1251				 * in a clustered pageout. We must wait until
1252				 * the cleaning operation completes before
1253				 * revoking access to the original page,
1254				 * otherwise we might attempt to remove a
1255				 * wired mapping.
1256				 */
1257#if TRACEFAULTPAGE
1258				dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);	/* (TEST/DEBUG) */
1259#endif
1260				XPR(XPR_VM_FAULT,
1261				    "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1262					object, offset,
1263					m, 0, 0);
1264				/*
1265				 * take an extra ref so that object won't die
1266				 */
1267				vm_object_reference_locked(object);
1268
1269				vm_fault_cleanup(object, first_m);
1270
1271				counter(c_vm_fault_page_block_backoff_kernel++);
1272				vm_object_lock(object);
1273				assert(object->ref_count > 0);
1274
1275				m = vm_page_lookup(object, offset);
1276
1277				if (m != VM_PAGE_NULL && m->cleaning) {
1278					PAGE_ASSERT_WAIT(m, interruptible);
1279
1280					vm_object_unlock(object);
1281					wait_result = thread_block(THREAD_CONTINUE_NULL);
1282					vm_object_deallocate(object);
1283
1284					goto backoff;
1285				} else {
1286					vm_object_unlock(object);
1287
1288					vm_object_deallocate(object);
1289					thread_interrupt_level(interruptible_state);
1290
1291					return (VM_FAULT_RETRY);
1292				}
1293			}
1294			if (type_of_fault == NULL && m->speculative &&
1295			    !(fault_info != NULL && fault_info->stealth)) {
1296			        /*
1297				 * If we were passed a non-NULL pointer for
1298				 * "type_of_fault", than we came from
1299				 * vm_fault... we'll let it deal with
1300				 * this condition, since it
1301				 * needs to see m->speculative to correctly
1302				 * account the pageins, otherwise...
1303				 * take it off the speculative queue, we'll
1304				 * let the caller of vm_fault_page deal
1305				 * with getting it onto the correct queue
1306				 *
1307				 * If the caller specified in fault_info that
1308				 * it wants a "stealth" fault, we also leave
1309				 * the page in the speculative queue.
1310				 */
1311			        vm_page_lockspin_queues();
1312				if (m->speculative)
1313					VM_PAGE_QUEUES_REMOVE(m);
1314			        vm_page_unlock_queues();
1315			}
1316
1317			if (m->encrypted) {
1318				/*
1319				 * ENCRYPTED SWAP:
1320				 * the user needs access to a page that we
1321				 * encrypted before paging it out.
1322				 * Decrypt the page now.
1323				 * Keep it busy to prevent anyone from
1324				 * accessing it during the decryption.
1325				 */
1326				m->busy = TRUE;
1327				vm_page_decrypt(m, 0);
1328				assert(object == m->object);
1329				assert(m->busy);
1330				PAGE_WAKEUP_DONE(m);
1331
1332				/*
1333				 * Retry from the top, in case
1334				 * something changed while we were
1335				 * decrypting.
1336				 */
1337				continue;
1338			}
1339			ASSERT_PAGE_DECRYPTED(m);
1340
1341			if (m->object->code_signed) {
1342				/*
1343				 * CODE SIGNING:
1344				 * We just paged in a page from a signed
1345				 * memory object but we don't need to
1346				 * validate it now.  We'll validate it if
1347				 * when it gets mapped into a user address
1348				 * space for the first time or when the page
1349				 * gets copied to another object as a result
1350				 * of a copy-on-write.
1351				 */
1352			}
1353
1354			/*
1355			 * We mark the page busy and leave it on
1356			 * the pageout queues.  If the pageout
1357			 * deamon comes across it, then it will
1358			 * remove the page from the queue, but not the object
1359			 */
1360#if TRACEFAULTPAGE
1361			dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1362#endif
1363			XPR(XPR_VM_FAULT,
1364			    "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1365				object, offset, m, 0, 0);
1366			assert(!m->busy);
1367			assert(!m->absent);
1368
1369			m->busy = TRUE;
1370			break;
1371		}
1372
1373
1374		/*
1375		 * we get here when there is no page present in the object at
1376		 * the offset we're interested in... we'll allocate a page
1377		 * at this point if the pager associated with
1378		 * this object can provide the data or we're the top object...
1379		 * object is locked;  m == NULL
1380		 */
1381		if (must_be_resident) {
1382			if (fault_type == VM_PROT_NONE &&
1383			    object == kernel_object) {
1384				/*
1385				 * We've been called from vm_fault_unwire()
1386				 * while removing a map entry that was allocated
1387				 * with KMA_KOBJECT and KMA_VAONLY.  This page
1388				 * is not present and there's nothing more to
1389				 * do here (nothing to unwire).
1390				 */
1391				vm_fault_cleanup(object, first_m);
1392				thread_interrupt_level(interruptible_state);
1393
1394				return VM_FAULT_MEMORY_ERROR;
1395			}
1396
1397			goto dont_look_for_page;
1398		}
1399
1400#if !MACH_PAGEMAP
1401		data_supply = FALSE;
1402#endif /* !MACH_PAGEMAP */
1403
1404		look_for_page =	(object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1405
1406#if TRACEFAULTPAGE
1407		dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);	/* (TEST/DEBUG) */
1408#endif
1409		if (!look_for_page && object == first_object && !object->phys_contiguous) {
1410			/*
1411			 * Allocate a new page for this object/offset pair as a placeholder
1412			 */
1413			m = vm_page_grab();
1414#if TRACEFAULTPAGE
1415			dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);	/* (TEST/DEBUG) */
1416#endif
1417			if (m == VM_PAGE_NULL) {
1418
1419				vm_fault_cleanup(object, first_m);
1420				thread_interrupt_level(interruptible_state);
1421
1422				return (VM_FAULT_MEMORY_SHORTAGE);
1423			}
1424
1425			if (fault_info && fault_info->batch_pmap_op == TRUE) {
1426				vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1427			} else {
1428				vm_page_insert(m, object, offset);
1429			}
1430		}
1431		if (look_for_page) {
1432			kern_return_t	rc;
1433			int		my_fault_type;
1434
1435			/*
1436			 *	If the memory manager is not ready, we
1437			 *	cannot make requests.
1438			 */
1439			if (!object->pager_ready) {
1440#if TRACEFAULTPAGE
1441				dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);	/* (TEST/DEBUG) */
1442#endif
1443				if (m != VM_PAGE_NULL)
1444				        VM_PAGE_FREE(m);
1445
1446				XPR(XPR_VM_FAULT,
1447				"vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1448					object, offset, 0, 0, 0);
1449
1450				/*
1451				 * take an extra ref so object won't die
1452				 */
1453				vm_object_reference_locked(object);
1454				vm_fault_cleanup(object, first_m);
1455				counter(c_vm_fault_page_block_backoff_kernel++);
1456
1457				vm_object_lock(object);
1458				assert(object->ref_count > 0);
1459
1460				if (!object->pager_ready) {
1461					wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1462
1463					vm_object_unlock(object);
1464					if (wait_result == THREAD_WAITING)
1465						wait_result = thread_block(THREAD_CONTINUE_NULL);
1466					vm_object_deallocate(object);
1467
1468					goto backoff;
1469				} else {
1470					vm_object_unlock(object);
1471					vm_object_deallocate(object);
1472					thread_interrupt_level(interruptible_state);
1473
1474					return (VM_FAULT_RETRY);
1475				}
1476			}
1477			if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1478				/*
1479				 * If there are too many outstanding page
1480				 * requests pending on this external object, we
1481				 * wait for them to be resolved now.
1482				 */
1483#if TRACEFAULTPAGE
1484				dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);	/* (TEST/DEBUG) */
1485#endif
1486				if (m != VM_PAGE_NULL)
1487					VM_PAGE_FREE(m);
1488				/*
1489				 * take an extra ref so object won't die
1490				 */
1491				vm_object_reference_locked(object);
1492
1493				vm_fault_cleanup(object, first_m);
1494
1495				counter(c_vm_fault_page_block_backoff_kernel++);
1496
1497				vm_object_lock(object);
1498				assert(object->ref_count > 0);
1499
1500				if (object->paging_in_progress >= vm_object_pagein_throttle) {
1501				        vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1502
1503					vm_object_unlock(object);
1504					wait_result = thread_block(THREAD_CONTINUE_NULL);
1505					vm_object_deallocate(object);
1506
1507					goto backoff;
1508				} else {
1509					vm_object_unlock(object);
1510					vm_object_deallocate(object);
1511					thread_interrupt_level(interruptible_state);
1512
1513					return (VM_FAULT_RETRY);
1514				}
1515			}
1516			if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && object->internal) {
1517
1518				if (m == VM_PAGE_NULL) {
1519					/*
1520					 * Allocate a new page for this object/offset pair as a placeholder
1521					 */
1522					m = vm_page_grab();
1523#if TRACEFAULTPAGE
1524					dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);	/* (TEST/DEBUG) */
1525#endif
1526					if (m == VM_PAGE_NULL) {
1527
1528						vm_fault_cleanup(object, first_m);
1529						thread_interrupt_level(interruptible_state);
1530
1531						return (VM_FAULT_MEMORY_SHORTAGE);
1532					}
1533
1534					m->absent = TRUE;
1535					if (fault_info && fault_info->batch_pmap_op == TRUE) {
1536						vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1537					} else {
1538						vm_page_insert(m, object, offset);
1539					}
1540				}
1541				assert(m->busy);
1542
1543				m->absent = TRUE;
1544				pager = object->pager;
1545
1546				vm_object_unlock(object);
1547
1548				rc = vm_compressor_pager_get(pager, offset + object->paging_offset, m->phys_page, &my_fault_type, 0);
1549
1550				vm_object_lock(object);
1551
1552				switch (rc) {
1553				case KERN_SUCCESS:
1554					m->absent = FALSE;
1555					m->dirty = TRUE;
1556					if ((m->object->wimg_bits &
1557					     VM_WIMG_MASK) !=
1558					    VM_WIMG_USE_DEFAULT) {
1559						/*
1560						 * If the page is not cacheable,
1561						 * we can't let its contents
1562						 * linger in the data cache
1563						 * after the decompression.
1564						 */
1565						pmap_sync_page_attributes_phys(
1566							m->phys_page);
1567					} else
1568						m->written_by_kernel = TRUE;
1569					break;
1570				case KERN_MEMORY_FAILURE:
1571					m->unusual = TRUE;
1572					m->error = TRUE;
1573					m->absent = FALSE;
1574					break;
1575				case KERN_MEMORY_ERROR:
1576					assert(m->absent);
1577					break;
1578				default:
1579					panic("?");
1580				}
1581				PAGE_WAKEUP_DONE(m);
1582
1583				rc = KERN_SUCCESS;
1584				goto data_requested;
1585			}
1586			my_fault_type = DBG_PAGEIN_FAULT;
1587
1588			if (m != VM_PAGE_NULL) {
1589				VM_PAGE_FREE(m);
1590				m = VM_PAGE_NULL;
1591			}
1592
1593#if TRACEFAULTPAGE
1594			dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);	/* (TEST/DEBUG) */
1595#endif
1596
1597			/*
1598			 * It's possible someone called vm_object_destroy while we weren't
1599			 * holding the object lock.  If that has happened, then bail out
1600			 * here.
1601			 */
1602
1603			pager = object->pager;
1604
1605			if (pager == MEMORY_OBJECT_NULL) {
1606				vm_fault_cleanup(object, first_m);
1607				thread_interrupt_level(interruptible_state);
1608				return VM_FAULT_MEMORY_ERROR;
1609			}
1610
1611			/*
1612			 * We have an absent page in place for the faulting offset,
1613			 * so we can release the object lock.
1614			 */
1615
1616			vm_object_unlock(object);
1617
1618			/*
1619			 * If this object uses a copy_call strategy,
1620			 * and we are interested in a copy of this object
1621			 * (having gotten here only by following a
1622			 * shadow chain), then tell the memory manager
1623			 * via a flag added to the desired_access
1624			 * parameter, so that it can detect a race
1625			 * between our walking down the shadow chain
1626			 * and its pushing pages up into a copy of
1627			 * the object that it manages.
1628			 */
1629			if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1630				wants_copy_flag = VM_PROT_WANTS_COPY;
1631			else
1632				wants_copy_flag = VM_PROT_NONE;
1633
1634			XPR(XPR_VM_FAULT,
1635			    "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1636				object, offset, m,
1637				access_required | wants_copy_flag, 0);
1638
1639			if (object->copy == first_object) {
1640				/*
1641				 * if we issue the memory_object_data_request in
1642				 * this state, we are subject to a deadlock with
1643				 * the underlying filesystem if it is trying to
1644				 * shrink the file resulting in a push of pages
1645				 * into the copy object...  that push will stall
1646				 * on the placeholder page, and if the pushing thread
1647				 * is holding a lock that is required on the pagein
1648				 * path (such as a truncate lock), we'll deadlock...
1649				 * to avoid this potential deadlock, we throw away
1650				 * our placeholder page before calling memory_object_data_request
1651				 * and force this thread to retry the vm_fault_page after
1652				 * we have issued the I/O.  the second time through this path
1653				 * we will find the page already in the cache (presumably still
1654				 * busy waiting for the I/O to complete) and then complete
1655				 * the fault w/o having to go through memory_object_data_request again
1656				 */
1657				assert(first_m != VM_PAGE_NULL);
1658				assert(first_m->object == first_object);
1659
1660				vm_object_lock(first_object);
1661				VM_PAGE_FREE(first_m);
1662				vm_object_paging_end(first_object);
1663				vm_object_unlock(first_object);
1664
1665				first_m = VM_PAGE_NULL;
1666				force_fault_retry = TRUE;
1667
1668				vm_fault_page_forced_retry++;
1669			}
1670
1671			if (data_already_requested == TRUE) {
1672				orig_behavior = fault_info->behavior;
1673				orig_cluster_size = fault_info->cluster_size;
1674
1675				fault_info->behavior = VM_BEHAVIOR_RANDOM;
1676				fault_info->cluster_size = PAGE_SIZE;
1677			}
1678			/*
1679			 * Call the memory manager to retrieve the data.
1680			 */
1681			rc = memory_object_data_request(
1682				pager,
1683				offset + object->paging_offset,
1684				PAGE_SIZE,
1685				access_required | wants_copy_flag,
1686				(memory_object_fault_info_t)fault_info);
1687
1688			if (data_already_requested == TRUE) {
1689				fault_info->behavior = orig_behavior;
1690				fault_info->cluster_size = orig_cluster_size;
1691			} else
1692				data_already_requested = TRUE;
1693
1694#if TRACEFAULTPAGE
1695			dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc);	/* (TEST/DEBUG) */
1696#endif
1697			vm_object_lock(object);
1698
1699		data_requested:
1700			if (rc != KERN_SUCCESS) {
1701
1702				vm_fault_cleanup(object, first_m);
1703				thread_interrupt_level(interruptible_state);
1704
1705				return ((rc == MACH_SEND_INTERRUPTED) ?
1706					VM_FAULT_INTERRUPTED :
1707					VM_FAULT_MEMORY_ERROR);
1708			} else {
1709				clock_sec_t     tv_sec;
1710				clock_usec_t    tv_usec;
1711
1712				if (my_fault_type == DBG_PAGEIN_FAULT) {
1713					clock_get_system_microtime(&tv_sec, &tv_usec);
1714					current_thread()->t_page_creation_time = tv_sec;
1715					current_thread()->t_page_creation_count = 0;
1716				}
1717			}
1718			if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1719
1720				vm_fault_cleanup(object, first_m);
1721				thread_interrupt_level(interruptible_state);
1722
1723				return (VM_FAULT_INTERRUPTED);
1724			}
1725			if (force_fault_retry == TRUE) {
1726
1727				vm_fault_cleanup(object, first_m);
1728				thread_interrupt_level(interruptible_state);
1729
1730				return (VM_FAULT_RETRY);
1731			}
1732			if (m == VM_PAGE_NULL && object->phys_contiguous) {
1733				/*
1734				 * No page here means that the object we
1735				 * initially looked up was "physically
1736				 * contiguous" (i.e. device memory).  However,
1737				 * with Virtual VRAM, the object might not
1738				 * be backed by that device memory anymore,
1739				 * so we're done here only if the object is
1740				 * still "phys_contiguous".
1741				 * Otherwise, if the object is no longer
1742				 * "phys_contiguous", we need to retry the
1743				 * page fault against the object's new backing
1744				 * store (different memory object).
1745				 */
1746			phys_contig_object:
1747				goto done;
1748			}
1749			/*
1750			 * potentially a pagein fault
1751			 * if we make it through the state checks
1752			 * above, than we'll count it as such
1753			 */
1754			my_fault = my_fault_type;
1755
1756			/*
1757			 * Retry with same object/offset, since new data may
1758			 * be in a different page (i.e., m is meaningless at
1759			 * this point).
1760			 */
1761			continue;
1762		}
1763dont_look_for_page:
1764		/*
1765		 * We get here if the object has no pager, or an existence map
1766		 * exists and indicates the page isn't present on the pager
1767		 * or we're unwiring a page.  If a pager exists, but there
1768		 * is no existence map, then the m->absent case above handles
1769		 * the ZF case when the pager can't provide the page
1770		 */
1771#if TRACEFAULTPAGE
1772		dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);	/* (TEST/DEBUG) */
1773#endif
1774		if (object == first_object)
1775			first_m = m;
1776		else
1777			assert(m == VM_PAGE_NULL);
1778
1779		XPR(XPR_VM_FAULT,
1780		    "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1781			object, offset, m,
1782			object->shadow, 0);
1783
1784		next_object = object->shadow;
1785
1786		if (next_object == VM_OBJECT_NULL) {
1787			/*
1788			 * we've hit the bottom of the shadown chain,
1789			 * fill the page in the top object with zeros.
1790			 */
1791			assert(!must_be_resident);
1792
1793			if (object != first_object) {
1794				vm_object_paging_end(object);
1795				vm_object_unlock(object);
1796
1797				object = first_object;
1798				offset = first_offset;
1799				vm_object_lock(object);
1800			}
1801			m = first_m;
1802			assert(m->object == object);
1803			first_m = VM_PAGE_NULL;
1804
1805			/*
1806			 * check for any conditions that prevent
1807			 * us from creating a new zero-fill page
1808			 * vm_fault_check will do all of the
1809			 * fault cleanup in the case of an error condition
1810			 * including resetting the thread_interrupt_level
1811			 */
1812			error = vm_fault_check(object, m, first_m, interruptible_state);
1813
1814			if (error != VM_FAULT_SUCCESS)
1815			        return (error);
1816
1817			if (m == VM_PAGE_NULL) {
1818				m = vm_page_grab();
1819
1820				if (m == VM_PAGE_NULL) {
1821					vm_fault_cleanup(object, VM_PAGE_NULL);
1822					thread_interrupt_level(interruptible_state);
1823
1824					return (VM_FAULT_MEMORY_SHORTAGE);
1825				}
1826				vm_page_insert(m, object, offset);
1827			}
1828			my_fault = vm_fault_zero_page(m, no_zero_fill);
1829
1830			if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1831				m->absent = TRUE;
1832			break;
1833
1834		} else {
1835		        /*
1836			 * Move on to the next object.  Lock the next
1837			 * object before unlocking the current one.
1838			 */
1839			if ((object != first_object) || must_be_resident)
1840				vm_object_paging_end(object);
1841
1842			offset += object->vo_shadow_offset;
1843			fault_info->lo_offset += object->vo_shadow_offset;
1844			fault_info->hi_offset += object->vo_shadow_offset;
1845			access_required = VM_PROT_READ;
1846
1847			vm_object_lock(next_object);
1848			vm_object_unlock(object);
1849
1850			object = next_object;
1851			vm_object_paging_begin(object);
1852		}
1853	}
1854
1855	/*
1856	 *	PAGE HAS BEEN FOUND.
1857	 *
1858	 *	This page (m) is:
1859	 *		busy, so that we can play with it;
1860	 *		not absent, so that nobody else will fill it;
1861	 *		possibly eligible for pageout;
1862	 *
1863	 *	The top-level page (first_m) is:
1864	 *		VM_PAGE_NULL if the page was found in the
1865	 *		 top-level object;
1866	 *		busy, not absent, and ineligible for pageout.
1867	 *
1868	 *	The current object (object) is locked.  A paging
1869	 *	reference is held for the current and top-level
1870	 *	objects.
1871	 */
1872
1873#if TRACEFAULTPAGE
1874	dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);	/* (TEST/DEBUG) */
1875#endif
1876#if	EXTRA_ASSERTIONS
1877	assert(m->busy && !m->absent);
1878	assert((first_m == VM_PAGE_NULL) ||
1879	       (first_m->busy && !first_m->absent &&
1880		!first_m->active && !first_m->inactive));
1881#endif	/* EXTRA_ASSERTIONS */
1882
1883	/*
1884	 * ENCRYPTED SWAP:
1885	 * If we found a page, we must have decrypted it before we
1886	 * get here...
1887	 */
1888	ASSERT_PAGE_DECRYPTED(m);
1889
1890	XPR(XPR_VM_FAULT,
1891	    "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1892		object, offset, m,
1893		first_object, first_m);
1894
1895	/*
1896	 * If the page is being written, but isn't
1897	 * already owned by the top-level object,
1898	 * we have to copy it into a new page owned
1899	 * by the top-level object.
1900	 */
1901	if (object != first_object) {
1902
1903#if TRACEFAULTPAGE
1904		dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type);	/* (TEST/DEBUG) */
1905#endif
1906	    	if (fault_type & VM_PROT_WRITE) {
1907			vm_page_t copy_m;
1908
1909			/*
1910			 * We only really need to copy if we
1911			 * want to write it.
1912			 */
1913			assert(!must_be_resident);
1914
1915			/*
1916			 * are we protecting the system from
1917			 * backing store exhaustion.  If so
1918			 * sleep unless we are privileged.
1919			 */
1920			if (vm_backing_store_low) {
1921				if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1922
1923					RELEASE_PAGE(m);
1924					vm_fault_cleanup(object, first_m);
1925
1926					assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1927
1928					thread_block(THREAD_CONTINUE_NULL);
1929					thread_interrupt_level(interruptible_state);
1930
1931					return (VM_FAULT_RETRY);
1932				}
1933			}
1934			/*
1935			 * If we try to collapse first_object at this
1936			 * point, we may deadlock when we try to get
1937			 * the lock on an intermediate object (since we
1938			 * have the bottom object locked).  We can't
1939			 * unlock the bottom object, because the page
1940			 * we found may move (by collapse) if we do.
1941			 *
1942			 * Instead, we first copy the page.  Then, when
1943			 * we have no more use for the bottom object,
1944			 * we unlock it and try to collapse.
1945			 *
1946			 * Note that we copy the page even if we didn't
1947			 * need to... that's the breaks.
1948			 */
1949
1950			/*
1951			 * Allocate a page for the copy
1952			 */
1953			copy_m = vm_page_grab();
1954
1955			if (copy_m == VM_PAGE_NULL) {
1956				RELEASE_PAGE(m);
1957
1958				vm_fault_cleanup(object, first_m);
1959				thread_interrupt_level(interruptible_state);
1960
1961				return (VM_FAULT_MEMORY_SHORTAGE);
1962			}
1963			XPR(XPR_VM_FAULT,
1964			    "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1965				object, offset,
1966				m, copy_m, 0);
1967
1968			vm_page_copy(m, copy_m);
1969
1970			/*
1971			 * If another map is truly sharing this
1972			 * page with us, we have to flush all
1973			 * uses of the original page, since we
1974			 * can't distinguish those which want the
1975			 * original from those which need the
1976			 * new copy.
1977			 *
1978			 * XXXO If we know that only one map has
1979			 * access to this page, then we could
1980			 * avoid the pmap_disconnect() call.
1981			 */
1982			if (m->pmapped)
1983			        pmap_disconnect(m->phys_page);
1984
1985			assert(!m->cleaning);
1986
1987			/*
1988			 * We no longer need the old page or object.
1989			 */
1990			RELEASE_PAGE(m);
1991
1992			vm_object_paging_end(object);
1993			vm_object_unlock(object);
1994
1995			my_fault = DBG_COW_FAULT;
1996			VM_STAT_INCR(cow_faults);
1997			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1998			current_task()->cow_faults++;
1999
2000			object = first_object;
2001			offset = first_offset;
2002
2003			vm_object_lock(object);
2004			/*
2005			 * get rid of the place holder
2006			 * page that we soldered in earlier
2007			 */
2008			VM_PAGE_FREE(first_m);
2009			first_m = VM_PAGE_NULL;
2010
2011			/*
2012			 * and replace it with the
2013			 * page we just copied into
2014			 */
2015			assert(copy_m->busy);
2016			vm_page_insert(copy_m, object, offset);
2017			SET_PAGE_DIRTY(copy_m, TRUE);
2018
2019			m = copy_m;
2020			/*
2021			 * Now that we've gotten the copy out of the
2022			 * way, let's try to collapse the top object.
2023			 * But we have to play ugly games with
2024			 * paging_in_progress to do that...
2025			 */
2026			vm_object_paging_end(object);
2027			vm_object_collapse(object, offset, TRUE);
2028			vm_object_paging_begin(object);
2029
2030		} else
2031		    	*protection &= (~VM_PROT_WRITE);
2032	}
2033	/*
2034	 * Now check whether the page needs to be pushed into the
2035	 * copy object.  The use of asymmetric copy on write for
2036	 * shared temporary objects means that we may do two copies to
2037	 * satisfy the fault; one above to get the page from a
2038	 * shadowed object, and one here to push it into the copy.
2039	 */
2040	try_failed_count = 0;
2041
2042	while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2043		vm_object_offset_t	copy_offset;
2044		vm_page_t		copy_m;
2045
2046#if TRACEFAULTPAGE
2047		dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);	/* (TEST/DEBUG) */
2048#endif
2049		/*
2050		 * If the page is being written, but hasn't been
2051		 * copied to the copy-object, we have to copy it there.
2052		 */
2053		if ((fault_type & VM_PROT_WRITE) == 0) {
2054			*protection &= ~VM_PROT_WRITE;
2055			break;
2056		}
2057
2058		/*
2059		 * If the page was guaranteed to be resident,
2060		 * we must have already performed the copy.
2061		 */
2062		if (must_be_resident)
2063			break;
2064
2065		/*
2066		 * Try to get the lock on the copy_object.
2067		 */
2068		if (!vm_object_lock_try(copy_object)) {
2069
2070			vm_object_unlock(object);
2071			try_failed_count++;
2072
2073			mutex_pause(try_failed_count);	/* wait a bit */
2074			vm_object_lock(object);
2075
2076			continue;
2077		}
2078		try_failed_count = 0;
2079
2080		/*
2081		 * Make another reference to the copy-object,
2082		 * to keep it from disappearing during the
2083		 * copy.
2084		 */
2085		vm_object_reference_locked(copy_object);
2086
2087		/*
2088		 * Does the page exist in the copy?
2089		 */
2090		copy_offset = first_offset - copy_object->vo_shadow_offset;
2091
2092		if (copy_object->vo_size <= copy_offset)
2093			/*
2094			 * Copy object doesn't cover this page -- do nothing.
2095			 */
2096			;
2097		else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2098			/*
2099			 * Page currently exists in the copy object
2100			 */
2101			if (copy_m->busy) {
2102				/*
2103				 * If the page is being brought
2104				 * in, wait for it and then retry.
2105				 */
2106				RELEASE_PAGE(m);
2107
2108				/*
2109				 * take an extra ref so object won't die
2110				 */
2111				vm_object_reference_locked(copy_object);
2112				vm_object_unlock(copy_object);
2113				vm_fault_cleanup(object, first_m);
2114				counter(c_vm_fault_page_block_backoff_kernel++);
2115
2116				vm_object_lock(copy_object);
2117				assert(copy_object->ref_count > 0);
2118				VM_OBJ_RES_DECR(copy_object);
2119				vm_object_lock_assert_exclusive(copy_object);
2120				copy_object->ref_count--;
2121				assert(copy_object->ref_count > 0);
2122				copy_m = vm_page_lookup(copy_object, copy_offset);
2123				/*
2124				 * ENCRYPTED SWAP:
2125				 * it's OK if the "copy_m" page is encrypted,
2126				 * because we're not moving it nor handling its
2127				 * contents.
2128				 */
2129				if (copy_m != VM_PAGE_NULL && copy_m->busy) {
2130					PAGE_ASSERT_WAIT(copy_m, interruptible);
2131
2132					vm_object_unlock(copy_object);
2133					wait_result = thread_block(THREAD_CONTINUE_NULL);
2134					vm_object_deallocate(copy_object);
2135
2136					goto backoff;
2137				} else {
2138					vm_object_unlock(copy_object);
2139					vm_object_deallocate(copy_object);
2140					thread_interrupt_level(interruptible_state);
2141
2142					return (VM_FAULT_RETRY);
2143				}
2144			}
2145		}
2146		else if (!PAGED_OUT(copy_object, copy_offset)) {
2147			/*
2148			 * If PAGED_OUT is TRUE, then the page used to exist
2149			 * in the copy-object, and has already been paged out.
2150			 * We don't need to repeat this. If PAGED_OUT is
2151			 * FALSE, then either we don't know (!pager_created,
2152			 * for example) or it hasn't been paged out.
2153			 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2154			 * We must copy the page to the copy object.
2155			 */
2156
2157			if (vm_backing_store_low) {
2158			        /*
2159				 * we are protecting the system from
2160				 * backing store exhaustion.  If so
2161				 * sleep unless we are privileged.
2162				 */
2163				if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2164					assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2165
2166					RELEASE_PAGE(m);
2167					VM_OBJ_RES_DECR(copy_object);
2168					vm_object_lock_assert_exclusive(copy_object);
2169					copy_object->ref_count--;
2170					assert(copy_object->ref_count > 0);
2171
2172					vm_object_unlock(copy_object);
2173					vm_fault_cleanup(object, first_m);
2174					thread_block(THREAD_CONTINUE_NULL);
2175					thread_interrupt_level(interruptible_state);
2176
2177					return (VM_FAULT_RETRY);
2178				}
2179			}
2180			/*
2181			 * Allocate a page for the copy
2182			 */
2183			copy_m = vm_page_alloc(copy_object, copy_offset);
2184
2185			if (copy_m == VM_PAGE_NULL) {
2186				RELEASE_PAGE(m);
2187
2188				VM_OBJ_RES_DECR(copy_object);
2189				vm_object_lock_assert_exclusive(copy_object);
2190				copy_object->ref_count--;
2191				assert(copy_object->ref_count > 0);
2192
2193				vm_object_unlock(copy_object);
2194				vm_fault_cleanup(object, first_m);
2195				thread_interrupt_level(interruptible_state);
2196
2197				return (VM_FAULT_MEMORY_SHORTAGE);
2198			}
2199			/*
2200			 * Must copy page into copy-object.
2201			 */
2202			vm_page_copy(m, copy_m);
2203
2204			/*
2205			 * If the old page was in use by any users
2206			 * of the copy-object, it must be removed
2207			 * from all pmaps.  (We can't know which
2208			 * pmaps use it.)
2209			 */
2210			if (m->pmapped)
2211			        pmap_disconnect(m->phys_page);
2212
2213			/*
2214			 * If there's a pager, then immediately
2215			 * page out this page, using the "initialize"
2216			 * option.  Else, we use the copy.
2217			 */
2218		 	if ((!copy_object->pager_created)
2219#if MACH_PAGEMAP
2220			    || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2221#endif
2222			    || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2223			    ) {
2224
2225				vm_page_lockspin_queues();
2226				assert(!m->cleaning);
2227				vm_page_activate(copy_m);
2228				vm_page_unlock_queues();
2229
2230				SET_PAGE_DIRTY(copy_m, TRUE);
2231				PAGE_WAKEUP_DONE(copy_m);
2232
2233			} else if (copy_object->internal &&
2234				   (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE)) {
2235				/*
2236				 * For internal objects check with the pager to see
2237				 * if the page already exists in the backing store.
2238				 * If yes, then we can drop the copy page. If not,
2239				 * then we'll activate it, mark it dirty and keep it
2240				 * around.
2241				 */
2242
2243				kern_return_t kr = KERN_SUCCESS;
2244
2245				memory_object_t	copy_pager = copy_object->pager;
2246				assert(copy_pager != MEMORY_OBJECT_NULL);
2247				vm_object_paging_begin(copy_object);
2248
2249				vm_object_unlock(copy_object);
2250
2251				kr = memory_object_data_request(
2252					copy_pager,
2253					copy_offset + copy_object->paging_offset,
2254					0, /* Only query the pager. */
2255					VM_PROT_READ,
2256					NULL);
2257
2258				vm_object_lock(copy_object);
2259
2260				vm_object_paging_end(copy_object);
2261
2262				/*
2263				 * Since we dropped the copy_object's lock,
2264				 * check whether we'll have to deallocate
2265				 * the hard way.
2266				 */
2267				if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2268					vm_object_unlock(copy_object);
2269					vm_object_deallocate(copy_object);
2270					vm_object_lock(object);
2271
2272					continue;
2273				}
2274				if (kr == KERN_SUCCESS) {
2275					/*
2276					 * The pager has the page. We don't want to overwrite
2277					 * that page by sending this one out to the backing store.
2278					 * So we drop the copy page.
2279					 */
2280					VM_PAGE_FREE(copy_m);
2281
2282				} else {
2283					/*
2284					 * The pager doesn't have the page. We'll keep this one
2285					 * around in the copy object. It might get sent out to
2286					 * the backing store under memory pressure.
2287					 */
2288					vm_page_lockspin_queues();
2289					assert(!m->cleaning);
2290					vm_page_activate(copy_m);
2291					vm_page_unlock_queues();
2292
2293					SET_PAGE_DIRTY(copy_m, TRUE);
2294					PAGE_WAKEUP_DONE(copy_m);
2295				}
2296			} else {
2297
2298				assert(copy_m->busy == TRUE);
2299				assert(!m->cleaning);
2300
2301				/*
2302				 * dirty is protected by the object lock
2303				 */
2304				SET_PAGE_DIRTY(copy_m, TRUE);
2305
2306				/*
2307				 * The page is already ready for pageout:
2308				 * not on pageout queues and busy.
2309				 * Unlock everything except the
2310				 * copy_object itself.
2311				 */
2312				vm_object_unlock(object);
2313
2314				/*
2315				 * Write the page to the copy-object,
2316				 * flushing it from the kernel.
2317				 */
2318				vm_pageout_initialize_page(copy_m);
2319
2320				/*
2321				 * Since the pageout may have
2322				 * temporarily dropped the
2323				 * copy_object's lock, we
2324				 * check whether we'll have
2325				 * to deallocate the hard way.
2326				 */
2327				if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2328					vm_object_unlock(copy_object);
2329					vm_object_deallocate(copy_object);
2330					vm_object_lock(object);
2331
2332					continue;
2333				}
2334				/*
2335				 * Pick back up the old object's
2336				 * lock.  [It is safe to do so,
2337				 * since it must be deeper in the
2338				 * object tree.]
2339				 */
2340				vm_object_lock(object);
2341			}
2342
2343			/*
2344			 * Because we're pushing a page upward
2345			 * in the object tree, we must restart
2346			 * any faults that are waiting here.
2347			 * [Note that this is an expansion of
2348			 * PAGE_WAKEUP that uses the THREAD_RESTART
2349			 * wait result].  Can't turn off the page's
2350			 * busy bit because we're not done with it.
2351			 */
2352			if (m->wanted) {
2353				m->wanted = FALSE;
2354				thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2355			}
2356		}
2357		/*
2358		 * The reference count on copy_object must be
2359		 * at least 2: one for our extra reference,
2360		 * and at least one from the outside world
2361		 * (we checked that when we last locked
2362		 * copy_object).
2363		 */
2364		vm_object_lock_assert_exclusive(copy_object);
2365		copy_object->ref_count--;
2366		assert(copy_object->ref_count > 0);
2367
2368		VM_OBJ_RES_DECR(copy_object);
2369		vm_object_unlock(copy_object);
2370
2371		break;
2372	}
2373
2374done:
2375	*result_page = m;
2376	*top_page = first_m;
2377
2378	XPR(XPR_VM_FAULT,
2379		"vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2380		object, offset, m, first_m, 0);
2381
2382	if (m != VM_PAGE_NULL) {
2383		retval = VM_FAULT_SUCCESS;
2384		if (my_fault == DBG_PAGEIN_FAULT) {
2385
2386			if (!m->object->internal || (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE))
2387				VM_STAT_INCR(pageins);
2388			DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2389			DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2390			current_task()->pageins++;
2391
2392			if (m->object->internal) {
2393				DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2394				my_fault = DBG_PAGEIND_FAULT;
2395			} else {
2396				DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2397				my_fault = DBG_PAGEINV_FAULT;
2398			}
2399
2400		        /*
2401			 * evaluate access pattern and update state
2402			 * vm_fault_deactivate_behind depends on the
2403			 * state being up to date
2404			 */
2405		        vm_fault_is_sequential(object, offset, fault_info->behavior);
2406
2407			vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2408		} else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2409
2410			VM_STAT_INCR(decompressions);
2411		}
2412		if (type_of_fault)
2413		        *type_of_fault = my_fault;
2414	} else {
2415		retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2416		assert(first_m == VM_PAGE_NULL);
2417		assert(object == first_object);
2418	}
2419
2420	thread_interrupt_level(interruptible_state);
2421
2422#if TRACEFAULTPAGE
2423	dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);	/* (TEST/DEBUG) */
2424#endif
2425	return retval;
2426
2427backoff:
2428	thread_interrupt_level(interruptible_state);
2429
2430	if (wait_result == THREAD_INTERRUPTED)
2431		return (VM_FAULT_INTERRUPTED);
2432	return (VM_FAULT_RETRY);
2433
2434#undef	RELEASE_PAGE
2435}
2436
2437
2438
2439/*
2440 * CODE SIGNING:
2441 * When soft faulting a page, we have to validate the page if:
2442 * 1. the page is being mapped in user space
2443 * 2. the page hasn't already been found to be "tainted"
2444 * 3. the page belongs to a code-signed object
2445 * 4. the page has not been validated yet or has been mapped for write.
2446 */
2447#define VM_FAULT_NEED_CS_VALIDATION(pmap, page)				\
2448	((pmap) != kernel_pmap /*1*/ &&					\
2449	 !(page)->cs_tainted /*2*/ &&					\
2450	 (page)->object->code_signed /*3*/ &&				\
2451	 (!(page)->cs_validated || (page)->wpmapped /*4*/))
2452
2453
2454/*
2455 * page queue lock must NOT be held
2456 * m->object must be locked
2457 *
2458 * NOTE: m->object could be locked "shared" only if we are called
2459 * from vm_fault() as part of a soft fault.  If so, we must be
2460 * careful not to modify the VM object in any way that is not
2461 * legal under a shared lock...
2462 */
2463extern int proc_selfpid(void);
2464extern char *proc_name_address(void *p);
2465unsigned long cs_enter_tainted_rejected = 0;
2466unsigned long cs_enter_tainted_accepted = 0;
2467kern_return_t
2468vm_fault_enter(vm_page_t m,
2469	       pmap_t pmap,
2470	       vm_map_offset_t vaddr,
2471	       vm_prot_t prot,
2472	       vm_prot_t fault_type,
2473	       boolean_t wired,
2474	       boolean_t change_wiring,
2475	       boolean_t no_cache,
2476	       boolean_t cs_bypass,
2477	       boolean_t *need_retry,
2478	       int *type_of_fault)
2479{
2480	kern_return_t	kr, pe_result;
2481	boolean_t	previously_pmapped = m->pmapped;
2482	boolean_t	must_disconnect = 0;
2483	boolean_t	map_is_switched, map_is_switch_protected;
2484	int		cs_enforcement_enabled;
2485
2486	vm_object_lock_assert_held(m->object);
2487#if DEBUG
2488	lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2489#endif /* DEBUG */
2490
2491	if (m->phys_page == vm_page_guard_addr) {
2492		assert(m->fictitious);
2493		return KERN_SUCCESS;
2494	}
2495
2496	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2497
2498		vm_object_lock_assert_exclusive(m->object);
2499
2500	} else if ((fault_type & VM_PROT_WRITE) == 0) {
2501		/*
2502		 * This is not a "write" fault, so we
2503		 * might not have taken the object lock
2504		 * exclusively and we might not be able
2505		 * to update the "wpmapped" bit in
2506		 * vm_fault_enter().
2507		 * Let's just grant read access to
2508		 * the page for now and we'll
2509		 * soft-fault again if we need write
2510		 * access later...
2511		 */
2512		prot &= ~VM_PROT_WRITE;
2513	}
2514	if (m->pmapped == FALSE) {
2515
2516		if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2517		        /*
2518			 * found it in the cache, but this
2519			 * is the first fault-in of the page (m->pmapped == FALSE)
2520			 * so it must have come in as part of
2521			 * a cluster... account 1 pagein against it
2522			 */
2523		        VM_STAT_INCR(pageins);
2524			DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2525
2526			if (m->object->internal) {
2527				DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2528				*type_of_fault = DBG_PAGEIND_FAULT;
2529			} else {
2530				DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2531				*type_of_fault = DBG_PAGEINV_FAULT;
2532			}
2533
2534			current_task()->pageins++;
2535		}
2536		VM_PAGE_CONSUME_CLUSTERED(m);
2537
2538	}
2539
2540	if (*type_of_fault != DBG_COW_FAULT) {
2541		DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2542
2543		if (pmap == kernel_pmap) {
2544			DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2545		}
2546	}
2547
2548	/* Validate code signature if necessary. */
2549	if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2550		vm_object_lock_assert_exclusive(m->object);
2551
2552		if (m->cs_validated) {
2553			vm_cs_revalidates++;
2554		}
2555
2556		/* VM map is locked, so 1 ref will remain on VM object -
2557		 * so no harm if vm_page_validate_cs drops the object lock */
2558		vm_page_validate_cs(m);
2559	}
2560
2561#define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2562
2563	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2564			   (pmap == vm_map_pmap(current_thread()->map)));
2565	map_is_switch_protected = current_thread()->map->switch_protect;
2566
2567	/* If the map is switched, and is switch-protected, we must protect
2568	 * some pages from being write-faulted: immutable pages because by
2569	 * definition they may not be written, and executable pages because that
2570	 * would provide a way to inject unsigned code.
2571	 * If the page is immutable, we can simply return. However, we can't
2572	 * immediately determine whether a page is executable anywhere. But,
2573	 * we can disconnect it everywhere and remove the executable protection
2574	 * from the current map. We do that below right before we do the
2575	 * PMAP_ENTER.
2576	 */
2577	cs_enforcement_enabled = cs_enforcement(NULL);
2578
2579	if(cs_enforcement_enabled && map_is_switched &&
2580	   map_is_switch_protected && page_immutable(m, prot) &&
2581	   (prot & VM_PROT_WRITE))
2582	{
2583		return KERN_CODESIGN_ERROR;
2584	}
2585
2586	/* A page could be tainted, or pose a risk of being tainted later.
2587	 * Check whether the receiving process wants it, and make it feel
2588	 * the consequences (that hapens in cs_invalid_page()).
2589	 * For CS Enforcement, two other conditions will
2590	 * cause that page to be tainted as well:
2591	 * - pmapping an unsigned page executable - this means unsigned code;
2592	 * - writeable mapping of a validated page - the content of that page
2593	 *   can be changed without the kernel noticing, therefore unsigned
2594	 *   code can be created
2595	 */
2596	if (m->cs_tainted ||
2597	    ((cs_enforcement_enabled && !cs_bypass ) &&
2598	     (/* The page is unsigned and wants to be executable */
2599	      (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2600	      /* The page should be immutable, but is in danger of being modified
2601		* This is the case where we want policy from the code directory -
2602		* is the page immutable or not? For now we have to assume that
2603		* code pages will be immutable, data pages not.
2604		* We'll assume a page is a code page if it has a code directory
2605		* and we fault for execution.
2606		* That is good enough since if we faulted the code page for
2607		* writing in another map before, it is wpmapped; if we fault
2608		* it for writing in this map later it will also be faulted for executing
2609		* at the same time; and if we fault for writing in another map
2610		* later, we will disconnect it from this pmap so we'll notice
2611		* the change.
2612		*/
2613	      (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2614	      ))
2615		)
2616	{
2617		/* We will have a tainted page. Have to handle the special case
2618		 * of a switched map now. If the map is not switched, standard
2619		 * procedure applies - call cs_invalid_page().
2620		 * If the map is switched, the real owner is invalid already.
2621		 * There is no point in invalidating the switching process since
2622		 * it will not be executing from the map. So we don't call
2623		 * cs_invalid_page() in that case. */
2624		boolean_t reject_page;
2625		if(map_is_switched) {
2626			assert(pmap==vm_map_pmap(current_thread()->map));
2627			assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2628			reject_page = FALSE;
2629		} else {
2630			if (cs_debug > 5)
2631				printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s slid: %s prot: 0x%x\n",
2632				       m->object->code_signed ? "yes" : "no",
2633				       m->cs_validated ? "yes" : "no",
2634				       m->cs_tainted ? "yes" : "no",
2635				       m->wpmapped ? "yes" : "no",
2636				       m->slid ? "yes" : "no",
2637				       (int)prot);
2638			reject_page = cs_invalid_page((addr64_t) vaddr);
2639		}
2640
2641		if (reject_page) {
2642			/* reject the tainted page: abort the page fault */
2643			int			pid;
2644			const char		*procname;
2645			task_t			task;
2646			vm_object_t		file_object, shadow;
2647			vm_object_offset_t	file_offset;
2648			char			*pathname, *filename;
2649			vm_size_t		pathname_len, filename_len;
2650			boolean_t		truncated_path;
2651#define __PATH_MAX 1024
2652			struct timespec		mtime, cs_mtime;
2653
2654			kr = KERN_CODESIGN_ERROR;
2655			cs_enter_tainted_rejected++;
2656
2657			/* get process name and pid */
2658			procname = "?";
2659			task = current_task();
2660			pid = proc_selfpid();
2661			if (task->bsd_info != NULL)
2662				procname = proc_name_address(task->bsd_info);
2663
2664			/* get file's VM object */
2665			file_object = m->object;
2666			file_offset = m->offset;
2667			for (shadow = file_object->shadow;
2668			     shadow != VM_OBJECT_NULL;
2669			     shadow = file_object->shadow) {
2670				vm_object_lock_shared(shadow);
2671				if (file_object != m->object) {
2672					vm_object_unlock(file_object);
2673				}
2674				file_offset += file_object->vo_shadow_offset;
2675				file_object = shadow;
2676			}
2677
2678			mtime.tv_sec = 0;
2679			mtime.tv_nsec = 0;
2680			cs_mtime.tv_sec = 0;
2681			cs_mtime.tv_nsec = 0;
2682
2683			/* get file's pathname and/or filename */
2684			pathname = NULL;
2685			filename = NULL;
2686			pathname_len = 0;
2687			filename_len = 0;
2688			truncated_path = FALSE;
2689			if (file_object->pager == NULL) {
2690				/* no pager -> no file -> no pathname */
2691				pathname = (char *) "<nil>";
2692			} else {
2693				pathname = (char *)kalloc(__PATH_MAX * 2);
2694				if (pathname) {
2695					pathname_len = __PATH_MAX;
2696					filename = pathname + pathname_len;
2697					filename_len = __PATH_MAX;
2698				}
2699				vnode_pager_get_object_name(file_object->pager,
2700							    pathname,
2701							    pathname_len,
2702							    filename,
2703							    filename_len,
2704							    &truncated_path);
2705				vnode_pager_get_object_mtime(file_object->pager,
2706							     &mtime,
2707							     &cs_mtime);
2708			}
2709			printf("CODE SIGNING: process %d[%s]: "
2710			       "rejecting invalid page at address 0x%llx "
2711			       "from offset 0x%llx in file \"%s%s%s\" "
2712			       "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2713			       "(signed:%d validated:%d tainted:%d "
2714			       "wpmapped:%d slid:%d)\n",
2715			       pid, procname, (addr64_t) vaddr,
2716			       file_offset,
2717			       pathname,
2718			       (truncated_path ? "/.../" : ""),
2719			       (truncated_path ? filename : ""),
2720			       cs_mtime.tv_sec, cs_mtime.tv_nsec,
2721			       ((cs_mtime.tv_sec == mtime.tv_sec &&
2722				 cs_mtime.tv_nsec == mtime.tv_nsec)
2723				? "=="
2724				: "!="),
2725			       mtime.tv_sec, mtime.tv_nsec,
2726			       m->object->code_signed,
2727			       m->cs_validated,
2728			       m->cs_tainted,
2729			       m->wpmapped,
2730			       m->slid);
2731			if (file_object != m->object) {
2732				vm_object_unlock(file_object);
2733			}
2734			if (pathname_len != 0) {
2735				kfree(pathname, __PATH_MAX * 2);
2736				pathname = NULL;
2737				filename = NULL;
2738			}
2739		} else {
2740			/* proceed with the tainted page */
2741			kr = KERN_SUCCESS;
2742			/* Page might have been tainted before or not; now it
2743			 * definitively is. If the page wasn't tainted, we must
2744			 * disconnect it from all pmaps later. */
2745			must_disconnect = !m->cs_tainted;
2746			m->cs_tainted = TRUE;
2747			cs_enter_tainted_accepted++;
2748		}
2749		if (kr != KERN_SUCCESS) {
2750			if (cs_debug) {
2751				printf("CODESIGNING: vm_fault_enter(0x%llx): "
2752				       "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2753				       (long long)vaddr, m, m->object, m->offset);
2754			}
2755#if !SECURE_KERNEL
2756			if (cs_enforcement_panic) {
2757				panic("CODESIGNING: panicking on invalid page\n");
2758			}
2759#endif
2760		}
2761
2762	} else {
2763		/* proceed with the valid page */
2764		kr = KERN_SUCCESS;
2765	}
2766
2767	boolean_t	page_queues_locked = FALSE;
2768#define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()	\
2769MACRO_BEGIN			    		\
2770	if (! page_queues_locked) {		\
2771		page_queues_locked = TRUE;	\
2772		vm_page_lockspin_queues();	\
2773	}					\
2774MACRO_END
2775#define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()	\
2776MACRO_BEGIN			    		\
2777	if (page_queues_locked) {		\
2778		page_queues_locked = FALSE;	\
2779		vm_page_unlock_queues();	\
2780	}					\
2781MACRO_END
2782
2783	/*
2784	 * Hold queues lock to manipulate
2785	 * the page queues.  Change wiring
2786	 * case is obvious.
2787	 */
2788	assert(m->compressor || m->object != compressor_object);
2789	if (m->compressor) {
2790		/*
2791		 * Compressor pages are neither wired
2792		 * nor pageable and should never change.
2793		 */
2794		assert(m->object == compressor_object);
2795	} else if (change_wiring) {
2796	        __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2797
2798		if (wired) {
2799			if (kr == KERN_SUCCESS) {
2800				vm_page_wire(m);
2801			}
2802		} else {
2803		        vm_page_unwire(m, TRUE);
2804		}
2805		/* we keep the page queues lock, if we need it later */
2806
2807	} else {
2808	        if (kr != KERN_SUCCESS) {
2809		        __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2810		        vm_page_deactivate(m);
2811			/* we keep the page queues lock, if we need it later */
2812		} else if (((!m->active && !m->inactive) ||
2813			    m->clean_queue ||
2814			    no_cache) &&
2815			   !VM_PAGE_WIRED(m) && !m->throttled) {
2816
2817			if (vm_page_local_q &&
2818			    !no_cache &&
2819			    (*type_of_fault == DBG_COW_FAULT ||
2820			     *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2821				struct vpl	*lq;
2822				uint32_t	lid;
2823
2824				__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
2825				vm_object_lock_assert_exclusive(m->object);
2826
2827				/*
2828				 * we got a local queue to stuff this
2829				 * new page on...
2830				 * its safe to manipulate local and
2831				 * local_id at this point since we're
2832				 * behind an exclusive object lock and
2833				 * the page is not on any global queue.
2834				 *
2835				 * we'll use the current cpu number to
2836				 * select the queue note that we don't
2837				 * need to disable preemption... we're
2838				 * going to behind the local queue's
2839				 * lock to do the real work
2840				 */
2841				lid = cpu_number();
2842
2843				lq = &vm_page_local_q[lid].vpl_un.vpl;
2844
2845				VPL_LOCK(&lq->vpl_lock);
2846
2847				queue_enter(&lq->vpl_queue, m,
2848					    vm_page_t, pageq);
2849				m->local = TRUE;
2850				m->local_id = lid;
2851				lq->vpl_count++;
2852
2853				if (m->object->internal)
2854					lq->vpl_internal_count++;
2855				else
2856					lq->vpl_external_count++;
2857
2858				VPL_UNLOCK(&lq->vpl_lock);
2859
2860				if (lq->vpl_count > vm_page_local_q_soft_limit)
2861				{
2862					/*
2863					 * we're beyond the soft limit
2864					 * for the local queue
2865					 * vm_page_reactivate_local will
2866					 * 'try' to take the global page
2867					 * queue lock... if it can't
2868					 * that's ok... we'll let the
2869					 * queue continue to grow up
2870					 * to the hard limit... at that
2871					 * point we'll wait for the
2872					 * lock... once we've got the
2873					 * lock, we'll transfer all of
2874					 * the pages from the local
2875					 * queue to the global active
2876					 * queue
2877					 */
2878					vm_page_reactivate_local(lid, FALSE, FALSE);
2879				}
2880			} else {
2881
2882				__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2883
2884				/*
2885				 * test again now that we hold the
2886				 * page queue lock
2887				 */
2888				if (!VM_PAGE_WIRED(m)) {
2889					if (m->clean_queue) {
2890						VM_PAGE_QUEUES_REMOVE(m);
2891
2892						vm_pageout_cleaned_reactivated++;
2893						vm_pageout_cleaned_fault_reactivated++;
2894					}
2895
2896					if ((!m->active &&
2897					     !m->inactive) ||
2898					    no_cache) {
2899						/*
2900						 * If this is a no_cache mapping
2901						 * and the page has never been
2902						 * mapped before or was
2903						 * previously a no_cache page,
2904						 * then we want to leave pages
2905						 * in the speculative state so
2906						 * that they can be readily
2907						 * recycled if free memory runs
2908						 * low.  Otherwise the page is
2909						 * activated as normal.
2910						 */
2911
2912						if (no_cache &&
2913						    (!previously_pmapped ||
2914						     m->no_cache)) {
2915							m->no_cache = TRUE;
2916
2917							if (!m->speculative)
2918								vm_page_speculate(m, FALSE);
2919
2920						} else if (!m->active &&
2921							   !m->inactive) {
2922
2923							vm_page_activate(m);
2924						}
2925					}
2926				}
2927				/* we keep the page queues lock, if we need it later */
2928			}
2929		}
2930	}
2931
2932	if ((prot & VM_PROT_EXECUTE) &&
2933	    ! m->xpmapped) {
2934
2935		__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2936
2937		/*
2938		 * xpmapped is protected by the page queues lock
2939		 * so it matters not that we might only hold the
2940		 * object lock in the shared state
2941		 */
2942
2943		if (! m->xpmapped) {
2944
2945			m->xpmapped = TRUE;
2946			__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
2947
2948			if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) &&
2949			    m->object->internal &&
2950			    m->object->pager != NULL) {
2951				/*
2952				 * This page could have been
2953				 * uncompressed by the
2954				 * compressor pager and its
2955				 * contents might be only in
2956				 * the data cache.
2957				 * Since it's being mapped for
2958				 * "execute" for the fist time,
2959				 * make sure the icache is in
2960				 * sync.
2961				 */
2962				pmap_sync_page_data_phys(m->phys_page);
2963			}
2964
2965		}
2966	}
2967	/* we're done with the page queues lock, if we ever took it */
2968	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
2969
2970
2971	/* If we have a KERN_SUCCESS from the previous checks, we either have
2972	 * a good page, or a tainted page that has been accepted by the process.
2973	 * In both cases the page will be entered into the pmap.
2974	 * If the page is writeable, we need to disconnect it from other pmaps
2975	 * now so those processes can take note.
2976	 */
2977	if (kr == KERN_SUCCESS) {
2978	        /*
2979		 * NOTE: we may only hold the vm_object lock SHARED
2980		 * at this point, but the update of pmapped is ok
2981		 * since this is the ONLY bit updated behind the SHARED
2982		 * lock... however, we need to figure out how to do an atomic
2983		 * update on a bit field to make this less fragile... right
2984		 * now I don't know how to coerce 'C' to give me the offset info
2985		 * that's needed for an AtomicCompareAndSwap
2986		 */
2987		m->pmapped = TRUE;
2988		if(vm_page_is_slideable(m)) {
2989			boolean_t was_busy = m->busy;
2990
2991			vm_object_lock_assert_exclusive(m->object);
2992
2993			m->busy = TRUE;
2994			kr = vm_page_slide(m, 0);
2995			assert(m->busy);
2996			if(!was_busy) {
2997				PAGE_WAKEUP_DONE(m);
2998			}
2999			if (kr != KERN_SUCCESS) {
3000				/*
3001				 * This page has not been slid correctly,
3002				 * do not do the pmap_enter() !
3003				 * Let vm_fault_enter() return the error
3004				 * so the caller can fail the fault.
3005				 */
3006				goto after_the_pmap_enter;
3007			}
3008		}
3009
3010		if (fault_type & VM_PROT_WRITE) {
3011
3012			if (m->wpmapped == FALSE) {
3013				vm_object_lock_assert_exclusive(m->object);
3014
3015				m->wpmapped = TRUE;
3016			}
3017			if (must_disconnect) {
3018				/*
3019				 * We can only get here
3020				 * because of the CSE logic
3021				 */
3022				assert(cs_enforcement_enabled);
3023				pmap_disconnect(m->phys_page);
3024				/*
3025				 * If we are faulting for a write, we can clear
3026				 * the execute bit - that will ensure the page is
3027				 * checked again before being executable, which
3028				 * protects against a map switch.
3029				 * This only happens the first time the page
3030				 * gets tainted, so we won't get stuck here
3031				 * to make an already writeable page executable.
3032				 */
3033				if (!cs_bypass){
3034					prot &= ~VM_PROT_EXECUTE;
3035				}
3036			}
3037		}
3038
3039		/* Prevent a deadlock by not
3040		 * holding the object lock if we need to wait for a page in
3041		 * pmap_enter() - <rdar://problem/7138958> */
3042		PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3043				  wired, PMAP_OPTIONS_NOWAIT, pe_result);
3044
3045		if(pe_result == KERN_RESOURCE_SHORTAGE) {
3046
3047			if (need_retry) {
3048				/*
3049				 * this will be non-null in the case where we hold the lock
3050				 * on the top-object in this chain... we can't just drop
3051				 * the lock on the object we're inserting the page into
3052				 * and recall the PMAP_ENTER since we can still cause
3053				 * a deadlock if one of the critical paths tries to
3054				 * acquire the lock on the top-object and we're blocked
3055				 * in PMAP_ENTER waiting for memory... our only recourse
3056				 * is to deal with it at a higher level where we can
3057				 * drop both locks.
3058				 */
3059				*need_retry = TRUE;
3060				vm_pmap_enter_retried++;
3061				goto after_the_pmap_enter;
3062			}
3063			/* The nonblocking version of pmap_enter did not succeed.
3064			 * and we don't need to drop other locks and retry
3065			 * at the level above us, so
3066			 * use the blocking version instead. Requires marking
3067			 * the page busy and unlocking the object */
3068			boolean_t was_busy = m->busy;
3069
3070			vm_object_lock_assert_exclusive(m->object);
3071
3072			m->busy = TRUE;
3073			vm_object_unlock(m->object);
3074
3075			PMAP_ENTER(pmap, vaddr, m, prot, fault_type, 0, wired);
3076
3077			/* Take the object lock again. */
3078			vm_object_lock(m->object);
3079
3080			/* If the page was busy, someone else will wake it up.
3081			 * Otherwise, we have to do it now. */
3082			assert(m->busy);
3083			if(!was_busy) {
3084				PAGE_WAKEUP_DONE(m);
3085			}
3086			vm_pmap_enter_blocked++;
3087		}
3088	}
3089
3090after_the_pmap_enter:
3091	return kr;
3092}
3093
3094
3095/*
3096 *	Routine:	vm_fault
3097 *	Purpose:
3098 *		Handle page faults, including pseudo-faults
3099 *		used to change the wiring status of pages.
3100 *	Returns:
3101 *		Explicit continuations have been removed.
3102 *	Implementation:
3103 *		vm_fault and vm_fault_page save mucho state
3104 *		in the moral equivalent of a closure.  The state
3105 *		structure is allocated when first entering vm_fault
3106 *		and deallocated when leaving vm_fault.
3107 */
3108
3109extern int _map_enter_debug;
3110
3111unsigned long vm_fault_collapse_total = 0;
3112unsigned long vm_fault_collapse_skipped = 0;
3113
3114
3115kern_return_t
3116vm_fault(
3117	vm_map_t	map,
3118	vm_map_offset_t	vaddr,
3119	vm_prot_t	fault_type,
3120	boolean_t	change_wiring,
3121	int		interruptible,
3122	pmap_t		caller_pmap,
3123	vm_map_offset_t	caller_pmap_addr)
3124{
3125	vm_map_version_t	version;	/* Map version for verificiation */
3126	boolean_t		wired;		/* Should mapping be wired down? */
3127	vm_object_t		object;		/* Top-level object */
3128	vm_object_offset_t	offset;		/* Top-level offset */
3129	vm_prot_t		prot;		/* Protection for mapping */
3130	vm_object_t		old_copy_object; /* Saved copy object */
3131	vm_page_t		result_page;	/* Result of vm_fault_page */
3132	vm_page_t		top_page;	/* Placeholder page */
3133	kern_return_t		kr;
3134
3135	vm_page_t		m;	/* Fast access to result_page */
3136	kern_return_t		error_code;
3137	vm_object_t		cur_object;
3138	vm_object_offset_t	cur_offset;
3139	vm_page_t		cur_m;
3140	vm_object_t		new_object;
3141	int                     type_of_fault;
3142	pmap_t			pmap;
3143	boolean_t		interruptible_state;
3144	vm_map_t		real_map = map;
3145	vm_map_t		original_map = map;
3146	vm_prot_t		original_fault_type;
3147	struct vm_object_fault_info fault_info;
3148	boolean_t		need_collapse = FALSE;
3149	boolean_t		need_retry = FALSE;
3150	boolean_t		*need_retry_ptr = NULL;
3151	int			object_lock_type = 0;
3152	int			cur_object_lock_type;
3153	vm_object_t		top_object = VM_OBJECT_NULL;
3154	int			throttle_delay;
3155
3156
3157	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3158	              (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3159			      ((uint64_t)vaddr >> 32),
3160			      vaddr,
3161			      (map == kernel_map),
3162			      0,
3163			      0);
3164
3165	if (get_preemption_level() != 0) {
3166	        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3167				      (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3168				      ((uint64_t)vaddr >> 32),
3169				      vaddr,
3170				      KERN_FAILURE,
3171				      0,
3172				      0);
3173
3174		return (KERN_FAILURE);
3175	}
3176
3177	interruptible_state = thread_interrupt_level(interruptible);
3178
3179	VM_STAT_INCR(faults);
3180	current_task()->faults++;
3181	original_fault_type = fault_type;
3182
3183	if (fault_type & VM_PROT_WRITE)
3184	        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3185	else
3186	        object_lock_type = OBJECT_LOCK_SHARED;
3187
3188	cur_object_lock_type = OBJECT_LOCK_SHARED;
3189
3190RetryFault:
3191	/*
3192	 * assume we will hit a page in the cache
3193	 * otherwise, explicitly override with
3194	 * the real fault type once we determine it
3195	 */
3196	type_of_fault = DBG_CACHE_HIT_FAULT;
3197
3198	/*
3199	 *	Find the backing store object and offset into
3200	 *	it to begin the search.
3201	 */
3202	fault_type = original_fault_type;
3203	map = original_map;
3204	vm_map_lock_read(map);
3205
3206	kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3207				  object_lock_type, &version,
3208				  &object, &offset, &prot, &wired,
3209				  &fault_info,
3210				  &real_map);
3211
3212	if (kr != KERN_SUCCESS) {
3213		vm_map_unlock_read(map);
3214		goto done;
3215	}
3216	pmap = real_map->pmap;
3217	fault_info.interruptible = interruptible;
3218	fault_info.stealth = FALSE;
3219	fault_info.io_sync = FALSE;
3220	fault_info.mark_zf_absent = FALSE;
3221	fault_info.batch_pmap_op = FALSE;
3222
3223	/*
3224	 * If the page is wired, we must fault for the current protection
3225	 * value, to avoid further faults.
3226	 */
3227	if (wired) {
3228		fault_type = prot | VM_PROT_WRITE;
3229		/*
3230		 * since we're treating this fault as a 'write'
3231		 * we must hold the top object lock exclusively
3232		 */
3233		if (object_lock_type == OBJECT_LOCK_SHARED) {
3234
3235		        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3236
3237			if (vm_object_lock_upgrade(object) == FALSE) {
3238			        /*
3239				 * couldn't upgrade, so explictly
3240				 * take the lock exclusively
3241				 */
3242			        vm_object_lock(object);
3243			}
3244		}
3245	}
3246
3247#if	VM_FAULT_CLASSIFY
3248	/*
3249	 *	Temporary data gathering code
3250	 */
3251	vm_fault_classify(object, offset, fault_type);
3252#endif
3253	/*
3254	 *	Fast fault code.  The basic idea is to do as much as
3255	 *	possible while holding the map lock and object locks.
3256	 *      Busy pages are not used until the object lock has to
3257	 *	be dropped to do something (copy, zero fill, pmap enter).
3258	 *	Similarly, paging references aren't acquired until that
3259	 *	point, and object references aren't used.
3260	 *
3261	 *	If we can figure out what to do
3262	 *	(zero fill, copy on write, pmap enter) while holding
3263	 *	the locks, then it gets done.  Otherwise, we give up,
3264	 *	and use the original fault path (which doesn't hold
3265	 *	the map lock, and relies on busy pages).
3266	 *	The give up cases include:
3267	 * 		- Have to talk to pager.
3268	 *		- Page is busy, absent or in error.
3269	 *		- Pager has locked out desired access.
3270	 *		- Fault needs to be restarted.
3271	 *		- Have to push page into copy object.
3272	 *
3273	 *	The code is an infinite loop that moves one level down
3274	 *	the shadow chain each time.  cur_object and cur_offset
3275	 * 	refer to the current object being examined. object and offset
3276	 *	are the original object from the map.  The loop is at the
3277	 *	top level if and only if object and cur_object are the same.
3278	 *
3279	 *	Invariants:  Map lock is held throughout.  Lock is held on
3280	 *		original object and cur_object (if different) when
3281	 *		continuing or exiting loop.
3282	 *
3283	 */
3284
3285
3286	/*
3287	 * If this page is to be inserted in a copy delay object
3288	 * for writing, and if the object has a copy, then the
3289	 * copy delay strategy is implemented in the slow fault page.
3290	 */
3291	if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3292	    object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
3293	        goto handle_copy_delay;
3294
3295	cur_object = object;
3296	cur_offset = offset;
3297
3298	while (TRUE) {
3299		if (!cur_object->pager_created &&
3300		    cur_object->phys_contiguous) /* superpage */
3301			break;
3302
3303		if (cur_object->blocked_access) {
3304			/*
3305			 * Access to this VM object has been blocked.
3306			 * Let the slow path handle it.
3307			 */
3308			break;
3309		}
3310
3311		m = vm_page_lookup(cur_object, cur_offset);
3312
3313		if (m != VM_PAGE_NULL) {
3314			if (m->busy) {
3315			        wait_result_t	result;
3316
3317				/*
3318				 * in order to do the PAGE_ASSERT_WAIT, we must
3319				 * have object that 'm' belongs to locked exclusively
3320				 */
3321				if (object != cur_object) {
3322
3323					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3324
3325					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3326
3327						if (vm_object_lock_upgrade(cur_object) == FALSE) {
3328						        /*
3329							 * couldn't upgrade so go do a full retry
3330							 * immediately since we can no longer be
3331							 * certain about cur_object (since we
3332							 * don't hold a reference on it)...
3333							 * first drop the top object lock
3334							 */
3335							vm_object_unlock(object);
3336
3337						        vm_map_unlock_read(map);
3338							if (real_map != map)
3339							        vm_map_unlock(real_map);
3340
3341							goto RetryFault;
3342						}
3343					}
3344				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3345
3346				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3347
3348					if (vm_object_lock_upgrade(object) == FALSE) {
3349					        /*
3350						 * couldn't upgrade, so explictly take the lock
3351						 * exclusively and go relookup the page since we
3352						 * will have dropped the object lock and
3353						 * a different thread could have inserted
3354						 * a page at this offset
3355						 * no need for a full retry since we're
3356						 * at the top level of the object chain
3357						 */
3358					        vm_object_lock(object);
3359
3360						continue;
3361					}
3362				}
3363				if (m->pageout_queue && m->object->internal && COMPRESSED_PAGER_IS_ACTIVE) {
3364					/*
3365					 * m->busy == TRUE and the object is locked exclusively
3366					 * if m->pageout_queue == TRUE after we acquire the
3367					 * queues lock, we are guaranteed that it is stable on
3368					 * the pageout queue and therefore reclaimable
3369					 *
3370					 * NOTE: this is only true for the internal pageout queue
3371					 * in the compressor world
3372					 */
3373					vm_page_lock_queues();
3374
3375					if (m->pageout_queue) {
3376						vm_pageout_throttle_up(m);
3377						vm_page_unlock_queues();
3378
3379						PAGE_WAKEUP_DONE(m);
3380						goto reclaimed_from_pageout;
3381					}
3382					vm_page_unlock_queues();
3383				}
3384				if (object != cur_object)
3385					vm_object_unlock(object);
3386
3387				vm_map_unlock_read(map);
3388				if (real_map != map)
3389				        vm_map_unlock(real_map);
3390
3391				result = PAGE_ASSERT_WAIT(m, interruptible);
3392
3393				vm_object_unlock(cur_object);
3394
3395				if (result == THREAD_WAITING) {
3396				        result = thread_block(THREAD_CONTINUE_NULL);
3397
3398					counter(c_vm_fault_page_block_busy_kernel++);
3399				}
3400				if (result == THREAD_AWAKENED || result == THREAD_RESTART)
3401				        goto RetryFault;
3402
3403				kr = KERN_ABORTED;
3404				goto done;
3405			}
3406reclaimed_from_pageout:
3407			if (m->laundry) {
3408				if (object != cur_object) {
3409					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3410						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3411
3412						vm_object_unlock(object);
3413						vm_object_unlock(cur_object);
3414
3415						vm_map_unlock_read(map);
3416						if (real_map != map)
3417							vm_map_unlock(real_map);
3418
3419						goto RetryFault;
3420					}
3421
3422				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3423
3424					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3425
3426					if (vm_object_lock_upgrade(object) == FALSE) {
3427						/*
3428						 * couldn't upgrade, so explictly take the lock
3429						 * exclusively and go relookup the page since we
3430						 * will have dropped the object lock and
3431						 * a different thread could have inserted
3432						 * a page at this offset
3433						 * no need for a full retry since we're
3434						 * at the top level of the object chain
3435						 */
3436						vm_object_lock(object);
3437
3438						continue;
3439					}
3440				}
3441				m->pageout = FALSE;
3442
3443				vm_pageout_steal_laundry(m, FALSE);
3444			}
3445
3446			if (m->phys_page == vm_page_guard_addr) {
3447				/*
3448				 * Guard page: let the slow path deal with it
3449				 */
3450				break;
3451			}
3452			if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3453			        /*
3454				 * Unusual case... let the slow path deal with it
3455				 */
3456				break;
3457			}
3458			if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3459				if (object != cur_object)
3460					vm_object_unlock(object);
3461				vm_map_unlock_read(map);
3462				if (real_map != map)
3463				        vm_map_unlock(real_map);
3464				vm_object_unlock(cur_object);
3465				kr = KERN_MEMORY_ERROR;
3466				goto done;
3467			}
3468
3469			if (m->encrypted) {
3470				/*
3471				 * ENCRYPTED SWAP:
3472				 * We've soft-faulted (because it's not in the page
3473				 * table) on an encrypted page.
3474				 * Keep the page "busy" so that no one messes with
3475				 * it during the decryption.
3476				 * Release the extra locks we're holding, keep only
3477				 * the page's VM object lock.
3478				 *
3479				 * in order to set 'busy' on 'm', we must
3480				 * have object that 'm' belongs to locked exclusively
3481				 */
3482			        if (object != cur_object) {
3483					vm_object_unlock(object);
3484
3485					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3486
3487					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3488
3489						if (vm_object_lock_upgrade(cur_object) == FALSE) {
3490						        /*
3491							 * couldn't upgrade so go do a full retry
3492							 * immediately since we've already dropped
3493							 * the top object lock associated with this page
3494							 * and the current one got dropped due to the
3495							 * failed upgrade... the state is no longer valid
3496							 */
3497						        vm_map_unlock_read(map);
3498							if (real_map != map)
3499							        vm_map_unlock(real_map);
3500
3501							goto RetryFault;
3502						}
3503					}
3504				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3505
3506				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3507
3508					if (vm_object_lock_upgrade(object) == FALSE) {
3509					        /*
3510						 * couldn't upgrade, so explictly take the lock
3511						 * exclusively and go relookup the page since we
3512						 * will have dropped the object lock and
3513						 * a different thread could have inserted
3514						 * a page at this offset
3515						 * no need for a full retry since we're
3516						 * at the top level of the object chain
3517						 */
3518					        vm_object_lock(object);
3519
3520						continue;
3521					}
3522				}
3523				m->busy = TRUE;
3524
3525				vm_map_unlock_read(map);
3526				if (real_map != map)
3527					vm_map_unlock(real_map);
3528
3529				vm_page_decrypt(m, 0);
3530
3531				assert(m->busy);
3532				PAGE_WAKEUP_DONE(m);
3533
3534				vm_object_unlock(cur_object);
3535				/*
3536				 * Retry from the top, in case anything
3537				 * changed while we were decrypting...
3538				 */
3539				goto RetryFault;
3540			}
3541			ASSERT_PAGE_DECRYPTED(m);
3542
3543			if(vm_page_is_slideable(m)) {
3544				/*
3545				 * We might need to slide this page, and so,
3546				 * we want to hold the VM object exclusively.
3547				 */
3548			        if (object != cur_object) {
3549					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3550						vm_object_unlock(object);
3551						vm_object_unlock(cur_object);
3552
3553					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3554
3555						vm_map_unlock_read(map);
3556						if (real_map != map)
3557							vm_map_unlock(real_map);
3558
3559						goto RetryFault;
3560					}
3561				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3562
3563					vm_object_unlock(object);
3564				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3565					vm_map_unlock_read(map);
3566					goto RetryFault;
3567				}
3568			}
3569
3570			if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
3571upgrade_for_validation:
3572				/*
3573				 * We might need to validate this page
3574				 * against its code signature, so we
3575				 * want to hold the VM object exclusively.
3576				 */
3577			        if (object != cur_object) {
3578					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3579						vm_object_unlock(object);
3580						vm_object_unlock(cur_object);
3581
3582					        cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3583
3584						vm_map_unlock_read(map);
3585						if (real_map != map)
3586							vm_map_unlock(real_map);
3587
3588						goto RetryFault;
3589					}
3590
3591				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3592
3593				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3594
3595					if (vm_object_lock_upgrade(object) == FALSE) {
3596					        /*
3597						 * couldn't upgrade, so explictly take the lock
3598						 * exclusively and go relookup the page since we
3599						 * will have dropped the object lock and
3600						 * a different thread could have inserted
3601						 * a page at this offset
3602						 * no need for a full retry since we're
3603						 * at the top level of the object chain
3604						 */
3605					        vm_object_lock(object);
3606
3607						continue;
3608					}
3609				}
3610			}
3611			/*
3612			 *	Two cases of map in faults:
3613			 *	    - At top level w/o copy object.
3614			 *	    - Read fault anywhere.
3615			 *		--> must disallow write.
3616			 */
3617
3618			if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3619
3620				goto FastPmapEnter;
3621			}
3622
3623			if ((fault_type & VM_PROT_WRITE) == 0) {
3624
3625			  	if (object != cur_object) {
3626				        /*
3627					 * We still need to hold the top object
3628					 * lock here to prevent a race between
3629					 * a read fault (taking only "shared"
3630					 * locks) and a write fault (taking
3631					 * an "exclusive" lock on the top
3632					 * object.
3633					 * Otherwise, as soon as we release the
3634					 * top lock, the write fault could
3635					 * proceed and actually complete before
3636					 * the read fault, and the copied page's
3637					 * translation could then be overwritten
3638					 * by the read fault's translation for
3639					 * the original page.
3640					 *
3641					 * Let's just record what the top object
3642					 * is and we'll release it later.
3643					 */
3644					top_object = object;
3645
3646					/*
3647					 * switch to the object that has the new page
3648					 */
3649					object = cur_object;
3650					object_lock_type = cur_object_lock_type;
3651				}
3652FastPmapEnter:
3653				/*
3654				 * prepare for the pmap_enter...
3655				 * object and map are both locked
3656				 * m contains valid data
3657				 * object == m->object
3658				 * cur_object == NULL or it's been unlocked
3659				 * no paging references on either object or cur_object
3660				 */
3661				if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE)
3662					need_retry_ptr = &need_retry;
3663				else
3664					need_retry_ptr = NULL;
3665
3666				if (caller_pmap) {
3667				        kr = vm_fault_enter(m,
3668							    caller_pmap,
3669							    caller_pmap_addr,
3670							    prot,
3671							    fault_type,
3672							    wired,
3673							    change_wiring,
3674							    fault_info.no_cache,
3675							    fault_info.cs_bypass,
3676							    need_retry_ptr,
3677							    &type_of_fault);
3678				} else {
3679				        kr = vm_fault_enter(m,
3680							    pmap,
3681							    vaddr,
3682							    prot,
3683							    fault_type,
3684							    wired,
3685							    change_wiring,
3686							    fault_info.no_cache,
3687							    fault_info.cs_bypass,
3688							    need_retry_ptr,
3689							    &type_of_fault);
3690				}
3691
3692				if (top_object != VM_OBJECT_NULL) {
3693					/*
3694					 * It's safe to drop the top object
3695					 * now that we've done our
3696					 * vm_fault_enter().  Any other fault
3697					 * in progress for that virtual
3698					 * address will either find our page
3699					 * and translation or put in a new page
3700					 * and translation.
3701					 */
3702					vm_object_unlock(top_object);
3703					top_object = VM_OBJECT_NULL;
3704				}
3705
3706				if (need_collapse == TRUE)
3707				        vm_object_collapse(object, offset, TRUE);
3708
3709				if (need_retry == FALSE &&
3710				    (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3711				        /*
3712					 * evaluate access pattern and update state
3713					 * vm_fault_deactivate_behind depends on the
3714					 * state being up to date
3715					 */
3716				        vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3717
3718					vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3719				}
3720				/*
3721				 * That's it, clean up and return.
3722				 */
3723				if (m->busy)
3724				        PAGE_WAKEUP_DONE(m);
3725
3726				vm_object_unlock(object);
3727
3728				vm_map_unlock_read(map);
3729				if (real_map != map)
3730					vm_map_unlock(real_map);
3731
3732				if (need_retry == TRUE) {
3733					/*
3734					 * vm_fault_enter couldn't complete the PMAP_ENTER...
3735					 * at this point we don't hold any locks so it's safe
3736					 * to ask the pmap layer to expand the page table to
3737					 * accommodate this mapping... once expanded, we'll
3738					 * re-drive the fault which should result in vm_fault_enter
3739					 * being able to successfully enter the mapping this time around
3740					 */
3741					(void)pmap_enter_options(pmap, vaddr, 0, 0, 0, 0, 0, PMAP_OPTIONS_NOENTER, NULL);
3742
3743					need_retry = FALSE;
3744					goto RetryFault;
3745				}
3746				goto done;
3747			}
3748			/*
3749			 * COPY ON WRITE FAULT
3750			 */
3751			assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3752
3753			if ((throttle_delay = vm_page_throttled())) {
3754				/*
3755				 * drop all of our locks...
3756				 * wait until the free queue is
3757				 * pumped back up and then
3758				 * redrive the fault
3759				 */
3760				if (object != cur_object)
3761					vm_object_unlock(cur_object);
3762				vm_object_unlock(object);
3763				vm_map_unlock_read(map);
3764				if (real_map != map)
3765					vm_map_unlock(real_map);
3766
3767				VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3768
3769				delay(throttle_delay);
3770
3771				if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3772						 THREAD_UNINT :
3773						 THREAD_ABORTSAFE))
3774					goto RetryFault;
3775				kr = KERN_ABORTED;
3776				goto done;
3777			}
3778                        /*
3779			 * If objects match, then
3780			 * object->copy must not be NULL (else control
3781			 * would be in previous code block), and we
3782			 * have a potential push into the copy object
3783			 * with which we can't cope with here.
3784			 */
3785			if (cur_object == object) {
3786			        /*
3787				 * must take the slow path to
3788				 * deal with the copy push
3789				 */
3790				break;
3791			}
3792
3793			/*
3794			 * This is now a shadow based copy on write
3795			 * fault -- it requires a copy up the shadow
3796			 * chain.
3797			 */
3798
3799			if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3800			    VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3801				goto upgrade_for_validation;
3802			}
3803
3804			/*
3805			 * Allocate a page in the original top level
3806			 * object. Give up if allocate fails.  Also
3807			 * need to remember current page, as it's the
3808			 * source of the copy.
3809			 *
3810			 * at this point we hold locks on both
3811			 * object and cur_object... no need to take
3812			 * paging refs or mark pages BUSY since
3813			 * we don't drop either object lock until
3814			 * the page has been copied and inserted
3815			 */
3816			cur_m = m;
3817			m = vm_page_grab();
3818
3819			if (m == VM_PAGE_NULL) {
3820			        /*
3821				 * no free page currently available...
3822				 * must take the slow path
3823				 */
3824				break;
3825			}
3826			/*
3827			 * Now do the copy.  Mark the source page busy...
3828			 *
3829			 *	NOTE: This code holds the map lock across
3830			 *	the page copy.
3831			 */
3832			vm_page_copy(cur_m, m);
3833			vm_page_insert(m, object, offset);
3834			SET_PAGE_DIRTY(m, FALSE);
3835
3836			/*
3837			 * Now cope with the source page and object
3838			 */
3839			if (object->ref_count > 1 && cur_m->pmapped)
3840			        pmap_disconnect(cur_m->phys_page);
3841
3842			need_collapse = TRUE;
3843
3844			if (!cur_object->internal &&
3845			    cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3846			        /*
3847				 * The object from which we've just
3848				 * copied a page is most probably backed
3849				 * by a vnode.  We don't want to waste too
3850				 * much time trying to collapse the VM objects
3851				 * and create a bottleneck when several tasks
3852				 * map the same file.
3853				 */
3854			        if (cur_object->copy == object) {
3855				        /*
3856					 * Shared mapping or no COW yet.
3857					 * We can never collapse a copy
3858					 * object into its backing object.
3859					 */
3860				        need_collapse = FALSE;
3861				} else if (cur_object->copy == object->shadow &&
3862					   object->shadow->resident_page_count == 0) {
3863				        /*
3864					 * Shared mapping after a COW occurred.
3865					 */
3866				        need_collapse = FALSE;
3867				}
3868			}
3869			vm_object_unlock(cur_object);
3870
3871			if (need_collapse == FALSE)
3872			        vm_fault_collapse_skipped++;
3873			vm_fault_collapse_total++;
3874
3875			type_of_fault = DBG_COW_FAULT;
3876			VM_STAT_INCR(cow_faults);
3877			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
3878			current_task()->cow_faults++;
3879
3880			goto FastPmapEnter;
3881
3882		} else {
3883			/*
3884			 * No page at cur_object, cur_offset... m == NULL
3885			 */
3886			if (cur_object->pager_created) {
3887				int	compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
3888
3889			        if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
3890					int		my_fault_type;
3891					int		c_flags = C_DONT_BLOCK;
3892					boolean_t	insert_cur_object = FALSE;
3893
3894				        /*
3895					 * May have to talk to a pager...
3896					 * if so, take the slow path by
3897					 * doing a 'break' from the while (TRUE) loop
3898					 *
3899					 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
3900					 * if the compressor is active and the page exists there
3901					 */
3902					if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS)
3903						break;
3904
3905					if (map == kernel_map || real_map == kernel_map) {
3906						/*
3907						 * can't call into the compressor with the kernel_map
3908						 * lock held, since the compressor may try to operate
3909						 * on the kernel map in order to return an empty c_segment
3910						 */
3911						break;
3912					}
3913					if (object != cur_object) {
3914						if (fault_type & VM_PROT_WRITE)
3915							c_flags |= C_KEEP;
3916						else
3917							insert_cur_object = TRUE;
3918					}
3919					if (insert_cur_object == TRUE) {
3920
3921						if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3922
3923							cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3924
3925							if (vm_object_lock_upgrade(cur_object) == FALSE) {
3926								/*
3927								 * couldn't upgrade so go do a full retry
3928								 * immediately since we can no longer be
3929								 * certain about cur_object (since we
3930								 * don't hold a reference on it)...
3931								 * first drop the top object lock
3932								 */
3933								vm_object_unlock(object);
3934
3935								vm_map_unlock_read(map);
3936								if (real_map != map)
3937									vm_map_unlock(real_map);
3938
3939								goto RetryFault;
3940							}
3941						}
3942					} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3943
3944						object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3945
3946						if (object != cur_object) {
3947							/*
3948							 * we can't go for the upgrade on the top
3949							 * lock since the upgrade may block waiting
3950							 * for readers to drain... since we hold
3951							 * cur_object locked at this point, waiting
3952							 * for the readers to drain would represent
3953							 * a lock order inversion since the lock order
3954							 * for objects is the reference order in the
3955							 * shadown chain
3956							 */
3957							vm_object_unlock(object);
3958							vm_object_unlock(cur_object);
3959
3960							vm_map_unlock_read(map);
3961							if (real_map != map)
3962								vm_map_unlock(real_map);
3963
3964							goto RetryFault;
3965						}
3966						if (vm_object_lock_upgrade(object) == FALSE) {
3967							/*
3968							 * couldn't upgrade, so explictly take the lock
3969							 * exclusively and go relookup the page since we
3970							 * will have dropped the object lock and
3971							 * a different thread could have inserted
3972							 * a page at this offset
3973							 * no need for a full retry since we're
3974							 * at the top level of the object chain
3975							 */
3976							vm_object_lock(object);
3977
3978							continue;
3979						}
3980					}
3981					m = vm_page_grab();
3982
3983					if (m == VM_PAGE_NULL) {
3984						/*
3985						 * no free page currently available...
3986						 * must take the slow path
3987						 */
3988						break;
3989					}
3990					if (vm_compressor_pager_get(cur_object->pager, cur_offset + cur_object->paging_offset,
3991								    m->phys_page, &my_fault_type, c_flags) != KERN_SUCCESS) {
3992						vm_page_release(m);
3993						break;
3994					}
3995					m->dirty = TRUE;
3996
3997					if (insert_cur_object)
3998						vm_page_insert(m, cur_object, cur_offset);
3999					else
4000						vm_page_insert(m, object, offset);
4001
4002					if ((m->object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4003                                                /*
4004						 * If the page is not cacheable,
4005						 * we can't let its contents
4006						 * linger in the data cache
4007						 * after the decompression.
4008						 */
4009						pmap_sync_page_attributes_phys(m->phys_page);
4010					}
4011					type_of_fault = my_fault_type;
4012
4013					VM_STAT_INCR(decompressions);
4014
4015					if (cur_object != object) {
4016						if (insert_cur_object) {
4017							top_object = object;
4018							/*
4019							 * switch to the object that has the new page
4020							 */
4021							object = cur_object;
4022							object_lock_type = cur_object_lock_type;
4023						} else {
4024							vm_object_unlock(cur_object);
4025							cur_object = object;
4026						}
4027					}
4028					goto FastPmapEnter;
4029				}
4030				/*
4031				 * existence map present and indicates
4032				 * that the pager doesn't have this page
4033				 */
4034			}
4035			if (cur_object->shadow == VM_OBJECT_NULL) {
4036				/*
4037				 * Zero fill fault.  Page gets
4038				 * inserted into the original object.
4039				 */
4040				if (cur_object->shadow_severed ||
4041				    VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
4042				{
4043					if (object != cur_object)
4044					        vm_object_unlock(cur_object);
4045					vm_object_unlock(object);
4046
4047					vm_map_unlock_read(map);
4048					if (real_map != map)
4049						vm_map_unlock(real_map);
4050
4051					kr = KERN_MEMORY_ERROR;
4052					goto done;
4053				}
4054				if ((throttle_delay = vm_page_throttled())) {
4055					/*
4056					 * drop all of our locks...
4057					 * wait until the free queue is
4058					 * pumped back up and then
4059					 * redrive the fault
4060					 */
4061					if (object != cur_object)
4062						vm_object_unlock(cur_object);
4063					vm_object_unlock(object);
4064					vm_map_unlock_read(map);
4065					if (real_map != map)
4066						vm_map_unlock(real_map);
4067
4068					VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
4069
4070					delay(throttle_delay);
4071
4072					if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
4073							 THREAD_UNINT :
4074							 THREAD_ABORTSAFE))
4075						goto RetryFault;
4076					kr = KERN_ABORTED;
4077					goto done;
4078				}
4079				if (vm_backing_store_low) {
4080				        /*
4081					 * we are protecting the system from
4082					 * backing store exhaustion...
4083					 * must take the slow path if we're
4084					 * not privileged
4085					 */
4086					if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
4087					        break;
4088				}
4089			  	if (cur_object != object) {
4090					vm_object_unlock(cur_object);
4091
4092					cur_object = object;
4093				}
4094				if (object_lock_type == OBJECT_LOCK_SHARED) {
4095
4096				        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4097
4098					if (vm_object_lock_upgrade(object) == FALSE) {
4099					        /*
4100						 * couldn't upgrade so do a full retry on the fault
4101						 * since we dropped the object lock which
4102						 * could allow another thread to insert
4103						 * a page at this offset
4104						 */
4105					        vm_map_unlock_read(map);
4106						if (real_map != map)
4107						        vm_map_unlock(real_map);
4108
4109						goto RetryFault;
4110					}
4111				}
4112				m = vm_page_alloc(object, offset);
4113
4114				if (m == VM_PAGE_NULL) {
4115				        /*
4116					 * no free page currently available...
4117					 * must take the slow path
4118					 */
4119					break;
4120				}
4121
4122				/*
4123				 * Now zero fill page...
4124				 * the page is probably going to
4125				 * be written soon, so don't bother
4126				 * to clear the modified bit
4127				 *
4128				 *   NOTE: This code holds the map
4129				 *   lock across the zero fill.
4130				 */
4131				type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4132
4133				goto FastPmapEnter;
4134		        }
4135			/*
4136			 * On to the next level in the shadow chain
4137			 */
4138			cur_offset += cur_object->vo_shadow_offset;
4139			new_object = cur_object->shadow;
4140
4141			/*
4142			 * take the new_object's lock with the indicated state
4143			 */
4144			if (cur_object_lock_type == OBJECT_LOCK_SHARED)
4145			        vm_object_lock_shared(new_object);
4146			else
4147			        vm_object_lock(new_object);
4148
4149			if (cur_object != object)
4150				vm_object_unlock(cur_object);
4151
4152			cur_object = new_object;
4153
4154			continue;
4155		}
4156	}
4157	/*
4158	 * Cleanup from fast fault failure.  Drop any object
4159	 * lock other than original and drop map lock.
4160	 */
4161	if (object != cur_object)
4162		vm_object_unlock(cur_object);
4163
4164	/*
4165	 * must own the object lock exclusively at this point
4166	 */
4167	if (object_lock_type == OBJECT_LOCK_SHARED) {
4168	        object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4169
4170		if (vm_object_lock_upgrade(object) == FALSE) {
4171		        /*
4172			 * couldn't upgrade, so explictly
4173			 * take the lock exclusively
4174			 * no need to retry the fault at this
4175			 * point since "vm_fault_page" will
4176			 * completely re-evaluate the state
4177			 */
4178		        vm_object_lock(object);
4179		}
4180	}
4181
4182handle_copy_delay:
4183	vm_map_unlock_read(map);
4184	if (real_map != map)
4185		vm_map_unlock(real_map);
4186
4187   	/*
4188	 * Make a reference to this object to
4189	 * prevent its disposal while we are messing with
4190	 * it.  Once we have the reference, the map is free
4191	 * to be diddled.  Since objects reference their
4192	 * shadows (and copies), they will stay around as well.
4193	 */
4194	vm_object_reference_locked(object);
4195	vm_object_paging_begin(object);
4196
4197	XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
4198
4199	error_code = 0;
4200
4201	result_page = VM_PAGE_NULL;
4202	kr = vm_fault_page(object, offset, fault_type,
4203			   (change_wiring && !wired),
4204			   FALSE, /* page not looked up */
4205			   &prot, &result_page, &top_page,
4206			   &type_of_fault,
4207			   &error_code, map->no_zero_fill,
4208			   FALSE, &fault_info);
4209
4210	/*
4211	 * if kr != VM_FAULT_SUCCESS, then the paging reference
4212	 * has been dropped and the object unlocked... the ref_count
4213	 * is still held
4214	 *
4215	 * if kr == VM_FAULT_SUCCESS, then the paging reference
4216	 * is still held along with the ref_count on the original object
4217	 *
4218	 *	the object is returned locked with a paging reference
4219	 *
4220	 *	if top_page != NULL, then it's BUSY and the
4221	 *	object it belongs to has a paging reference
4222	 *	but is returned unlocked
4223	 */
4224	if (kr != VM_FAULT_SUCCESS &&
4225	    kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4226	        /*
4227		 * we didn't succeed, lose the object reference immediately.
4228		 */
4229		vm_object_deallocate(object);
4230
4231		/*
4232		 * See why we failed, and take corrective action.
4233		 */
4234		switch (kr) {
4235		case VM_FAULT_MEMORY_SHORTAGE:
4236			if (vm_page_wait((change_wiring) ?
4237					 THREAD_UNINT :
4238					 THREAD_ABORTSAFE))
4239				goto RetryFault;
4240			/*
4241			 * fall thru
4242			 */
4243		case VM_FAULT_INTERRUPTED:
4244			kr = KERN_ABORTED;
4245			goto done;
4246		case VM_FAULT_RETRY:
4247			goto RetryFault;
4248		case VM_FAULT_MEMORY_ERROR:
4249			if (error_code)
4250				kr = error_code;
4251			else
4252				kr = KERN_MEMORY_ERROR;
4253			goto done;
4254		default:
4255			panic("vm_fault: unexpected error 0x%x from "
4256			      "vm_fault_page()\n", kr);
4257		}
4258	}
4259	m = result_page;
4260
4261	if (m != VM_PAGE_NULL) {
4262		assert((change_wiring && !wired) ?
4263	   	    (top_page == VM_PAGE_NULL) :
4264	   	    ((top_page == VM_PAGE_NULL) == (m->object == object)));
4265	}
4266
4267	/*
4268	 * What to do with the resulting page from vm_fault_page
4269	 * if it doesn't get entered into the physical map:
4270	 */
4271#define RELEASE_PAGE(m)					\
4272	MACRO_BEGIN					\
4273	PAGE_WAKEUP_DONE(m);				\
4274	if (!m->active && !m->inactive && !m->throttled) {		\
4275		vm_page_lockspin_queues();				\
4276		if (!m->active && !m->inactive && !m->throttled)	\
4277			vm_page_activate(m);				\
4278		vm_page_unlock_queues();				\
4279	}								\
4280	MACRO_END
4281
4282	/*
4283	 * We must verify that the maps have not changed
4284	 * since our last lookup.
4285	 */
4286	if (m != VM_PAGE_NULL) {
4287		old_copy_object = m->object->copy;
4288		vm_object_unlock(m->object);
4289	} else {
4290		old_copy_object = VM_OBJECT_NULL;
4291		vm_object_unlock(object);
4292	}
4293
4294	/*
4295	 * no object locks are held at this point
4296	 */
4297	if ((map != original_map) || !vm_map_verify(map, &version)) {
4298		vm_object_t		retry_object;
4299		vm_object_offset_t	retry_offset;
4300		vm_prot_t		retry_prot;
4301
4302		/*
4303		 * To avoid trying to write_lock the map while another
4304		 * thread has it read_locked (in vm_map_pageable), we
4305		 * do not try for write permission.  If the page is
4306		 * still writable, we will get write permission.  If it
4307		 * is not, or has been marked needs_copy, we enter the
4308		 * mapping without write permission, and will merely
4309		 * take another fault.
4310		 */
4311		map = original_map;
4312		vm_map_lock_read(map);
4313
4314		kr = vm_map_lookup_locked(&map, vaddr,
4315					  fault_type & ~VM_PROT_WRITE,
4316					  OBJECT_LOCK_EXCLUSIVE, &version,
4317					  &retry_object, &retry_offset, &retry_prot,
4318					  &wired,
4319					  &fault_info,
4320					  &real_map);
4321		pmap = real_map->pmap;
4322
4323		if (kr != KERN_SUCCESS) {
4324			vm_map_unlock_read(map);
4325
4326			if (m != VM_PAGE_NULL) {
4327			        /*
4328				 * retake the lock so that
4329				 * we can drop the paging reference
4330				 * in vm_fault_cleanup and do the
4331				 * PAGE_WAKEUP_DONE in RELEASE_PAGE
4332				 */
4333				vm_object_lock(m->object);
4334
4335				RELEASE_PAGE(m);
4336
4337				vm_fault_cleanup(m->object, top_page);
4338			} else {
4339			        /*
4340				 * retake the lock so that
4341				 * we can drop the paging reference
4342				 * in vm_fault_cleanup
4343				 */
4344			        vm_object_lock(object);
4345
4346			        vm_fault_cleanup(object, top_page);
4347			}
4348			vm_object_deallocate(object);
4349
4350			goto done;
4351		}
4352		vm_object_unlock(retry_object);
4353
4354		if ((retry_object != object) || (retry_offset != offset)) {
4355
4356			vm_map_unlock_read(map);
4357			if (real_map != map)
4358				vm_map_unlock(real_map);
4359
4360			if (m != VM_PAGE_NULL) {
4361			        /*
4362				 * retake the lock so that
4363				 * we can drop the paging reference
4364				 * in vm_fault_cleanup and do the
4365				 * PAGE_WAKEUP_DONE in RELEASE_PAGE
4366				 */
4367			        vm_object_lock(m->object);
4368
4369				RELEASE_PAGE(m);
4370
4371				vm_fault_cleanup(m->object, top_page);
4372			} else {
4373			        /*
4374				 * retake the lock so that
4375				 * we can drop the paging reference
4376				 * in vm_fault_cleanup
4377				 */
4378			        vm_object_lock(object);
4379
4380			        vm_fault_cleanup(object, top_page);
4381			}
4382			vm_object_deallocate(object);
4383
4384			goto RetryFault;
4385		}
4386		/*
4387		 * Check whether the protection has changed or the object
4388		 * has been copied while we left the map unlocked.
4389		 */
4390		prot &= retry_prot;
4391	}
4392	if (m != VM_PAGE_NULL) {
4393		vm_object_lock(m->object);
4394
4395		if (m->object->copy != old_copy_object) {
4396		        /*
4397			 * The copy object changed while the top-level object
4398			 * was unlocked, so take away write permission.
4399			 */
4400			prot &= ~VM_PROT_WRITE;
4401		}
4402	} else
4403		vm_object_lock(object);
4404
4405	/*
4406	 * If we want to wire down this page, but no longer have
4407	 * adequate permissions, we must start all over.
4408	 */
4409	if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
4410
4411		vm_map_verify_done(map, &version);
4412		if (real_map != map)
4413			vm_map_unlock(real_map);
4414
4415		if (m != VM_PAGE_NULL) {
4416			RELEASE_PAGE(m);
4417
4418			vm_fault_cleanup(m->object, top_page);
4419		} else
4420		        vm_fault_cleanup(object, top_page);
4421
4422		vm_object_deallocate(object);
4423
4424		goto RetryFault;
4425	}
4426	if (m != VM_PAGE_NULL) {
4427		/*
4428		 * Put this page into the physical map.
4429		 * We had to do the unlock above because pmap_enter
4430		 * may cause other faults.  The page may be on
4431		 * the pageout queues.  If the pageout daemon comes
4432		 * across the page, it will remove it from the queues.
4433		 */
4434		if (caller_pmap) {
4435			kr = vm_fault_enter(m,
4436					    caller_pmap,
4437					    caller_pmap_addr,
4438					    prot,
4439					    fault_type,
4440					    wired,
4441					    change_wiring,
4442					    fault_info.no_cache,
4443					    fault_info.cs_bypass,
4444					    NULL,
4445					    &type_of_fault);
4446		} else {
4447			kr = vm_fault_enter(m,
4448					    pmap,
4449					    vaddr,
4450					    prot,
4451					    fault_type,
4452					    wired,
4453					    change_wiring,
4454					    fault_info.no_cache,
4455					    fault_info.cs_bypass,
4456					    NULL,
4457					    &type_of_fault);
4458		}
4459		if (kr != KERN_SUCCESS) {
4460			/* abort this page fault */
4461			vm_map_verify_done(map, &version);
4462			if (real_map != map)
4463				vm_map_unlock(real_map);
4464			PAGE_WAKEUP_DONE(m);
4465			vm_fault_cleanup(m->object, top_page);
4466			vm_object_deallocate(object);
4467			goto done;
4468		}
4469	} else {
4470
4471		vm_map_entry_t		entry;
4472		vm_map_offset_t		laddr;
4473		vm_map_offset_t		ldelta, hdelta;
4474
4475		/*
4476		 * do a pmap block mapping from the physical address
4477		 * in the object
4478		 */
4479
4480#ifdef ppc
4481		/* While we do not worry about execution protection in   */
4482		/* general, certian pages may have instruction execution */
4483		/* disallowed.  We will check here, and if not allowed   */
4484		/* to execute, we return with a protection failure.      */
4485
4486		if ((fault_type & VM_PROT_EXECUTE) &&
4487			(!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
4488
4489			vm_map_verify_done(map, &version);
4490
4491			if (real_map != map)
4492				vm_map_unlock(real_map);
4493
4494			vm_fault_cleanup(object, top_page);
4495			vm_object_deallocate(object);
4496
4497			kr = KERN_PROTECTION_FAILURE;
4498			goto done;
4499		}
4500#endif	/* ppc */
4501
4502		if (real_map != map)
4503			vm_map_unlock(real_map);
4504
4505		if (original_map != map) {
4506			vm_map_unlock_read(map);
4507			vm_map_lock_read(original_map);
4508			map = original_map;
4509		}
4510		real_map = map;
4511
4512		laddr = vaddr;
4513		hdelta = 0xFFFFF000;
4514		ldelta = 0xFFFFF000;
4515
4516		while (vm_map_lookup_entry(map, laddr, &entry)) {
4517			if (ldelta > (laddr - entry->vme_start))
4518				ldelta = laddr - entry->vme_start;
4519			if (hdelta > (entry->vme_end - laddr))
4520				hdelta = entry->vme_end - laddr;
4521			if (entry->is_sub_map) {
4522
4523				laddr = (laddr - entry->vme_start)
4524							+ entry->offset;
4525				vm_map_lock_read(entry->object.sub_map);
4526
4527				if (map != real_map)
4528					vm_map_unlock_read(map);
4529				if (entry->use_pmap) {
4530					vm_map_unlock_read(real_map);
4531					real_map = entry->object.sub_map;
4532				}
4533				map = entry->object.sub_map;
4534
4535			} else {
4536				break;
4537			}
4538		}
4539
4540		if (vm_map_lookup_entry(map, laddr, &entry) &&
4541					(entry->object.vm_object != NULL) &&
4542					(entry->object.vm_object == object)) {
4543
4544			int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
4545			if (caller_pmap) {
4546				/*
4547				 * Set up a block mapped area
4548				 */
4549				assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4550				pmap_map_block(caller_pmap,
4551					       (addr64_t)(caller_pmap_addr - ldelta),
4552					       (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
4553							  entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4554					       (uint32_t)((ldelta + hdelta) >> 12), prot,
4555					       (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4556			} else {
4557				/*
4558				 * Set up a block mapped area
4559				 */
4560				assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4561				pmap_map_block(real_map->pmap,
4562					       (addr64_t)(vaddr - ldelta),
4563					       (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
4564							  entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4565					       (uint32_t)((ldelta + hdelta) >> 12), prot,
4566					       (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4567			}
4568		}
4569	}
4570
4571	/*
4572	 * Unlock everything, and return
4573	 */
4574	vm_map_verify_done(map, &version);
4575	if (real_map != map)
4576		vm_map_unlock(real_map);
4577
4578	if (m != VM_PAGE_NULL) {
4579		PAGE_WAKEUP_DONE(m);
4580
4581		vm_fault_cleanup(m->object, top_page);
4582	} else
4583	        vm_fault_cleanup(object, top_page);
4584
4585	vm_object_deallocate(object);
4586
4587#undef	RELEASE_PAGE
4588
4589	kr = KERN_SUCCESS;
4590done:
4591	thread_interrupt_level(interruptible_state);
4592
4593	/*
4594	 * Only throttle on faults which cause a pagein.
4595	 */
4596	if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
4597		throttle_lowpri_io(1);
4598	}
4599
4600	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4601			      (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4602			      ((uint64_t)vaddr >> 32),
4603			      vaddr,
4604			      kr,
4605			      type_of_fault,
4606			      0);
4607
4608	return (kr);
4609}
4610
4611/*
4612 *	vm_fault_wire:
4613 *
4614 *	Wire down a range of virtual addresses in a map.
4615 */
4616kern_return_t
4617vm_fault_wire(
4618	vm_map_t	map,
4619	vm_map_entry_t	entry,
4620	pmap_t		pmap,
4621	vm_map_offset_t	pmap_addr)
4622{
4623
4624	register vm_map_offset_t	va;
4625	register vm_map_offset_t	end_addr = entry->vme_end;
4626	register kern_return_t	rc;
4627
4628	assert(entry->in_transition);
4629
4630	if ((entry->object.vm_object != NULL) &&
4631			!entry->is_sub_map &&
4632			entry->object.vm_object->phys_contiguous) {
4633		return KERN_SUCCESS;
4634	}
4635
4636	/*
4637	 *	Inform the physical mapping system that the
4638	 *	range of addresses may not fault, so that
4639	 *	page tables and such can be locked down as well.
4640	 */
4641
4642	pmap_pageable(pmap, pmap_addr,
4643		pmap_addr + (end_addr - entry->vme_start), FALSE);
4644
4645	/*
4646	 *	We simulate a fault to get the page and enter it
4647	 *	in the physical map.
4648	 */
4649
4650	for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4651		if ((rc = vm_fault_wire_fast(
4652			map, va, entry, pmap,
4653			pmap_addr + (va - entry->vme_start)
4654			)) != KERN_SUCCESS) {
4655			rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
4656			  	(pmap == kernel_pmap) ?
4657					THREAD_UNINT : THREAD_ABORTSAFE,
4658				pmap, pmap_addr + (va - entry->vme_start));
4659			DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
4660		}
4661
4662		if (rc != KERN_SUCCESS) {
4663			struct vm_map_entry	tmp_entry = *entry;
4664
4665			/* unwire wired pages */
4666			tmp_entry.vme_end = va;
4667			vm_fault_unwire(map,
4668				&tmp_entry, FALSE, pmap, pmap_addr);
4669
4670			return rc;
4671		}
4672	}
4673	return KERN_SUCCESS;
4674}
4675
4676/*
4677 *	vm_fault_unwire:
4678 *
4679 *	Unwire a range of virtual addresses in a map.
4680 */
4681void
4682vm_fault_unwire(
4683	vm_map_t	map,
4684	vm_map_entry_t	entry,
4685	boolean_t	deallocate,
4686	pmap_t		pmap,
4687	vm_map_offset_t	pmap_addr)
4688{
4689	register vm_map_offset_t	va;
4690	register vm_map_offset_t	end_addr = entry->vme_end;
4691	vm_object_t		object;
4692	struct vm_object_fault_info fault_info;
4693
4694	object = (entry->is_sub_map)
4695			? VM_OBJECT_NULL : entry->object.vm_object;
4696
4697	/*
4698	 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4699	 * do anything since such memory is wired by default.  So we don't have
4700	 * anything to undo here.
4701	 */
4702
4703	if (object != VM_OBJECT_NULL && object->phys_contiguous)
4704		return;
4705
4706	fault_info.interruptible = THREAD_UNINT;
4707	fault_info.behavior = entry->behavior;
4708	fault_info.user_tag = entry->alias;
4709	fault_info.lo_offset = entry->offset;
4710	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4711	fault_info.no_cache = entry->no_cache;
4712	fault_info.stealth = TRUE;
4713	fault_info.io_sync = FALSE;
4714	fault_info.cs_bypass = FALSE;
4715	fault_info.mark_zf_absent = FALSE;
4716	fault_info.batch_pmap_op = FALSE;
4717
4718	/*
4719	 *	Since the pages are wired down, we must be able to
4720	 *	get their mappings from the physical map system.
4721	 */
4722
4723	for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4724
4725		if (object == VM_OBJECT_NULL) {
4726			if (pmap) {
4727				pmap_change_wiring(pmap,
4728						   pmap_addr + (va - entry->vme_start), FALSE);
4729			}
4730			(void) vm_fault(map, va, VM_PROT_NONE,
4731					TRUE, THREAD_UNINT, pmap, pmap_addr);
4732		} else {
4733		 	vm_prot_t	prot;
4734			vm_page_t	result_page;
4735			vm_page_t	top_page;
4736			vm_object_t	result_object;
4737			vm_fault_return_t result;
4738
4739			if (end_addr - va > (vm_size_t) -1) {
4740				/* 32-bit overflow */
4741				fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4742			} else {
4743				fault_info.cluster_size = (vm_size_t) (end_addr - va);
4744				assert(fault_info.cluster_size == end_addr - va);
4745			}
4746
4747			do {
4748				prot = VM_PROT_NONE;
4749
4750				vm_object_lock(object);
4751				vm_object_paging_begin(object);
4752				XPR(XPR_VM_FAULT,
4753					"vm_fault_unwire -> vm_fault_page\n",
4754					0,0,0,0,0);
4755				result_page = VM_PAGE_NULL;
4756			 	result = vm_fault_page(
4757					object,
4758					entry->offset + (va - entry->vme_start),
4759					VM_PROT_NONE, TRUE,
4760					FALSE, /* page not looked up */
4761					&prot, &result_page, &top_page,
4762					(int *)0,
4763					NULL, map->no_zero_fill,
4764					FALSE, &fault_info);
4765			} while (result == VM_FAULT_RETRY);
4766
4767			/*
4768			 * If this was a mapping to a file on a device that has been forcibly
4769			 * unmounted, then we won't get a page back from vm_fault_page().  Just
4770			 * move on to the next one in case the remaining pages are mapped from
4771			 * different objects.  During a forced unmount, the object is terminated
4772			 * so the alive flag will be false if this happens.  A forced unmount will
4773			 * will occur when an external disk is unplugged before the user does an
4774			 * eject, so we don't want to panic in that situation.
4775			 */
4776
4777			if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
4778				continue;
4779
4780			if (result == VM_FAULT_MEMORY_ERROR &&
4781			    object == kernel_object) {
4782				/*
4783				 * This must have been allocated with
4784				 * KMA_KOBJECT and KMA_VAONLY and there's
4785				 * no physical page at this offset.
4786				 * We're done (no page to free).
4787				 */
4788				assert(deallocate);
4789				continue;
4790			}
4791
4792			if (result != VM_FAULT_SUCCESS)
4793				panic("vm_fault_unwire: failure");
4794
4795			result_object = result_page->object;
4796
4797			if (deallocate) {
4798				assert(result_page->phys_page !=
4799				       vm_page_fictitious_addr);
4800				pmap_disconnect(result_page->phys_page);
4801				VM_PAGE_FREE(result_page);
4802			} else {
4803				if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
4804					pmap_change_wiring(pmap,
4805					    pmap_addr + (va - entry->vme_start), FALSE);
4806
4807
4808				if (VM_PAGE_WIRED(result_page)) {
4809					vm_page_lockspin_queues();
4810					vm_page_unwire(result_page, TRUE);
4811					vm_page_unlock_queues();
4812				}
4813				if(entry->zero_wired_pages) {
4814					pmap_zero_page(result_page->phys_page);
4815					entry->zero_wired_pages = FALSE;
4816				}
4817
4818				PAGE_WAKEUP_DONE(result_page);
4819			}
4820			vm_fault_cleanup(result_object, top_page);
4821		}
4822	}
4823
4824	/*
4825	 *	Inform the physical mapping system that the range
4826	 *	of addresses may fault, so that page tables and
4827	 *	such may be unwired themselves.
4828	 */
4829
4830	pmap_pageable(pmap, pmap_addr,
4831		pmap_addr + (end_addr - entry->vme_start), TRUE);
4832
4833}
4834
4835/*
4836 *	vm_fault_wire_fast:
4837 *
4838 *	Handle common case of a wire down page fault at the given address.
4839 *	If successful, the page is inserted into the associated physical map.
4840 *	The map entry is passed in to avoid the overhead of a map lookup.
4841 *
4842 *	NOTE: the given address should be truncated to the
4843 *	proper page address.
4844 *
4845 *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
4846 *	a standard error specifying why the fault is fatal is returned.
4847 *
4848 *	The map in question must be referenced, and remains so.
4849 *	Caller has a read lock on the map.
4850 *
4851 *	This is a stripped version of vm_fault() for wiring pages.  Anything
4852 *	other than the common case will return KERN_FAILURE, and the caller
4853 *	is expected to call vm_fault().
4854 */
4855kern_return_t
4856vm_fault_wire_fast(
4857	__unused vm_map_t	map,
4858	vm_map_offset_t	va,
4859	vm_map_entry_t	entry,
4860	pmap_t			pmap,
4861	vm_map_offset_t	pmap_addr)
4862{
4863	vm_object_t		object;
4864	vm_object_offset_t	offset;
4865	register vm_page_t	m;
4866	vm_prot_t		prot;
4867	thread_t           	thread = current_thread();
4868	int			type_of_fault;
4869	kern_return_t		kr;
4870
4871	VM_STAT_INCR(faults);
4872
4873	if (thread != THREAD_NULL && thread->task != TASK_NULL)
4874	  thread->task->faults++;
4875
4876/*
4877 *	Recovery actions
4878 */
4879
4880#undef	RELEASE_PAGE
4881#define RELEASE_PAGE(m)	{				\
4882	PAGE_WAKEUP_DONE(m);				\
4883	vm_page_lockspin_queues();			\
4884	vm_page_unwire(m, TRUE);			\
4885	vm_page_unlock_queues();			\
4886}
4887
4888
4889#undef	UNLOCK_THINGS
4890#define UNLOCK_THINGS	{				\
4891	vm_object_paging_end(object);			   \
4892	vm_object_unlock(object);			   \
4893}
4894
4895#undef	UNLOCK_AND_DEALLOCATE
4896#define UNLOCK_AND_DEALLOCATE	{			\
4897	UNLOCK_THINGS;					\
4898	vm_object_deallocate(object);			\
4899}
4900/*
4901 *	Give up and have caller do things the hard way.
4902 */
4903
4904#define GIVE_UP {					\
4905	UNLOCK_AND_DEALLOCATE;				\
4906	return(KERN_FAILURE);				\
4907}
4908
4909
4910	/*
4911	 *	If this entry is not directly to a vm_object, bail out.
4912	 */
4913	if (entry->is_sub_map)
4914		return(KERN_FAILURE);
4915
4916	/*
4917	 *	Find the backing store object and offset into it.
4918	 */
4919
4920	object = entry->object.vm_object;
4921	offset = (va - entry->vme_start) + entry->offset;
4922	prot = entry->protection;
4923
4924   	/*
4925	 *	Make a reference to this object to prevent its
4926	 *	disposal while we are messing with it.
4927	 */
4928
4929	vm_object_lock(object);
4930	vm_object_reference_locked(object);
4931	vm_object_paging_begin(object);
4932
4933	/*
4934	 *	INVARIANTS (through entire routine):
4935	 *
4936	 *	1)	At all times, we must either have the object
4937	 *		lock or a busy page in some object to prevent
4938	 *		some other thread from trying to bring in
4939	 *		the same page.
4940	 *
4941	 *	2)	Once we have a busy page, we must remove it from
4942	 *		the pageout queues, so that the pageout daemon
4943	 *		will not grab it away.
4944	 *
4945	 */
4946
4947	/*
4948	 *	Look for page in top-level object.  If it's not there or
4949	 *	there's something going on, give up.
4950	 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
4951	 * decrypt the page before wiring it down.
4952	 */
4953	m = vm_page_lookup(object, offset);
4954	if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
4955	    (m->unusual && ( m->error || m->restart || m->absent))) {
4956
4957		GIVE_UP;
4958	}
4959	ASSERT_PAGE_DECRYPTED(m);
4960
4961	if (m->fictitious &&
4962	    m->phys_page == vm_page_guard_addr) {
4963		/*
4964		 * Guard pages are fictitious pages and are never
4965		 * entered into a pmap, so let's say it's been wired...
4966		 */
4967		kr = KERN_SUCCESS;
4968		goto done;
4969	}
4970
4971	/*
4972	 *	Wire the page down now.  All bail outs beyond this
4973	 *	point must unwire the page.
4974	 */
4975
4976	vm_page_lockspin_queues();
4977	vm_page_wire(m);
4978	vm_page_unlock_queues();
4979
4980	/*
4981	 *	Mark page busy for other threads.
4982	 */
4983	assert(!m->busy);
4984	m->busy = TRUE;
4985	assert(!m->absent);
4986
4987	/*
4988	 *	Give up if the page is being written and there's a copy object
4989	 */
4990	if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
4991		RELEASE_PAGE(m);
4992		GIVE_UP;
4993	}
4994
4995	/*
4996	 *	Put this page into the physical map.
4997	 */
4998	type_of_fault = DBG_CACHE_HIT_FAULT;
4999	kr = vm_fault_enter(m,
5000			    pmap,
5001			    pmap_addr,
5002			    prot,
5003			    prot,
5004			    TRUE,
5005			    FALSE,
5006			    FALSE,
5007			    FALSE,
5008			    NULL,
5009			    &type_of_fault);
5010
5011done:
5012	/*
5013	 *	Unlock everything, and return
5014	 */
5015
5016	PAGE_WAKEUP_DONE(m);
5017	UNLOCK_AND_DEALLOCATE;
5018
5019	return kr;
5020
5021}
5022
5023/*
5024 *	Routine:	vm_fault_copy_cleanup
5025 *	Purpose:
5026 *		Release a page used by vm_fault_copy.
5027 */
5028
5029void
5030vm_fault_copy_cleanup(
5031	vm_page_t	page,
5032	vm_page_t	top_page)
5033{
5034	vm_object_t	object = page->object;
5035
5036	vm_object_lock(object);
5037	PAGE_WAKEUP_DONE(page);
5038	if (!page->active && !page->inactive && !page->throttled) {
5039		vm_page_lockspin_queues();
5040		if (!page->active && !page->inactive && !page->throttled)
5041			vm_page_activate(page);
5042		vm_page_unlock_queues();
5043	}
5044	vm_fault_cleanup(object, top_page);
5045}
5046
5047void
5048vm_fault_copy_dst_cleanup(
5049	vm_page_t	page)
5050{
5051	vm_object_t	object;
5052
5053	if (page != VM_PAGE_NULL) {
5054		object = page->object;
5055		vm_object_lock(object);
5056		vm_page_lockspin_queues();
5057		vm_page_unwire(page, TRUE);
5058		vm_page_unlock_queues();
5059		vm_object_paging_end(object);
5060		vm_object_unlock(object);
5061	}
5062}
5063
5064/*
5065 *	Routine:	vm_fault_copy
5066 *
5067 *	Purpose:
5068 *		Copy pages from one virtual memory object to another --
5069 *		neither the source nor destination pages need be resident.
5070 *
5071 *		Before actually copying a page, the version associated with
5072 *		the destination address map wil be verified.
5073 *
5074 *	In/out conditions:
5075 *		The caller must hold a reference, but not a lock, to
5076 *		each of the source and destination objects and to the
5077 *		destination map.
5078 *
5079 *	Results:
5080 *		Returns KERN_SUCCESS if no errors were encountered in
5081 *		reading or writing the data.  Returns KERN_INTERRUPTED if
5082 *		the operation was interrupted (only possible if the
5083 *		"interruptible" argument is asserted).  Other return values
5084 *		indicate a permanent error in copying the data.
5085 *
5086 *		The actual amount of data copied will be returned in the
5087 *		"copy_size" argument.  In the event that the destination map
5088 *		verification failed, this amount may be less than the amount
5089 *		requested.
5090 */
5091kern_return_t
5092vm_fault_copy(
5093	vm_object_t		src_object,
5094	vm_object_offset_t	src_offset,
5095	vm_map_size_t		*copy_size,		/* INOUT */
5096	vm_object_t		dst_object,
5097	vm_object_offset_t	dst_offset,
5098	vm_map_t		dst_map,
5099	vm_map_version_t	 *dst_version,
5100	int			interruptible)
5101{
5102	vm_page_t		result_page;
5103
5104	vm_page_t		src_page;
5105	vm_page_t		src_top_page;
5106	vm_prot_t		src_prot;
5107
5108	vm_page_t		dst_page;
5109	vm_page_t		dst_top_page;
5110	vm_prot_t		dst_prot;
5111
5112	vm_map_size_t		amount_left;
5113	vm_object_t		old_copy_object;
5114	kern_return_t		error = 0;
5115	vm_fault_return_t	result;
5116
5117	vm_map_size_t		part_size;
5118	struct vm_object_fault_info fault_info_src;
5119	struct vm_object_fault_info fault_info_dst;
5120
5121	/*
5122	 * In order not to confuse the clustered pageins, align
5123	 * the different offsets on a page boundary.
5124	 */
5125
5126#define	RETURN(x)					\
5127	MACRO_BEGIN					\
5128	*copy_size -= amount_left;			\
5129	MACRO_RETURN(x);				\
5130	MACRO_END
5131
5132	amount_left = *copy_size;
5133
5134	fault_info_src.interruptible = interruptible;
5135	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5136	fault_info_src.user_tag  = 0;
5137	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5138	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5139	fault_info_src.no_cache   = FALSE;
5140	fault_info_src.stealth = TRUE;
5141	fault_info_src.io_sync = FALSE;
5142	fault_info_src.cs_bypass = FALSE;
5143	fault_info_src.mark_zf_absent = FALSE;
5144	fault_info_src.batch_pmap_op = FALSE;
5145
5146	fault_info_dst.interruptible = interruptible;
5147	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5148	fault_info_dst.user_tag  = 0;
5149	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5150	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5151	fault_info_dst.no_cache   = FALSE;
5152	fault_info_dst.stealth = TRUE;
5153	fault_info_dst.io_sync = FALSE;
5154	fault_info_dst.cs_bypass = FALSE;
5155	fault_info_dst.mark_zf_absent = FALSE;
5156	fault_info_dst.batch_pmap_op = FALSE;
5157
5158	do { /* while (amount_left > 0) */
5159		/*
5160		 * There may be a deadlock if both source and destination
5161		 * pages are the same. To avoid this deadlock, the copy must
5162		 * start by getting the destination page in order to apply
5163		 * COW semantics if any.
5164		 */
5165
5166	RetryDestinationFault: ;
5167
5168		dst_prot = VM_PROT_WRITE|VM_PROT_READ;
5169
5170		vm_object_lock(dst_object);
5171		vm_object_paging_begin(dst_object);
5172
5173		if (amount_left > (vm_size_t) -1) {
5174			/* 32-bit overflow */
5175			fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5176		} else {
5177			fault_info_dst.cluster_size = (vm_size_t) amount_left;
5178			assert(fault_info_dst.cluster_size == amount_left);
5179		}
5180
5181		XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
5182		dst_page = VM_PAGE_NULL;
5183		result = vm_fault_page(dst_object,
5184				       vm_object_trunc_page(dst_offset),
5185				       VM_PROT_WRITE|VM_PROT_READ,
5186				       FALSE,
5187				       FALSE, /* page not looked up */
5188				       &dst_prot, &dst_page, &dst_top_page,
5189				       (int *)0,
5190				       &error,
5191				       dst_map->no_zero_fill,
5192				       FALSE, &fault_info_dst);
5193		switch (result) {
5194		case VM_FAULT_SUCCESS:
5195			break;
5196		case VM_FAULT_RETRY:
5197			goto RetryDestinationFault;
5198		case VM_FAULT_MEMORY_SHORTAGE:
5199			if (vm_page_wait(interruptible))
5200				goto RetryDestinationFault;
5201			/* fall thru */
5202		case VM_FAULT_INTERRUPTED:
5203			RETURN(MACH_SEND_INTERRUPTED);
5204		case VM_FAULT_SUCCESS_NO_VM_PAGE:
5205			/* success but no VM page: fail the copy */
5206			vm_object_paging_end(dst_object);
5207			vm_object_unlock(dst_object);
5208			/*FALLTHROUGH*/
5209		case VM_FAULT_MEMORY_ERROR:
5210			if (error)
5211				return (error);
5212			else
5213				return(KERN_MEMORY_ERROR);
5214		default:
5215			panic("vm_fault_copy: unexpected error 0x%x from "
5216			      "vm_fault_page()\n", result);
5217		}
5218		assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5219
5220		old_copy_object = dst_page->object->copy;
5221
5222		/*
5223		 * There exists the possiblity that the source and
5224		 * destination page are the same.  But we can't
5225		 * easily determine that now.  If they are the
5226		 * same, the call to vm_fault_page() for the
5227		 * destination page will deadlock.  To prevent this we
5228		 * wire the page so we can drop busy without having
5229		 * the page daemon steal the page.  We clean up the
5230		 * top page  but keep the paging reference on the object
5231		 * holding the dest page so it doesn't go away.
5232		 */
5233
5234		vm_page_lockspin_queues();
5235		vm_page_wire(dst_page);
5236		vm_page_unlock_queues();
5237		PAGE_WAKEUP_DONE(dst_page);
5238		vm_object_unlock(dst_page->object);
5239
5240		if (dst_top_page != VM_PAGE_NULL) {
5241			vm_object_lock(dst_object);
5242			VM_PAGE_FREE(dst_top_page);
5243			vm_object_paging_end(dst_object);
5244			vm_object_unlock(dst_object);
5245		}
5246
5247	RetrySourceFault: ;
5248
5249		if (src_object == VM_OBJECT_NULL) {
5250			/*
5251			 *	No source object.  We will just
5252			 *	zero-fill the page in dst_object.
5253			 */
5254			src_page = VM_PAGE_NULL;
5255			result_page = VM_PAGE_NULL;
5256		} else {
5257			vm_object_lock(src_object);
5258			src_page = vm_page_lookup(src_object,
5259						  vm_object_trunc_page(src_offset));
5260			if (src_page == dst_page) {
5261				src_prot = dst_prot;
5262				result_page = VM_PAGE_NULL;
5263			} else {
5264				src_prot = VM_PROT_READ;
5265				vm_object_paging_begin(src_object);
5266
5267				if (amount_left > (vm_size_t) -1) {
5268					/* 32-bit overflow */
5269					fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5270				} else {
5271					fault_info_src.cluster_size = (vm_size_t) amount_left;
5272					assert(fault_info_src.cluster_size == amount_left);
5273				}
5274
5275				XPR(XPR_VM_FAULT,
5276					"vm_fault_copy(2) -> vm_fault_page\n",
5277					0,0,0,0,0);
5278				result_page = VM_PAGE_NULL;
5279				result = vm_fault_page(
5280					src_object,
5281					vm_object_trunc_page(src_offset),
5282					VM_PROT_READ, FALSE,
5283					FALSE, /* page not looked up */
5284					&src_prot,
5285					&result_page, &src_top_page,
5286					(int *)0, &error, FALSE,
5287					FALSE, &fault_info_src);
5288
5289				switch (result) {
5290				case VM_FAULT_SUCCESS:
5291					break;
5292				case VM_FAULT_RETRY:
5293					goto RetrySourceFault;
5294				case VM_FAULT_MEMORY_SHORTAGE:
5295					if (vm_page_wait(interruptible))
5296						goto RetrySourceFault;
5297					/* fall thru */
5298				case VM_FAULT_INTERRUPTED:
5299					vm_fault_copy_dst_cleanup(dst_page);
5300					RETURN(MACH_SEND_INTERRUPTED);
5301				case VM_FAULT_SUCCESS_NO_VM_PAGE:
5302					/* success but no VM page: fail */
5303					vm_object_paging_end(src_object);
5304					vm_object_unlock(src_object);
5305					/*FALLTHROUGH*/
5306				case VM_FAULT_MEMORY_ERROR:
5307					vm_fault_copy_dst_cleanup(dst_page);
5308					if (error)
5309						return (error);
5310					else
5311						return(KERN_MEMORY_ERROR);
5312				default:
5313					panic("vm_fault_copy(2): unexpected "
5314					      "error 0x%x from "
5315					      "vm_fault_page()\n", result);
5316				}
5317
5318
5319				assert((src_top_page == VM_PAGE_NULL) ==
5320				       (result_page->object == src_object));
5321			}
5322			assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5323			vm_object_unlock(result_page->object);
5324		}
5325
5326		if (!vm_map_verify(dst_map, dst_version)) {
5327			if (result_page != VM_PAGE_NULL && src_page != dst_page)
5328				vm_fault_copy_cleanup(result_page, src_top_page);
5329			vm_fault_copy_dst_cleanup(dst_page);
5330			break;
5331		}
5332
5333		vm_object_lock(dst_page->object);
5334
5335		if (dst_page->object->copy != old_copy_object) {
5336			vm_object_unlock(dst_page->object);
5337			vm_map_verify_done(dst_map, dst_version);
5338			if (result_page != VM_PAGE_NULL && src_page != dst_page)
5339				vm_fault_copy_cleanup(result_page, src_top_page);
5340			vm_fault_copy_dst_cleanup(dst_page);
5341			break;
5342		}
5343		vm_object_unlock(dst_page->object);
5344
5345		/*
5346		 *	Copy the page, and note that it is dirty
5347		 *	immediately.
5348		 */
5349
5350		if (!page_aligned(src_offset) ||
5351			!page_aligned(dst_offset) ||
5352			!page_aligned(amount_left)) {
5353
5354			vm_object_offset_t	src_po,
5355						dst_po;
5356
5357			src_po = src_offset - vm_object_trunc_page(src_offset);
5358			dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5359
5360			if (dst_po > src_po) {
5361				part_size = PAGE_SIZE - dst_po;
5362			} else {
5363				part_size = PAGE_SIZE - src_po;
5364			}
5365			if (part_size > (amount_left)){
5366				part_size = amount_left;
5367			}
5368
5369			if (result_page == VM_PAGE_NULL) {
5370				assert((vm_offset_t) dst_po == dst_po);
5371				assert((vm_size_t) part_size == part_size);
5372				vm_page_part_zero_fill(dst_page,
5373						       (vm_offset_t) dst_po,
5374						       (vm_size_t) part_size);
5375			} else {
5376				assert((vm_offset_t) src_po == src_po);
5377				assert((vm_offset_t) dst_po == dst_po);
5378				assert((vm_size_t) part_size == part_size);
5379				vm_page_part_copy(result_page,
5380						  (vm_offset_t) src_po,
5381						  dst_page,
5382						  (vm_offset_t) dst_po,
5383						  (vm_size_t)part_size);
5384				if(!dst_page->dirty){
5385					vm_object_lock(dst_object);
5386					SET_PAGE_DIRTY(dst_page, TRUE);
5387					vm_object_unlock(dst_page->object);
5388				}
5389
5390			}
5391		} else {
5392			part_size = PAGE_SIZE;
5393
5394			if (result_page == VM_PAGE_NULL)
5395				vm_page_zero_fill(dst_page);
5396			else{
5397				vm_object_lock(result_page->object);
5398				vm_page_copy(result_page, dst_page);
5399				vm_object_unlock(result_page->object);
5400
5401				if(!dst_page->dirty){
5402					vm_object_lock(dst_object);
5403					SET_PAGE_DIRTY(dst_page, TRUE);
5404					vm_object_unlock(dst_page->object);
5405				}
5406			}
5407
5408		}
5409
5410		/*
5411		 *	Unlock everything, and return
5412		 */
5413
5414		vm_map_verify_done(dst_map, dst_version);
5415
5416		if (result_page != VM_PAGE_NULL && src_page != dst_page)
5417			vm_fault_copy_cleanup(result_page, src_top_page);
5418		vm_fault_copy_dst_cleanup(dst_page);
5419
5420		amount_left -= part_size;
5421		src_offset += part_size;
5422		dst_offset += part_size;
5423	} while (amount_left > 0);
5424
5425	RETURN(KERN_SUCCESS);
5426#undef	RETURN
5427
5428	/*NOTREACHED*/
5429}
5430
5431#if	VM_FAULT_CLASSIFY
5432/*
5433 *	Temporary statistics gathering support.
5434 */
5435
5436/*
5437 *	Statistics arrays:
5438 */
5439#define VM_FAULT_TYPES_MAX	5
5440#define	VM_FAULT_LEVEL_MAX	8
5441
5442int	vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
5443
5444#define	VM_FAULT_TYPE_ZERO_FILL	0
5445#define	VM_FAULT_TYPE_MAP_IN	1
5446#define	VM_FAULT_TYPE_PAGER	2
5447#define	VM_FAULT_TYPE_COPY	3
5448#define	VM_FAULT_TYPE_OTHER	4
5449
5450
5451void
5452vm_fault_classify(vm_object_t		object,
5453		  vm_object_offset_t	offset,
5454		  vm_prot_t		fault_type)
5455{
5456	int		type, level = 0;
5457	vm_page_t	m;
5458
5459	while (TRUE) {
5460		m = vm_page_lookup(object, offset);
5461		if (m != VM_PAGE_NULL) {
5462		        if (m->busy || m->error || m->restart || m->absent) {
5463				type = VM_FAULT_TYPE_OTHER;
5464				break;
5465			}
5466			if (((fault_type & VM_PROT_WRITE) == 0) ||
5467			    ((level == 0) && object->copy == VM_OBJECT_NULL)) {
5468				type = VM_FAULT_TYPE_MAP_IN;
5469				break;
5470			}
5471			type = VM_FAULT_TYPE_COPY;
5472			break;
5473		}
5474		else {
5475			if (object->pager_created) {
5476				type = VM_FAULT_TYPE_PAGER;
5477				break;
5478			}
5479			if (object->shadow == VM_OBJECT_NULL) {
5480				type = VM_FAULT_TYPE_ZERO_FILL;
5481				break;
5482		        }
5483
5484			offset += object->vo_shadow_offset;
5485			object = object->shadow;
5486			level++;
5487			continue;
5488		}
5489	}
5490
5491	if (level > VM_FAULT_LEVEL_MAX)
5492		level = VM_FAULT_LEVEL_MAX;
5493
5494	vm_fault_stats[type][level] += 1;
5495
5496	return;
5497}
5498
5499/* cleanup routine to call from debugger */
5500
5501void
5502vm_fault_classify_init(void)
5503{
5504	int type, level;
5505
5506	for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
5507		for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
5508			vm_fault_stats[type][level] = 0;
5509		}
5510	}
5511
5512	return;
5513}
5514#endif	/* VM_FAULT_CLASSIFY */
5515
5516
5517void
5518vm_page_validate_cs_mapped(
5519	vm_page_t	page,
5520	const void 	*kaddr)
5521{
5522	vm_object_t		object;
5523	vm_object_offset_t	offset;
5524	kern_return_t		kr;
5525	memory_object_t		pager;
5526	void			*blobs;
5527	boolean_t		validated, tainted;
5528
5529	assert(page->busy);
5530	vm_object_lock_assert_exclusive(page->object);
5531
5532	if (!cs_validation) {
5533		return;
5534	}
5535
5536	if (page->wpmapped && !page->cs_tainted) {
5537		/*
5538		 * This page was mapped for "write" access sometime in the
5539		 * past and could still be modifiable in the future.
5540		 * Consider it tainted.
5541		 * [ If the page was already found to be "tainted", no
5542		 * need to re-validate. ]
5543		 */
5544		page->cs_validated = TRUE;
5545		page->cs_tainted = TRUE;
5546		if (cs_debug) {
5547			printf("CODESIGNING: vm_page_validate_cs: "
5548			       "page %p obj %p off 0x%llx "
5549			       "was modified\n",
5550			       page, page->object, page->offset);
5551		}
5552		vm_cs_validated_dirtied++;
5553	}
5554
5555	if (page->cs_validated) {
5556		return;
5557	}
5558
5559	vm_cs_validates++;
5560
5561	object = page->object;
5562	assert(object->code_signed);
5563	offset = page->offset;
5564
5565	if (!object->alive || object->terminating || object->pager == NULL) {
5566		/*
5567		 * The object is terminating and we don't have its pager
5568		 * so we can't validate the data...
5569		 */
5570		return;
5571	}
5572	/*
5573	 * Since we get here to validate a page that was brought in by
5574	 * the pager, we know that this pager is all setup and ready
5575	 * by now.
5576	 */
5577	assert(!object->internal);
5578	assert(object->pager != NULL);
5579	assert(object->pager_ready);
5580
5581	pager = object->pager;
5582	assert(object->paging_in_progress);
5583	kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5584	if (kr != KERN_SUCCESS) {
5585		blobs = NULL;
5586	}
5587
5588	/* verify the SHA1 hash for this page */
5589	validated = cs_validate_page(blobs,
5590				     pager,
5591				     offset + object->paging_offset,
5592				     (const void *)kaddr,
5593				     &tainted);
5594
5595	page->cs_validated = validated;
5596	if (validated) {
5597		page->cs_tainted = tainted;
5598	}
5599}
5600
5601extern int panic_on_cs_killed;
5602void
5603vm_page_validate_cs(
5604	vm_page_t	page)
5605{
5606	vm_object_t		object;
5607	vm_object_offset_t	offset;
5608	vm_map_offset_t		koffset;
5609	vm_map_size_t		ksize;
5610	vm_offset_t		kaddr;
5611	kern_return_t		kr;
5612	boolean_t		busy_page;
5613	boolean_t		need_unmap;
5614
5615	vm_object_lock_assert_held(page->object);
5616
5617	if (!cs_validation) {
5618		return;
5619	}
5620
5621	if (page->wpmapped && !page->cs_tainted) {
5622		vm_object_lock_assert_exclusive(page->object);
5623
5624		/*
5625		 * This page was mapped for "write" access sometime in the
5626		 * past and could still be modifiable in the future.
5627		 * Consider it tainted.
5628		 * [ If the page was already found to be "tainted", no
5629		 * need to re-validate. ]
5630		 */
5631		page->cs_validated = TRUE;
5632		page->cs_tainted = TRUE;
5633		if (cs_debug) {
5634			printf("CODESIGNING: vm_page_validate_cs: "
5635			       "page %p obj %p off 0x%llx "
5636			       "was modified\n",
5637			       page, page->object, page->offset);
5638		}
5639		vm_cs_validated_dirtied++;
5640	}
5641
5642	if (page->cs_validated) {
5643		return;
5644	}
5645
5646	if (panic_on_cs_killed &&
5647	    page->slid) {
5648		panic("vm_page_validate_cs(%p): page is slid\n", page);
5649	}
5650	assert(!page->slid);
5651
5652#if CHECK_CS_VALIDATION_BITMAP
5653	if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5654		page->cs_validated = TRUE;
5655		page->cs_tainted = FALSE;
5656		vm_cs_bitmap_validated++;
5657		return;
5658	}
5659#endif
5660	vm_object_lock_assert_exclusive(page->object);
5661
5662	object = page->object;
5663	assert(object->code_signed);
5664	offset = page->offset;
5665
5666	busy_page = page->busy;
5667	if (!busy_page) {
5668		/* keep page busy while we map (and unlock) the VM object */
5669		page->busy = TRUE;
5670	}
5671
5672	/*
5673	 * Take a paging reference on the VM object
5674	 * to protect it from collapse or bypass,
5675	 * and keep it from disappearing too.
5676	 */
5677	vm_object_paging_begin(object);
5678
5679	/* map the page in the kernel address space */
5680	ksize = PAGE_SIZE_64;
5681	koffset = 0;
5682	need_unmap = FALSE;
5683	kr = vm_paging_map_object(page,
5684				  object,
5685				  offset,
5686				  VM_PROT_READ,
5687				  FALSE, /* can't unlock object ! */
5688				  &ksize,
5689				  &koffset,
5690				  &need_unmap);
5691	if (kr != KERN_SUCCESS) {
5692		panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5693	}
5694	kaddr = CAST_DOWN(vm_offset_t, koffset);
5695
5696	/* validate the mapped page */
5697	vm_page_validate_cs_mapped(page, (const void *) kaddr);
5698
5699#if CHECK_CS_VALIDATION_BITMAP
5700	if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5701		vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5702	}
5703#endif
5704	assert(page->busy);
5705	assert(object == page->object);
5706	vm_object_lock_assert_exclusive(object);
5707
5708	if (!busy_page) {
5709		PAGE_WAKEUP_DONE(page);
5710	}
5711	if (need_unmap) {
5712		/* unmap the map from the kernel address space */
5713		vm_paging_unmap_object(object, koffset, koffset + ksize);
5714		koffset = 0;
5715		ksize = 0;
5716		kaddr = 0;
5717	}
5718	vm_object_paging_end(object);
5719}
5720