1/*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 *	File:	vm/vm_pageout.c
60 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61 *	Date:	1985
62 *
63 *	The proverbial page-out daemon.
64 */
65
66#include <stdint.h>
67
68#include <debug.h>
69#include <mach_pagemap.h>
70#include <mach_cluster_stats.h>
71#include <advisory_pageout.h>
72
73#include <mach/mach_types.h>
74#include <mach/memory_object.h>
75#include <mach/memory_object_default.h>
76#include <mach/memory_object_control_server.h>
77#include <mach/mach_host_server.h>
78#include <mach/upl.h>
79#include <mach/vm_map.h>
80#include <mach/vm_param.h>
81#include <mach/vm_statistics.h>
82#include <mach/sdt.h>
83
84#include <kern/kern_types.h>
85#include <kern/counters.h>
86#include <kern/host_statistics.h>
87#include <kern/machine.h>
88#include <kern/misc_protos.h>
89#include <kern/sched.h>
90#include <kern/thread.h>
91#include <kern/xpr.h>
92#include <kern/kalloc.h>
93
94#include <machine/vm_tuning.h>
95#include <machine/commpage.h>
96
97#include <vm/pmap.h>
98#include <vm/vm_fault.h>
99#include <vm/vm_map.h>
100#include <vm/vm_object.h>
101#include <vm/vm_page.h>
102#include <vm/vm_pageout.h>
103#include <vm/vm_protos.h> /* must be last */
104#include <vm/memory_object.h>
105#include <vm/vm_purgeable_internal.h>
106#include <vm/vm_shared_region.h>
107/*
108 * ENCRYPTED SWAP:
109 */
110#include <libkern/crypto/aes.h>
111extern u_int32_t random(void);	/* from <libkern/libkern.h> */
112
113extern int cs_debug;
114
115#if UPL_DEBUG
116#include <libkern/OSDebug.h>
117#endif
118
119#if VM_PRESSURE_EVENTS
120extern void consider_vm_pressure_events(void);
121#endif
122
123#ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
124#define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
125#endif
126
127#ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
128#ifdef	CONFIG_EMBEDDED
129#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
130#else
131#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
132#endif
133#endif
134
135#ifndef VM_PAGEOUT_DEADLOCK_RELIEF
136#define VM_PAGEOUT_DEADLOCK_RELIEF 100	/* number of pages to move to break deadlock */
137#endif
138
139#ifndef VM_PAGEOUT_INACTIVE_RELIEF
140#define VM_PAGEOUT_INACTIVE_RELIEF 50	/* minimum number of pages to move to the inactive q */
141#endif
142
143#ifndef	VM_PAGE_LAUNDRY_MAX
144#define	VM_PAGE_LAUNDRY_MAX	128UL	/* maximum pageouts on a given pageout queue */
145#endif	/* VM_PAGEOUT_LAUNDRY_MAX */
146
147#ifndef	VM_PAGEOUT_BURST_WAIT
148#define	VM_PAGEOUT_BURST_WAIT	30	/* milliseconds */
149#endif	/* VM_PAGEOUT_BURST_WAIT */
150
151#ifndef	VM_PAGEOUT_EMPTY_WAIT
152#define VM_PAGEOUT_EMPTY_WAIT	200	/* milliseconds */
153#endif	/* VM_PAGEOUT_EMPTY_WAIT */
154
155#ifndef	VM_PAGEOUT_DEADLOCK_WAIT
156#define VM_PAGEOUT_DEADLOCK_WAIT	300	/* milliseconds */
157#endif	/* VM_PAGEOUT_DEADLOCK_WAIT */
158
159#ifndef	VM_PAGEOUT_IDLE_WAIT
160#define VM_PAGEOUT_IDLE_WAIT	10	/* milliseconds */
161#endif	/* VM_PAGEOUT_IDLE_WAIT */
162
163#ifndef VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED
164#define VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED		1000	/* maximum pages considered before we issue a pressure event */
165#endif /* VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED */
166
167#ifndef VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS
168#define VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS		5	/* seconds */
169#endif /* VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS */
170
171unsigned int	vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
172unsigned int	vm_page_speculative_percentage = 5;
173
174#ifndef VM_PAGE_SPECULATIVE_TARGET
175#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_page_speculative_percentage))
176#endif /* VM_PAGE_SPECULATIVE_TARGET */
177
178
179#ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
180#define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
181#endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
182
183
184/*
185 *	To obtain a reasonable LRU approximation, the inactive queue
186 *	needs to be large enough to give pages on it a chance to be
187 *	referenced a second time.  This macro defines the fraction
188 *	of active+inactive pages that should be inactive.
189 *	The pageout daemon uses it to update vm_page_inactive_target.
190 *
191 *	If vm_page_free_count falls below vm_page_free_target and
192 *	vm_page_inactive_count is below vm_page_inactive_target,
193 *	then the pageout daemon starts running.
194 */
195
196#ifndef	VM_PAGE_INACTIVE_TARGET
197#define	VM_PAGE_INACTIVE_TARGET(avail)	((avail) * 1 / 2)
198#endif	/* VM_PAGE_INACTIVE_TARGET */
199
200/*
201 *	Once the pageout daemon starts running, it keeps going
202 *	until vm_page_free_count meets or exceeds vm_page_free_target.
203 */
204
205#ifndef	VM_PAGE_FREE_TARGET
206#ifdef	CONFIG_EMBEDDED
207#define	VM_PAGE_FREE_TARGET(free)	(15 + (free) / 100)
208#else
209#define	VM_PAGE_FREE_TARGET(free)	(15 + (free) / 80)
210#endif
211#endif	/* VM_PAGE_FREE_TARGET */
212
213/*
214 *	The pageout daemon always starts running once vm_page_free_count
215 *	falls below vm_page_free_min.
216 */
217
218#ifndef	VM_PAGE_FREE_MIN
219#ifdef	CONFIG_EMBEDDED
220#define	VM_PAGE_FREE_MIN(free)		(10 + (free) / 200)
221#else
222#define	VM_PAGE_FREE_MIN(free)		(10 + (free) / 100)
223#endif
224#endif	/* VM_PAGE_FREE_MIN */
225
226#define VM_PAGE_FREE_RESERVED_LIMIT	100
227#define VM_PAGE_FREE_MIN_LIMIT		1500
228#define VM_PAGE_FREE_TARGET_LIMIT	2000
229
230
231/*
232 *	When vm_page_free_count falls below vm_page_free_reserved,
233 *	only vm-privileged threads can allocate pages.  vm-privilege
234 *	allows the pageout daemon and default pager (and any other
235 *	associated threads needed for default pageout) to continue
236 *	operation by dipping into the reserved pool of pages.
237 */
238
239#ifndef	VM_PAGE_FREE_RESERVED
240#define	VM_PAGE_FREE_RESERVED(n)	\
241	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
242#endif	/* VM_PAGE_FREE_RESERVED */
243
244/*
245 *	When we dequeue pages from the inactive list, they are
246 *	reactivated (ie, put back on the active queue) if referenced.
247 *	However, it is possible to starve the free list if other
248 *	processors are referencing pages faster than we can turn off
249 *	the referenced bit.  So we limit the number of reactivations
250 *	we will make per call of vm_pageout_scan().
251 */
252#define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
253#ifndef	VM_PAGE_REACTIVATE_LIMIT
254#ifdef	CONFIG_EMBEDDED
255#define	VM_PAGE_REACTIVATE_LIMIT(avail)	(VM_PAGE_INACTIVE_TARGET(avail) / 2)
256#else
257#define	VM_PAGE_REACTIVATE_LIMIT(avail)	(MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
258#endif
259#endif	/* VM_PAGE_REACTIVATE_LIMIT */
260#define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM	100
261
262
263extern boolean_t hibernate_cleaning_in_progress;
264
265/*
266 * Exported variable used to broadcast the activation of the pageout scan
267 * Working Set uses this to throttle its use of pmap removes.  In this
268 * way, code which runs within memory in an uncontested context does
269 * not keep encountering soft faults.
270 */
271
272unsigned int	vm_pageout_scan_event_counter = 0;
273
274/*
275 * Forward declarations for internal routines.
276 */
277
278static void vm_pressure_thread(void);
279static void vm_pageout_garbage_collect(int);
280static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
281static void vm_pageout_iothread_external(void);
282static void vm_pageout_iothread_internal(void);
283static void vm_pageout_adjust_io_throttles(struct vm_pageout_queue *, struct vm_pageout_queue *, boolean_t);
284
285extern void vm_pageout_continue(void);
286extern void vm_pageout_scan(void);
287
288static thread_t	vm_pageout_external_iothread = THREAD_NULL;
289static thread_t	vm_pageout_internal_iothread = THREAD_NULL;
290
291unsigned int vm_pageout_reserved_internal = 0;
292unsigned int vm_pageout_reserved_really = 0;
293
294unsigned int vm_pageout_idle_wait = 0;		/* milliseconds */
295unsigned int vm_pageout_empty_wait = 0;		/* milliseconds */
296unsigned int vm_pageout_burst_wait = 0;		/* milliseconds */
297unsigned int vm_pageout_deadlock_wait = 0;	/* milliseconds */
298unsigned int vm_pageout_deadlock_relief = 0;
299unsigned int vm_pageout_inactive_relief = 0;
300unsigned int vm_pageout_burst_active_throttle = 0;
301unsigned int vm_pageout_burst_inactive_throttle = 0;
302
303int	vm_upl_wait_for_pages = 0;
304
305
306/*
307 *	These variables record the pageout daemon's actions:
308 *	how many pages it looks at and what happens to those pages.
309 *	No locking needed because only one thread modifies the variables.
310 */
311
312unsigned int vm_pageout_active = 0;		/* debugging */
313unsigned int vm_pageout_active_busy = 0;	/* debugging */
314unsigned int vm_pageout_inactive = 0;		/* debugging */
315unsigned int vm_pageout_inactive_throttled = 0;	/* debugging */
316unsigned int vm_pageout_inactive_forced = 0;	/* debugging */
317unsigned int vm_pageout_inactive_nolock = 0;	/* debugging */
318unsigned int vm_pageout_inactive_avoid = 0;	/* debugging */
319unsigned int vm_pageout_inactive_busy = 0;	/* debugging */
320unsigned int vm_pageout_inactive_error = 0;	/* debugging */
321unsigned int vm_pageout_inactive_absent = 0;	/* debugging */
322unsigned int vm_pageout_inactive_notalive = 0;	/* debugging */
323unsigned int vm_pageout_inactive_used = 0;	/* debugging */
324unsigned int vm_pageout_cache_evicted = 0;	/* debugging */
325unsigned int vm_pageout_inactive_clean = 0;	/* debugging */
326unsigned int vm_pageout_speculative_clean = 0;	/* debugging */
327
328unsigned int vm_pageout_freed_from_cleaned = 0;
329unsigned int vm_pageout_freed_from_speculative = 0;
330unsigned int vm_pageout_freed_from_inactive_clean = 0;
331
332unsigned int vm_pageout_enqueued_cleaned_from_inactive_clean = 0;
333unsigned int vm_pageout_enqueued_cleaned_from_inactive_dirty = 0;
334
335unsigned int vm_pageout_cleaned_reclaimed = 0;		/* debugging; how many cleaned pages are reclaimed by the pageout scan */
336unsigned int vm_pageout_cleaned_reactivated = 0;	/* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */
337unsigned int vm_pageout_cleaned_reference_reactivated = 0;
338unsigned int vm_pageout_cleaned_volatile_reactivated = 0;
339unsigned int vm_pageout_cleaned_fault_reactivated = 0;
340unsigned int vm_pageout_cleaned_commit_reactivated = 0;	/* debugging; how many cleaned pages are found to be referenced on commit (and are therefore reactivated) */
341unsigned int vm_pageout_cleaned_busy = 0;
342unsigned int vm_pageout_cleaned_nolock = 0;
343
344unsigned int vm_pageout_inactive_dirty_internal = 0;	/* debugging */
345unsigned int vm_pageout_inactive_dirty_external = 0;	/* debugging */
346unsigned int vm_pageout_inactive_deactivated = 0;	/* debugging */
347unsigned int vm_pageout_inactive_anonymous = 0;	/* debugging */
348unsigned int vm_pageout_dirty_no_pager = 0;	/* debugging */
349unsigned int vm_pageout_purged_objects = 0;	/* debugging */
350unsigned int vm_stat_discard = 0;		/* debugging */
351unsigned int vm_stat_discard_sent = 0;		/* debugging */
352unsigned int vm_stat_discard_failure = 0;	/* debugging */
353unsigned int vm_stat_discard_throttle = 0;	/* debugging */
354unsigned int vm_pageout_reactivation_limit_exceeded = 0;	/* debugging */
355unsigned int vm_pageout_catch_ups = 0;				/* debugging */
356unsigned int vm_pageout_inactive_force_reclaim = 0;	/* debugging */
357
358unsigned int vm_pageout_scan_reclaimed_throttled = 0;
359unsigned int vm_pageout_scan_active_throttled = 0;
360unsigned int vm_pageout_scan_inactive_throttled_internal = 0;
361unsigned int vm_pageout_scan_inactive_throttled_external = 0;
362unsigned int vm_pageout_scan_throttle = 0;			/* debugging */
363unsigned int vm_pageout_scan_burst_throttle = 0;		/* debugging */
364unsigned int vm_pageout_scan_empty_throttle = 0;		/* debugging */
365unsigned int vm_pageout_scan_deadlock_detected = 0;		/* debugging */
366unsigned int vm_pageout_scan_active_throttle_success = 0;	/* debugging */
367unsigned int vm_pageout_scan_inactive_throttle_success = 0;	/* debugging */
368unsigned int vm_pageout_inactive_external_forced_reactivate_count = 0;	/* debugging */
369unsigned int vm_pageout_inactive_external_forced_jetsam_count = 0;	/* debugging */
370unsigned int vm_page_speculative_count_drifts = 0;
371unsigned int vm_page_speculative_count_drift_max = 0;
372
373
374unsigned int vm_precleaning_aborted = 0;
375
376static boolean_t vm_pageout_need_to_refill_clean_queue = FALSE;
377static boolean_t vm_pageout_precleaning_delayed = FALSE;
378
379/*
380 * Backing store throttle when BS is exhausted
381 */
382unsigned int	vm_backing_store_low = 0;
383
384unsigned int vm_pageout_out_of_line  = 0;
385unsigned int vm_pageout_in_place  = 0;
386
387unsigned int vm_page_steal_pageout_page = 0;
388
389/*
390 * ENCRYPTED SWAP:
391 * counters and statistics...
392 */
393unsigned long vm_page_decrypt_counter = 0;
394unsigned long vm_page_decrypt_for_upl_counter = 0;
395unsigned long vm_page_encrypt_counter = 0;
396unsigned long vm_page_encrypt_abort_counter = 0;
397unsigned long vm_page_encrypt_already_encrypted_counter = 0;
398boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
399
400struct	vm_pageout_queue vm_pageout_queue_internal;
401struct	vm_pageout_queue vm_pageout_queue_external;
402
403unsigned int vm_page_speculative_target = 0;
404
405vm_object_t 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
406
407boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL;
408
409#if DEVELOPMENT || DEBUG
410unsigned long vm_cs_validated_resets = 0;
411#endif
412
413int	vm_debug_events	= 0;
414
415#if CONFIG_MEMORYSTATUS
416extern int memorystatus_wakeup;
417#endif
418#if CONFIG_JETSAM
419extern int memorystatus_kill_top_proc_from_VM(void);
420#endif
421
422/*
423 *	Routine:	vm_backing_store_disable
424 *	Purpose:
425 *		Suspend non-privileged threads wishing to extend
426 *		backing store when we are low on backing store
427 *		(Synchronized by caller)
428 */
429void
430vm_backing_store_disable(
431	boolean_t	disable)
432{
433	if(disable) {
434		vm_backing_store_low = 1;
435	} else {
436		if(vm_backing_store_low) {
437			vm_backing_store_low = 0;
438			thread_wakeup((event_t) &vm_backing_store_low);
439		}
440	}
441}
442
443
444#if MACH_CLUSTER_STATS
445unsigned long vm_pageout_cluster_dirtied = 0;
446unsigned long vm_pageout_cluster_cleaned = 0;
447unsigned long vm_pageout_cluster_collisions = 0;
448unsigned long vm_pageout_cluster_clusters = 0;
449unsigned long vm_pageout_cluster_conversions = 0;
450unsigned long vm_pageout_target_collisions = 0;
451unsigned long vm_pageout_target_page_dirtied = 0;
452unsigned long vm_pageout_target_page_freed = 0;
453#define CLUSTER_STAT(clause)	clause
454#else	/* MACH_CLUSTER_STATS */
455#define CLUSTER_STAT(clause)
456#endif	/* MACH_CLUSTER_STATS */
457
458/*
459 *	Routine:	vm_pageout_object_terminate
460 *	Purpose:
461 *		Destroy the pageout_object, and perform all of the
462 *		required cleanup actions.
463 *
464 *	In/Out conditions:
465 *		The object must be locked, and will be returned locked.
466 */
467void
468vm_pageout_object_terminate(
469	vm_object_t	object)
470{
471	vm_object_t	shadow_object;
472
473	/*
474	 * Deal with the deallocation (last reference) of a pageout object
475	 * (used for cleaning-in-place) by dropping the paging references/
476	 * freeing pages in the original object.
477	 */
478
479	assert(object->pageout);
480	shadow_object = object->shadow;
481	vm_object_lock(shadow_object);
482
483	while (!queue_empty(&object->memq)) {
484		vm_page_t 		p, m;
485		vm_object_offset_t	offset;
486
487		p = (vm_page_t) queue_first(&object->memq);
488
489		assert(p->private);
490		assert(p->pageout);
491		p->pageout = FALSE;
492		assert(!p->cleaning);
493		assert(!p->laundry);
494
495		offset = p->offset;
496		VM_PAGE_FREE(p);
497		p = VM_PAGE_NULL;
498
499		m = vm_page_lookup(shadow_object,
500			offset + object->vo_shadow_offset);
501
502		if(m == VM_PAGE_NULL)
503			continue;
504
505		assert((m->dirty) || (m->precious) ||
506				(m->busy && m->cleaning));
507
508		/*
509		 * Handle the trusted pager throttle.
510		 * Also decrement the burst throttle (if external).
511		 */
512		vm_page_lock_queues();
513		if (m->laundry)
514			vm_pageout_throttle_up(m);
515
516		/*
517		 * Handle the "target" page(s). These pages are to be freed if
518		 * successfully cleaned. Target pages are always busy, and are
519		 * wired exactly once. The initial target pages are not mapped,
520		 * (so cannot be referenced or modified) but converted target
521		 * pages may have been modified between the selection as an
522		 * adjacent page and conversion to a target.
523		 */
524		if (m->pageout) {
525			assert(m->busy);
526			assert(m->wire_count == 1);
527			m->cleaning = FALSE;
528			m->encrypted_cleaning = FALSE;
529			m->pageout = FALSE;
530#if MACH_CLUSTER_STATS
531			if (m->wanted) vm_pageout_target_collisions++;
532#endif
533			/*
534			 * Revoke all access to the page. Since the object is
535			 * locked, and the page is busy, this prevents the page
536			 * from being dirtied after the pmap_disconnect() call
537			 * returns.
538			 *
539			 * Since the page is left "dirty" but "not modifed", we
540			 * can detect whether the page was redirtied during
541			 * pageout by checking the modify state.
542			 */
543			if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) {
544				SET_PAGE_DIRTY(m, FALSE);
545			} else {
546				m->dirty = FALSE;
547			}
548
549			if (m->dirty) {
550				CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
551				vm_page_unwire(m, TRUE);	/* reactivates */
552				VM_STAT_INCR(reactivations);
553				PAGE_WAKEUP_DONE(m);
554			} else {
555				CLUSTER_STAT(vm_pageout_target_page_freed++;)
556				vm_page_free(m);/* clears busy, etc. */
557			}
558			vm_page_unlock_queues();
559			continue;
560		}
561		/*
562		 * Handle the "adjacent" pages. These pages were cleaned in
563		 * place, and should be left alone.
564		 * If prep_pin_count is nonzero, then someone is using the
565		 * page, so make it active.
566		 */
567		if (!m->active && !m->inactive && !m->throttled && !m->private) {
568			if (m->reference)
569				vm_page_activate(m);
570			else
571				vm_page_deactivate(m);
572		}
573		if (m->overwriting) {
574			/*
575			 * the (COPY_OUT_FROM == FALSE) request_page_list case
576			 */
577			if (m->busy) {
578				/*
579				 * We do not re-set m->dirty !
580				 * The page was busy so no extraneous activity
581				 * could have occurred. COPY_INTO is a read into the
582				 * new pages. CLEAN_IN_PLACE does actually write
583				 * out the pages but handling outside of this code
584				 * will take care of resetting dirty. We clear the
585				 * modify however for the Programmed I/O case.
586				 */
587				pmap_clear_modify(m->phys_page);
588
589				m->busy = FALSE;
590				m->absent = FALSE;
591			} else {
592				/*
593				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
594				 * Occurs when the original page was wired
595				 * at the time of the list request
596				 */
597				 assert(VM_PAGE_WIRED(m));
598				 vm_page_unwire(m, TRUE);	/* reactivates */
599			}
600			m->overwriting = FALSE;
601		} else {
602			/*
603			 * Set the dirty state according to whether or not the page was
604			 * modified during the pageout. Note that we purposefully do
605			 * NOT call pmap_clear_modify since the page is still mapped.
606			 * If the page were to be dirtied between the 2 calls, this
607			 * this fact would be lost. This code is only necessary to
608			 * maintain statistics, since the pmap module is always
609			 * consulted if m->dirty is false.
610			 */
611#if MACH_CLUSTER_STATS
612			m->dirty = pmap_is_modified(m->phys_page);
613
614			if (m->dirty)	vm_pageout_cluster_dirtied++;
615			else		vm_pageout_cluster_cleaned++;
616			if (m->wanted)	vm_pageout_cluster_collisions++;
617#else
618			m->dirty = FALSE;
619#endif
620		}
621		if (m->encrypted_cleaning == TRUE) {
622			m->encrypted_cleaning = FALSE;
623			m->busy = FALSE;
624		}
625		m->cleaning = FALSE;
626
627		/*
628		 * Wakeup any thread waiting for the page to be un-cleaning.
629		 */
630		PAGE_WAKEUP(m);
631		vm_page_unlock_queues();
632	}
633	/*
634	 * Account for the paging reference taken in vm_paging_object_allocate.
635	 */
636	vm_object_activity_end(shadow_object);
637	vm_object_unlock(shadow_object);
638
639	assert(object->ref_count == 0);
640	assert(object->paging_in_progress == 0);
641	assert(object->activity_in_progress == 0);
642	assert(object->resident_page_count == 0);
643	return;
644}
645
646/*
647 * Routine:	vm_pageclean_setup
648 *
649 * Purpose:	setup a page to be cleaned (made non-dirty), but not
650 *		necessarily flushed from the VM page cache.
651 *		This is accomplished by cleaning in place.
652 *
653 *		The page must not be busy, and new_object
654 *		must be locked.
655 *
656 */
657void
658vm_pageclean_setup(
659	vm_page_t		m,
660	vm_page_t		new_m,
661	vm_object_t		new_object,
662	vm_object_offset_t	new_offset)
663{
664	assert(!m->busy);
665#if 0
666	assert(!m->cleaning);
667#endif
668
669	XPR(XPR_VM_PAGEOUT,
670    "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
671		m->object, m->offset, m,
672		new_m, new_offset);
673
674	pmap_clear_modify(m->phys_page);
675
676	/*
677	 * Mark original page as cleaning in place.
678	 */
679	m->cleaning = TRUE;
680	SET_PAGE_DIRTY(m, FALSE);
681	m->precious = FALSE;
682
683	/*
684	 * Convert the fictitious page to a private shadow of
685	 * the real page.
686	 */
687	assert(new_m->fictitious);
688	assert(new_m->phys_page == vm_page_fictitious_addr);
689	new_m->fictitious = FALSE;
690	new_m->private = TRUE;
691	new_m->pageout = TRUE;
692	new_m->phys_page = m->phys_page;
693
694	vm_page_lockspin_queues();
695	vm_page_wire(new_m);
696	vm_page_unlock_queues();
697
698	vm_page_insert(new_m, new_object, new_offset);
699	assert(!new_m->wanted);
700	new_m->busy = FALSE;
701}
702
703/*
704 *	Routine:	vm_pageout_initialize_page
705 *	Purpose:
706 *		Causes the specified page to be initialized in
707 *		the appropriate memory object. This routine is used to push
708 *		pages into a copy-object when they are modified in the
709 *		permanent object.
710 *
711 *		The page is moved to a temporary object and paged out.
712 *
713 *	In/out conditions:
714 *		The page in question must not be on any pageout queues.
715 *		The object to which it belongs must be locked.
716 *		The page must be busy, but not hold a paging reference.
717 *
718 *	Implementation:
719 *		Move this page to a completely new object.
720 */
721void
722vm_pageout_initialize_page(
723	vm_page_t	m)
724{
725	vm_object_t		object;
726	vm_object_offset_t	paging_offset;
727	memory_object_t		pager;
728
729	XPR(XPR_VM_PAGEOUT,
730		"vm_pageout_initialize_page, page 0x%X\n",
731		m, 0, 0, 0, 0);
732	assert(m->busy);
733
734	/*
735	 *	Verify that we really want to clean this page
736	 */
737	assert(!m->absent);
738	assert(!m->error);
739	assert(m->dirty);
740
741	/*
742	 *	Create a paging reference to let us play with the object.
743	 */
744	object = m->object;
745	paging_offset = m->offset + object->paging_offset;
746
747	if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
748		VM_PAGE_FREE(m);
749		panic("reservation without pageout?"); /* alan */
750		vm_object_unlock(object);
751
752		return;
753	}
754
755	/*
756	 * If there's no pager, then we can't clean the page.  This should
757	 * never happen since this should be a copy object and therefore not
758	 * an external object, so the pager should always be there.
759	 */
760
761	pager = object->pager;
762
763	if (pager == MEMORY_OBJECT_NULL) {
764		VM_PAGE_FREE(m);
765		panic("missing pager for copy object");
766		return;
767	}
768
769	/*
770	 * set the page for future call to vm_fault_list_request
771	 */
772	pmap_clear_modify(m->phys_page);
773	SET_PAGE_DIRTY(m, FALSE);
774	m->pageout = TRUE;
775
776	/*
777	 * keep the object from collapsing or terminating
778	 */
779	vm_object_paging_begin(object);
780	vm_object_unlock(object);
781
782	/*
783	 *	Write the data to its pager.
784	 *	Note that the data is passed by naming the new object,
785	 *	not a virtual address; the pager interface has been
786	 *	manipulated to use the "internal memory" data type.
787	 *	[The object reference from its allocation is donated
788	 *	to the eventual recipient.]
789	 */
790	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
791
792	vm_object_lock(object);
793	vm_object_paging_end(object);
794}
795
796#if	MACH_CLUSTER_STATS
797#define MAXCLUSTERPAGES	16
798struct {
799	unsigned long pages_in_cluster;
800	unsigned long pages_at_higher_offsets;
801	unsigned long pages_at_lower_offsets;
802} cluster_stats[MAXCLUSTERPAGES];
803#endif	/* MACH_CLUSTER_STATS */
804
805
806/*
807 * vm_pageout_cluster:
808 *
809 * Given a page, queue it to the appropriate I/O thread,
810 * which will page it out and attempt to clean adjacent pages
811 * in the same operation.
812 *
813 * The page must be busy, and the object and queues locked. We will take a
814 * paging reference to prevent deallocation or collapse when we
815 * release the object lock back at the call site.  The I/O thread
816 * is responsible for consuming this reference
817 *
818 * The page must not be on any pageout queue.
819 */
820
821void
822vm_pageout_cluster(vm_page_t m, boolean_t pageout)
823{
824	vm_object_t	object = m->object;
825        struct		vm_pageout_queue *q;
826
827
828	XPR(XPR_VM_PAGEOUT,
829		"vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
830		object, m->offset, m, 0, 0);
831
832	VM_PAGE_CHECK(m);
833#if DEBUG
834	lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
835#endif
836	vm_object_lock_assert_exclusive(object);
837
838	/*
839	 * Only a certain kind of page is appreciated here.
840	 */
841	assert((m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
842	assert(!m->cleaning && !m->pageout && !m->laundry);
843#ifndef CONFIG_FREEZE
844	assert(!m->inactive && !m->active);
845	assert(!m->throttled);
846#endif
847
848	/*
849	 * protect the object from collapse or termination
850	 */
851	vm_object_activity_begin(object);
852
853	m->pageout = pageout;
854
855	if (object->internal == TRUE)
856	        q = &vm_pageout_queue_internal;
857	else
858	        q = &vm_pageout_queue_external;
859
860        /*
861	 * pgo_laundry count is tied to the laundry bit
862	 */
863	m->laundry = TRUE;
864	q->pgo_laundry++;
865
866	m->pageout_queue = TRUE;
867	queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
868
869	if (q->pgo_idle == TRUE) {
870	        q->pgo_idle = FALSE;
871	        thread_wakeup((event_t) &q->pgo_pending);
872	}
873
874	VM_PAGE_CHECK(m);
875}
876
877
878unsigned long vm_pageout_throttle_up_count = 0;
879
880/*
881 * A page is back from laundry or we are stealing it back from
882 * the laundering state.  See if there are some pages waiting to
883 * go to laundry and if we can let some of them go now.
884 *
885 * Object and page queues must be locked.
886 */
887void
888vm_pageout_throttle_up(
889       vm_page_t       m)
890{
891       struct vm_pageout_queue *q;
892
893       assert(m->object != VM_OBJECT_NULL);
894       assert(m->object != kernel_object);
895
896#if DEBUG
897       lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
898       vm_object_lock_assert_exclusive(m->object);
899#endif
900
901       vm_pageout_throttle_up_count++;
902
903       if (m->object->internal == TRUE)
904               q = &vm_pageout_queue_internal;
905       else
906               q = &vm_pageout_queue_external;
907
908       if (m->pageout_queue == TRUE) {
909
910	       queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
911	       m->pageout_queue = FALSE;
912
913	       m->pageq.next = NULL;
914	       m->pageq.prev = NULL;
915
916	       vm_object_activity_end(m->object);
917       }
918       if (m->laundry == TRUE) {
919
920	       m->laundry = FALSE;
921	       q->pgo_laundry--;
922
923	       if (q->pgo_throttled == TRUE) {
924		       q->pgo_throttled = FALSE;
925                       thread_wakeup((event_t) &q->pgo_laundry);
926               }
927	       if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
928		       q->pgo_draining = FALSE;
929		       thread_wakeup((event_t) (&q->pgo_laundry+1));
930	       }
931	       if (vm_pageout_precleaning_delayed == TRUE) {
932		       /*
933			* since the pageout scan can return on laundry congestion, wake it up this way
934			* don't depend on pgo_throttled == TRUE to indicate that the pageout scan thread
935			* is blocked on &q->pgo_laundry since the hibernation mechanism utilizes both
936			* pgo_throttled and pgo_draining
937			*/
938		       vm_pageout_precleaning_delayed = FALSE;
939		       thread_wakeup((event_t)(&vm_page_free_wanted));
940	       }
941	}
942}
943
944
945/*
946 * VM memory pressure monitoring.
947 *
948 * vm_pageout_scan() keeps track of the number of pages it considers and
949 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
950 *
951 * compute_memory_pressure() is called every second from compute_averages()
952 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
953 * of recalimed pages in a new vm_pageout_stat[] bucket.
954 *
955 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
956 * The caller provides the number of seconds ("nsecs") worth of statistics
957 * it wants, up to 30 seconds.
958 * It computes the number of pages reclaimed in the past "nsecs" seconds and
959 * also returns the number of pages the system still needs to reclaim at this
960 * moment in time.
961 */
962#define VM_PAGEOUT_STAT_SIZE	31
963struct vm_pageout_stat {
964	unsigned int considered;
965	unsigned int reclaimed;
966} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0}, };
967unsigned int vm_pageout_stat_now = 0;
968unsigned int vm_memory_pressure = 0;
969
970#define VM_PAGEOUT_STAT_BEFORE(i) \
971	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
972#define VM_PAGEOUT_STAT_AFTER(i) \
973	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
974
975/*
976 * Called from compute_averages().
977 */
978void
979compute_memory_pressure(
980	__unused void *arg)
981{
982	unsigned int vm_pageout_next;
983
984	vm_memory_pressure =
985		vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed;
986
987	commpage_set_memory_pressure( vm_memory_pressure );
988
989	/* move "now" forward */
990	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
991	vm_pageout_stats[vm_pageout_next].considered = 0;
992	vm_pageout_stats[vm_pageout_next].reclaimed = 0;
993	vm_pageout_stat_now = vm_pageout_next;
994}
995
996
997/*
998 * IMPORTANT
999 * mach_vm_ctl_page_free_wanted() is called indirectly, via
1000 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
1001 * it must be safe in the restricted stackshot context. Locks and/or
1002 * blocking are not allowable.
1003 */
1004unsigned int
1005mach_vm_ctl_page_free_wanted(void)
1006{
1007	unsigned int page_free_target, page_free_count, page_free_wanted;
1008
1009	page_free_target = vm_page_free_target;
1010	page_free_count = vm_page_free_count;
1011	if (page_free_target > page_free_count) {
1012		page_free_wanted = page_free_target - page_free_count;
1013	} else {
1014		page_free_wanted = 0;
1015	}
1016
1017	return page_free_wanted;
1018}
1019
1020
1021/*
1022 * IMPORTANT:
1023 * mach_vm_pressure_monitor() is called when taking a stackshot, with
1024 * wait_for_pressure FALSE, so that code path must remain safe in the
1025 * restricted stackshot context. No blocking or locks are allowable.
1026 * on that code path.
1027 */
1028
1029kern_return_t
1030mach_vm_pressure_monitor(
1031	boolean_t	wait_for_pressure,
1032	unsigned int	nsecs_monitored,
1033	unsigned int	*pages_reclaimed_p,
1034	unsigned int	*pages_wanted_p)
1035{
1036	wait_result_t	wr;
1037	unsigned int	vm_pageout_then, vm_pageout_now;
1038	unsigned int	pages_reclaimed;
1039
1040	/*
1041	 * We don't take the vm_page_queue_lock here because we don't want
1042	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1043	 * thread when it's trying to reclaim memory.  We don't need fully
1044	 * accurate monitoring anyway...
1045	 */
1046
1047	if (wait_for_pressure) {
1048		/* wait until there's memory pressure */
1049		while (vm_page_free_count >= vm_page_free_target) {
1050			wr = assert_wait((event_t) &vm_page_free_wanted,
1051					 THREAD_INTERRUPTIBLE);
1052			if (wr == THREAD_WAITING) {
1053				wr = thread_block(THREAD_CONTINUE_NULL);
1054			}
1055			if (wr == THREAD_INTERRUPTED) {
1056				return KERN_ABORTED;
1057			}
1058			if (wr == THREAD_AWAKENED) {
1059				/*
1060				 * The memory pressure might have already
1061				 * been relieved but let's not block again
1062				 * and let's report that there was memory
1063				 * pressure at some point.
1064				 */
1065				break;
1066			}
1067		}
1068	}
1069
1070	/* provide the number of pages the system wants to reclaim */
1071	if (pages_wanted_p != NULL) {
1072		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1073	}
1074
1075	if (pages_reclaimed_p == NULL) {
1076		return KERN_SUCCESS;
1077	}
1078
1079	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1080	do {
1081		vm_pageout_now = vm_pageout_stat_now;
1082		pages_reclaimed = 0;
1083		for (vm_pageout_then =
1084			     VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1085		     vm_pageout_then != vm_pageout_now &&
1086			     nsecs_monitored-- != 0;
1087		     vm_pageout_then =
1088			     VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1089			pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed;
1090		}
1091	} while (vm_pageout_now != vm_pageout_stat_now);
1092	*pages_reclaimed_p = pages_reclaimed;
1093
1094	return KERN_SUCCESS;
1095}
1096
1097
1098
1099/*
1100 * function in BSD to apply I/O throttle to the pageout thread
1101 */
1102extern void vm_pageout_io_throttle(void);
1103
1104
1105/*
1106 * Page States: Used below to maintain the page state
1107 * before it's removed from it's Q. This saved state
1108 * helps us do the right accounting in certain cases
1109 */
1110#define PAGE_STATE_SPECULATIVE		1
1111#define PAGE_STATE_ANONYMOUS		2
1112#define PAGE_STATE_INACTIVE		3
1113#define PAGE_STATE_INACTIVE_FIRST	4
1114#define PAGE_STATE_CLEAN      5
1115
1116#define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m)				\
1117	MACRO_BEGIN							\
1118	/*								\
1119	 * If a "reusable" page somehow made it back into		\
1120	 * the active queue, it's been re-used and is not		\
1121	 * quite re-usable.						\
1122	 * If the VM object was "all_reusable", consider it		\
1123	 * as "all re-used" instead of converting it to			\
1124	 * "partially re-used", which could be expensive.		\
1125	 */								\
1126	if ((m)->reusable ||						\
1127	    (m)->object->all_reusable) {				\
1128		vm_object_reuse_pages((m)->object,			\
1129				      (m)->offset,			\
1130				      (m)->offset + PAGE_SIZE_64,	\
1131				      FALSE);				\
1132	}								\
1133	MACRO_END
1134
1135
1136#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT  	128
1137#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX	1024
1138
1139#define	FCS_IDLE		0
1140#define FCS_DELAYED		1
1141#define FCS_DEADLOCK_DETECTED	2
1142
1143struct flow_control {
1144        int		state;
1145        mach_timespec_t	ts;
1146};
1147
1148uint32_t vm_pageout_considered_page = 0;
1149
1150
1151/*
1152 *	vm_pageout_scan does the dirty work for the pageout daemon.
1153 *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
1154 *	held and vm_page_free_wanted == 0.
1155 */
1156void
1157vm_pageout_scan(void)
1158{
1159	unsigned int loop_count = 0;
1160	unsigned int inactive_burst_count = 0;
1161	unsigned int active_burst_count = 0;
1162	unsigned int reactivated_this_call;
1163	unsigned int reactivate_limit;
1164	vm_page_t   local_freeq = NULL;
1165	int         local_freed = 0;
1166	int         delayed_unlock;
1167	int	    delayed_unlock_limit = 0;
1168	int	    refmod_state = 0;
1169        int	vm_pageout_deadlock_target = 0;
1170	struct	vm_pageout_queue *iq;
1171	struct	vm_pageout_queue *eq;
1172        struct	vm_speculative_age_q *sq;
1173	struct  flow_control	flow_control = { 0, { 0, 0 } };
1174        boolean_t inactive_throttled = FALSE;
1175	boolean_t try_failed;
1176	mach_timespec_t	ts;
1177	unsigned	int msecs = 0;
1178	vm_object_t	object;
1179	vm_object_t	last_object_tried;
1180	uint32_t	catch_up_count = 0;
1181	uint32_t	inactive_reclaim_run;
1182	boolean_t	forced_reclaim;
1183	boolean_t	exceeded_burst_throttle;
1184	boolean_t	grab_anonymous = FALSE;
1185	int		page_prev_state = 0;
1186	int		cache_evict_throttle = 0;
1187	uint32_t	vm_pageout_inactive_external_forced_reactivate_limit = 0;
1188
1189	VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
1190		       vm_pageout_speculative_clean, vm_pageout_inactive_clean,
1191		       vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
1192
1193	flow_control.state = FCS_IDLE;
1194	iq = &vm_pageout_queue_internal;
1195	eq = &vm_pageout_queue_external;
1196	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1197
1198
1199        XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1200
1201
1202	vm_page_lock_queues();
1203	delayed_unlock = 1;	/* must be nonzero if Qs are locked, 0 if unlocked */
1204
1205	/*
1206	 *	Calculate the max number of referenced pages on the inactive
1207	 *	queue that we will reactivate.
1208	 */
1209	reactivated_this_call = 0;
1210	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
1211						    vm_page_inactive_count);
1212	inactive_reclaim_run = 0;
1213
1214	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
1215
1216	/*
1217	 *	We want to gradually dribble pages from the active queue
1218	 *	to the inactive queue.  If we let the inactive queue get
1219	 *	very small, and then suddenly dump many pages into it,
1220	 *	those pages won't get a sufficient chance to be referenced
1221	 *	before we start taking them from the inactive queue.
1222	 *
1223	 *	We must limit the rate at which we send pages to the pagers
1224	 *	so that we don't tie up too many pages in the I/O queues.
1225	 *	We implement a throttling mechanism using the laundry count
1226	 * 	to limit the number of pages outstanding to the default
1227	 *	and external pagers.  We can bypass the throttles and look
1228	 *	for clean pages if the pageout queues don't drain in a timely
1229	 *	fashion since this may indicate that the pageout paths are
1230	 *	stalled waiting for memory, which only we can provide.
1231	 */
1232
1233
1234Restart:
1235	assert(delayed_unlock!=0);
1236
1237	/*
1238	 *	Recalculate vm_page_inactivate_target.
1239	 */
1240	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1241							  vm_page_inactive_count +
1242							  vm_page_speculative_count);
1243
1244	vm_page_anonymous_min = vm_page_inactive_target / 3;
1245
1246	/*
1247	 * don't want to wake the pageout_scan thread up everytime we fall below
1248	 * the targets... set a low water mark at 0.25% below the target
1249	 */
1250	vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
1251
1252	if (vm_page_speculative_percentage > 50)
1253		vm_page_speculative_percentage = 50;
1254	else if (vm_page_speculative_percentage <= 0)
1255		vm_page_speculative_percentage = 1;
1256
1257	vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1258								vm_page_inactive_count);
1259
1260	object = NULL;
1261	last_object_tried = NULL;
1262	try_failed = FALSE;
1263
1264	if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
1265	        catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1266	else
1267	        catch_up_count = 0;
1268
1269	for (;;) {
1270		vm_page_t m;
1271
1272		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1273
1274		if (delayed_unlock == 0) {
1275		        vm_page_lock_queues();
1276			delayed_unlock = 1;
1277		}
1278		if (vm_upl_wait_for_pages < 0)
1279			vm_upl_wait_for_pages = 0;
1280
1281		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
1282
1283		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX)
1284			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
1285
1286		/*
1287		 * Move pages from active to inactive if we're below the target
1288		 */
1289		/* if we are trying to make clean, we need to make sure we actually have inactive - mj */
1290		if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1291			goto done_moving_active_pages;
1292
1293		if (object != NULL) {
1294			vm_object_unlock(object);
1295			object = NULL;
1296			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1297		}
1298		/*
1299		 * Don't sweep through active queue more than the throttle
1300		 * which should be kept relatively low
1301		 */
1302		active_burst_count = MIN(vm_pageout_burst_active_throttle,
1303					 vm_page_active_count);
1304
1305		VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_START,
1306			       vm_pageout_inactive, vm_pageout_inactive_used, vm_page_free_count, local_freed);
1307
1308		VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_NONE,
1309			       vm_pageout_speculative_clean, vm_pageout_inactive_clean,
1310			       vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
1311
1312		while (!queue_empty(&vm_page_queue_active) && active_burst_count--) {
1313
1314			vm_pageout_active++;
1315
1316			m = (vm_page_t) queue_first(&vm_page_queue_active);
1317
1318			assert(m->active && !m->inactive);
1319			assert(!m->laundry);
1320			assert(m->object != kernel_object);
1321			assert(m->phys_page != vm_page_guard_addr);
1322
1323			DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1324
1325			/*
1326			 * The page might be absent or busy,
1327			 * but vm_page_deactivate can handle that.
1328			 */
1329			vm_page_deactivate(m);
1330
1331			if (delayed_unlock++ > delayed_unlock_limit) {
1332
1333			        if (local_freeq) {
1334					vm_page_unlock_queues();
1335
1336					VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1337						       vm_page_free_count, local_freed, delayed_unlock_limit, 1);
1338
1339				        vm_page_free_list(local_freeq, TRUE);
1340
1341					VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1342						       vm_page_free_count, 0, 0, 1);
1343
1344					local_freeq = NULL;
1345					local_freed = 0;
1346					vm_page_lock_queues();
1347				} else
1348					lck_mtx_yield(&vm_page_queue_lock);
1349
1350				delayed_unlock = 1;
1351
1352				/*
1353				 * continue the while loop processing
1354				 * the active queue... need to hold
1355				 * the page queues lock
1356				 */
1357			}
1358		}
1359
1360		VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_END,
1361			       vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, vm_page_inactive_target);
1362
1363
1364		/**********************************************************************
1365		 * above this point we're playing with the active queue
1366		 * below this point we're playing with the throttling mechanisms
1367		 * and the inactive queue
1368		 **********************************************************************/
1369
1370done_moving_active_pages:
1371
1372		if (vm_page_cleaned_count < VM_PAGE_CLEANED_MIN && vm_page_anonymous_count > vm_page_anonymous_min)
1373			vm_pageout_need_to_refill_clean_queue = TRUE;
1374
1375		if (vm_page_free_count + local_freed >= vm_page_free_target) {
1376			if (object != NULL) {
1377			        vm_object_unlock(object);
1378				object = NULL;
1379			}
1380			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1381
1382			if (local_freeq) {
1383				vm_page_unlock_queues();
1384
1385				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1386					       vm_page_free_count, local_freed, delayed_unlock_limit, 2);
1387
1388				vm_page_free_list(local_freeq, TRUE);
1389
1390				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1391					       vm_page_free_count, local_freed, 0, 2);
1392
1393				local_freeq = NULL;
1394				local_freed = 0;
1395				vm_page_lock_queues();
1396			}
1397			/*
1398			 * make sure the pageout I/O threads are running
1399			 * throttled in case there are still requests
1400			 * in the laundry... since we have met our targets
1401			 * we don't need the laundry to be cleaned in a timely
1402			 * fashion... so let's avoid interfering with foreground
1403			 * activity
1404			 */
1405			vm_pageout_adjust_io_throttles(iq, eq, TRUE);
1406
1407			/*
1408			 * recalculate vm_page_inactivate_target
1409			 */
1410			vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1411									  vm_page_inactive_count +
1412									  vm_page_speculative_count);
1413#ifndef	CONFIG_EMBEDDED
1414			if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1415			    !queue_empty(&vm_page_queue_active)) {
1416				/*
1417				 * inactive target still not met... keep going
1418				 * until we get the queues balanced...
1419				 */
1420			        continue;
1421			}
1422#endif
1423		        lck_mtx_lock(&vm_page_queue_free_lock);
1424
1425			if ((vm_page_free_count >= vm_page_free_target) &&
1426			    (vm_page_cleaned_count >= VM_PAGE_CLEANED_TARGET || vm_pageout_need_to_refill_clean_queue == FALSE) &&
1427			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1428				/*
1429				 * done - we have met our target *and*
1430				 * there is no one waiting for a page.
1431				 */
1432				vm_pageout_need_to_refill_clean_queue = FALSE;
1433return_from_scan:
1434				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1435
1436				VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
1437					       vm_pageout_inactive, vm_pageout_inactive_used, vm_pageout_need_to_refill_clean_queue, 0);
1438				VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
1439					       vm_pageout_speculative_clean, vm_pageout_inactive_clean,
1440					       vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
1441
1442				return;
1443			}
1444			lck_mtx_unlock(&vm_page_queue_free_lock);
1445		}
1446
1447		/*
1448		 * Before anything, we check if we have any ripe volatile
1449		 * objects around. If so, try to purge the first object.
1450		 * If the purge fails, fall through to reclaim a page instead.
1451		 * If the purge succeeds, go back to the top and reevalute
1452		 * the new memory situation.
1453		 */
1454		assert (available_for_purge>=0);
1455		if (available_for_purge)
1456		{
1457		        if (object != NULL) {
1458			        vm_object_unlock(object);
1459				object = NULL;
1460			}
1461
1462			VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1463
1464			if (TRUE == vm_purgeable_object_purge_one()) {
1465
1466				VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1467
1468				continue;
1469			}
1470			VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1471		}
1472		if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1473		        /*
1474			 * try to pull pages from the aging bins...
1475			 * see vm_page.h for an explanation of how
1476			 * this mechanism works
1477			 */
1478		        struct vm_speculative_age_q	*aq;
1479			mach_timespec_t	ts_fully_aged;
1480			boolean_t	can_steal = FALSE;
1481			int num_scanned_queues;
1482
1483			aq = &vm_page_queue_speculative[speculative_steal_index];
1484
1485			num_scanned_queues = 0;
1486			while (queue_empty(&aq->age_q) &&
1487			       num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1488
1489			        speculative_steal_index++;
1490
1491				if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1492				        speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1493
1494				aq = &vm_page_queue_speculative[speculative_steal_index];
1495			}
1496
1497			if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1498				/*
1499				 * XXX We've scanned all the speculative
1500				 * queues but still haven't found one
1501				 * that is not empty, even though
1502				 * vm_page_speculative_count is not 0.
1503				 *
1504				 * report the anomaly...
1505				 */
1506				printf("vm_pageout_scan: "
1507				       "all speculative queues empty "
1508				       "but count=%d.  Re-adjusting.\n",
1509				       vm_page_speculative_count);
1510				if (vm_page_speculative_count > vm_page_speculative_count_drift_max)
1511					vm_page_speculative_count_drift_max = vm_page_speculative_count;
1512				vm_page_speculative_count_drifts++;
1513#if 6553678
1514				Debugger("vm_pageout_scan: no speculative pages");
1515#endif
1516				/* readjust... */
1517				vm_page_speculative_count = 0;
1518				/* ... and continue */
1519				continue;
1520			}
1521
1522			if (vm_page_speculative_count > vm_page_speculative_target)
1523			        can_steal = TRUE;
1524			else {
1525			        ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) / 1000;
1526				ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) % 1000)
1527				                      * 1000 * NSEC_PER_USEC;
1528
1529				ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1530
1531				clock_sec_t sec;
1532				clock_nsec_t nsec;
1533			        clock_get_system_nanotime(&sec, &nsec);
1534				ts.tv_sec = (unsigned int) sec;
1535				ts.tv_nsec = nsec;
1536
1537				if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1538				        can_steal = TRUE;
1539			}
1540			if (can_steal == TRUE)
1541			        vm_page_speculate_ageit(aq);
1542		}
1543		if (queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
1544			int 	pages_evicted;
1545
1546		        if (object != NULL) {
1547			        vm_object_unlock(object);
1548				object = NULL;
1549			}
1550			pages_evicted = vm_object_cache_evict(100, 10);
1551
1552			if (pages_evicted) {
1553
1554				vm_pageout_cache_evicted += pages_evicted;
1555
1556				VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
1557					       vm_page_free_count, pages_evicted, vm_pageout_cache_evicted, 0);
1558
1559				/*
1560				 * we just freed up to 100 pages,
1561				 * so go back to the top of the main loop
1562				 * and re-evaulate the memory situation
1563				 */
1564				continue;
1565			} else
1566				cache_evict_throttle = 100;
1567		}
1568		if  (cache_evict_throttle)
1569			cache_evict_throttle--;
1570
1571
1572		exceeded_burst_throttle = FALSE;
1573		/*
1574		 * Sometimes we have to pause:
1575		 *	1) No inactive pages - nothing to do.
1576		 *	2) Loop control - no acceptable pages found on the inactive queue
1577		 *         within the last vm_pageout_burst_inactive_throttle iterations
1578		 *	3) Flow control - default pageout queue is full
1579		 */
1580		if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_anonymous) && queue_empty(&sq->age_q)) {
1581		        vm_pageout_scan_empty_throttle++;
1582			msecs = vm_pageout_empty_wait;
1583			goto vm_pageout_scan_delay;
1584
1585		} else if (inactive_burst_count >=
1586			   MIN(vm_pageout_burst_inactive_throttle,
1587			       (vm_page_inactive_count +
1588				vm_page_speculative_count))) {
1589		        vm_pageout_scan_burst_throttle++;
1590			msecs = vm_pageout_burst_wait;
1591
1592			exceeded_burst_throttle = TRUE;
1593			goto vm_pageout_scan_delay;
1594
1595		} else if (VM_PAGE_Q_THROTTLED(iq) &&
1596				  VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) {
1597			clock_sec_t sec;
1598			clock_nsec_t nsec;
1599
1600		        switch (flow_control.state) {
1601
1602			case FCS_IDLE:
1603				if ((vm_page_free_count + local_freed) < vm_page_free_target) {
1604					if (vm_page_inactive_count - vm_page_anonymous_count > 0) {
1605						grab_anonymous = FALSE;
1606						goto consider_inactive;
1607					}
1608					if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target)
1609						continue;
1610				}
1611reset_deadlock_timer:
1612			        ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1613				ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1614			        clock_get_system_nanotime(&sec, &nsec);
1615				flow_control.ts.tv_sec = (unsigned int) sec;
1616				flow_control.ts.tv_nsec = nsec;
1617				ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1618
1619				flow_control.state = FCS_DELAYED;
1620				msecs = vm_pageout_deadlock_wait;
1621
1622				break;
1623
1624			case FCS_DELAYED:
1625			        clock_get_system_nanotime(&sec, &nsec);
1626				ts.tv_sec = (unsigned int) sec;
1627				ts.tv_nsec = nsec;
1628
1629				if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1630				        /*
1631					 * the pageout thread for the default pager is potentially
1632					 * deadlocked since the
1633					 * default pager queue has been throttled for more than the
1634					 * allowable time... we need to move some clean pages or dirty
1635					 * pages belonging to the external pagers if they aren't throttled
1636					 * vm_page_free_wanted represents the number of threads currently
1637					 * blocked waiting for pages... we'll move one page for each of
1638					 * these plus a fixed amount to break the logjam... once we're done
1639					 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1640					 * with a new timeout target since we have no way of knowing
1641					 * whether we've broken the deadlock except through observation
1642					 * of the queue associated with the default pager... we need to
1643					 * stop moving pages and allow the system to run to see what
1644					 * state it settles into.
1645					 */
1646				        vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1647					vm_pageout_scan_deadlock_detected++;
1648					flow_control.state = FCS_DEADLOCK_DETECTED;
1649					thread_wakeup((event_t) &vm_pageout_garbage_collect);
1650					goto consider_inactive;
1651				}
1652				/*
1653				 * just resniff instead of trying
1654				 * to compute a new delay time... we're going to be
1655				 * awakened immediately upon a laundry completion,
1656				 * so we won't wait any longer than necessary
1657				 */
1658				msecs = vm_pageout_idle_wait;
1659				break;
1660
1661			case FCS_DEADLOCK_DETECTED:
1662			        if (vm_pageout_deadlock_target)
1663				        goto consider_inactive;
1664				goto reset_deadlock_timer;
1665
1666			}
1667vm_pageout_scan_delay:
1668			if (object != NULL) {
1669			        vm_object_unlock(object);
1670				object = NULL;
1671			}
1672			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1673
1674			if (local_freeq) {
1675				vm_page_unlock_queues();
1676
1677				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1678					       vm_page_free_count, local_freed, delayed_unlock_limit, 3);
1679
1680				vm_page_free_list(local_freeq, TRUE);
1681
1682				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1683					       vm_page_free_count, local_freed, 0, 3);
1684
1685				local_freeq = NULL;
1686				local_freed = 0;
1687				vm_page_lock_queues();
1688
1689				if (flow_control.state == FCS_DELAYED &&
1690				    !VM_PAGE_Q_THROTTLED(iq)) {
1691					flow_control.state = FCS_IDLE;
1692					goto consider_inactive;
1693				}
1694			}
1695
1696			if (vm_page_free_count >= vm_page_free_target) {
1697				/*
1698				 * we're here because either
1699				 *  1) someone else freed up some pages while we had
1700				 *     the queues unlocked above or
1701				 *  2) we're precleaning and we haven't yet met
1702				 *     our cleaned target
1703				 * and we've hit one of the 3 conditions that
1704				 * cause us to pause the pageout scan thread
1705				 *
1706				 * since we already have enough free pages,
1707				 * let's avoid stalling and return normally
1708				 *
1709				 * before we return, make sure the pageout I/O threads
1710				 * are running throttled in case there are still requests
1711				 * in the laundry... since we have enough free pages
1712				 * we don't need the laundry to be cleaned in a timely
1713				 * fashion... so let's avoid interfering with foreground
1714				 * activity
1715				 *
1716				 * we don't want to hold vm_page_queue_free_lock when
1717				 * calling vm_pageout_adjust_io_throttles (since it
1718				 * may cause other locks to be taken), we do the intitial
1719				 * check outside of the lock.  Once we take the lock,
1720				 * we recheck the condition since it may have changed.
1721				 * if it has, no problem, we will make the threads
1722				 * non-throttled before actually blocking
1723				 */
1724				vm_pageout_adjust_io_throttles(iq, eq, TRUE);
1725			}
1726			lck_mtx_lock(&vm_page_queue_free_lock);
1727
1728			if (vm_page_free_count >= vm_page_free_target) {
1729				if (vm_page_cleaned_count < VM_PAGE_CLEANED_TARGET) {
1730					vm_precleaning_aborted++;
1731					vm_pageout_precleaning_delayed = TRUE;
1732				}
1733				goto return_from_scan;
1734			}
1735			lck_mtx_unlock(&vm_page_queue_free_lock);
1736
1737			if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
1738				/*
1739				 * we're most likely about to block due to one of
1740				 * the 3 conditions that cause vm_pageout_scan to
1741				 * not be able to make forward progress w/r
1742				 * to providing new pages to the free queue,
1743				 * so unthrottle the I/O threads in case we
1744				 * have laundry to be cleaned... it needs
1745				 * to be completed ASAP.
1746				 *
1747				 * even if we don't block, we want the io threads
1748				 * running unthrottled since the sum of free +
1749				 * clean pages is still under our free target
1750				 */
1751				vm_pageout_adjust_io_throttles(iq, eq, FALSE);
1752			}
1753			if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
1754				/*
1755				 * if we get here we're below our free target and
1756				 * we're stalling due to a full laundry queue or
1757				 * we don't have any inactive pages other then
1758				 * those in the clean queue...
1759				 * however, we have pages on the clean queue that
1760				 * can be moved to the free queue, so let's not
1761				 * stall the pageout scan
1762				 */
1763				flow_control.state = FCS_IDLE;
1764				goto consider_inactive;
1765			}
1766			VM_CHECK_MEMORYSTATUS;
1767
1768			if (flow_control.state != FCS_IDLE)
1769				vm_pageout_scan_throttle++;
1770			iq->pgo_throttled = TRUE;
1771
1772			assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1773			counter(c_vm_pageout_scan_block++);
1774
1775			vm_page_unlock_queues();
1776
1777			assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1778
1779			VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
1780				       iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
1781
1782			thread_block(THREAD_CONTINUE_NULL);
1783
1784			VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
1785				       iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
1786
1787			vm_page_lock_queues();
1788			delayed_unlock = 1;
1789
1790			iq->pgo_throttled = FALSE;
1791
1792			if (loop_count >= vm_page_inactive_count)
1793				loop_count = 0;
1794			inactive_burst_count = 0;
1795
1796			goto Restart;
1797			/*NOTREACHED*/
1798		}
1799
1800
1801		flow_control.state = FCS_IDLE;
1802consider_inactive:
1803		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
1804									    vm_pageout_inactive_external_forced_reactivate_limit);
1805		loop_count++;
1806		inactive_burst_count++;
1807		vm_pageout_inactive++;
1808
1809		boolean_t pageout_making_free = ((vm_page_free_count + local_freed) < vm_page_free_target); /* TRUE if making free, FALSE if making clean */
1810
1811		/*
1812		 * Choose a victim.
1813		 */
1814		while (1) {
1815			m = NULL;
1816
1817			if (VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) {
1818				assert(vm_page_throttled_count == 0);
1819				assert(queue_empty(&vm_page_queue_throttled));
1820			}
1821
1822			/*
1823			 * If we are still below the free target, try speculative
1824			 * and clean queue pages.
1825			 */
1826			if (pageout_making_free) {
1827				/*
1828				 * The most eligible pages are ones we paged in speculatively,
1829				 * but which have not yet been touched.
1830				 */
1831				if ( !queue_empty(&sq->age_q) ) {
1832					m = (vm_page_t) queue_first(&sq->age_q);
1833
1834					page_prev_state = PAGE_STATE_SPECULATIVE;
1835
1836					break;
1837				}
1838
1839				/*
1840				 * Try a clean-queue inactive page, if we are still trying to fill the free list.
1841				 */
1842				if ( !queue_empty(&vm_page_queue_cleaned) ) {
1843					m = (vm_page_t) queue_first(&vm_page_queue_cleaned);
1844
1845					page_prev_state = PAGE_STATE_CLEAN;
1846
1847					break;
1848				}
1849
1850				if (grab_anonymous == FALSE || queue_empty(&vm_page_queue_anonymous)) {
1851
1852					if ( !queue_empty(&vm_page_queue_inactive) ) {
1853						m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1854
1855						page_prev_state = PAGE_STATE_INACTIVE;
1856						if (vm_pageout_need_to_refill_clean_queue == TRUE)
1857							grab_anonymous = TRUE;
1858						break;
1859					}
1860				}
1861			}
1862			if (vm_pageout_need_to_refill_clean_queue == TRUE) {
1863				if ( !queue_empty(&vm_page_queue_anonymous) ) {
1864					m = (vm_page_t) queue_first(&vm_page_queue_anonymous);
1865
1866					page_prev_state = PAGE_STATE_ANONYMOUS;
1867					grab_anonymous = FALSE;
1868					break;
1869				}
1870			}
1871
1872			/*
1873			 * if we've gotten here, we have no victim page.
1874			 * if making clean, free the local freed list and return.
1875			 * if making free, check to see if we've finished balancing the queues
1876			 * yet, if we haven't just continue, else panic
1877			 */
1878			vm_page_unlock_queues();
1879
1880			if (object != NULL) {
1881				vm_object_unlock(object);
1882				object = NULL;
1883			}
1884			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1885
1886			if (local_freeq) {
1887				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1888					       vm_page_free_count, local_freed, delayed_unlock_limit, 5);
1889
1890				vm_page_free_list(local_freeq, TRUE);
1891
1892				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1893					       vm_page_free_count, local_freed, 0, 5);
1894
1895				local_freeq = NULL;
1896				local_freed = 0;
1897			}
1898			vm_page_lock_queues();
1899			delayed_unlock = 1;
1900
1901			if (pageout_making_free == FALSE) {
1902				if (vm_pageout_need_to_refill_clean_queue == TRUE)
1903					DTRACE_VM(novictimforclean);
1904
1905				lck_mtx_lock(&vm_page_queue_free_lock);
1906				goto return_from_scan;
1907
1908			}
1909			if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target)
1910				goto Restart;
1911
1912			panic("vm_pageout: no victim");
1913
1914			/* NOTREACHED */
1915		}
1916
1917		/*
1918		 * we just found this page on one of our queues...
1919		 * it can't also be on the pageout queue, so safe
1920		 * to call VM_PAGE_QUEUES_REMOVE
1921		 */
1922		assert(!m->pageout_queue);
1923
1924		VM_PAGE_QUEUES_REMOVE(m);
1925
1926		assert(!m->laundry);
1927		assert(!m->private);
1928		assert(!m->fictitious);
1929		assert(m->object != kernel_object);
1930		assert(m->phys_page != vm_page_guard_addr);
1931
1932
1933		if (page_prev_state != PAGE_STATE_SPECULATIVE)
1934			vm_pageout_stats[vm_pageout_stat_now].considered++;
1935
1936		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1937
1938		/*
1939		 * check to see if we currently are working
1940		 * with the same object... if so, we've
1941		 * already got the lock
1942		 */
1943		if (m->object != object) {
1944		        /*
1945			 * the object associated with candidate page is
1946			 * different from the one we were just working
1947			 * with... dump the lock if we still own it
1948			 */
1949		        if (object != NULL) {
1950			        vm_object_unlock(object);
1951				object = NULL;
1952				vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1953			}
1954			/*
1955			 * Try to lock object; since we've alread got the
1956			 * page queues lock, we can only 'try' for this one.
1957			 * if the 'try' fails, we need to do a mutex_pause
1958			 * to allow the owner of the object lock a chance to
1959			 * run... otherwise, we're likely to trip over this
1960			 * object in the same state as we work our way through
1961			 * the queue... clumps of pages associated with the same
1962			 * object are fairly typical on the inactive and active queues
1963			 */
1964			if (!vm_object_lock_try_scan(m->object)) {
1965				vm_page_t m_want = NULL;
1966
1967				vm_pageout_inactive_nolock++;
1968
1969				if (page_prev_state == PAGE_STATE_CLEAN)
1970					vm_pageout_cleaned_nolock++;
1971
1972				if (page_prev_state == PAGE_STATE_SPECULATIVE)
1973					page_prev_state = PAGE_STATE_INACTIVE_FIRST;
1974
1975				pmap_clear_reference(m->phys_page);
1976				m->reference = FALSE;
1977
1978				/*
1979				 * m->object must be stable since we hold the page queues lock...
1980				 * we can update the scan_collisions field sans the object lock
1981				 * since it is a separate field and this is the only spot that does
1982				 * a read-modify-write operation and it is never executed concurrently...
1983				 * we can asynchronously set this field to 0 when creating a UPL, so it
1984				 * is possible for the value to be a bit non-determistic, but that's ok
1985				 * since it's only used as a hint
1986				 */
1987				m->object->scan_collisions++;
1988
1989				if (pageout_making_free) {
1990					if ( !queue_empty(&sq->age_q) )
1991						m_want = (vm_page_t) queue_first(&sq->age_q);
1992					else if (!queue_empty(&vm_page_queue_cleaned))
1993						m_want = (vm_page_t) queue_first(&vm_page_queue_cleaned);
1994					else if (grab_anonymous == FALSE || queue_empty(&vm_page_queue_anonymous))
1995						m_want = (vm_page_t) queue_first(&vm_page_queue_inactive);
1996				}
1997				if (m_want == NULL && vm_pageout_need_to_refill_clean_queue == TRUE) {
1998				        if ( !queue_empty(&vm_page_queue_anonymous) )
1999					        m_want = (vm_page_t) queue_first(&vm_page_queue_anonymous);
2000				}
2001				/*
2002				 * this is the next object we're going to be interested in
2003				 * try to make sure its available after the mutex_yield
2004				 * returns control
2005				 */
2006				if (m_want)
2007					vm_pageout_scan_wants_object = m_want->object;
2008
2009				/*
2010				 * force us to dump any collected free pages
2011				 * and to pause before moving on
2012				 */
2013				try_failed = TRUE;
2014
2015				goto requeue_page;
2016			}
2017			object = m->object;
2018			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2019
2020			try_failed = FALSE;
2021		}
2022		if (catch_up_count)
2023		        catch_up_count--;
2024
2025		if (m->busy) {
2026			if (m->encrypted_cleaning) {
2027				/*
2028				 * ENCRYPTED SWAP:
2029				 * if this page has already been picked up as
2030				 * part of a page-out cluster, it will be busy
2031				 * because it is being encrypted (see
2032				 * vm_object_upl_request()).  But we still
2033				 * want to demote it from "clean-in-place"
2034				 * (aka "adjacent") to "clean-and-free" (aka
2035				 * "target"), so let's ignore its "busy" bit
2036				 * here and proceed to check for "cleaning" a
2037				 * little bit below...
2038				 *
2039				 * CAUTION CAUTION:
2040				 * A "busy" page should still be left alone for
2041				 * most purposes, so we have to be very careful
2042				 * not to process that page too much.
2043				 */
2044				assert(m->cleaning);
2045				goto consider_inactive_page;
2046			}
2047
2048			/*
2049			 *	Somebody is already playing with this page.
2050			 *	Put it back on the appropriate queue
2051			 *
2052			 */
2053			vm_pageout_inactive_busy++;
2054
2055			if (page_prev_state == PAGE_STATE_CLEAN)
2056				vm_pageout_cleaned_busy++;
2057
2058requeue_page:
2059			switch (page_prev_state) {
2060
2061			case PAGE_STATE_SPECULATIVE:
2062				vm_page_speculate(m, FALSE);
2063				break;
2064
2065			case PAGE_STATE_ANONYMOUS:
2066			case PAGE_STATE_CLEAN:
2067			case PAGE_STATE_INACTIVE:
2068				VM_PAGE_ENQUEUE_INACTIVE(m, FALSE);
2069				break;
2070
2071			case PAGE_STATE_INACTIVE_FIRST:
2072				VM_PAGE_ENQUEUE_INACTIVE(m, TRUE);
2073				break;
2074			}
2075			goto done_with_inactivepage;
2076		}
2077
2078
2079		/*
2080		 *	If it's absent, in error or the object is no longer alive,
2081		 *	we can reclaim the page... in the no longer alive case,
2082		 *	there are 2 states the page can be in that preclude us
2083		 *	from reclaiming it - busy or cleaning - that we've already
2084		 *	dealt with
2085		 */
2086		if (m->absent || m->error || !object->alive) {
2087
2088			if (m->absent)
2089				vm_pageout_inactive_absent++;
2090			else if (!object->alive)
2091				vm_pageout_inactive_notalive++;
2092			else
2093				vm_pageout_inactive_error++;
2094reclaim_page:
2095			if (vm_pageout_deadlock_target) {
2096				vm_pageout_scan_inactive_throttle_success++;
2097			        vm_pageout_deadlock_target--;
2098			}
2099
2100			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
2101
2102			if (object->internal) {
2103				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
2104			} else {
2105				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
2106			}
2107			assert(!m->cleaning);
2108			assert(!m->laundry);
2109
2110			m->busy = TRUE;
2111
2112			/*
2113			 * remove page from object here since we're already
2114			 * behind the object lock... defer the rest of the work
2115			 * we'd normally do in vm_page_free_prepare_object
2116			 * until 'vm_page_free_list' is called
2117			 */
2118			if (m->tabled)
2119				vm_page_remove(m, TRUE);
2120
2121			assert(m->pageq.next == NULL &&
2122			       m->pageq.prev == NULL);
2123			m->pageq.next = (queue_entry_t)local_freeq;
2124			local_freeq = m;
2125			local_freed++;
2126
2127			if (page_prev_state == PAGE_STATE_SPECULATIVE)
2128				vm_pageout_freed_from_speculative++;
2129			else if (page_prev_state == PAGE_STATE_CLEAN)
2130				vm_pageout_freed_from_cleaned++;
2131			else
2132				vm_pageout_freed_from_inactive_clean++;
2133
2134			inactive_burst_count = 0;
2135
2136			if (page_prev_state != PAGE_STATE_SPECULATIVE)
2137				vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
2138
2139			goto done_with_inactivepage;
2140		}
2141		/*
2142		 * If the object is empty, the page must be reclaimed even
2143		 * if dirty or used.
2144		 * If the page belongs to a volatile object, we stick it back
2145		 * on.
2146		 */
2147		if (object->copy == VM_OBJECT_NULL) {
2148			if (object->purgable == VM_PURGABLE_EMPTY) {
2149				if (m->pmapped == TRUE) {
2150					/* unmap the page */
2151					refmod_state = pmap_disconnect(m->phys_page);
2152					if (refmod_state & VM_MEM_MODIFIED) {
2153						SET_PAGE_DIRTY(m, FALSE);
2154					}
2155				}
2156				if (m->dirty || m->precious) {
2157					/* we saved the cost of cleaning this page ! */
2158					vm_page_purged_count++;
2159				}
2160				goto reclaim_page;
2161			}
2162			if (object->purgable == VM_PURGABLE_VOLATILE) {
2163				/* if it's wired, we can't put it on our queue */
2164				assert(!VM_PAGE_WIRED(m));
2165
2166				/* just stick it back on! */
2167				reactivated_this_call++;
2168
2169				if (page_prev_state == PAGE_STATE_CLEAN)
2170					vm_pageout_cleaned_volatile_reactivated++;
2171
2172				goto reactivate_page;
2173			}
2174		}
2175
2176consider_inactive_page:
2177		if (m->busy) {
2178			/*
2179			 * CAUTION CAUTION:
2180			 * A "busy" page should always be left alone, except...
2181			 */
2182			if (m->cleaning && m->encrypted_cleaning) {
2183				/*
2184				 * ENCRYPTED_SWAP:
2185				 * We could get here with a "busy" page
2186				 * if it's being encrypted during a
2187				 * "clean-in-place" operation.  We'll deal
2188				 * with it right away by testing if it has been
2189				 * referenced and either reactivating it or
2190				 * promoting it from "clean-in-place" to
2191				 * "clean-and-free".
2192				 */
2193			} else {
2194				panic("\"busy\" page considered for pageout\n");
2195			}
2196		}
2197
2198		/*
2199		 *	If it's being used, reactivate.
2200		 *	(Fictitious pages are either busy or absent.)
2201		 *	First, update the reference and dirty bits
2202		 *	to make sure the page is unreferenced.
2203		 */
2204		refmod_state = -1;
2205
2206		if (m->reference == FALSE && m->pmapped == TRUE) {
2207		        refmod_state = pmap_get_refmod(m->phys_page);
2208
2209		        if (refmod_state & VM_MEM_REFERENCED)
2210			        m->reference = TRUE;
2211		        if (refmod_state & VM_MEM_MODIFIED) {
2212				SET_PAGE_DIRTY(m, FALSE);
2213			}
2214		}
2215
2216		/*
2217		 *   if (m->cleaning)
2218		 *	If already cleaning this page in place and it hasn't
2219		 *  been recently referenced, just pull off the queue.
2220		 *  We can leave the page mapped, and upl_commit_range
2221		 *  will put it on the clean queue.
2222		 *
2223		 *	note: if m->encrypted_cleaning == TRUE, then
2224		 *		m->cleaning == TRUE
2225		 *	and we'll handle it here
2226		 *
2227		 *   if (m->pageout && !m->cleaning)
2228		 *	an msync INVALIDATE is in progress...
2229		 *	this page has been marked for destruction
2230		 * 	after it has been cleaned,
2231		 * 	but not yet gathered into a UPL
2232		 *	where 'cleaning' will be set...
2233		 *	just leave it off the paging queues
2234		 *
2235		 *   if (m->pageout && m->clenaing)
2236		 *	an msync INVALIDATE is in progress
2237		 *	and the UPL has already gathered this page...
2238		 *	just leave it off the paging queues
2239		 */
2240
2241		/*
2242		 * page with m->pageout and still on the queues means that an
2243		 * MS_INVALIDATE in progress on this page... leave it alone
2244		 */
2245		if (m->pageout) {
2246			inactive_burst_count = 0;
2247			goto done_with_inactivepage;
2248		}
2249
2250		/* if cleaning, reactivate if referenced.  otherwise, just pull off queue */
2251		if (m->cleaning) {
2252			if (m->reference == TRUE) {
2253				reactivated_this_call++;
2254				goto reactivate_page;
2255			} else {
2256				inactive_burst_count = 0;
2257				goto done_with_inactivepage;
2258			}
2259		}
2260
2261		if (m->reference || m->dirty) {
2262			/* deal with a rogue "reusable" page */
2263			VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m);
2264		}
2265
2266		if (m->reference && !m->no_cache) {
2267			/*
2268			 * The page we pulled off the inactive list has
2269			 * been referenced.  It is possible for other
2270			 * processors to be touching pages faster than we
2271			 * can clear the referenced bit and traverse the
2272			 * inactive queue, so we limit the number of
2273			 * reactivations.
2274			 */
2275			if (++reactivated_this_call >= reactivate_limit) {
2276				vm_pageout_reactivation_limit_exceeded++;
2277			} else if (catch_up_count) {
2278				vm_pageout_catch_ups++;
2279			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
2280				vm_pageout_inactive_force_reclaim++;
2281			} else {
2282				uint32_t isinuse;
2283
2284				if (page_prev_state == PAGE_STATE_CLEAN)
2285					vm_pageout_cleaned_reference_reactivated++;
2286
2287reactivate_page:
2288				if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
2289				     vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
2290					/*
2291					 * no explict mappings of this object exist
2292					 * and it's not open via the filesystem
2293					 */
2294					vm_page_deactivate(m);
2295					vm_pageout_inactive_deactivated++;
2296				} else {
2297					/*
2298					 * The page was/is being used, so put back on active list.
2299					 */
2300					vm_page_activate(m);
2301					VM_STAT_INCR(reactivations);
2302				}
2303
2304				if (page_prev_state == PAGE_STATE_CLEAN)
2305					vm_pageout_cleaned_reactivated++;
2306
2307				vm_pageout_inactive_used++;
2308				inactive_burst_count = 0;
2309
2310                                goto done_with_inactivepage;
2311			}
2312			/*
2313			 * Make sure we call pmap_get_refmod() if it
2314			 * wasn't already called just above, to update
2315			 * the dirty bit.
2316			 */
2317			if ((refmod_state == -1) && !m->dirty && m->pmapped) {
2318				refmod_state = pmap_get_refmod(m->phys_page);
2319				if (refmod_state & VM_MEM_MODIFIED) {
2320					SET_PAGE_DIRTY(m, FALSE);
2321				}
2322			}
2323			forced_reclaim = TRUE;
2324		} else {
2325			forced_reclaim = FALSE;
2326		}
2327
2328                XPR(XPR_VM_PAGEOUT,
2329                "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
2330                object, m->offset, m, 0,0);
2331
2332		/*
2333		 * we've got a candidate page to steal...
2334		 *
2335		 * m->dirty is up to date courtesy of the
2336		 * preceding check for m->reference... if
2337		 * we get here, then m->reference had to be
2338		 * FALSE (or possibly "reactivate_limit" was
2339                 * exceeded), but in either case we called
2340                 * pmap_get_refmod() and updated both
2341                 * m->reference and m->dirty
2342		 *
2343		 * if it's dirty or precious we need to
2344		 * see if the target queue is throtttled
2345		 * it if is, we need to skip over it by moving it back
2346		 * to the end of the inactive queue
2347		 */
2348
2349		inactive_throttled = FALSE;
2350
2351		if (m->dirty || m->precious) {
2352		        if (object->internal) {
2353				if (VM_PAGE_Q_THROTTLED(iq))
2354				        inactive_throttled = TRUE;
2355			} else if (VM_PAGE_Q_THROTTLED(eq)) {
2356				inactive_throttled = TRUE;
2357			}
2358		}
2359throttle_inactive:
2360		if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
2361		    object->internal && m->dirty &&
2362		    (object->purgable == VM_PURGABLE_DENY ||
2363		     object->purgable == VM_PURGABLE_NONVOLATILE ||
2364		     object->purgable == VM_PURGABLE_VOLATILE)) {
2365			queue_enter(&vm_page_queue_throttled, m,
2366				    vm_page_t, pageq);
2367			m->throttled = TRUE;
2368			vm_page_throttled_count++;
2369
2370			vm_pageout_scan_reclaimed_throttled++;
2371
2372			goto done_with_inactivepage;
2373		}
2374		if (inactive_throttled == TRUE) {
2375
2376			if (object->internal)
2377				vm_pageout_scan_inactive_throttled_internal++;
2378			else
2379				vm_pageout_scan_inactive_throttled_external++;
2380
2381			if (page_prev_state == PAGE_STATE_SPECULATIVE)
2382				page_prev_state = PAGE_STATE_INACTIVE;
2383
2384			if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && object->internal == FALSE) {
2385				/*
2386				 * a) The external pageout queue is throttled
2387				 * b) We're done with the active queue and moved on to the inactive queue
2388				 * c) We start noticing dirty pages and usually we would put them at the end of the inactive queue, but,
2389				 * d) We don't have a default pager, and so,
2390				 * e) We push these onto the active queue in an effort to cause a re-evaluation of the active queue
2391				 *    and get back some, possibly clean, pages.
2392				 *
2393				 * We also keep a count of the pages of this kind, since, these will be a good indicator of us being in a deadlock
2394				 * on systems without a dynamic pager, where:
2395				 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2396				 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2397				 * c) Most of the pages in the inactive queue belong to this file.
2398				 */
2399				queue_enter(&vm_page_queue_active, m, vm_page_t, pageq);
2400				m->active = TRUE;
2401				vm_page_active_count++;
2402
2403				vm_pageout_adjust_io_throttles(iq, eq, FALSE);
2404
2405				vm_pageout_inactive_external_forced_reactivate_count++;
2406				vm_pageout_inactive_external_forced_reactivate_limit--;
2407
2408				if (vm_pageout_inactive_external_forced_reactivate_limit <= 0){
2409					vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2410#if CONFIG_JETSAM
2411					/*
2412					 * Possible deadlock scenario so request jetsam action
2413					 */
2414					assert(object);
2415					vm_object_unlock(object);
2416					object = VM_OBJECT_NULL;
2417					vm_page_unlock_queues();
2418
2419					if (memorystatus_kill_top_proc_from_VM() < 0){
2420						panic("vm_pageout_scan: Jetsam request failed\n");
2421					}
2422
2423					vm_pageout_inactive_external_forced_jetsam_count++;
2424					vm_page_lock_queues();
2425					delayed_unlock = 1;
2426#endif
2427				}
2428				inactive_burst_count = 0;
2429				goto done_with_inactivepage;
2430			} else {
2431				goto requeue_page;
2432			}
2433		}
2434
2435		/*
2436		 * we've got a page that we can steal...
2437		 * eliminate all mappings and make sure
2438		 * we have the up-to-date modified state
2439		 *
2440		 * if we need to do a pmap_disconnect then we
2441		 * need to re-evaluate m->dirty since the pmap_disconnect
2442		 * provides the true state atomically... the
2443		 * page was still mapped up to the pmap_disconnect
2444		 * and may have been dirtied at the last microsecond
2445		 *
2446		 * we also check for the page being referenced 'late'
2447		 * and reactivate it for that case
2448		 *
2449		 * Note that if 'pmapped' is FALSE then the page is not
2450		 * and has not been in any map, so there is no point calling
2451		 * pmap_disconnect().  m->dirty and/or m->reference could
2452		 * have been set in anticipation of likely usage of the page.
2453		 */
2454		if (m->pmapped == TRUE) {
2455		        refmod_state = pmap_disconnect(m->phys_page);
2456
2457		        if (refmod_state & VM_MEM_MODIFIED) {
2458				SET_PAGE_DIRTY(m, FALSE);
2459			}
2460		        if (refmod_state & VM_MEM_REFERENCED) {
2461
2462				/* If m->reference is already set, this page must have
2463				 * already failed the reactivate_limit test, so don't
2464				 * bump the counts twice.
2465				 */
2466				if ( ! m->reference ) {
2467					m->reference = TRUE;
2468					if (forced_reclaim ||
2469					    ++reactivated_this_call >= reactivate_limit)
2470						vm_pageout_reactivation_limit_exceeded++;
2471					else {
2472						if (page_prev_state == PAGE_STATE_CLEAN)
2473							vm_pageout_cleaned_reference_reactivated++;
2474						goto reactivate_page;
2475					}
2476				}
2477			}
2478		}
2479		/*
2480		 * reset our count of pages that have been reclaimed
2481		 * since the last page was 'stolen'
2482		 */
2483		inactive_reclaim_run = 0;
2484
2485		/*
2486		 *	If it's clean and not precious, we can free the page.
2487		 */
2488		if (!m->dirty && !m->precious) {
2489
2490			if (page_prev_state == PAGE_STATE_SPECULATIVE)
2491				vm_pageout_speculative_clean++;
2492			else {
2493				if (page_prev_state == PAGE_STATE_ANONYMOUS)
2494					vm_pageout_inactive_anonymous++;
2495				else if (page_prev_state == PAGE_STATE_CLEAN)
2496					vm_pageout_cleaned_reclaimed++;
2497
2498				if (m->was_dirty) {
2499					/* page on clean queue used to be dirty; we should increment the vm_stat pageout count here */
2500					VM_STAT_INCR(pageouts);
2501					DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
2502				}
2503				vm_pageout_inactive_clean++;
2504			}
2505			/* FYI: (!pageout_making_free) == (!m->clean_queue && !m->speculative) */
2506			if (((vm_page_free_count + local_freed) >= vm_page_free_target) && !pageout_making_free) {
2507
2508				assert(!m->clean_queue);
2509				assert(!m->speculative);
2510
2511				/*
2512				 * we have met our free page target and this page wasn't just pulled
2513				 * from the clean or speculative queues, so put it on the clean queue
2514				 */
2515				if (m->reference == TRUE) {
2516					/*
2517					 * must have come through the forced reclaim path.
2518					 * we need to clear out the reference state in this case
2519					 * so that we don't just reactivate the page when we
2520					 * find it in the clean queue based on an old reference.
2521					 * if it gets re-referenced while on the queue, then
2522					 * the reactivation is justified
2523					 */
2524					m->reference = FALSE;
2525					pmap_clear_reference(m->phys_page);
2526				}
2527
2528				vm_pageout_enqueued_cleaned_from_inactive_clean++;
2529				vm_page_enqueue_cleaned(m);
2530
2531				inactive_burst_count = 0; /* we found a usable page on the inactive queue, hooray */
2532
2533				goto done_with_inactivepage;
2534
2535			}
2536			/*
2537			 * OK, at this point we have found a page we are going to free.
2538			 */
2539
2540#ifndef CONFIG_EMBEDDED
2541
2542#define VM_PRESSURE_INTERVAL_NS		250000000	/* nanoseconds; == .25 seconds */
2543			if (vm_pageout_need_to_refill_clean_queue == TRUE || page_prev_state == PAGE_STATE_CLEAN) {
2544				static uint64_t vm_pressure_last_time_ns = 0;
2545				uint64_t cur_time_ns = 0;
2546				absolutetime_to_nanoseconds(mach_absolute_time(), &cur_time_ns);
2547				if (cur_time_ns >= vm_pressure_last_time_ns + VM_PRESSURE_INTERVAL_NS) {
2548					vm_pressure_last_time_ns = cur_time_ns;
2549					thread_wakeup(&vm_pressure_thread);
2550#if CONFIG_MEMORYSTATUS
2551					/* Wake up idle-exit thread */
2552					thread_wakeup((event_t)&memorystatus_wakeup);
2553#endif
2554				}
2555			}
2556#endif /* !CONFIG_EMBEDDED */
2557
2558			goto reclaim_page;
2559		}
2560
2561		/*
2562		 * The page may have been dirtied since the last check
2563		 * for a throttled target queue (which may have been skipped
2564		 * if the page was clean then).  With the dirty page
2565		 * disconnected here, we can make one final check.
2566		 */
2567		if (object->internal) {
2568			if (VM_PAGE_Q_THROTTLED(iq))
2569				inactive_throttled = TRUE;
2570		} else if (VM_PAGE_Q_THROTTLED(eq)) {
2571			inactive_throttled = TRUE;
2572		}
2573
2574		if (inactive_throttled == TRUE)
2575			goto throttle_inactive;
2576
2577		/*
2578		 * do NOT set the pageout bit!
2579		 * sure, we might need free pages, but this page is going to take time to become free
2580		 * anyway, so we may as well put it on the clean queue first and take it from there later
2581		 * if necessary.  that way, we'll ensure we don't free up too much. -mj
2582		 */
2583		vm_pageout_cluster(m, FALSE);
2584
2585		if (page_prev_state == PAGE_STATE_ANONYMOUS)
2586			vm_pageout_inactive_anonymous++;
2587		if (object->internal)
2588			vm_pageout_inactive_dirty_internal++;
2589		else
2590			vm_pageout_inactive_dirty_external++;
2591
2592		inactive_burst_count = 0;
2593
2594done_with_inactivepage:
2595		if (delayed_unlock++ > delayed_unlock_limit || try_failed == TRUE) {
2596
2597		        if (object != NULL) {
2598				vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2599			        vm_object_unlock(object);
2600				object = NULL;
2601			}
2602		        if (local_freeq) {
2603				vm_page_unlock_queues();
2604
2605				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2606					       vm_page_free_count, local_freed, delayed_unlock_limit, 4);
2607
2608				vm_page_free_list(local_freeq, TRUE);
2609
2610				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2611					       vm_page_free_count, local_freed, 0, 4);
2612
2613				local_freeq = NULL;
2614				local_freed = 0;
2615				vm_page_lock_queues();
2616			} else
2617				lck_mtx_yield(&vm_page_queue_lock);
2618
2619			delayed_unlock = 1;
2620		}
2621		vm_pageout_considered_page++;
2622
2623		/*
2624		 * back to top of pageout scan loop
2625		 */
2626	}
2627}
2628
2629
2630int vm_page_free_count_init;
2631
2632void
2633vm_page_free_reserve(
2634	int pages)
2635{
2636	int		free_after_reserve;
2637
2638	vm_page_free_reserved += pages;
2639
2640	if (vm_page_free_reserved > VM_PAGE_FREE_RESERVED_LIMIT)
2641		vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
2642
2643	free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
2644
2645	vm_page_free_min = vm_page_free_reserved +
2646		VM_PAGE_FREE_MIN(free_after_reserve);
2647
2648	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
2649	        vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
2650
2651	vm_page_free_target = vm_page_free_reserved +
2652		VM_PAGE_FREE_TARGET(free_after_reserve);
2653
2654	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
2655	        vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
2656
2657	if (vm_page_free_target < vm_page_free_min + 5)
2658		vm_page_free_target = vm_page_free_min + 5;
2659
2660	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3);
2661	vm_page_creation_throttle = vm_page_free_target * 3;
2662}
2663
2664/*
2665 *	vm_pageout is the high level pageout daemon.
2666 */
2667
2668void
2669vm_pageout_continue(void)
2670{
2671	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
2672	vm_pageout_scan_event_counter++;
2673
2674	vm_pageout_scan();
2675	/*
2676	 * we hold both the vm_page_queue_free_lock
2677	 * and the vm_page_queues_lock at this point
2678	 */
2679	assert(vm_page_free_wanted == 0);
2680	assert(vm_page_free_wanted_privileged == 0);
2681	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2682
2683	lck_mtx_unlock(&vm_page_queue_free_lock);
2684	vm_page_unlock_queues();
2685
2686	counter(c_vm_pageout_block++);
2687	thread_block((thread_continue_t)vm_pageout_continue);
2688	/*NOTREACHED*/
2689}
2690
2691
2692#ifdef FAKE_DEADLOCK
2693
2694#define FAKE_COUNT	5000
2695
2696int internal_count = 0;
2697int fake_deadlock = 0;
2698
2699#endif
2700
2701static void
2702vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2703{
2704	vm_page_t	m = NULL;
2705	vm_object_t	object;
2706	vm_object_offset_t offset;
2707	memory_object_t	pager;
2708	thread_t	self = current_thread();
2709
2710	if ((vm_pageout_internal_iothread != THREAD_NULL)
2711	    && (self == vm_pageout_external_iothread )
2712	    && (self->options & TH_OPT_VMPRIV))
2713		self->options &= ~TH_OPT_VMPRIV;
2714
2715	vm_page_lockspin_queues();
2716
2717        while ( !queue_empty(&q->pgo_pending) ) {
2718
2719		   q->pgo_busy = TRUE;
2720		   queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2721		   if (m->object == slide_info.slide_object) {
2722			   panic("slid page %p not allowed on this path\n", m);
2723		   }
2724		   VM_PAGE_CHECK(m);
2725		   m->pageout_queue = FALSE;
2726		   m->pageq.next = NULL;
2727		   m->pageq.prev = NULL;
2728
2729		   /*
2730		    * grab a snapshot of the object and offset this
2731		    * page is tabled in so that we can relookup this
2732		    * page after we've taken the object lock - these
2733		    * fields are stable while we hold the page queues lock
2734		    * but as soon as we drop it, there is nothing to keep
2735		    * this page in this object... we hold an activity_in_progress
2736		    * on this object which will keep it from terminating
2737		    */
2738		   object = m->object;
2739		   offset = m->offset;
2740
2741		   vm_page_unlock_queues();
2742
2743#ifdef FAKE_DEADLOCK
2744		   if (q == &vm_pageout_queue_internal) {
2745		           vm_offset_t addr;
2746			   int	pg_count;
2747
2748			   internal_count++;
2749
2750			   if ((internal_count == FAKE_COUNT)) {
2751
2752				   pg_count = vm_page_free_count + vm_page_free_reserved;
2753
2754			           if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2755				           kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2756				   }
2757				   internal_count = 0;
2758				   fake_deadlock++;
2759			   }
2760		   }
2761#endif
2762		   vm_object_lock(object);
2763
2764		   m = vm_page_lookup(object, offset);
2765
2766		   if (m == NULL ||
2767		       m->busy || m->cleaning || m->pageout_queue || !m->laundry) {
2768			   /*
2769			    * it's either the same page that someone else has
2770			    * started cleaning (or it's finished cleaning or
2771			    * been put back on the pageout queue), or
2772			    * the page has been freed or we have found a
2773			    * new page at this offset... in all of these cases
2774			    * we merely need to release the activity_in_progress
2775			    * we took when we put the page on the pageout queue
2776			    */
2777			   vm_object_activity_end(object);
2778			   vm_object_unlock(object);
2779
2780			   vm_page_lockspin_queues();
2781			   continue;
2782		   }
2783		   if (!object->pager_initialized) {
2784
2785			   /*
2786			    *	If there is no memory object for the page, create
2787			    *	one and hand it to the default pager.
2788			    */
2789
2790			   if (!object->pager_initialized)
2791			           vm_object_collapse(object,
2792						      (vm_object_offset_t) 0,
2793						      TRUE);
2794			   if (!object->pager_initialized)
2795			           vm_object_pager_create(object);
2796			   if (!object->pager_initialized) {
2797			           /*
2798				    *	Still no pager for the object.
2799				    *	Reactivate the page.
2800				    *
2801				    *	Should only happen if there is no
2802				    *	default pager.
2803				    */
2804				   m->pageout = FALSE;
2805
2806			           vm_page_lockspin_queues();
2807
2808				   vm_pageout_throttle_up(m);
2809				   vm_page_activate(m);
2810				   vm_pageout_dirty_no_pager++;
2811
2812				   vm_page_unlock_queues();
2813
2814				   /*
2815				    *	And we are done with it.
2816				    */
2817			           vm_object_activity_end(object);
2818				   vm_object_unlock(object);
2819
2820				   vm_page_lockspin_queues();
2821				   continue;
2822			   }
2823		   }
2824		   pager = object->pager;
2825
2826	           if (pager == MEMORY_OBJECT_NULL) {
2827		           /*
2828			    * This pager has been destroyed by either
2829			    * memory_object_destroy or vm_object_destroy, and
2830			    * so there is nowhere for the page to go.
2831			    */
2832			   if (m->pageout) {
2833				   /*
2834				    * Just free the page... VM_PAGE_FREE takes
2835				    * care of cleaning up all the state...
2836				    * including doing the vm_pageout_throttle_up
2837				    */
2838				   VM_PAGE_FREE(m);
2839			   } else {
2840			           vm_page_lockspin_queues();
2841
2842				   vm_pageout_throttle_up(m);
2843				   vm_page_activate(m);
2844
2845				   vm_page_unlock_queues();
2846
2847				   /*
2848				    *	And we are done with it.
2849				    */
2850			   }
2851			   vm_object_activity_end(object);
2852			   vm_object_unlock(object);
2853
2854			   vm_page_lockspin_queues();
2855			   continue;
2856		   }
2857#if 0
2858		   /*
2859		    * we don't hold the page queue lock
2860		    * so this check isn't safe to make
2861		    */
2862		   VM_PAGE_CHECK(m);
2863#endif
2864		   /*
2865		    * give back the activity_in_progress reference we
2866		    * took when we queued up this page and replace it
2867		    * it with a paging_in_progress reference that will
2868                    * also hold the paging offset from changing and
2869                    * prevent the object from terminating
2870		    */
2871		   vm_object_activity_end(object);
2872		   vm_object_paging_begin(object);
2873		   vm_object_unlock(object);
2874
2875                   /*
2876		    * Send the data to the pager.
2877		    * any pageout clustering happens there
2878		    */
2879		   memory_object_data_return(pager,
2880					     m->offset + object->paging_offset,
2881					     PAGE_SIZE,
2882					     NULL,
2883					     NULL,
2884					     FALSE,
2885					     FALSE,
2886					     0);
2887
2888		   vm_object_lock(object);
2889		   vm_object_paging_end(object);
2890		   vm_object_unlock(object);
2891
2892		   vm_pageout_io_throttle();
2893
2894		   vm_page_lockspin_queues();
2895	}
2896	q->pgo_busy = FALSE;
2897	q->pgo_idle = TRUE;
2898
2899	assert_wait((event_t) q, THREAD_UNINT);
2900	vm_page_unlock_queues();
2901
2902	thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2903	/*NOTREACHED*/
2904}
2905
2906
2907
2908static void
2909vm_pageout_adjust_io_throttles(struct vm_pageout_queue *iq, struct vm_pageout_queue *eq, boolean_t req_lowpriority)
2910{
2911	uint32_t 	policy;
2912	boolean_t	set_iq = FALSE;
2913	boolean_t	set_eq = FALSE;
2914
2915	if (hibernate_cleaning_in_progress == TRUE)
2916		req_lowpriority = FALSE;
2917
2918	if (iq->pgo_inited == TRUE && iq->pgo_lowpriority != req_lowpriority)
2919		set_iq = TRUE;
2920
2921	if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority)
2922		set_eq = TRUE;
2923
2924	if (set_iq == TRUE || set_eq == TRUE) {
2925
2926		vm_page_unlock_queues();
2927
2928		if (req_lowpriority == TRUE) {
2929			policy = TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE;
2930			DTRACE_VM(laundrythrottle);
2931		} else {
2932			policy = TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL;
2933			DTRACE_VM(laundryunthrottle);
2934		}
2935		if (set_iq == TRUE) {
2936			proc_apply_thread_diskacc(kernel_task, iq->pgo_tid, policy);
2937			iq->pgo_lowpriority = req_lowpriority;
2938		}
2939		if (set_eq == TRUE) {
2940			proc_apply_thread_diskacc(kernel_task, eq->pgo_tid, policy);
2941			eq->pgo_lowpriority = req_lowpriority;
2942		}
2943		vm_page_lock_queues();
2944	}
2945}
2946
2947
2948static void
2949vm_pageout_iothread_external(void)
2950{
2951	thread_t	self = current_thread();
2952
2953	self->options |= TH_OPT_VMPRIV;
2954
2955	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
2956	proc_apply_thread_diskacc(kernel_task, self->thread_id, TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE);
2957
2958	vm_page_lock_queues();
2959
2960	vm_pageout_queue_external.pgo_tid = self->thread_id;
2961	vm_pageout_queue_external.pgo_lowpriority = TRUE;
2962	vm_pageout_queue_external.pgo_inited = TRUE;
2963
2964	vm_page_unlock_queues();
2965
2966	vm_pageout_iothread_continue(&vm_pageout_queue_external);
2967
2968	/*NOTREACHED*/
2969}
2970
2971static void
2972vm_pageout_iothread_internal(void)
2973{
2974	thread_t	self = current_thread();
2975
2976	self->options |= TH_OPT_VMPRIV;
2977
2978	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
2979	proc_apply_thread_diskacc(kernel_task, self->thread_id, TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE);
2980
2981	vm_page_lock_queues();
2982
2983	vm_pageout_queue_internal.pgo_tid = self->thread_id;
2984	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
2985	vm_pageout_queue_internal.pgo_inited = TRUE;
2986
2987	vm_page_unlock_queues();
2988
2989	vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2990
2991	/*NOTREACHED*/
2992}
2993
2994kern_return_t
2995vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
2996{
2997	if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
2998		return KERN_SUCCESS;
2999	} else {
3000		return KERN_FAILURE; /* Already set */
3001	}
3002}
3003
3004static void
3005vm_pressure_thread(void) {
3006	static boolean_t set_up_thread = FALSE;
3007
3008	if (set_up_thread) {
3009#if VM_PRESSURE_EVENTS
3010		consider_vm_pressure_events();
3011#endif /* VM_PRESSURE_EVENTS */
3012	}
3013
3014	set_up_thread = TRUE;
3015	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
3016	thread_block((thread_continue_t)vm_pressure_thread);
3017}
3018
3019uint32_t vm_pageout_considered_page_last = 0;
3020
3021/*
3022 * called once per-second via "compute_averages"
3023 */
3024void
3025compute_pageout_gc_throttle()
3026{
3027	if (vm_pageout_considered_page != vm_pageout_considered_page_last) {
3028
3029		vm_pageout_considered_page_last = vm_pageout_considered_page;
3030
3031		thread_wakeup((event_t) &vm_pageout_garbage_collect);
3032	}
3033}
3034
3035
3036static void
3037vm_pageout_garbage_collect(int collect)
3038{
3039
3040	if (collect) {
3041		boolean_t buf_large_zfree = FALSE;
3042		boolean_t first_try = TRUE;
3043
3044		stack_collect();
3045
3046		consider_machine_collect();
3047
3048		do {
3049			if (consider_buffer_cache_collect != NULL) {
3050				buf_large_zfree = (*consider_buffer_cache_collect)(0);
3051			}
3052			if (first_try == TRUE || buf_large_zfree == TRUE) {
3053				/*
3054				 * consider_zone_gc should be last, because the other operations
3055				 * might return memory to zones.
3056				 */
3057				consider_zone_gc(buf_large_zfree);
3058			}
3059			first_try = FALSE;
3060
3061		} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
3062
3063		consider_machine_adjust();
3064	}
3065	assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
3066
3067	thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
3068	/*NOTREACHED*/
3069}
3070
3071
3072
3073void
3074vm_pageout(void)
3075{
3076	thread_t	self = current_thread();
3077	thread_t	thread;
3078	kern_return_t	result;
3079	spl_t		s;
3080
3081	/*
3082	 * Set thread privileges.
3083	 */
3084	s = splsched();
3085	thread_lock(self);
3086	self->priority = BASEPRI_PREEMPT - 1;
3087	set_sched_pri(self, self->priority);
3088	thread_unlock(self);
3089
3090	if (!self->reserved_stack)
3091		self->reserved_stack = self->kernel_stack;
3092
3093	splx(s);
3094
3095	/*
3096	 *	Initialize some paging parameters.
3097	 */
3098
3099	if (vm_pageout_idle_wait == 0)
3100		vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
3101
3102	if (vm_pageout_burst_wait == 0)
3103		vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
3104
3105	if (vm_pageout_empty_wait == 0)
3106		vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
3107
3108	if (vm_pageout_deadlock_wait == 0)
3109		vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
3110
3111	if (vm_pageout_deadlock_relief == 0)
3112		vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
3113
3114	if (vm_pageout_inactive_relief == 0)
3115		vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
3116
3117	if (vm_pageout_burst_active_throttle == 0)
3118	        vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
3119
3120	if (vm_pageout_burst_inactive_throttle == 0)
3121	        vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
3122
3123	/*
3124	 * Set kernel task to low backing store privileged
3125	 * status
3126	 */
3127	task_lock(kernel_task);
3128	kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
3129	task_unlock(kernel_task);
3130
3131	vm_page_free_count_init = vm_page_free_count;
3132
3133	/*
3134	 * even if we've already called vm_page_free_reserve
3135	 * call it again here to insure that the targets are
3136	 * accurately calculated (it uses vm_page_free_count_init)
3137	 * calling it with an arg of 0 will not change the reserve
3138	 * but will re-calculate free_min and free_target
3139	 */
3140	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
3141		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
3142	} else
3143		vm_page_free_reserve(0);
3144
3145
3146	queue_init(&vm_pageout_queue_external.pgo_pending);
3147	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
3148	vm_pageout_queue_external.pgo_laundry = 0;
3149	vm_pageout_queue_external.pgo_idle = FALSE;
3150	vm_pageout_queue_external.pgo_busy = FALSE;
3151	vm_pageout_queue_external.pgo_throttled = FALSE;
3152	vm_pageout_queue_external.pgo_draining = FALSE;
3153	vm_pageout_queue_external.pgo_lowpriority = FALSE;
3154	vm_pageout_queue_external.pgo_tid = -1;
3155	vm_pageout_queue_external.pgo_inited = FALSE;
3156
3157
3158	queue_init(&vm_pageout_queue_internal.pgo_pending);
3159	vm_pageout_queue_internal.pgo_maxlaundry = 0;
3160	vm_pageout_queue_internal.pgo_laundry = 0;
3161	vm_pageout_queue_internal.pgo_idle = FALSE;
3162	vm_pageout_queue_internal.pgo_busy = FALSE;
3163	vm_pageout_queue_internal.pgo_throttled = FALSE;
3164	vm_pageout_queue_internal.pgo_draining = FALSE;
3165	vm_pageout_queue_internal.pgo_lowpriority = FALSE;
3166	vm_pageout_queue_internal.pgo_tid = -1;
3167	vm_pageout_queue_internal.pgo_inited = FALSE;
3168
3169	/* internal pageout thread started when default pager registered first time */
3170	/* external pageout and garbage collection threads started here */
3171
3172	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
3173					      BASEPRI_PREEMPT - 1,
3174					      &vm_pageout_external_iothread);
3175	if (result != KERN_SUCCESS)
3176		panic("vm_pageout_iothread_external: create failed");
3177
3178	thread_deallocate(vm_pageout_external_iothread);
3179
3180	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
3181					      BASEPRI_DEFAULT,
3182					      &thread);
3183	if (result != KERN_SUCCESS)
3184		panic("vm_pageout_garbage_collect: create failed");
3185
3186	thread_deallocate(thread);
3187
3188	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
3189						BASEPRI_DEFAULT,
3190						&thread);
3191
3192	if (result != KERN_SUCCESS)
3193		panic("vm_pressure_thread: create failed");
3194
3195	thread_deallocate(thread);
3196
3197	vm_object_reaper_init();
3198
3199
3200	vm_pageout_continue();
3201
3202	/*
3203	 * Unreached code!
3204	 *
3205	 * The vm_pageout_continue() call above never returns, so the code below is never
3206	 * executed.  We take advantage of this to declare several DTrace VM related probe
3207	 * points that our kernel doesn't have an analog for.  These are probe points that
3208	 * exist in Solaris and are in the DTrace documentation, so people may have written
3209	 * scripts that use them.  Declaring the probe points here means their scripts will
3210	 * compile and execute which we want for portability of the scripts, but since this
3211	 * section of code is never reached, the probe points will simply never fire.  Yes,
3212	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
3213	 * Solaris specific VM events in mind, not portability to different VM implementations.
3214	 */
3215
3216	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
3217	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
3218	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
3219	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
3220	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
3221	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
3222	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
3223	/*NOTREACHED*/
3224}
3225
3226kern_return_t
3227vm_pageout_internal_start(void)
3228{
3229	kern_return_t result;
3230
3231	vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
3232	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
3233	if (result == KERN_SUCCESS)
3234		thread_deallocate(vm_pageout_internal_iothread);
3235	return result;
3236}
3237
3238
3239static upl_t
3240upl_create(int type, int flags, upl_size_t size)
3241{
3242	upl_t	upl;
3243	int	page_field_size = 0;
3244	int	upl_flags = 0;
3245	int	upl_size  = sizeof(struct upl);
3246
3247	size = round_page_32(size);
3248
3249	if (type & UPL_CREATE_LITE) {
3250		page_field_size = (atop(size) + 7) >> 3;
3251		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
3252
3253		upl_flags |= UPL_LITE;
3254	}
3255	if (type & UPL_CREATE_INTERNAL) {
3256		upl_size += (int) sizeof(struct upl_page_info) * atop(size);
3257
3258		upl_flags |= UPL_INTERNAL;
3259	}
3260	upl = (upl_t)kalloc(upl_size + page_field_size);
3261
3262	if (page_field_size)
3263	        bzero((char *)upl + upl_size, page_field_size);
3264
3265	upl->flags = upl_flags | flags;
3266	upl->src_object = NULL;
3267	upl->kaddr = (vm_offset_t)0;
3268	upl->size = 0;
3269	upl->map_object = NULL;
3270	upl->ref_count = 1;
3271	upl->ext_ref_count = 0;
3272	upl->highest_page = 0;
3273	upl_lock_init(upl);
3274	upl->vector_upl = NULL;
3275#if UPL_DEBUG
3276	upl->ubc_alias1 = 0;
3277	upl->ubc_alias2 = 0;
3278
3279	upl->upl_creator = current_thread();
3280	upl->upl_state = 0;
3281	upl->upl_commit_index = 0;
3282	bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
3283
3284	upl->uplq.next = 0;
3285	upl->uplq.prev = 0;
3286
3287	(void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
3288#endif /* UPL_DEBUG */
3289
3290	return(upl);
3291}
3292
3293static void
3294upl_destroy(upl_t upl)
3295{
3296	int	page_field_size;  /* bit field in word size buf */
3297        int	size;
3298
3299	if (upl->ext_ref_count) {
3300		panic("upl(%p) ext_ref_count", upl);
3301	}
3302
3303#if UPL_DEBUG
3304	if ( !(upl->flags & UPL_VECTOR)) {
3305		vm_object_t	object;
3306
3307		if (upl->flags & UPL_SHADOWED) {
3308			object = upl->map_object->shadow;
3309		} else {
3310			object = upl->map_object;
3311		}
3312		vm_object_lock(object);
3313		queue_remove(&object->uplq, upl, upl_t, uplq);
3314		vm_object_activity_end(object);
3315		vm_object_collapse(object, 0, TRUE);
3316		vm_object_unlock(object);
3317	}
3318#endif /* UPL_DEBUG */
3319	/*
3320	 * drop a reference on the map_object whether or
3321	 * not a pageout object is inserted
3322	 */
3323	if (upl->flags & UPL_SHADOWED)
3324		vm_object_deallocate(upl->map_object);
3325
3326        if (upl->flags & UPL_DEVICE_MEMORY)
3327	        size = PAGE_SIZE;
3328	else
3329	        size = upl->size;
3330	page_field_size = 0;
3331
3332	if (upl->flags & UPL_LITE) {
3333		page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
3334		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
3335	}
3336	upl_lock_destroy(upl);
3337	upl->vector_upl = (vector_upl_t) 0xfeedbeef;
3338
3339	if (upl->flags & UPL_INTERNAL) {
3340		kfree(upl,
3341		      sizeof(struct upl) +
3342		      (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
3343		      + page_field_size);
3344	} else {
3345		kfree(upl, sizeof(struct upl) + page_field_size);
3346	}
3347}
3348
3349void
3350upl_deallocate(upl_t upl)
3351{
3352	if (--upl->ref_count == 0) {
3353		if(vector_upl_is_valid(upl))
3354			vector_upl_deallocate(upl);
3355		upl_destroy(upl);
3356	}
3357}
3358
3359#if DEVELOPMENT || DEBUG
3360/*/*
3361 * Statistics about UPL enforcement of copy-on-write obligations.
3362 */
3363unsigned long upl_cow = 0;
3364unsigned long upl_cow_again = 0;
3365unsigned long upl_cow_pages = 0;
3366unsigned long upl_cow_again_pages = 0;
3367
3368unsigned long iopl_cow = 0;
3369unsigned long iopl_cow_pages = 0;
3370#endif
3371
3372/*
3373 *	Routine:	vm_object_upl_request
3374 *	Purpose:
3375 *		Cause the population of a portion of a vm_object.
3376 *		Depending on the nature of the request, the pages
3377 *		returned may be contain valid data or be uninitialized.
3378 *		A page list structure, listing the physical pages
3379 *		will be returned upon request.
3380 *		This function is called by the file system or any other
3381 *		supplier of backing store to a pager.
3382 *		IMPORTANT NOTE: The caller must still respect the relationship
3383 *		between the vm_object and its backing memory object.  The
3384 *		caller MUST NOT substitute changes in the backing file
3385 *		without first doing a memory_object_lock_request on the
3386 *		target range unless it is know that the pages are not
3387 *		shared with another entity at the pager level.
3388 *		Copy_in_to:
3389 *			if a page list structure is present
3390 *			return the mapped physical pages, where a
3391 *			page is not present, return a non-initialized
3392 *			one.  If the no_sync bit is turned on, don't
3393 *			call the pager unlock to synchronize with other
3394 *			possible copies of the page. Leave pages busy
3395 *			in the original object, if a page list structure
3396 *			was specified.  When a commit of the page list
3397 *			pages is done, the dirty bit will be set for each one.
3398 *		Copy_out_from:
3399 *			If a page list structure is present, return
3400 *			all mapped pages.  Where a page does not exist
3401 *			map a zero filled one. Leave pages busy in
3402 *			the original object.  If a page list structure
3403 *			is not specified, this call is a no-op.
3404 *
3405 *		Note:  access of default pager objects has a rather interesting
3406 *		twist.  The caller of this routine, presumably the file system
3407 *		page cache handling code, will never actually make a request
3408 *		against a default pager backed object.  Only the default
3409 *		pager will make requests on backing store related vm_objects
3410 *		In this way the default pager can maintain the relationship
3411 *		between backing store files (abstract memory objects) and
3412 *		the vm_objects (cache objects), they support.
3413 *
3414 */
3415
3416__private_extern__ kern_return_t
3417vm_object_upl_request(
3418	vm_object_t		object,
3419	vm_object_offset_t	offset,
3420	upl_size_t		size,
3421	upl_t			*upl_ptr,
3422	upl_page_info_array_t	user_page_list,
3423	unsigned int		*page_list_count,
3424	int			cntrl_flags)
3425{
3426	vm_page_t		dst_page = VM_PAGE_NULL;
3427	vm_object_offset_t	dst_offset;
3428	upl_size_t		xfer_size;
3429	unsigned int		size_in_pages;
3430	boolean_t		dirty;
3431	boolean_t		hw_dirty;
3432	upl_t			upl = NULL;
3433	unsigned int		entry;
3434#if MACH_CLUSTER_STATS
3435	boolean_t		encountered_lrp = FALSE;
3436#endif
3437	vm_page_t		alias_page = NULL;
3438        int			refmod_state = 0;
3439	wpl_array_t 		lite_list = NULL;
3440	vm_object_t		last_copy_object;
3441	struct	vm_page_delayed_work	dw_array[DEFAULT_DELAYED_WORK_LIMIT];
3442	struct	vm_page_delayed_work	*dwp;
3443	int			dw_count;
3444	int			dw_limit;
3445
3446	if (cntrl_flags & ~UPL_VALID_FLAGS) {
3447		/*
3448		 * For forward compatibility's sake,
3449		 * reject any unknown flag.
3450		 */
3451		return KERN_INVALID_VALUE;
3452	}
3453	if ( (!object->internal) && (object->paging_offset != 0) )
3454		panic("vm_object_upl_request: external object with non-zero paging offset\n");
3455	if (object->phys_contiguous)
3456	        panic("vm_object_upl_request: contiguous object specified\n");
3457
3458
3459	if ((size / PAGE_SIZE) > MAX_UPL_SIZE)
3460		size = MAX_UPL_SIZE * PAGE_SIZE;
3461
3462	if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
3463	        *page_list_count = MAX_UPL_SIZE;
3464
3465	if (cntrl_flags & UPL_SET_INTERNAL) {
3466	        if (cntrl_flags & UPL_SET_LITE) {
3467
3468			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
3469
3470			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
3471			lite_list = (wpl_array_t)
3472					(((uintptr_t)user_page_list) +
3473					((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3474			if (size == 0) {
3475				user_page_list = NULL;
3476				lite_list = NULL;
3477			}
3478		} else {
3479		        upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
3480
3481			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
3482			if (size == 0) {
3483				user_page_list = NULL;
3484			}
3485		}
3486	} else {
3487	        if (cntrl_flags & UPL_SET_LITE) {
3488
3489			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
3490
3491			lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
3492			if (size == 0) {
3493				lite_list = NULL;
3494			}
3495		} else {
3496		        upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
3497		}
3498	}
3499	*upl_ptr = upl;
3500
3501	if (user_page_list)
3502	        user_page_list[0].device = FALSE;
3503
3504	if (cntrl_flags & UPL_SET_LITE) {
3505	        upl->map_object = object;
3506	} else {
3507	        upl->map_object = vm_object_allocate(size);
3508		/*
3509		 * No neeed to lock the new object: nobody else knows
3510		 * about it yet, so it's all ours so far.
3511		 */
3512		upl->map_object->shadow = object;
3513		upl->map_object->pageout = TRUE;
3514		upl->map_object->can_persist = FALSE;
3515		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3516		upl->map_object->vo_shadow_offset = offset;
3517		upl->map_object->wimg_bits = object->wimg_bits;
3518
3519		VM_PAGE_GRAB_FICTITIOUS(alias_page);
3520
3521		upl->flags |= UPL_SHADOWED;
3522	}
3523	/*
3524	 * ENCRYPTED SWAP:
3525	 * Just mark the UPL as "encrypted" here.
3526	 * We'll actually encrypt the pages later,
3527	 * in upl_encrypt(), when the caller has
3528	 * selected which pages need to go to swap.
3529	 */
3530	if (cntrl_flags & UPL_ENCRYPT)
3531		upl->flags |= UPL_ENCRYPTED;
3532
3533	if (cntrl_flags & UPL_FOR_PAGEOUT)
3534		upl->flags |= UPL_PAGEOUT;
3535
3536	vm_object_lock(object);
3537	vm_object_activity_begin(object);
3538
3539	/*
3540	 * we can lock in the paging_offset once paging_in_progress is set
3541	 */
3542	upl->size = size;
3543	upl->offset = offset + object->paging_offset;
3544
3545#if UPL_DEBUG
3546	vm_object_activity_begin(object);
3547	queue_enter(&object->uplq, upl, upl_t, uplq);
3548#endif /* UPL_DEBUG */
3549
3550	if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
3551		/*
3552		 * Honor copy-on-write obligations
3553		 *
3554		 * The caller is gathering these pages and
3555		 * might modify their contents.  We need to
3556		 * make sure that the copy object has its own
3557		 * private copies of these pages before we let
3558		 * the caller modify them.
3559		 */
3560		vm_object_update(object,
3561				 offset,
3562				 size,
3563				 NULL,
3564				 NULL,
3565				 FALSE,	/* should_return */
3566				 MEMORY_OBJECT_COPY_SYNC,
3567				 VM_PROT_NO_CHANGE);
3568#if DEVELOPMENT || DEBUG
3569		upl_cow++;
3570		upl_cow_pages += size >> PAGE_SHIFT;
3571#endif
3572	}
3573	/*
3574	 * remember which copy object we synchronized with
3575	 */
3576	last_copy_object = object->copy;
3577	entry = 0;
3578
3579	xfer_size = size;
3580	dst_offset = offset;
3581	size_in_pages = size / PAGE_SIZE;
3582
3583	dwp = &dw_array[0];
3584	dw_count = 0;
3585	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
3586
3587	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
3588	    object->resident_page_count < (MAX_UPL_SIZE * 2))
3589		object->scan_collisions = 0;
3590
3591	while (xfer_size) {
3592
3593		dwp->dw_mask = 0;
3594
3595		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
3596			vm_object_unlock(object);
3597			VM_PAGE_GRAB_FICTITIOUS(alias_page);
3598			vm_object_lock(object);
3599		}
3600		if (cntrl_flags & UPL_COPYOUT_FROM) {
3601		        upl->flags |= UPL_PAGE_SYNC_DONE;
3602
3603			if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
3604				dst_page->fictitious ||
3605				dst_page->absent ||
3606				dst_page->error ||
3607			        dst_page->cleaning ||
3608			        (VM_PAGE_WIRED(dst_page))) {
3609
3610				if (user_page_list)
3611					user_page_list[entry].phys_addr = 0;
3612
3613				goto try_next_page;
3614			}
3615			/*
3616			 * grab this up front...
3617			 * a high percentange of the time we're going to
3618			 * need the hardware modification state a bit later
3619			 * anyway... so we can eliminate an extra call into
3620			 * the pmap layer by grabbing it here and recording it
3621			 */
3622			if (dst_page->pmapped)
3623			        refmod_state = pmap_get_refmod(dst_page->phys_page);
3624			else
3625			        refmod_state = 0;
3626
3627			if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
3628			        /*
3629				 * page is on inactive list and referenced...
3630				 * reactivate it now... this gets it out of the
3631				 * way of vm_pageout_scan which would have to
3632				 * reactivate it upon tripping over it
3633				 */
3634				dwp->dw_mask |= DW_vm_page_activate;
3635			}
3636			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
3637			        /*
3638				 * we're only asking for DIRTY pages to be returned
3639				 */
3640			        if (dst_page->pageout || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
3641				        /*
3642					 * if we were the page stolen by vm_pageout_scan to be
3643					 * cleaned (as opposed to a buddy being clustered in
3644					 * or this request is not being driven by a PAGEOUT cluster
3645					 * then we only need to check for the page being dirty or
3646					 * precious to decide whether to return it
3647					 */
3648				        if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
3649					        goto check_busy;
3650					goto dont_return;
3651				}
3652				/*
3653				 * this is a request for a PAGEOUT cluster and this page
3654				 * is merely along for the ride as a 'buddy'... not only
3655				 * does it have to be dirty to be returned, but it also
3656				 * can't have been referenced recently...
3657				 */
3658				if ( (hibernate_cleaning_in_progress == TRUE ||
3659				      (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) || dst_page->throttled)) &&
3660				      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
3661				        goto check_busy;
3662				}
3663dont_return:
3664				/*
3665				 * if we reach here, we're not to return
3666				 * the page... go on to the next one
3667				 */
3668				if (dst_page->laundry == TRUE) {
3669					/*
3670					 * if we get here, the page is not 'cleaning' (filtered out above).
3671					 * since it has been referenced, remove it from the laundry
3672					 * so we don't pay the cost of an I/O to clean a page
3673					 * we're just going to take back
3674					 */
3675					vm_page_lockspin_queues();
3676
3677					vm_pageout_steal_laundry(dst_page, TRUE);
3678					vm_page_activate(dst_page);
3679
3680					vm_page_unlock_queues();
3681				}
3682				if (user_page_list)
3683				        user_page_list[entry].phys_addr = 0;
3684
3685				goto try_next_page;
3686			}
3687check_busy:
3688			if (dst_page->busy) {
3689			        if (cntrl_flags & UPL_NOBLOCK) {
3690			        if (user_page_list)
3691					        user_page_list[entry].phys_addr = 0;
3692
3693					goto try_next_page;
3694				}
3695				/*
3696				 * someone else is playing with the
3697				 * page.  We will have to wait.
3698				 */
3699				PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3700
3701				continue;
3702			}
3703			/*
3704			 * ENCRYPTED SWAP:
3705			 * The caller is gathering this page and might
3706			 * access its contents later on.  Decrypt the
3707			 * page before adding it to the UPL, so that
3708			 * the caller never sees encrypted data.
3709			 */
3710			if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
3711			        int  was_busy;
3712
3713				/*
3714				 * save the current state of busy
3715				 * mark page as busy while decrypt
3716				 * is in progress since it will drop
3717				 * the object lock...
3718				 */
3719				was_busy = dst_page->busy;
3720				dst_page->busy = TRUE;
3721
3722				vm_page_decrypt(dst_page, 0);
3723				vm_page_decrypt_for_upl_counter++;
3724				/*
3725				 * restore to original busy state
3726				 */
3727				dst_page->busy = was_busy;
3728			}
3729			if (dst_page->pageout_queue == TRUE) {
3730
3731				vm_page_lockspin_queues();
3732
3733				if (dst_page->pageout_queue == TRUE) {
3734					/*
3735					 * we've buddied up a page for a clustered pageout
3736					 * that has already been moved to the pageout
3737					 * queue by pageout_scan... we need to remove
3738					 * it from the queue and drop the laundry count
3739					 * on that queue
3740					 */
3741					vm_pageout_throttle_up(dst_page);
3742				}
3743				vm_page_unlock_queues();
3744			}
3745#if MACH_CLUSTER_STATS
3746			/*
3747			 * pageout statistics gathering.  count
3748			 * all the pages we will page out that
3749			 * were not counted in the initial
3750			 * vm_pageout_scan work
3751			 */
3752			if (dst_page->pageout)
3753			        encountered_lrp = TRUE;
3754			if ((dst_page->dirty ||	(dst_page->object->internal && dst_page->precious))) {
3755			        if (encountered_lrp)
3756				        CLUSTER_STAT(pages_at_higher_offsets++;)
3757				else
3758				        CLUSTER_STAT(pages_at_lower_offsets++;)
3759			}
3760#endif
3761			hw_dirty = refmod_state & VM_MEM_MODIFIED;
3762			dirty = hw_dirty ? TRUE : dst_page->dirty;
3763
3764			if (dst_page->phys_page > upl->highest_page)
3765			        upl->highest_page = dst_page->phys_page;
3766
3767			if (cntrl_flags & UPL_SET_LITE) {
3768				unsigned int	pg_num;
3769
3770				pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
3771				assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
3772				lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3773
3774				if (hw_dirty)
3775				        pmap_clear_modify(dst_page->phys_page);
3776
3777				/*
3778				 * Mark original page as cleaning
3779				 * in place.
3780				 */
3781				dst_page->cleaning = TRUE;
3782				dst_page->precious = FALSE;
3783			} else {
3784			        /*
3785				 * use pageclean setup, it is more
3786				 * convenient even for the pageout
3787				 * cases here
3788				 */
3789			        vm_object_lock(upl->map_object);
3790				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3791				vm_object_unlock(upl->map_object);
3792
3793				alias_page->absent = FALSE;
3794				alias_page = NULL;
3795			}
3796#if     MACH_PAGEMAP
3797			/*
3798			 * Record that this page has been
3799			 * written out
3800			 */
3801			vm_external_state_set(object->existence_map, dst_page->offset);
3802#endif  /*MACH_PAGEMAP*/
3803			if (dirty) {
3804				SET_PAGE_DIRTY(dst_page, FALSE);
3805			} else {
3806				dst_page->dirty = FALSE;
3807			}
3808
3809			if (!dirty)
3810				dst_page->precious = TRUE;
3811
3812			if ( (cntrl_flags & UPL_ENCRYPT) ) {
3813			        /*
3814				 * ENCRYPTED SWAP:
3815				 * We want to deny access to the target page
3816				 * because its contents are about to be
3817				 * encrypted and the user would be very
3818				 * confused to see encrypted data instead
3819				 * of their data.
3820				 * We also set "encrypted_cleaning" to allow
3821				 * vm_pageout_scan() to demote that page
3822				 * from "adjacent/clean-in-place" to
3823				 * "target/clean-and-free" if it bumps into
3824				 * this page during its scanning while we're
3825				 * still processing this cluster.
3826				 */
3827			        dst_page->busy = TRUE;
3828				dst_page->encrypted_cleaning = TRUE;
3829			}
3830			if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
3831				if ( !VM_PAGE_WIRED(dst_page))
3832					dst_page->pageout = TRUE;
3833			}
3834		} else {
3835			if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
3836				/*
3837				 * Honor copy-on-write obligations
3838				 *
3839				 * The copy object has changed since we
3840				 * last synchronized for copy-on-write.
3841				 * Another copy object might have been
3842				 * inserted while we released the object's
3843				 * lock.  Since someone could have seen the
3844				 * original contents of the remaining pages
3845				 * through that new object, we have to
3846				 * synchronize with it again for the remaining
3847				 * pages only.  The previous pages are "busy"
3848				 * so they can not be seen through the new
3849				 * mapping.  The new mapping will see our
3850				 * upcoming changes for those previous pages,
3851				 * but that's OK since they couldn't see what
3852				 * was there before.  It's just a race anyway
3853				 * and there's no guarantee of consistency or
3854				 * atomicity.  We just don't want new mappings
3855				 * to see both the *before* and *after* pages.
3856				 */
3857				if (object->copy != VM_OBJECT_NULL) {
3858					vm_object_update(
3859						object,
3860						dst_offset,/* current offset */
3861						xfer_size, /* remaining size */
3862						NULL,
3863						NULL,
3864						FALSE,	   /* should_return */
3865						MEMORY_OBJECT_COPY_SYNC,
3866						VM_PROT_NO_CHANGE);
3867
3868#if DEVELOPMENT || DEBUG
3869					upl_cow_again++;
3870					upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
3871#endif
3872				}
3873				/*
3874				 * remember the copy object we synced with
3875				 */
3876				last_copy_object = object->copy;
3877			}
3878			dst_page = vm_page_lookup(object, dst_offset);
3879
3880			if (dst_page != VM_PAGE_NULL) {
3881
3882				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3883					/*
3884					 * skip over pages already present in the cache
3885					 */
3886					if (user_page_list)
3887						user_page_list[entry].phys_addr = 0;
3888
3889					goto try_next_page;
3890				}
3891				if (dst_page->fictitious) {
3892					panic("need corner case for fictitious page");
3893				}
3894
3895				if (dst_page->busy || dst_page->cleaning) {
3896					/*
3897					 * someone else is playing with the
3898					 * page.  We will have to wait.
3899					 */
3900					PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3901
3902					continue;
3903				}
3904				if (dst_page->laundry) {
3905					dst_page->pageout = FALSE;
3906
3907					vm_pageout_steal_laundry(dst_page, FALSE);
3908				}
3909			} else {
3910				if (object->private) {
3911					/*
3912					 * This is a nasty wrinkle for users
3913					 * of upl who encounter device or
3914					 * private memory however, it is
3915					 * unavoidable, only a fault can
3916					 * resolve the actual backing
3917					 * physical page by asking the
3918					 * backing device.
3919					 */
3920					if (user_page_list)
3921						user_page_list[entry].phys_addr = 0;
3922
3923					goto try_next_page;
3924				}
3925				if (object->scan_collisions) {
3926					/*
3927					 * the pageout_scan thread is trying to steal
3928					 * pages from this object, but has run into our
3929					 * lock... grab 2 pages from the head of the object...
3930					 * the first is freed on behalf of pageout_scan, the
3931					 * 2nd is for our own use... we use vm_object_page_grab
3932					 * in both cases to avoid taking pages from the free
3933					 * list since we are under memory pressure and our
3934					 * lock on this object is getting in the way of
3935					 * relieving it
3936					 */
3937					dst_page = vm_object_page_grab(object);
3938
3939					if (dst_page != VM_PAGE_NULL)
3940						vm_page_release(dst_page);
3941
3942					dst_page = vm_object_page_grab(object);
3943				}
3944				if (dst_page == VM_PAGE_NULL) {
3945					/*
3946					 * need to allocate a page
3947					 */
3948					dst_page = vm_page_grab();
3949				}
3950				if (dst_page == VM_PAGE_NULL) {
3951				        if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3952					       /*
3953						* we don't want to stall waiting for pages to come onto the free list
3954						* while we're already holding absent pages in this UPL
3955						* the caller will deal with the empty slots
3956						*/
3957					        if (user_page_list)
3958						        user_page_list[entry].phys_addr = 0;
3959
3960						goto try_next_page;
3961					}
3962				        /*
3963					 * no pages available... wait
3964					 * then try again for the same
3965					 * offset...
3966					 */
3967					vm_object_unlock(object);
3968
3969					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
3970
3971					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
3972
3973					VM_PAGE_WAIT();
3974					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
3975
3976					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
3977
3978					vm_object_lock(object);
3979
3980					continue;
3981				}
3982				vm_page_insert(dst_page, object, dst_offset);
3983
3984				dst_page->absent = TRUE;
3985				dst_page->busy = FALSE;
3986
3987				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
3988				        /*
3989					 * if UPL_RET_ONLY_ABSENT was specified,
3990					 * than we're definitely setting up a
3991					 * upl for a clustered read/pagein
3992					 * operation... mark the pages as clustered
3993					 * so upl_commit_range can put them on the
3994					 * speculative list
3995					 */
3996				        dst_page->clustered = TRUE;
3997				}
3998			}
3999			/*
4000			 * ENCRYPTED SWAP:
4001			 */
4002			if (cntrl_flags & UPL_ENCRYPT) {
4003				/*
4004				 * The page is going to be encrypted when we
4005				 * get it from the pager, so mark it so.
4006				 */
4007				dst_page->encrypted = TRUE;
4008			} else {
4009				/*
4010				 * Otherwise, the page will not contain
4011				 * encrypted data.
4012				 */
4013				dst_page->encrypted = FALSE;
4014			}
4015			dst_page->overwriting = TRUE;
4016
4017			if (dst_page->pmapped) {
4018			        if ( !(cntrl_flags & UPL_FILE_IO))
4019				        /*
4020					 * eliminate all mappings from the
4021					 * original object and its prodigy
4022					 */
4023				        refmod_state = pmap_disconnect(dst_page->phys_page);
4024				else
4025				        refmod_state = pmap_get_refmod(dst_page->phys_page);
4026			} else
4027			        refmod_state = 0;
4028
4029			hw_dirty = refmod_state & VM_MEM_MODIFIED;
4030			dirty = hw_dirty ? TRUE : dst_page->dirty;
4031
4032			if (cntrl_flags & UPL_SET_LITE) {
4033				unsigned int	pg_num;
4034
4035				pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
4036				assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
4037				lite_list[pg_num>>5] |= 1 << (pg_num & 31);
4038
4039				if (hw_dirty)
4040				        pmap_clear_modify(dst_page->phys_page);
4041
4042				/*
4043				 * Mark original page as cleaning
4044				 * in place.
4045				 */
4046				dst_page->cleaning = TRUE;
4047				dst_page->precious = FALSE;
4048			} else {
4049				/*
4050				 * use pageclean setup, it is more
4051				 * convenient even for the pageout
4052				 * cases here
4053				 */
4054			        vm_object_lock(upl->map_object);
4055				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
4056			        vm_object_unlock(upl->map_object);
4057
4058				alias_page->absent = FALSE;
4059				alias_page = NULL;
4060			}
4061
4062			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
4063				upl->flags &= ~UPL_CLEAR_DIRTY;
4064				upl->flags |= UPL_SET_DIRTY;
4065				dirty = TRUE;
4066				upl->flags |= UPL_SET_DIRTY;
4067			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
4068				/*
4069				 * clean in place for read implies
4070				 * that a write will be done on all
4071				 * the pages that are dirty before
4072				 * a upl commit is done.  The caller
4073				 * is obligated to preserve the
4074				 * contents of all pages marked dirty
4075				 */
4076				upl->flags |= UPL_CLEAR_DIRTY;
4077			}
4078			dst_page->dirty = dirty;
4079
4080			if (!dirty)
4081				dst_page->precious = TRUE;
4082
4083			if ( !VM_PAGE_WIRED(dst_page)) {
4084			        /*
4085				 * deny access to the target page while
4086				 * it is being worked on
4087				 */
4088				dst_page->busy = TRUE;
4089			} else
4090				dwp->dw_mask |= DW_vm_page_wire;
4091
4092			/*
4093			 * We might be about to satisfy a fault which has been
4094			 * requested. So no need for the "restart" bit.
4095			 */
4096			dst_page->restart = FALSE;
4097			if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
4098			        /*
4099				 * expect the page to be used
4100				 */
4101				dwp->dw_mask |= DW_set_reference;
4102			}
4103			if (cntrl_flags & UPL_PRECIOUS) {
4104				if (dst_page->object->internal) {
4105					SET_PAGE_DIRTY(dst_page, FALSE);
4106					dst_page->precious = FALSE;
4107				} else {
4108					dst_page->precious = TRUE;
4109				}
4110			} else {
4111				dst_page->precious = FALSE;
4112			}
4113		}
4114		if (dst_page->busy)
4115			upl->flags |= UPL_HAS_BUSY;
4116
4117		if (dst_page->phys_page > upl->highest_page)
4118		        upl->highest_page = dst_page->phys_page;
4119		if (user_page_list) {
4120			user_page_list[entry].phys_addr = dst_page->phys_page;
4121			user_page_list[entry].pageout	= dst_page->pageout;
4122			user_page_list[entry].absent	= dst_page->absent;
4123			user_page_list[entry].dirty	= dst_page->dirty;
4124			user_page_list[entry].precious	= dst_page->precious;
4125			user_page_list[entry].device	= FALSE;
4126			user_page_list[entry].needed    = FALSE;
4127			if (dst_page->clustered == TRUE)
4128			        user_page_list[entry].speculative = dst_page->speculative;
4129			else
4130			        user_page_list[entry].speculative = FALSE;
4131			user_page_list[entry].cs_validated = dst_page->cs_validated;
4132			user_page_list[entry].cs_tainted = dst_page->cs_tainted;
4133		}
4134	        /*
4135		 * if UPL_RET_ONLY_ABSENT is set, then
4136		 * we are working with a fresh page and we've
4137		 * just set the clustered flag on it to
4138		 * indicate that it was drug in as part of a
4139		 * speculative cluster... so leave it alone
4140		 */
4141		if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
4142		        /*
4143			 * someone is explicitly grabbing this page...
4144			 * update clustered and speculative state
4145			 *
4146			 */
4147		        VM_PAGE_CONSUME_CLUSTERED(dst_page);
4148		}
4149try_next_page:
4150		if (dwp->dw_mask) {
4151			if (dwp->dw_mask & DW_vm_page_activate)
4152				VM_STAT_INCR(reactivations);
4153
4154			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
4155
4156			if (dw_count >= dw_limit) {
4157				vm_page_do_delayed_work(object, &dw_array[0], dw_count);
4158
4159				dwp = &dw_array[0];
4160				dw_count = 0;
4161			}
4162		}
4163		entry++;
4164		dst_offset += PAGE_SIZE_64;
4165		xfer_size -= PAGE_SIZE;
4166	}
4167	if (dw_count)
4168		vm_page_do_delayed_work(object, &dw_array[0], dw_count);
4169
4170	if (alias_page != NULL) {
4171		VM_PAGE_FREE(alias_page);
4172	}
4173
4174	if (page_list_count != NULL) {
4175	        if (upl->flags & UPL_INTERNAL)
4176			*page_list_count = 0;
4177		else if (*page_list_count > entry)
4178			*page_list_count = entry;
4179	}
4180#if UPL_DEBUG
4181	upl->upl_state = 1;
4182#endif
4183	vm_object_unlock(object);
4184
4185	return KERN_SUCCESS;
4186}
4187
4188/* JMM - Backward compatability for now */
4189kern_return_t
4190vm_fault_list_request(			/* forward */
4191	memory_object_control_t		control,
4192	vm_object_offset_t	offset,
4193	upl_size_t		size,
4194	upl_t			*upl_ptr,
4195	upl_page_info_t		**user_page_list_ptr,
4196	unsigned int		page_list_count,
4197	int			cntrl_flags);
4198kern_return_t
4199vm_fault_list_request(
4200	memory_object_control_t		control,
4201	vm_object_offset_t	offset,
4202	upl_size_t		size,
4203	upl_t			*upl_ptr,
4204	upl_page_info_t		**user_page_list_ptr,
4205	unsigned int		page_list_count,
4206	int			cntrl_flags)
4207{
4208	unsigned int		local_list_count;
4209	upl_page_info_t		*user_page_list;
4210	kern_return_t		kr;
4211
4212	if((cntrl_flags & UPL_VECTOR)==UPL_VECTOR)
4213		 return KERN_INVALID_ARGUMENT;
4214
4215	if (user_page_list_ptr != NULL) {
4216		local_list_count = page_list_count;
4217		user_page_list = *user_page_list_ptr;
4218	} else {
4219		local_list_count = 0;
4220		user_page_list = NULL;
4221	}
4222	kr =  memory_object_upl_request(control,
4223				offset,
4224				size,
4225				upl_ptr,
4226				user_page_list,
4227				&local_list_count,
4228				cntrl_flags);
4229
4230	if(kr != KERN_SUCCESS)
4231		return kr;
4232
4233	if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
4234		*user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
4235	}
4236
4237	return KERN_SUCCESS;
4238}
4239
4240
4241
4242/*
4243 *	Routine:	vm_object_super_upl_request
4244 *	Purpose:
4245 *		Cause the population of a portion of a vm_object
4246 *		in much the same way as memory_object_upl_request.
4247 *		Depending on the nature of the request, the pages
4248 *		returned may be contain valid data or be uninitialized.
4249 *		However, the region may be expanded up to the super
4250 *		cluster size provided.
4251 */
4252
4253__private_extern__ kern_return_t
4254vm_object_super_upl_request(
4255	vm_object_t object,
4256	vm_object_offset_t	offset,
4257	upl_size_t		size,
4258	upl_size_t		super_cluster,
4259	upl_t			*upl,
4260	upl_page_info_t		*user_page_list,
4261	unsigned int		*page_list_count,
4262	int			cntrl_flags)
4263{
4264	if (object->paging_offset > offset  || ((cntrl_flags & UPL_VECTOR)==UPL_VECTOR))
4265		return KERN_FAILURE;
4266
4267	assert(object->paging_in_progress);
4268	offset = offset - object->paging_offset;
4269
4270	if (super_cluster > size) {
4271
4272		vm_object_offset_t	base_offset;
4273		upl_size_t		super_size;
4274		vm_object_size_t	super_size_64;
4275
4276		base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
4277		super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
4278		super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
4279		super_size = (upl_size_t) super_size_64;
4280		assert(super_size == super_size_64);
4281
4282		if (offset > (base_offset + super_size)) {
4283		        panic("vm_object_super_upl_request: Missed target pageout"
4284			      " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
4285			      offset, base_offset, super_size, super_cluster,
4286			      size, object->paging_offset);
4287		}
4288		/*
4289		 * apparently there is a case where the vm requests a
4290		 * page to be written out who's offset is beyond the
4291		 * object size
4292		 */
4293		if ((offset + size) > (base_offset + super_size)) {
4294		        super_size_64 = (offset + size) - base_offset;
4295			super_size = (upl_size_t) super_size_64;
4296			assert(super_size == super_size_64);
4297		}
4298
4299		offset = base_offset;
4300		size = super_size;
4301	}
4302	return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
4303}
4304
4305
4306kern_return_t
4307vm_map_create_upl(
4308	vm_map_t		map,
4309	vm_map_address_t	offset,
4310	upl_size_t		*upl_size,
4311	upl_t			*upl,
4312	upl_page_info_array_t	page_list,
4313	unsigned int		*count,
4314	int			*flags)
4315{
4316	vm_map_entry_t	entry;
4317	int		caller_flags;
4318	int		force_data_sync;
4319	int		sync_cow_data;
4320	vm_object_t	local_object;
4321	vm_map_offset_t	local_offset;
4322	vm_map_offset_t	local_start;
4323	kern_return_t	ret;
4324
4325	caller_flags = *flags;
4326
4327	if (caller_flags & ~UPL_VALID_FLAGS) {
4328		/*
4329		 * For forward compatibility's sake,
4330		 * reject any unknown flag.
4331		 */
4332		return KERN_INVALID_VALUE;
4333	}
4334	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
4335	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
4336
4337	if (upl == NULL)
4338		return KERN_INVALID_ARGUMENT;
4339
4340REDISCOVER_ENTRY:
4341	vm_map_lock_read(map);
4342
4343	if (vm_map_lookup_entry(map, offset, &entry)) {
4344
4345		if ((entry->vme_end - offset) < *upl_size) {
4346			*upl_size = (upl_size_t) (entry->vme_end - offset);
4347			assert(*upl_size == entry->vme_end - offset);
4348		}
4349
4350		if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
4351		        *flags = 0;
4352
4353			if ( !entry->is_sub_map && entry->object.vm_object != VM_OBJECT_NULL) {
4354			        if (entry->object.vm_object->private)
4355				        *flags = UPL_DEV_MEMORY;
4356
4357				if (entry->object.vm_object->phys_contiguous)
4358					*flags |= UPL_PHYS_CONTIG;
4359			}
4360			vm_map_unlock_read(map);
4361
4362			return KERN_SUCCESS;
4363		}
4364
4365		if (entry->is_sub_map) {
4366			vm_map_t	submap;
4367
4368			submap = entry->object.sub_map;
4369			local_start = entry->vme_start;
4370			local_offset = entry->offset;
4371
4372			vm_map_reference(submap);
4373			vm_map_unlock_read(map);
4374
4375			ret = vm_map_create_upl(submap,
4376						local_offset + (offset - local_start),
4377						upl_size, upl, page_list, count, flags);
4378			vm_map_deallocate(submap);
4379
4380			return ret;
4381		}
4382
4383	        if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
4384        		if ((*upl_size/PAGE_SIZE) > MAX_UPL_SIZE)
4385               			*upl_size = MAX_UPL_SIZE * PAGE_SIZE;
4386		}
4387		/*
4388		 *      Create an object if necessary.
4389		 */
4390		if (entry->object.vm_object == VM_OBJECT_NULL) {
4391
4392			if (vm_map_lock_read_to_write(map))
4393				goto REDISCOVER_ENTRY;
4394
4395			entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
4396			entry->offset = 0;
4397
4398			vm_map_lock_write_to_read(map);
4399		}
4400		if (!(caller_flags & UPL_COPYOUT_FROM)) {
4401			if (!(entry->protection & VM_PROT_WRITE)) {
4402				vm_map_unlock_read(map);
4403				return KERN_PROTECTION_FAILURE;
4404			}
4405
4406#if !CONFIG_EMBEDDED
4407			local_object = entry->object.vm_object;
4408			if (vm_map_entry_should_cow_for_true_share(entry) &&
4409			    local_object->vo_size > *upl_size &&
4410			    *upl_size != 0) {
4411				vm_prot_t	prot;
4412
4413				/*
4414				 * Set up the targeted range for copy-on-write to avoid
4415				 * applying true_share/copy_delay to the entire object.
4416				 */
4417
4418				if (vm_map_lock_read_to_write(map)) {
4419					goto REDISCOVER_ENTRY;
4420				}
4421
4422				vm_map_clip_start(map, entry, vm_map_trunc_page(offset));
4423				vm_map_clip_end(map, entry, vm_map_round_page(offset + *upl_size));
4424				prot = entry->protection & ~VM_PROT_WRITE;
4425				if (override_nx(map, entry->alias) && prot)
4426					prot |= VM_PROT_EXECUTE;
4427				vm_object_pmap_protect(local_object,
4428						       entry->offset,
4429						       entry->vme_end - entry->vme_start,
4430						       ((entry->is_shared || map->mapped_in_other_pmaps)
4431							? PMAP_NULL
4432							: map->pmap),
4433						       entry->vme_start,
4434						       prot);
4435				entry->needs_copy = TRUE;
4436
4437				vm_map_lock_write_to_read(map);
4438			}
4439#endif /* !CONFIG_EMBEDDED */
4440
4441			if (entry->needs_copy)  {
4442				/*
4443				 * Honor copy-on-write for COPY_SYMMETRIC
4444				 * strategy.
4445				 */
4446				vm_map_t		local_map;
4447				vm_object_t		object;
4448				vm_object_offset_t	new_offset;
4449				vm_prot_t		prot;
4450				boolean_t		wired;
4451				vm_map_version_t	version;
4452				vm_map_t		real_map;
4453
4454				local_map = map;
4455
4456				if (vm_map_lookup_locked(&local_map,
4457							 offset, VM_PROT_WRITE,
4458							 OBJECT_LOCK_EXCLUSIVE,
4459							 &version, &object,
4460							 &new_offset, &prot, &wired,
4461							 NULL,
4462							 &real_map) != KERN_SUCCESS) {
4463				        vm_map_unlock_read(local_map);
4464					return KERN_FAILURE;
4465				}
4466				if (real_map != map)
4467					vm_map_unlock(real_map);
4468				vm_map_unlock_read(local_map);
4469
4470				vm_object_unlock(object);
4471
4472				goto REDISCOVER_ENTRY;
4473			}
4474		}
4475		if (sync_cow_data) {
4476			if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
4477				local_object = entry->object.vm_object;
4478				local_start = entry->vme_start;
4479				local_offset = entry->offset;
4480
4481				vm_object_reference(local_object);
4482				vm_map_unlock_read(map);
4483
4484				if (local_object->shadow && local_object->copy) {
4485				        vm_object_lock_request(
4486							       local_object->shadow,
4487							       (vm_object_offset_t)
4488							       ((offset - local_start) +
4489								local_offset) +
4490							       local_object->vo_shadow_offset,
4491							       *upl_size, FALSE,
4492							       MEMORY_OBJECT_DATA_SYNC,
4493							       VM_PROT_NO_CHANGE);
4494				}
4495				sync_cow_data = FALSE;
4496				vm_object_deallocate(local_object);
4497
4498				goto REDISCOVER_ENTRY;
4499			}
4500		}
4501		if (force_data_sync) {
4502			local_object = entry->object.vm_object;
4503			local_start = entry->vme_start;
4504			local_offset = entry->offset;
4505
4506			vm_object_reference(local_object);
4507		        vm_map_unlock_read(map);
4508
4509			vm_object_lock_request(
4510					       local_object,
4511					       (vm_object_offset_t)
4512					       ((offset - local_start) + local_offset),
4513					       (vm_object_size_t)*upl_size, FALSE,
4514					       MEMORY_OBJECT_DATA_SYNC,
4515					       VM_PROT_NO_CHANGE);
4516
4517			force_data_sync = FALSE;
4518			vm_object_deallocate(local_object);
4519
4520			goto REDISCOVER_ENTRY;
4521		}
4522		if (entry->object.vm_object->private)
4523		        *flags = UPL_DEV_MEMORY;
4524		else
4525		        *flags = 0;
4526
4527		if (entry->object.vm_object->phys_contiguous)
4528		        *flags |= UPL_PHYS_CONTIG;
4529
4530		local_object = entry->object.vm_object;
4531		local_offset = entry->offset;
4532		local_start = entry->vme_start;
4533
4534		vm_object_reference(local_object);
4535		vm_map_unlock_read(map);
4536
4537		ret = vm_object_iopl_request(local_object,
4538					      (vm_object_offset_t) ((offset - local_start) + local_offset),
4539					      *upl_size,
4540					      upl,
4541					      page_list,
4542					      count,
4543					      caller_flags);
4544		vm_object_deallocate(local_object);
4545
4546		return(ret);
4547	}
4548	vm_map_unlock_read(map);
4549
4550	return(KERN_FAILURE);
4551}
4552
4553/*
4554 * Internal routine to enter a UPL into a VM map.
4555 *
4556 * JMM - This should just be doable through the standard
4557 * vm_map_enter() API.
4558 */
4559kern_return_t
4560vm_map_enter_upl(
4561	vm_map_t		map,
4562	upl_t			upl,
4563	vm_map_offset_t		*dst_addr)
4564{
4565	vm_map_size_t	 	size;
4566	vm_object_offset_t 	offset;
4567	vm_map_offset_t		addr;
4568	vm_page_t		m;
4569	kern_return_t		kr;
4570	int			isVectorUPL = 0, curr_upl=0;
4571	upl_t			vector_upl = NULL;
4572	vm_offset_t		vector_upl_dst_addr = 0;
4573	vm_map_t		vector_upl_submap = NULL;
4574	upl_offset_t 		subupl_offset = 0;
4575	upl_size_t		subupl_size = 0;
4576
4577	if (upl == UPL_NULL)
4578		return KERN_INVALID_ARGUMENT;
4579
4580	if((isVectorUPL = vector_upl_is_valid(upl))) {
4581		int mapped=0,valid_upls=0;
4582		vector_upl = upl;
4583
4584		upl_lock(vector_upl);
4585		for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
4586			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
4587			if(upl == NULL)
4588				continue;
4589			valid_upls++;
4590			if (UPL_PAGE_LIST_MAPPED & upl->flags)
4591				mapped++;
4592		}
4593
4594		if(mapped) {
4595			if(mapped != valid_upls)
4596				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
4597			else {
4598				upl_unlock(vector_upl);
4599				return KERN_FAILURE;
4600			}
4601		}
4602
4603		kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE, VM_FLAGS_ANYWHERE, &vector_upl_submap);
4604		if( kr != KERN_SUCCESS )
4605			panic("Vector UPL submap allocation failed\n");
4606		map = vector_upl_submap;
4607		vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
4608		curr_upl=0;
4609	}
4610	else
4611		upl_lock(upl);
4612
4613process_upl_to_enter:
4614	if(isVectorUPL){
4615		if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
4616			*dst_addr = vector_upl_dst_addr;
4617			upl_unlock(vector_upl);
4618			return KERN_SUCCESS;
4619		}
4620		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
4621		if(upl == NULL)
4622			goto process_upl_to_enter;
4623
4624		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
4625		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
4626	} else {
4627		/*
4628		 * check to see if already mapped
4629		 */
4630		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
4631			upl_unlock(upl);
4632			return KERN_FAILURE;
4633		}
4634	}
4635	if ((!(upl->flags & UPL_SHADOWED)) &&
4636	    ((upl->flags & UPL_HAS_BUSY) ||
4637	     !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
4638
4639		vm_object_t 		object;
4640		vm_page_t		alias_page;
4641		vm_object_offset_t	new_offset;
4642		unsigned int		pg_num;
4643		wpl_array_t 		lite_list;
4644
4645		if (upl->flags & UPL_INTERNAL) {
4646			lite_list = (wpl_array_t)
4647				((((uintptr_t)upl) + sizeof(struct upl))
4648				 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4649		} else {
4650		        lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
4651		}
4652		object = upl->map_object;
4653		upl->map_object = vm_object_allocate(upl->size);
4654
4655		vm_object_lock(upl->map_object);
4656
4657		upl->map_object->shadow = object;
4658		upl->map_object->pageout = TRUE;
4659		upl->map_object->can_persist = FALSE;
4660		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
4661		upl->map_object->vo_shadow_offset = upl->offset - object->paging_offset;
4662		upl->map_object->wimg_bits = object->wimg_bits;
4663		offset = upl->map_object->vo_shadow_offset;
4664		new_offset = 0;
4665		size = upl->size;
4666
4667		upl->flags |= UPL_SHADOWED;
4668
4669		while (size) {
4670			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
4671			assert(pg_num == new_offset / PAGE_SIZE);
4672
4673			if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4674
4675				VM_PAGE_GRAB_FICTITIOUS(alias_page);
4676
4677				vm_object_lock(object);
4678
4679				m = vm_page_lookup(object, offset);
4680				if (m == VM_PAGE_NULL) {
4681				        panic("vm_upl_map: page missing\n");
4682				}
4683
4684				/*
4685				 * Convert the fictitious page to a private
4686				 * shadow of the real page.
4687				 */
4688				assert(alias_page->fictitious);
4689				alias_page->fictitious = FALSE;
4690				alias_page->private = TRUE;
4691				alias_page->pageout = TRUE;
4692				/*
4693				 * since m is a page in the upl it must
4694				 * already be wired or BUSY, so it's
4695				 * safe to assign the underlying physical
4696				 * page to the alias
4697				 */
4698				alias_page->phys_page = m->phys_page;
4699
4700			        vm_object_unlock(object);
4701
4702				vm_page_lockspin_queues();
4703				vm_page_wire(alias_page);
4704				vm_page_unlock_queues();
4705
4706				/*
4707				 * ENCRYPTED SWAP:
4708				 * The virtual page ("m") has to be wired in some way
4709				 * here or its physical page ("m->phys_page") could
4710				 * be recycled at any time.
4711				 * Assuming this is enforced by the caller, we can't
4712				 * get an encrypted page here.  Since the encryption
4713				 * key depends on the VM page's "pager" object and
4714				 * the "paging_offset", we couldn't handle 2 pageable
4715				 * VM pages (with different pagers and paging_offsets)
4716				 * sharing the same physical page:  we could end up
4717				 * encrypting with one key (via one VM page) and
4718				 * decrypting with another key (via the alias VM page).
4719				 */
4720				ASSERT_PAGE_DECRYPTED(m);
4721
4722				vm_page_insert(alias_page, upl->map_object, new_offset);
4723
4724				assert(!alias_page->wanted);
4725				alias_page->busy = FALSE;
4726				alias_page->absent = FALSE;
4727			}
4728			size -= PAGE_SIZE;
4729			offset += PAGE_SIZE_64;
4730			new_offset += PAGE_SIZE_64;
4731		}
4732		vm_object_unlock(upl->map_object);
4733	}
4734	if (upl->flags & UPL_SHADOWED)
4735	        offset = 0;
4736	else
4737	        offset = upl->offset - upl->map_object->paging_offset;
4738
4739	size = upl->size;
4740
4741	vm_object_reference(upl->map_object);
4742
4743	if(!isVectorUPL) {
4744		*dst_addr = 0;
4745		/*
4746	 	* NEED A UPL_MAP ALIAS
4747	 	*/
4748		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
4749				  VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
4750				  VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
4751
4752		if (kr != KERN_SUCCESS) {
4753			upl_unlock(upl);
4754			return(kr);
4755		}
4756	}
4757	else {
4758		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
4759				  VM_FLAGS_FIXED, upl->map_object, offset, FALSE,
4760				  VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
4761		if(kr)
4762			panic("vm_map_enter failed for a Vector UPL\n");
4763	}
4764	vm_object_lock(upl->map_object);
4765
4766	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
4767		m = vm_page_lookup(upl->map_object, offset);
4768
4769		if (m) {
4770			m->pmapped = TRUE;
4771
4772			/* CODE SIGNING ENFORCEMENT: page has been wpmapped,
4773			 * but only in kernel space. If this was on a user map,
4774			 * we'd have to set the wpmapped bit. */
4775			/* m->wpmapped = TRUE; */
4776			assert(map==kernel_map);
4777
4778			PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, VM_PROT_NONE, 0, TRUE);
4779		}
4780		offset += PAGE_SIZE_64;
4781	}
4782	vm_object_unlock(upl->map_object);
4783
4784	/*
4785	 * hold a reference for the mapping
4786	 */
4787	upl->ref_count++;
4788	upl->flags |= UPL_PAGE_LIST_MAPPED;
4789	upl->kaddr = (vm_offset_t) *dst_addr;
4790	assert(upl->kaddr == *dst_addr);
4791
4792	if(isVectorUPL)
4793		goto process_upl_to_enter;
4794
4795	upl_unlock(upl);
4796
4797	return KERN_SUCCESS;
4798}
4799
4800/*
4801 * Internal routine to remove a UPL mapping from a VM map.
4802 *
4803 * XXX - This should just be doable through a standard
4804 * vm_map_remove() operation.  Otherwise, implicit clean-up
4805 * of the target map won't be able to correctly remove
4806 * these (and release the reference on the UPL).  Having
4807 * to do this means we can't map these into user-space
4808 * maps yet.
4809 */
4810kern_return_t
4811vm_map_remove_upl(
4812	vm_map_t	map,
4813	upl_t		upl)
4814{
4815	vm_address_t	addr;
4816	upl_size_t	size;
4817	int		isVectorUPL = 0, curr_upl = 0;
4818	upl_t		vector_upl = NULL;
4819
4820	if (upl == UPL_NULL)
4821		return KERN_INVALID_ARGUMENT;
4822
4823	if((isVectorUPL = vector_upl_is_valid(upl))) {
4824		int 	unmapped=0, valid_upls=0;
4825		vector_upl = upl;
4826		upl_lock(vector_upl);
4827		for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
4828			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
4829			if(upl == NULL)
4830				continue;
4831			valid_upls++;
4832			if (!(UPL_PAGE_LIST_MAPPED & upl->flags))
4833				unmapped++;
4834		}
4835
4836		if(unmapped) {
4837			if(unmapped != valid_upls)
4838				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
4839			else {
4840				upl_unlock(vector_upl);
4841				return KERN_FAILURE;
4842			}
4843		}
4844		curr_upl=0;
4845	}
4846	else
4847		upl_lock(upl);
4848
4849process_upl_to_remove:
4850	if(isVectorUPL) {
4851		if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
4852			vm_map_t v_upl_submap;
4853			vm_offset_t v_upl_submap_dst_addr;
4854			vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
4855
4856			vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
4857			vm_map_deallocate(v_upl_submap);
4858			upl_unlock(vector_upl);
4859			return KERN_SUCCESS;
4860		}
4861
4862		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
4863		if(upl == NULL)
4864			goto process_upl_to_remove;
4865	}
4866
4867	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
4868		addr = upl->kaddr;
4869		size = upl->size;
4870
4871		assert(upl->ref_count > 1);
4872		upl->ref_count--;		/* removing mapping ref */
4873
4874		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
4875		upl->kaddr = (vm_offset_t) 0;
4876
4877		if(!isVectorUPL) {
4878			upl_unlock(upl);
4879
4880			vm_map_remove(map,
4881				vm_map_trunc_page(addr),
4882				vm_map_round_page(addr + size),
4883				VM_MAP_NO_FLAGS);
4884
4885			return KERN_SUCCESS;
4886		}
4887		else {
4888			/*
4889			* If it's a Vectored UPL, we'll be removing the entire
4890			* submap anyways, so no need to remove individual UPL
4891			* element mappings from within the submap
4892			*/
4893			goto process_upl_to_remove;
4894		}
4895	}
4896	upl_unlock(upl);
4897
4898	return KERN_FAILURE;
4899}
4900
4901
4902kern_return_t
4903upl_commit_range(
4904	upl_t			upl,
4905	upl_offset_t		offset,
4906	upl_size_t		size,
4907	int			flags,
4908	upl_page_info_t		*page_list,
4909	mach_msg_type_number_t	count,
4910	boolean_t		*empty)
4911{
4912	upl_size_t		xfer_size, subupl_size = size;
4913	vm_object_t		shadow_object;
4914	vm_object_t		object;
4915	vm_object_offset_t	target_offset;
4916	upl_offset_t		subupl_offset = offset;
4917	int			entry;
4918	wpl_array_t 		lite_list;
4919	int			occupied;
4920	int			clear_refmod = 0;
4921	int			pgpgout_count = 0;
4922	struct	vm_page_delayed_work	dw_array[DEFAULT_DELAYED_WORK_LIMIT];
4923	struct	vm_page_delayed_work	*dwp;
4924	int			dw_count;
4925	int			dw_limit;
4926	int			isVectorUPL = 0;
4927	upl_t			vector_upl = NULL;
4928	boolean_t		should_be_throttled = FALSE;
4929
4930	*empty = FALSE;
4931
4932	if (upl == UPL_NULL)
4933		return KERN_INVALID_ARGUMENT;
4934
4935	if (count == 0)
4936		page_list = NULL;
4937
4938	if((isVectorUPL = vector_upl_is_valid(upl))) {
4939		vector_upl = upl;
4940		upl_lock(vector_upl);
4941	}
4942	else
4943		upl_lock(upl);
4944
4945process_upl_to_commit:
4946
4947	if(isVectorUPL) {
4948		size = subupl_size;
4949		offset = subupl_offset;
4950		if(size == 0) {
4951			upl_unlock(vector_upl);
4952			return KERN_SUCCESS;
4953		}
4954		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
4955		if(upl == NULL) {
4956			upl_unlock(vector_upl);
4957			return KERN_FAILURE;
4958		}
4959		page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
4960		subupl_size -= size;
4961		subupl_offset += size;
4962	}
4963
4964#if UPL_DEBUG
4965	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
4966		(void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
4967
4968		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
4969		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
4970
4971		upl->upl_commit_index++;
4972	}
4973#endif
4974	if (upl->flags & UPL_DEVICE_MEMORY)
4975		xfer_size = 0;
4976	else if ((offset + size) <= upl->size)
4977	        xfer_size = size;
4978	else {
4979		if(!isVectorUPL)
4980			upl_unlock(upl);
4981		else {
4982			upl_unlock(vector_upl);
4983		}
4984		return KERN_FAILURE;
4985	}
4986	if (upl->flags & UPL_SET_DIRTY)
4987		flags |= UPL_COMMIT_SET_DIRTY;
4988	if (upl->flags & UPL_CLEAR_DIRTY)
4989	        flags |= UPL_COMMIT_CLEAR_DIRTY;
4990
4991	if (upl->flags & UPL_INTERNAL)
4992		lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
4993					   + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4994	else
4995		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4996
4997	object = upl->map_object;
4998
4999	if (upl->flags & UPL_SHADOWED) {
5000	        vm_object_lock(object);
5001		shadow_object = object->shadow;
5002	} else {
5003		shadow_object = object;
5004	}
5005	entry = offset/PAGE_SIZE;
5006	target_offset = (vm_object_offset_t)offset;
5007
5008	if (upl->flags & UPL_KERNEL_OBJECT)
5009		vm_object_lock_shared(shadow_object);
5010	else
5011		vm_object_lock(shadow_object);
5012
5013	if (upl->flags & UPL_ACCESS_BLOCKED) {
5014		assert(shadow_object->blocked_access);
5015		shadow_object->blocked_access = FALSE;
5016		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
5017	}
5018
5019	if (shadow_object->code_signed) {
5020		/*
5021		 * CODE SIGNING:
5022		 * If the object is code-signed, do not let this UPL tell
5023		 * us if the pages are valid or not.  Let the pages be
5024		 * validated by VM the normal way (when they get mapped or
5025		 * copied).
5026		 */
5027		flags &= ~UPL_COMMIT_CS_VALIDATED;
5028	}
5029	if (! page_list) {
5030		/*
5031		 * No page list to get the code-signing info from !?
5032		 */
5033		flags &= ~UPL_COMMIT_CS_VALIDATED;
5034	}
5035	if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && shadow_object->internal)
5036		should_be_throttled = TRUE;
5037
5038	dwp = &dw_array[0];
5039	dw_count = 0;
5040	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5041
5042	while (xfer_size) {
5043		vm_page_t	t, m;
5044
5045		dwp->dw_mask = 0;
5046		clear_refmod = 0;
5047
5048		m = VM_PAGE_NULL;
5049
5050		if (upl->flags & UPL_LITE) {
5051			unsigned int	pg_num;
5052
5053			pg_num = (unsigned int) (target_offset/PAGE_SIZE);
5054			assert(pg_num == target_offset/PAGE_SIZE);
5055
5056			if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
5057			        lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
5058
5059				if (!(upl->flags & UPL_KERNEL_OBJECT))
5060					m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
5061			}
5062		}
5063		if (upl->flags & UPL_SHADOWED) {
5064			if ((t = vm_page_lookup(object, target_offset))	!= VM_PAGE_NULL) {
5065
5066				t->pageout = FALSE;
5067
5068				VM_PAGE_FREE(t);
5069
5070				if (m == VM_PAGE_NULL)
5071					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
5072			}
5073		}
5074		if ((upl->flags & UPL_KERNEL_OBJECT) || m == VM_PAGE_NULL)
5075			goto commit_next_page;
5076
5077		if (flags & UPL_COMMIT_CS_VALIDATED) {
5078			/*
5079			 * CODE SIGNING:
5080			 * Set the code signing bits according to
5081			 * what the UPL says they should be.
5082			 */
5083			m->cs_validated = page_list[entry].cs_validated;
5084			m->cs_tainted = page_list[entry].cs_tainted;
5085		}
5086		if (upl->flags & UPL_IO_WIRE) {
5087
5088			if (page_list)
5089				page_list[entry].phys_addr = 0;
5090
5091			if (flags & UPL_COMMIT_SET_DIRTY) {
5092				SET_PAGE_DIRTY(m, FALSE);
5093			} else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
5094				m->dirty = FALSE;
5095
5096				if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
5097				    m->cs_validated && !m->cs_tainted) {
5098					/*
5099					 * CODE SIGNING:
5100					 * This page is no longer dirty
5101					 * but could have been modified,
5102					 * so it will need to be
5103					 * re-validated.
5104					 */
5105					m->cs_validated = FALSE;
5106#if DEVELOPMENT || DEBUG
5107					vm_cs_validated_resets++;
5108#endif
5109					pmap_disconnect(m->phys_page);
5110				}
5111				clear_refmod |= VM_MEM_MODIFIED;
5112			}
5113			if (flags & UPL_COMMIT_INACTIVATE) {
5114				dwp->dw_mask |= DW_vm_page_deactivate_internal;
5115				clear_refmod |= VM_MEM_REFERENCED;
5116			}
5117			if (upl->flags & UPL_ACCESS_BLOCKED) {
5118				/*
5119				 * We blocked access to the pages in this UPL.
5120				 * Clear the "busy" bit and wake up any waiter
5121				 * for this page.
5122				 */
5123				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5124			}
5125			if (m->absent) {
5126				if (flags & UPL_COMMIT_FREE_ABSENT)
5127					dwp->dw_mask |= DW_vm_page_free;
5128				else {
5129					m->absent = FALSE;
5130					dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5131
5132					if ( !(dwp->dw_mask & DW_vm_page_deactivate_internal))
5133						dwp->dw_mask |= DW_vm_page_activate;
5134				}
5135			} else
5136				dwp->dw_mask |= DW_vm_page_unwire;
5137
5138			goto commit_next_page;
5139		}
5140		if (page_list)
5141			page_list[entry].phys_addr = 0;
5142
5143		/*
5144		 * make sure to clear the hardware
5145		 * modify or reference bits before
5146		 * releasing the BUSY bit on this page
5147		 * otherwise we risk losing a legitimate
5148		 * change of state
5149		 */
5150		if (flags & UPL_COMMIT_CLEAR_DIRTY) {
5151			m->dirty = FALSE;
5152
5153			clear_refmod |= VM_MEM_MODIFIED;
5154		}
5155		if (m->laundry)
5156			dwp->dw_mask |= DW_vm_pageout_throttle_up;
5157
5158		if (VM_PAGE_WIRED(m))
5159			m->pageout = FALSE;
5160
5161		if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
5162		    m->cs_validated && !m->cs_tainted) {
5163			/*
5164			 * CODE SIGNING:
5165			 * This page is no longer dirty
5166			 * but could have been modified,
5167			 * so it will need to be
5168			 * re-validated.
5169			 */
5170			m->cs_validated = FALSE;
5171#if DEVELOPMENT || DEBUG
5172			vm_cs_validated_resets++;
5173#endif
5174			pmap_disconnect(m->phys_page);
5175		}
5176		if (m->overwriting) {
5177			/*
5178			 * the (COPY_OUT_FROM == FALSE) request_page_list case
5179			 */
5180			if (m->busy) {
5181				m->absent = FALSE;
5182
5183				dwp->dw_mask |= DW_clear_busy;
5184			} else {
5185				/*
5186				 * alternate (COPY_OUT_FROM == FALSE) page_list case
5187				 * Occurs when the original page was wired
5188				 * at the time of the list request
5189				 */
5190				assert(VM_PAGE_WIRED(m));
5191
5192				dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
5193			}
5194			m->overwriting = FALSE;
5195		}
5196		if (m->encrypted_cleaning == TRUE) {
5197			m->encrypted_cleaning = FALSE;
5198
5199			dwp->dw_mask |= DW_clear_busy | DW_PAGE_WAKEUP;
5200		}
5201		m->cleaning = FALSE;
5202
5203		if (m->pageout) {
5204			/*
5205			 * With the clean queue enabled, UPL_PAGEOUT should
5206			 * no longer set the pageout bit. It's pages now go
5207			 * to the clean queue.
5208			 */
5209			assert(!(flags & UPL_PAGEOUT));
5210
5211			m->pageout = FALSE;
5212#if MACH_CLUSTER_STATS
5213			if (m->wanted) vm_pageout_target_collisions++;
5214#endif
5215			if ((flags & UPL_COMMIT_SET_DIRTY) ||
5216			    (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))) {
5217				/*
5218				 * page was re-dirtied after we started
5219				 * the pageout... reactivate it since
5220				 * we don't know whether the on-disk
5221				 * copy matches what is now in memory
5222				 */
5223				SET_PAGE_DIRTY(m, FALSE);
5224
5225				dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
5226
5227				if (upl->flags & UPL_PAGEOUT) {
5228					CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
5229					VM_STAT_INCR(reactivations);
5230					DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
5231				}
5232			} else {
5233				/*
5234				 * page has been successfully cleaned
5235				 * go ahead and free it for other use
5236				 */
5237				if (m->object->internal) {
5238					DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
5239				} else {
5240					DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
5241				}
5242				m->dirty = FALSE;
5243				m->busy = TRUE;
5244
5245				dwp->dw_mask |= DW_vm_page_free;
5246			}
5247			goto commit_next_page;
5248		}
5249#if MACH_CLUSTER_STATS
5250		if (m->wpmapped)
5251			m->dirty = pmap_is_modified(m->phys_page);
5252
5253		if (m->dirty)   vm_pageout_cluster_dirtied++;
5254		else            vm_pageout_cluster_cleaned++;
5255		if (m->wanted)  vm_pageout_cluster_collisions++;
5256#endif
5257		/*
5258		 * It is a part of the semantic of COPYOUT_FROM
5259		 * UPLs that a commit implies cache sync
5260		 * between the vm page and the backing store
5261		 * this can be used to strip the precious bit
5262		 * as well as clean
5263		 */
5264		if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
5265			m->precious = FALSE;
5266
5267		if (flags & UPL_COMMIT_SET_DIRTY) {
5268			SET_PAGE_DIRTY(m, FALSE);
5269		} else {
5270			m->dirty = FALSE;
5271		}
5272
5273		/* with the clean queue on, move *all* cleaned pages to the clean queue */
5274		if (hibernate_cleaning_in_progress == FALSE && !m->dirty && (upl->flags & UPL_PAGEOUT)) {
5275			pgpgout_count++;
5276
5277			/* this page used to be dirty; now it's on the clean queue. */
5278			m->was_dirty = TRUE;
5279
5280			dwp->dw_mask |= DW_enqueue_cleaned;
5281			vm_pageout_enqueued_cleaned_from_inactive_dirty++;
5282		} else if (should_be_throttled == TRUE && !m->active && !m->inactive && !m->speculative && !m->throttled) {
5283			/*
5284			 * page coming back in from being 'frozen'...
5285			 * it was dirty before it was frozen, so keep it so
5286			 * the vm_page_activate will notice that it really belongs
5287			 * on the throttle queue and put it there
5288			 */
5289			SET_PAGE_DIRTY(m, FALSE);
5290			dwp->dw_mask |= DW_vm_page_activate;
5291
5292		} else {
5293			if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
5294				dwp->dw_mask |= DW_vm_page_deactivate_internal;
5295				clear_refmod |= VM_MEM_REFERENCED;
5296			} else if (!m->active && !m->inactive && !m->speculative) {
5297
5298				if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
5299					dwp->dw_mask |= DW_vm_page_speculate;
5300				else if (m->reference)
5301					dwp->dw_mask |= DW_vm_page_activate;
5302				else {
5303					dwp->dw_mask |= DW_vm_page_deactivate_internal;
5304					clear_refmod |= VM_MEM_REFERENCED;
5305				}
5306			}
5307		}
5308		if (upl->flags & UPL_ACCESS_BLOCKED) {
5309			/*
5310			 * We blocked access to the pages in this URL.
5311			 * Clear the "busy" bit on this page before we
5312			 * wake up any waiter.
5313			 */
5314			dwp->dw_mask |= DW_clear_busy;
5315		}
5316		/*
5317		 * Wakeup any thread waiting for the page to be un-cleaning.
5318		 */
5319		dwp->dw_mask |= DW_PAGE_WAKEUP;
5320
5321commit_next_page:
5322		if (clear_refmod)
5323			pmap_clear_refmod(m->phys_page, clear_refmod);
5324
5325		target_offset += PAGE_SIZE_64;
5326		xfer_size -= PAGE_SIZE;
5327		entry++;
5328
5329		if (dwp->dw_mask) {
5330			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
5331				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
5332
5333				if (dw_count >= dw_limit) {
5334					vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count);
5335
5336					dwp = &dw_array[0];
5337					dw_count = 0;
5338				}
5339			} else {
5340				if (dwp->dw_mask & DW_clear_busy)
5341					m->busy = FALSE;
5342
5343				if (dwp->dw_mask & DW_PAGE_WAKEUP)
5344					PAGE_WAKEUP(m);
5345			}
5346		}
5347	}
5348	if (dw_count)
5349		vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count);
5350
5351	occupied = 1;
5352
5353	if (upl->flags & UPL_DEVICE_MEMORY)  {
5354		occupied = 0;
5355	} else if (upl->flags & UPL_LITE) {
5356		int	pg_num;
5357		int	i;
5358
5359		pg_num = upl->size/PAGE_SIZE;
5360		pg_num = (pg_num + 31) >> 5;
5361		occupied = 0;
5362
5363		for (i = 0; i < pg_num; i++) {
5364			if (lite_list[i] != 0) {
5365				occupied = 1;
5366				break;
5367			}
5368		}
5369	} else {
5370		if (queue_empty(&upl->map_object->memq))
5371			occupied = 0;
5372	}
5373	if (occupied == 0) {
5374		/*
5375		 * If this UPL element belongs to a Vector UPL and is
5376		 * empty, then this is the right function to deallocate
5377		 * it. So go ahead set the *empty variable. The flag
5378		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
5379		 * should be considered relevant for the Vector UPL and not
5380		 * the internal UPLs.
5381		 */
5382		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
5383			*empty = TRUE;
5384
5385		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
5386		        /*
5387			 * this is not a paging object
5388			 * so we need to drop the paging reference
5389			 * that was taken when we created the UPL
5390			 * against this object
5391			 */
5392			vm_object_activity_end(shadow_object);
5393			vm_object_collapse(shadow_object, 0, TRUE);
5394		} else {
5395		         /*
5396			  * we dontated the paging reference to
5397			  * the map object... vm_pageout_object_terminate
5398			  * will drop this reference
5399			  */
5400		}
5401	}
5402	vm_object_unlock(shadow_object);
5403	if (object != shadow_object)
5404	        vm_object_unlock(object);
5405
5406	if(!isVectorUPL)
5407		upl_unlock(upl);
5408	else {
5409		/*
5410		 * If we completed our operations on an UPL that is
5411		 * part of a Vectored UPL and if empty is TRUE, then
5412		 * we should go ahead and deallocate this UPL element.
5413		 * Then we check if this was the last of the UPL elements
5414		 * within that Vectored UPL. If so, set empty to TRUE
5415		 * so that in ubc_upl_commit_range or ubc_upl_commit, we
5416		 * can go ahead and deallocate the Vector UPL too.
5417		 */
5418		if(*empty==TRUE) {
5419			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
5420			upl_deallocate(upl);
5421		}
5422		goto process_upl_to_commit;
5423	}
5424
5425	if (pgpgout_count) {
5426		DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
5427	}
5428
5429	return KERN_SUCCESS;
5430}
5431
5432kern_return_t
5433upl_abort_range(
5434	upl_t			upl,
5435	upl_offset_t		offset,
5436	upl_size_t		size,
5437	int			error,
5438	boolean_t		*empty)
5439{
5440	upl_page_info_t		*user_page_list = NULL;
5441	upl_size_t		xfer_size, subupl_size = size;
5442	vm_object_t		shadow_object;
5443	vm_object_t		object;
5444	vm_object_offset_t	target_offset;
5445	upl_offset_t		subupl_offset = offset;
5446	int			entry;
5447	wpl_array_t 	 	lite_list;
5448	int			occupied;
5449	struct	vm_page_delayed_work	dw_array[DEFAULT_DELAYED_WORK_LIMIT];
5450	struct	vm_page_delayed_work	*dwp;
5451	int			dw_count;
5452	int			dw_limit;
5453	int			isVectorUPL = 0;
5454	upl_t			vector_upl = NULL;
5455
5456	*empty = FALSE;
5457
5458	if (upl == UPL_NULL)
5459		return KERN_INVALID_ARGUMENT;
5460
5461	if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
5462		return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
5463
5464	if((isVectorUPL = vector_upl_is_valid(upl))) {
5465		vector_upl = upl;
5466		upl_lock(vector_upl);
5467	}
5468	else
5469		upl_lock(upl);
5470
5471process_upl_to_abort:
5472	if(isVectorUPL) {
5473		size = subupl_size;
5474		offset = subupl_offset;
5475		if(size == 0) {
5476			upl_unlock(vector_upl);
5477			return KERN_SUCCESS;
5478		}
5479		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
5480		if(upl == NULL) {
5481			upl_unlock(vector_upl);
5482			return KERN_FAILURE;
5483		}
5484		subupl_size -= size;
5485		subupl_offset += size;
5486	}
5487
5488	*empty = FALSE;
5489
5490#if UPL_DEBUG
5491	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
5492		(void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5493
5494		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
5495		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
5496		upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
5497
5498		upl->upl_commit_index++;
5499	}
5500#endif
5501	if (upl->flags & UPL_DEVICE_MEMORY)
5502		xfer_size = 0;
5503	else if ((offset + size) <= upl->size)
5504	        xfer_size = size;
5505	else {
5506		if(!isVectorUPL)
5507			upl_unlock(upl);
5508		else {
5509			upl_unlock(vector_upl);
5510		}
5511
5512		return KERN_FAILURE;
5513	}
5514	if (upl->flags & UPL_INTERNAL) {
5515		lite_list = (wpl_array_t)
5516			((((uintptr_t)upl) + sizeof(struct upl))
5517			+ ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
5518
5519		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5520	} else {
5521		lite_list = (wpl_array_t)
5522			(((uintptr_t)upl) + sizeof(struct upl));
5523	}
5524	object = upl->map_object;
5525
5526	if (upl->flags & UPL_SHADOWED) {
5527	        vm_object_lock(object);
5528		shadow_object = object->shadow;
5529	} else
5530		shadow_object = object;
5531
5532	entry = offset/PAGE_SIZE;
5533	target_offset = (vm_object_offset_t)offset;
5534
5535	if (upl->flags & UPL_KERNEL_OBJECT)
5536		vm_object_lock_shared(shadow_object);
5537	else
5538		vm_object_lock(shadow_object);
5539
5540	if (upl->flags & UPL_ACCESS_BLOCKED) {
5541		assert(shadow_object->blocked_access);
5542		shadow_object->blocked_access = FALSE;
5543		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
5544	}
5545
5546	dwp = &dw_array[0];
5547	dw_count = 0;
5548	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5549
5550	if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT))
5551		panic("upl_abort_range: kernel_object being DUMPED");
5552
5553	while (xfer_size) {
5554		vm_page_t	t, m;
5555		unsigned int	pg_num;
5556		boolean_t	needed;
5557
5558		pg_num = (unsigned int) (target_offset/PAGE_SIZE);
5559		assert(pg_num == target_offset/PAGE_SIZE);
5560
5561		needed = FALSE;
5562
5563		if (user_page_list)
5564			needed = user_page_list[pg_num].needed;
5565
5566		dwp->dw_mask = 0;
5567		m = VM_PAGE_NULL;
5568
5569		if (upl->flags & UPL_LITE) {
5570
5571			if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
5572				lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
5573
5574				if ( !(upl->flags & UPL_KERNEL_OBJECT))
5575					m = vm_page_lookup(shadow_object, target_offset +
5576							   (upl->offset - shadow_object->paging_offset));
5577			}
5578		}
5579		if (upl->flags & UPL_SHADOWED) {
5580		        if ((t = vm_page_lookup(object, target_offset))	!= VM_PAGE_NULL) {
5581			        t->pageout = FALSE;
5582
5583				VM_PAGE_FREE(t);
5584
5585				if (m == VM_PAGE_NULL)
5586					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
5587			}
5588		}
5589		if ((upl->flags & UPL_KERNEL_OBJECT))
5590			goto abort_next_page;
5591
5592		if (m != VM_PAGE_NULL) {
5593
5594			if (m->absent) {
5595			        boolean_t must_free = TRUE;
5596
5597				/*
5598				 * COPYOUT = FALSE case
5599				 * check for error conditions which must
5600				 * be passed back to the pages customer
5601				 */
5602				if (error & UPL_ABORT_RESTART) {
5603					m->restart = TRUE;
5604					m->absent = FALSE;
5605					m->unusual = TRUE;
5606					must_free = FALSE;
5607				} else if (error & UPL_ABORT_UNAVAILABLE) {
5608					m->restart = FALSE;
5609					m->unusual = TRUE;
5610					must_free = FALSE;
5611				} else if (error & UPL_ABORT_ERROR) {
5612					m->restart = FALSE;
5613					m->absent = FALSE;
5614					m->error = TRUE;
5615					m->unusual = TRUE;
5616					must_free = FALSE;
5617				}
5618				if (m->clustered && needed == FALSE) {
5619					/*
5620					 * This page was a part of a speculative
5621					 * read-ahead initiated by the kernel
5622					 * itself.  No one is expecting this
5623					 * page and no one will clean up its
5624					 * error state if it ever becomes valid
5625					 * in the future.
5626					 * We have to free it here.
5627					 */
5628					must_free = TRUE;
5629				}
5630
5631				/*
5632				 * ENCRYPTED SWAP:
5633				 * If the page was already encrypted,
5634				 * we don't really need to decrypt it
5635				 * now.  It will get decrypted later,
5636				 * on demand, as soon as someone needs
5637				 * to access its contents.
5638				 */
5639
5640				m->cleaning = FALSE;
5641				m->encrypted_cleaning = FALSE;
5642
5643				if (m->overwriting && !m->busy) {
5644					/*
5645					 * this shouldn't happen since
5646					 * this is an 'absent' page, but
5647					 * it doesn't hurt to check for
5648					 * the 'alternate' method of
5649					 * stabilizing the page...
5650					 * we will mark 'busy' to be cleared
5651					 * in the following code which will
5652					 * take care of the primary stabilzation
5653					 * method (i.e. setting 'busy' to TRUE)
5654					 */
5655					dwp->dw_mask |= DW_vm_page_unwire;
5656				}
5657				m->overwriting = FALSE;
5658
5659				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5660
5661				if (must_free == TRUE)
5662					dwp->dw_mask |= DW_vm_page_free;
5663				else
5664					dwp->dw_mask |= DW_vm_page_activate;
5665			} else {
5666			        /*
5667				 * Handle the trusted pager throttle.
5668				 */
5669			        if (m->laundry)
5670					dwp->dw_mask |= DW_vm_pageout_throttle_up;
5671
5672				if (upl->flags & UPL_ACCESS_BLOCKED) {
5673					/*
5674					 * We blocked access to the pages in this UPL.
5675					 * Clear the "busy" bit and wake up any waiter
5676					 * for this page.
5677					 */
5678					dwp->dw_mask |= DW_clear_busy;
5679				}
5680				if (m->overwriting) {
5681					if (m->busy)
5682						dwp->dw_mask |= DW_clear_busy;
5683					else {
5684						/*
5685						 * deal with the 'alternate' method
5686						 * of stabilizing the page...
5687						 * we will either free the page
5688						 * or mark 'busy' to be cleared
5689						 * in the following code which will
5690						 * take care of the primary stabilzation
5691						 * method (i.e. setting 'busy' to TRUE)
5692						 */
5693						dwp->dw_mask |= DW_vm_page_unwire;
5694					}
5695					m->overwriting = FALSE;
5696				}
5697				if (m->encrypted_cleaning == TRUE) {
5698					m->encrypted_cleaning = FALSE;
5699
5700					dwp->dw_mask |= DW_clear_busy;
5701				}
5702				m->pageout = FALSE;
5703				m->cleaning = FALSE;
5704#if	MACH_PAGEMAP
5705				vm_external_state_clr(m->object->existence_map, m->offset);
5706#endif	/* MACH_PAGEMAP */
5707				if (error & UPL_ABORT_DUMP_PAGES) {
5708					pmap_disconnect(m->phys_page);
5709
5710					dwp->dw_mask |= DW_vm_page_free;
5711				} else {
5712					if (!(dwp->dw_mask & DW_vm_page_unwire)) {
5713						if (error & UPL_ABORT_REFERENCE) {
5714							/*
5715							 * we've been told to explictly
5716							 * reference this page... for
5717							 * file I/O, this is done by
5718							 * implementing an LRU on the inactive q
5719							 */
5720							dwp->dw_mask |= DW_vm_page_lru;
5721
5722						} else if (!m->active && !m->inactive && !m->speculative)
5723							dwp->dw_mask |= DW_vm_page_deactivate_internal;
5724					}
5725					dwp->dw_mask |= DW_PAGE_WAKEUP;
5726				}
5727			}
5728		}
5729abort_next_page:
5730		target_offset += PAGE_SIZE_64;
5731		xfer_size -= PAGE_SIZE;
5732		entry++;
5733
5734		if (dwp->dw_mask) {
5735			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
5736				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
5737
5738				if (dw_count >= dw_limit) {
5739					vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count);
5740
5741					dwp = &dw_array[0];
5742					dw_count = 0;
5743				}
5744			} else {
5745				if (dwp->dw_mask & DW_clear_busy)
5746					m->busy = FALSE;
5747
5748				if (dwp->dw_mask & DW_PAGE_WAKEUP)
5749					PAGE_WAKEUP(m);
5750			}
5751		}
5752	}
5753	if (dw_count)
5754		vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count);
5755
5756	occupied = 1;
5757
5758	if (upl->flags & UPL_DEVICE_MEMORY)  {
5759		occupied = 0;
5760	} else if (upl->flags & UPL_LITE) {
5761		int	pg_num;
5762		int	i;
5763
5764		pg_num = upl->size/PAGE_SIZE;
5765		pg_num = (pg_num + 31) >> 5;
5766		occupied = 0;
5767
5768		for (i = 0; i < pg_num; i++) {
5769			if (lite_list[i] != 0) {
5770				occupied = 1;
5771				break;
5772			}
5773		}
5774	} else {
5775		if (queue_empty(&upl->map_object->memq))
5776			occupied = 0;
5777	}
5778	if (occupied == 0) {
5779		/*
5780		 * If this UPL element belongs to a Vector UPL and is
5781		 * empty, then this is the right function to deallocate
5782		 * it. So go ahead set the *empty variable. The flag
5783		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
5784		 * should be considered relevant for the Vector UPL and
5785		 * not the internal UPLs.
5786		 */
5787		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
5788			*empty = TRUE;
5789
5790		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
5791		        /*
5792			 * this is not a paging object
5793			 * so we need to drop the paging reference
5794			 * that was taken when we created the UPL
5795			 * against this object
5796			 */
5797			vm_object_activity_end(shadow_object);
5798			vm_object_collapse(shadow_object, 0, TRUE);
5799		} else {
5800		         /*
5801			  * we dontated the paging reference to
5802			  * the map object... vm_pageout_object_terminate
5803			  * will drop this reference
5804			  */
5805		}
5806	}
5807	vm_object_unlock(shadow_object);
5808	if (object != shadow_object)
5809	        vm_object_unlock(object);
5810
5811	if(!isVectorUPL)
5812		upl_unlock(upl);
5813	else {
5814		/*
5815		* If we completed our operations on an UPL that is
5816	 	* part of a Vectored UPL and if empty is TRUE, then
5817	 	* we should go ahead and deallocate this UPL element.
5818	 	* Then we check if this was the last of the UPL elements
5819	 	* within that Vectored UPL. If so, set empty to TRUE
5820	 	* so that in ubc_upl_abort_range or ubc_upl_abort, we
5821	 	* can go ahead and deallocate the Vector UPL too.
5822	 	*/
5823		if(*empty == TRUE) {
5824			*empty = vector_upl_set_subupl(vector_upl, upl,0);
5825			upl_deallocate(upl);
5826		}
5827		goto process_upl_to_abort;
5828	}
5829
5830	return KERN_SUCCESS;
5831}
5832
5833
5834kern_return_t
5835upl_abort(
5836	upl_t	upl,
5837	int	error)
5838{
5839	boolean_t	empty;
5840
5841	return upl_abort_range(upl, 0, upl->size, error, &empty);
5842}
5843
5844
5845/* an option on commit should be wire */
5846kern_return_t
5847upl_commit(
5848	upl_t			upl,
5849	upl_page_info_t		*page_list,
5850	mach_msg_type_number_t	count)
5851{
5852	boolean_t	empty;
5853
5854	return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
5855}
5856
5857void
5858vm_object_set_pmap_cache_attr(
5859		vm_object_t		object,
5860		upl_page_info_array_t	user_page_list,
5861		unsigned int		num_pages,
5862		boolean_t		batch_pmap_op)
5863{
5864	unsigned int    cache_attr = 0;
5865
5866	cache_attr = object->wimg_bits & VM_WIMG_MASK;
5867	assert(user_page_list);
5868	if (cache_attr != VM_WIMG_USE_DEFAULT) {
5869		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
5870	}
5871}
5872
5873unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
5874
5875kern_return_t
5876vm_object_iopl_request(
5877	vm_object_t		object,
5878	vm_object_offset_t	offset,
5879	upl_size_t		size,
5880	upl_t			*upl_ptr,
5881	upl_page_info_array_t	user_page_list,
5882	unsigned int		*page_list_count,
5883	int			cntrl_flags)
5884{
5885	vm_page_t		dst_page;
5886	vm_object_offset_t	dst_offset;
5887	upl_size_t		xfer_size;
5888	upl_t			upl = NULL;
5889	unsigned int		entry;
5890	wpl_array_t 		lite_list = NULL;
5891	int			no_zero_fill = FALSE;
5892	unsigned int		size_in_pages;
5893	u_int32_t		psize;
5894	kern_return_t		ret;
5895	vm_prot_t		prot;
5896	struct vm_object_fault_info fault_info;
5897	struct	vm_page_delayed_work	dw_array[DEFAULT_DELAYED_WORK_LIMIT];
5898	struct	vm_page_delayed_work	*dwp;
5899	int			dw_count;
5900	int			dw_limit;
5901	int			dw_index;
5902
5903	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5904		/*
5905		 * For forward compatibility's sake,
5906		 * reject any unknown flag.
5907		 */
5908		return KERN_INVALID_VALUE;
5909	}
5910	if (vm_lopage_needed == FALSE)
5911	        cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
5912
5913	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
5914	        if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
5915		        return KERN_INVALID_VALUE;
5916
5917		if (object->phys_contiguous) {
5918		        if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
5919			        return KERN_INVALID_ADDRESS;
5920
5921			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
5922			        return KERN_INVALID_ADDRESS;
5923		}
5924	}
5925
5926	if (cntrl_flags & UPL_ENCRYPT) {
5927		/*
5928		 * ENCRYPTED SWAP:
5929		 * The paging path doesn't use this interface,
5930		 * so we don't support the UPL_ENCRYPT flag
5931		 * here.  We won't encrypt the pages.
5932		 */
5933		assert(! (cntrl_flags & UPL_ENCRYPT));
5934	}
5935	if (cntrl_flags & UPL_NOZEROFILL)
5936	        no_zero_fill = TRUE;
5937
5938	if (cntrl_flags & UPL_COPYOUT_FROM)
5939		prot = VM_PROT_READ;
5940	else
5941		prot = VM_PROT_READ | VM_PROT_WRITE;
5942
5943	if (((size/PAGE_SIZE) > MAX_UPL_SIZE) && !object->phys_contiguous)
5944		size = MAX_UPL_SIZE * PAGE_SIZE;
5945
5946	if (cntrl_flags & UPL_SET_INTERNAL) {
5947		if (page_list_count != NULL)
5948			*page_list_count = MAX_UPL_SIZE;
5949	}
5950	if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
5951	    ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
5952	        return KERN_INVALID_ARGUMENT;
5953
5954	if ((!object->internal) && (object->paging_offset != 0))
5955		panic("vm_object_iopl_request: external object with non-zero paging offset\n");
5956
5957
5958	if (object->phys_contiguous)
5959	        psize = PAGE_SIZE;
5960	else
5961	        psize = size;
5962
5963	if (cntrl_flags & UPL_SET_INTERNAL) {
5964	        upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
5965
5966		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5967		lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
5968					   ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
5969		if (size == 0) {
5970			user_page_list = NULL;
5971			lite_list = NULL;
5972		}
5973	} else {
5974	        upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
5975
5976		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5977		if (size == 0) {
5978			lite_list = NULL;
5979		}
5980	}
5981	if (user_page_list)
5982	        user_page_list[0].device = FALSE;
5983	*upl_ptr = upl;
5984
5985	upl->map_object = object;
5986	upl->size = size;
5987
5988	size_in_pages = size / PAGE_SIZE;
5989
5990	if (object == kernel_object &&
5991	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
5992		upl->flags |= UPL_KERNEL_OBJECT;
5993#if UPL_DEBUG
5994		vm_object_lock(object);
5995#else
5996		vm_object_lock_shared(object);
5997#endif
5998	} else {
5999		vm_object_lock(object);
6000		vm_object_activity_begin(object);
6001	}
6002	/*
6003	 * paging in progress also protects the paging_offset
6004	 */
6005	upl->offset = offset + object->paging_offset;
6006
6007	if (cntrl_flags & UPL_BLOCK_ACCESS) {
6008		/*
6009		 * The user requested that access to the pages in this UPL
6010		 * be blocked until the UPL is commited or aborted.
6011		 */
6012		upl->flags |= UPL_ACCESS_BLOCKED;
6013	}
6014
6015	if (object->phys_contiguous) {
6016#if UPL_DEBUG
6017		vm_object_activity_begin(object);
6018		queue_enter(&object->uplq, upl, upl_t, uplq);
6019#endif /* UPL_DEBUG */
6020
6021		if (upl->flags & UPL_ACCESS_BLOCKED) {
6022			assert(!object->blocked_access);
6023			object->blocked_access = TRUE;
6024		}
6025
6026		vm_object_unlock(object);
6027
6028		/*
6029		 * don't need any shadow mappings for this one
6030		 * since it is already I/O memory
6031		 */
6032		upl->flags |= UPL_DEVICE_MEMORY;
6033
6034		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1)>>PAGE_SHIFT);
6035
6036		if (user_page_list) {
6037		        user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset)>>PAGE_SHIFT);
6038			user_page_list[0].device = TRUE;
6039		}
6040		if (page_list_count != NULL) {
6041		        if (upl->flags & UPL_INTERNAL)
6042			        *page_list_count = 0;
6043			else
6044			        *page_list_count = 1;
6045		}
6046		return KERN_SUCCESS;
6047	}
6048	if (object != kernel_object) {
6049		/*
6050		 * Protect user space from future COW operations
6051		 */
6052		object->true_share = TRUE;
6053
6054		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
6055			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6056	}
6057
6058#if UPL_DEBUG
6059	vm_object_activity_begin(object);
6060	queue_enter(&object->uplq, upl, upl_t, uplq);
6061#endif /* UPL_DEBUG */
6062
6063	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
6064	    object->copy != VM_OBJECT_NULL) {
6065		/*
6066		 * Honor copy-on-write obligations
6067		 *
6068		 * The caller is gathering these pages and
6069		 * might modify their contents.  We need to
6070		 * make sure that the copy object has its own
6071		 * private copies of these pages before we let
6072		 * the caller modify them.
6073		 *
6074		 * NOTE: someone else could map the original object
6075		 * after we've done this copy-on-write here, and they
6076		 * could then see an inconsistent picture of the memory
6077		 * while it's being modified via the UPL.  To prevent this,
6078		 * we would have to block access to these pages until the
6079		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
6080		 * code path for that...
6081		 */
6082		vm_object_update(object,
6083				 offset,
6084				 size,
6085				 NULL,
6086				 NULL,
6087				 FALSE,	/* should_return */
6088				 MEMORY_OBJECT_COPY_SYNC,
6089				 VM_PROT_NO_CHANGE);
6090#if DEVELOPMENT || DEBUG
6091		iopl_cow++;
6092		iopl_cow_pages += size >> PAGE_SHIFT;
6093#endif
6094	}
6095
6096
6097	entry = 0;
6098
6099	xfer_size = size;
6100	dst_offset = offset;
6101
6102	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
6103	fault_info.user_tag  = 0;
6104	fault_info.lo_offset = offset;
6105	fault_info.hi_offset = offset + xfer_size;
6106	fault_info.no_cache  = FALSE;
6107	fault_info.stealth = FALSE;
6108	fault_info.io_sync = FALSE;
6109	fault_info.cs_bypass = FALSE;
6110	fault_info.mark_zf_absent = TRUE;
6111
6112	dwp = &dw_array[0];
6113	dw_count = 0;
6114	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
6115
6116	while (xfer_size) {
6117	        vm_fault_return_t	result;
6118		unsigned int		pg_num;
6119
6120		dwp->dw_mask = 0;
6121
6122		dst_page = vm_page_lookup(object, dst_offset);
6123
6124		/*
6125		 * ENCRYPTED SWAP:
6126		 * If the page is encrypted, we need to decrypt it,
6127		 * so force a soft page fault.
6128		 */
6129		if (dst_page == VM_PAGE_NULL ||
6130		    dst_page->busy ||
6131		    dst_page->encrypted ||
6132		    dst_page->error ||
6133		    dst_page->restart ||
6134		    dst_page->absent ||
6135		    dst_page->fictitious) {
6136
6137		   if (object == kernel_object)
6138			   panic("vm_object_iopl_request: missing/bad page in kernel object\n");
6139
6140		   do {
6141			vm_page_t	top_page;
6142			kern_return_t	error_code;
6143			int		interruptible;
6144
6145			if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
6146				interruptible = THREAD_ABORTSAFE;
6147			else
6148				interruptible = THREAD_UNINT;
6149
6150			fault_info.interruptible = interruptible;
6151			fault_info.cluster_size = xfer_size;
6152			fault_info.batch_pmap_op = TRUE;
6153
6154			vm_object_paging_begin(object);
6155
6156			result = vm_fault_page(object, dst_offset,
6157					       prot | VM_PROT_WRITE, FALSE,
6158					       &prot, &dst_page, &top_page,
6159					       (int *)0,
6160					       &error_code, no_zero_fill,
6161					       FALSE, &fault_info);
6162
6163			switch (result) {
6164
6165			case VM_FAULT_SUCCESS:
6166
6167				if ( !dst_page->absent) {
6168					PAGE_WAKEUP_DONE(dst_page);
6169				} else {
6170					/*
6171					 * we only get back an absent page if we
6172					 * requested that it not be zero-filled
6173					 * because we are about to fill it via I/O
6174					 *
6175					 * absent pages should be left BUSY
6176					 * to prevent them from being faulted
6177					 * into an address space before we've
6178					 * had a chance to complete the I/O on
6179					 * them since they may contain info that
6180					 * shouldn't be seen by the faulting task
6181					 */
6182				}
6183				/*
6184				 *	Release paging references and
6185				 *	top-level placeholder page, if any.
6186				 */
6187				if (top_page != VM_PAGE_NULL) {
6188					vm_object_t local_object;
6189
6190					local_object = top_page->object;
6191
6192					if (top_page->object != dst_page->object) {
6193						vm_object_lock(local_object);
6194						VM_PAGE_FREE(top_page);
6195						vm_object_paging_end(local_object);
6196						vm_object_unlock(local_object);
6197					} else {
6198						VM_PAGE_FREE(top_page);
6199						vm_object_paging_end(local_object);
6200					}
6201				}
6202				vm_object_paging_end(object);
6203				break;
6204
6205			case VM_FAULT_RETRY:
6206				vm_object_lock(object);
6207				break;
6208
6209			case VM_FAULT_MEMORY_SHORTAGE:
6210				OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6211
6212				VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6213
6214				if (vm_page_wait(interruptible)) {
6215					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6216
6217					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6218					vm_object_lock(object);
6219
6220					break;
6221				}
6222				OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6223
6224				VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
6225
6226				/* fall thru */
6227
6228			case VM_FAULT_INTERRUPTED:
6229				error_code = MACH_SEND_INTERRUPTED;
6230			case VM_FAULT_MEMORY_ERROR:
6231			memory_error:
6232				ret = (error_code ? error_code:	KERN_MEMORY_ERROR);
6233
6234				vm_object_lock(object);
6235				goto return_err;
6236
6237			case VM_FAULT_SUCCESS_NO_VM_PAGE:
6238				/* success but no page: fail */
6239				vm_object_paging_end(object);
6240				vm_object_unlock(object);
6241				goto memory_error;
6242
6243			default:
6244				panic("vm_object_iopl_request: unexpected error"
6245				      " 0x%x from vm_fault_page()\n", result);
6246			}
6247		   } while (result != VM_FAULT_SUCCESS);
6248
6249		}
6250		if (upl->flags & UPL_KERNEL_OBJECT)
6251			goto record_phys_addr;
6252
6253		if (dst_page->cleaning) {
6254			/*
6255			 * Someone else is cleaning this page in place.
6256			 * In theory, we should be able to  proceed and use this
6257			 * page but they'll probably end up clearing the "busy"
6258			 * bit on it in upl_commit_range() but they didn't set
6259			 * it, so they would clear our "busy" bit and open
6260			 * us to race conditions.
6261			 * We'd better wait for the cleaning to complete and
6262			 * then try again.
6263			 */
6264			vm_object_iopl_request_sleep_for_cleaning++;
6265			PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6266			continue;
6267		}
6268		if (dst_page->laundry) {
6269			dst_page->pageout = FALSE;
6270
6271			vm_pageout_steal_laundry(dst_page, FALSE);
6272		}
6273		if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
6274		     dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
6275		        vm_page_t	low_page;
6276			int 		refmod;
6277
6278			/*
6279			 * support devices that can't DMA above 32 bits
6280			 * by substituting pages from a pool of low address
6281			 * memory for any pages we find above the 4G mark
6282			 * can't substitute if the page is already wired because
6283			 * we don't know whether that physical address has been
6284			 * handed out to some other 64 bit capable DMA device to use
6285			 */
6286			if (VM_PAGE_WIRED(dst_page)) {
6287			        ret = KERN_PROTECTION_FAILURE;
6288				goto return_err;
6289			}
6290			low_page = vm_page_grablo();
6291
6292			if (low_page == VM_PAGE_NULL) {
6293			        ret = KERN_RESOURCE_SHORTAGE;
6294				goto return_err;
6295			}
6296			/*
6297			 * from here until the vm_page_replace completes
6298			 * we musn't drop the object lock... we don't
6299			 * want anyone refaulting this page in and using
6300			 * it after we disconnect it... we want the fault
6301			 * to find the new page being substituted.
6302			 */
6303			if (dst_page->pmapped)
6304			        refmod = pmap_disconnect(dst_page->phys_page);
6305			else
6306			        refmod = 0;
6307
6308			if (!dst_page->absent)
6309				vm_page_copy(dst_page, low_page);
6310
6311			low_page->reference = dst_page->reference;
6312			low_page->dirty     = dst_page->dirty;
6313			low_page->absent    = dst_page->absent;
6314
6315			if (refmod & VM_MEM_REFERENCED)
6316			        low_page->reference = TRUE;
6317			if (refmod & VM_MEM_MODIFIED) {
6318			        SET_PAGE_DIRTY(low_page, FALSE);
6319			}
6320
6321			vm_page_replace(low_page, object, dst_offset);
6322
6323			dst_page = low_page;
6324			/*
6325			 * vm_page_grablo returned the page marked
6326			 * BUSY... we don't need a PAGE_WAKEUP_DONE
6327			 * here, because we've never dropped the object lock
6328			 */
6329			if ( !dst_page->absent)
6330				dst_page->busy = FALSE;
6331		}
6332		if ( !dst_page->busy)
6333			dwp->dw_mask |= DW_vm_page_wire;
6334
6335		if (cntrl_flags & UPL_BLOCK_ACCESS) {
6336			/*
6337			 * Mark the page "busy" to block any future page fault
6338			 * on this page in addition to wiring it.
6339			 * We'll also remove the mapping
6340			 * of all these pages before leaving this routine.
6341			 */
6342			assert(!dst_page->fictitious);
6343			dst_page->busy = TRUE;
6344		}
6345		/*
6346		 * expect the page to be used
6347		 * page queues lock must be held to set 'reference'
6348		 */
6349		dwp->dw_mask |= DW_set_reference;
6350
6351   		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
6352			SET_PAGE_DIRTY(dst_page, TRUE);
6353		}
6354record_phys_addr:
6355		if (dst_page->busy)
6356			upl->flags |= UPL_HAS_BUSY;
6357
6358		pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
6359		assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
6360		lite_list[pg_num>>5] |= 1 << (pg_num & 31);
6361
6362		if (dst_page->phys_page > upl->highest_page)
6363		        upl->highest_page = dst_page->phys_page;
6364
6365		if (user_page_list) {
6366			user_page_list[entry].phys_addr	= dst_page->phys_page;
6367			user_page_list[entry].pageout	= dst_page->pageout;
6368			user_page_list[entry].absent	= dst_page->absent;
6369			user_page_list[entry].dirty 	= dst_page->dirty;
6370			user_page_list[entry].precious	= dst_page->precious;
6371			user_page_list[entry].device 	= FALSE;
6372			user_page_list[entry].needed    = FALSE;
6373			if (dst_page->clustered == TRUE)
6374			        user_page_list[entry].speculative = dst_page->speculative;
6375			else
6376			        user_page_list[entry].speculative = FALSE;
6377			user_page_list[entry].cs_validated = dst_page->cs_validated;
6378			user_page_list[entry].cs_tainted = dst_page->cs_tainted;
6379		}
6380		if (object != kernel_object) {
6381			/*
6382			 * someone is explicitly grabbing this page...
6383			 * update clustered and speculative state
6384			 *
6385			 */
6386			VM_PAGE_CONSUME_CLUSTERED(dst_page);
6387		}
6388		entry++;
6389		dst_offset += PAGE_SIZE_64;
6390		xfer_size -= PAGE_SIZE;
6391
6392		if (dwp->dw_mask) {
6393			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6394
6395			if (dw_count >= dw_limit) {
6396				vm_page_do_delayed_work(object, &dw_array[0], dw_count);
6397
6398				dwp = &dw_array[0];
6399				dw_count = 0;
6400			}
6401		}
6402	}
6403	if (dw_count)
6404		vm_page_do_delayed_work(object, &dw_array[0], dw_count);
6405
6406	vm_object_set_pmap_cache_attr(object, user_page_list, entry, TRUE);
6407
6408	if (page_list_count != NULL) {
6409	        if (upl->flags & UPL_INTERNAL)
6410			*page_list_count = 0;
6411		else if (*page_list_count > entry)
6412			*page_list_count = entry;
6413	}
6414	vm_object_unlock(object);
6415
6416	if (cntrl_flags & UPL_BLOCK_ACCESS) {
6417		/*
6418		 * We've marked all the pages "busy" so that future
6419		 * page faults will block.
6420		 * Now remove the mapping for these pages, so that they
6421		 * can't be accessed without causing a page fault.
6422		 */
6423		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
6424				       PMAP_NULL, 0, VM_PROT_NONE);
6425		assert(!object->blocked_access);
6426		object->blocked_access = TRUE;
6427	}
6428	return KERN_SUCCESS;
6429
6430return_err:
6431	dw_index = 0;
6432
6433	for (; offset < dst_offset; offset += PAGE_SIZE) {
6434		boolean_t need_unwire;
6435
6436	        dst_page = vm_page_lookup(object, offset);
6437
6438		if (dst_page == VM_PAGE_NULL)
6439		        panic("vm_object_iopl_request: Wired page missing. \n");
6440
6441		/*
6442		 * if we've already processed this page in an earlier
6443		 * dw_do_work, we need to undo the wiring... we will
6444		 * leave the dirty and reference bits on if they
6445		 * were set, since we don't have a good way of knowing
6446		 * what the previous state was and we won't get here
6447		 * under any normal circumstances...  we will always
6448		 * clear BUSY and wakeup any waiters via vm_page_free
6449		 * or PAGE_WAKEUP_DONE
6450		 */
6451		need_unwire = TRUE;
6452
6453		if (dw_count) {
6454			if (dw_array[dw_index].dw_m == dst_page) {
6455				/*
6456				 * still in the deferred work list
6457				 * which means we haven't yet called
6458				 * vm_page_wire on this page
6459				 */
6460				need_unwire = FALSE;
6461
6462				dw_index++;
6463				dw_count--;
6464			}
6465		}
6466		vm_page_lock_queues();
6467
6468		if (dst_page->absent) {
6469			vm_page_free(dst_page);
6470
6471			need_unwire = FALSE;
6472		} else {
6473			if (need_unwire == TRUE)
6474				vm_page_unwire(dst_page, TRUE);
6475
6476			PAGE_WAKEUP_DONE(dst_page);
6477		}
6478		vm_page_unlock_queues();
6479
6480		if (need_unwire == TRUE)
6481			VM_STAT_INCR(reactivations);
6482	}
6483#if UPL_DEBUG
6484	upl->upl_state = 2;
6485#endif
6486	if (! (upl->flags & UPL_KERNEL_OBJECT)) {
6487		vm_object_activity_end(object);
6488		vm_object_collapse(object, 0, TRUE);
6489	}
6490	vm_object_unlock(object);
6491	upl_destroy(upl);
6492
6493	return ret;
6494}
6495
6496kern_return_t
6497upl_transpose(
6498	upl_t		upl1,
6499	upl_t		upl2)
6500{
6501	kern_return_t		retval;
6502	boolean_t		upls_locked;
6503	vm_object_t		object1, object2;
6504
6505	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2  || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR)  || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
6506		return KERN_INVALID_ARGUMENT;
6507	}
6508
6509	upls_locked = FALSE;
6510
6511	/*
6512	 * Since we need to lock both UPLs at the same time,
6513	 * avoid deadlocks by always taking locks in the same order.
6514	 */
6515	if (upl1 < upl2) {
6516		upl_lock(upl1);
6517		upl_lock(upl2);
6518	} else {
6519		upl_lock(upl2);
6520		upl_lock(upl1);
6521	}
6522	upls_locked = TRUE;	/* the UPLs will need to be unlocked */
6523
6524	object1 = upl1->map_object;
6525	object2 = upl2->map_object;
6526
6527	if (upl1->offset != 0 || upl2->offset != 0 ||
6528	    upl1->size != upl2->size) {
6529		/*
6530		 * We deal only with full objects, not subsets.
6531		 * That's because we exchange the entire backing store info
6532		 * for the objects: pager, resident pages, etc...  We can't do
6533		 * only part of it.
6534		 */
6535		retval = KERN_INVALID_VALUE;
6536		goto done;
6537	}
6538
6539	/*
6540	 * Tranpose the VM objects' backing store.
6541	 */
6542	retval = vm_object_transpose(object1, object2,
6543				     (vm_object_size_t) upl1->size);
6544
6545	if (retval == KERN_SUCCESS) {
6546		/*
6547		 * Make each UPL point to the correct VM object, i.e. the
6548		 * object holding the pages that the UPL refers to...
6549		 */
6550#if UPL_DEBUG
6551		queue_remove(&object1->uplq, upl1, upl_t, uplq);
6552		queue_remove(&object2->uplq, upl2, upl_t, uplq);
6553#endif
6554		upl1->map_object = object2;
6555		upl2->map_object = object1;
6556#if UPL_DEBUG
6557		queue_enter(&object1->uplq, upl2, upl_t, uplq);
6558		queue_enter(&object2->uplq, upl1, upl_t, uplq);
6559#endif
6560	}
6561
6562done:
6563	/*
6564	 * Cleanup.
6565	 */
6566	if (upls_locked) {
6567		upl_unlock(upl1);
6568		upl_unlock(upl2);
6569		upls_locked = FALSE;
6570	}
6571
6572	return retval;
6573}
6574
6575void
6576upl_range_needed(
6577	upl_t		upl,
6578	int		index,
6579	int		count)
6580{
6581	upl_page_info_t	*user_page_list;
6582	int		size_in_pages;
6583
6584	if ( !(upl->flags & UPL_INTERNAL) || count <= 0)
6585		return;
6586
6587	size_in_pages = upl->size / PAGE_SIZE;
6588
6589	user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
6590
6591	while (count-- && index < size_in_pages)
6592		user_page_list[index++].needed = TRUE;
6593}
6594
6595
6596/*
6597 * ENCRYPTED SWAP:
6598 *
6599 * Rationale:  the user might have some encrypted data on disk (via
6600 * FileVault or any other mechanism).  That data is then decrypted in
6601 * memory, which is safe as long as the machine is secure.  But that
6602 * decrypted data in memory could be paged out to disk by the default
6603 * pager.  The data would then be stored on disk in clear (not encrypted)
6604 * and it could be accessed by anyone who gets physical access to the
6605 * disk (if the laptop or the disk gets stolen for example).  This weakens
6606 * the security offered by FileVault.
6607 *
6608 * Solution:  the default pager will optionally request that all the
6609 * pages it gathers for pageout be encrypted, via the UPL interfaces,
6610 * before it sends this UPL to disk via the vnode_pageout() path.
6611 *
6612 * Notes:
6613 *
6614 * To avoid disrupting the VM LRU algorithms, we want to keep the
6615 * clean-in-place mechanisms, which allow us to send some extra pages to
6616 * swap (clustering) without actually removing them from the user's
6617 * address space.  We don't want the user to unknowingly access encrypted
6618 * data, so we have to actually remove the encrypted pages from the page
6619 * table.  When the user accesses the data, the hardware will fail to
6620 * locate the virtual page in its page table and will trigger a page
6621 * fault.  We can then decrypt the page and enter it in the page table
6622 * again.  Whenever we allow the user to access the contents of a page,
6623 * we have to make sure it's not encrypted.
6624 *
6625 *
6626 */
6627/*
6628 * ENCRYPTED SWAP:
6629 * Reserve of virtual addresses in the kernel address space.
6630 * We need to map the physical pages in the kernel, so that we
6631 * can call the encryption/decryption routines with a kernel
6632 * virtual address.  We keep this pool of pre-allocated kernel
6633 * virtual addresses so that we don't have to scan the kernel's
6634 * virtaul address space each time we need to encrypt or decrypt
6635 * a physical page.
6636 * It would be nice to be able to encrypt and decrypt in physical
6637 * mode but that might not always be more efficient...
6638 */
6639decl_simple_lock_data(,vm_paging_lock)
6640#define VM_PAGING_NUM_PAGES	64
6641vm_map_offset_t vm_paging_base_address = 0;
6642boolean_t	vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
6643int		vm_paging_max_index = 0;
6644int		vm_paging_page_waiter = 0;
6645int		vm_paging_page_waiter_total = 0;
6646unsigned long	vm_paging_no_kernel_page = 0;
6647unsigned long	vm_paging_objects_mapped = 0;
6648unsigned long	vm_paging_pages_mapped = 0;
6649unsigned long	vm_paging_objects_mapped_slow = 0;
6650unsigned long	vm_paging_pages_mapped_slow = 0;
6651
6652void
6653vm_paging_map_init(void)
6654{
6655	kern_return_t	kr;
6656	vm_map_offset_t	page_map_offset;
6657	vm_map_entry_t	map_entry;
6658
6659	assert(vm_paging_base_address == 0);
6660
6661	/*
6662	 * Initialize our pool of pre-allocated kernel
6663	 * virtual addresses.
6664	 */
6665	page_map_offset = 0;
6666	kr = vm_map_find_space(kernel_map,
6667			       &page_map_offset,
6668			       VM_PAGING_NUM_PAGES * PAGE_SIZE,
6669			       0,
6670			       0,
6671			       &map_entry);
6672	if (kr != KERN_SUCCESS) {
6673		panic("vm_paging_map_init: kernel_map full\n");
6674	}
6675	map_entry->object.vm_object = kernel_object;
6676	map_entry->offset = page_map_offset;
6677	map_entry->protection = VM_PROT_NONE;
6678	map_entry->max_protection = VM_PROT_NONE;
6679	map_entry->permanent = TRUE;
6680	vm_object_reference(kernel_object);
6681	vm_map_unlock(kernel_map);
6682
6683	assert(vm_paging_base_address == 0);
6684	vm_paging_base_address = page_map_offset;
6685}
6686
6687/*
6688 * ENCRYPTED SWAP:
6689 * vm_paging_map_object:
6690 *	Maps part of a VM object's pages in the kernel
6691 * 	virtual address space, using the pre-allocated
6692 *	kernel virtual addresses, if possible.
6693 * Context:
6694 * 	The VM object is locked.  This lock will get
6695 * 	dropped and re-acquired though, so the caller
6696 * 	must make sure the VM object is kept alive
6697 *	(by holding a VM map that has a reference
6698 * 	on it, for example, or taking an extra reference).
6699 * 	The page should also be kept busy to prevent
6700 *	it from being reclaimed.
6701 */
6702kern_return_t
6703vm_paging_map_object(
6704	vm_map_offset_t		*address,
6705	vm_page_t		page,
6706	vm_object_t		object,
6707	vm_object_offset_t	offset,
6708	vm_map_size_t		*size,
6709	vm_prot_t		protection,
6710	boolean_t		can_unlock_object)
6711{
6712	kern_return_t		kr;
6713	vm_map_offset_t		page_map_offset;
6714	vm_map_size_t		map_size;
6715	vm_object_offset_t	object_offset;
6716	int			i;
6717
6718
6719	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
6720		assert(page->busy);
6721		/*
6722		 * Use one of the pre-allocated kernel virtual addresses
6723		 * and just enter the VM page in the kernel address space
6724		 * at that virtual address.
6725		 */
6726		simple_lock(&vm_paging_lock);
6727
6728		/*
6729		 * Try and find an available kernel virtual address
6730		 * from our pre-allocated pool.
6731		 */
6732		page_map_offset = 0;
6733		for (;;) {
6734			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
6735				if (vm_paging_page_inuse[i] == FALSE) {
6736					page_map_offset =
6737						vm_paging_base_address +
6738						(i * PAGE_SIZE);
6739					break;
6740				}
6741			}
6742			if (page_map_offset != 0) {
6743				/* found a space to map our page ! */
6744				break;
6745			}
6746
6747			if (can_unlock_object) {
6748				/*
6749				 * If we can afford to unlock the VM object,
6750				 * let's take the slow path now...
6751				 */
6752				break;
6753			}
6754			/*
6755			 * We can't afford to unlock the VM object, so
6756			 * let's wait for a space to become available...
6757			 */
6758			vm_paging_page_waiter_total++;
6759			vm_paging_page_waiter++;
6760			thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
6761						       &vm_paging_lock,
6762						       THREAD_UNINT);
6763			vm_paging_page_waiter--;
6764			/* ... and try again */
6765		}
6766
6767		if (page_map_offset != 0) {
6768			/*
6769			 * We found a kernel virtual address;
6770			 * map the physical page to that virtual address.
6771			 */
6772			if (i > vm_paging_max_index) {
6773				vm_paging_max_index = i;
6774			}
6775			vm_paging_page_inuse[i] = TRUE;
6776			simple_unlock(&vm_paging_lock);
6777
6778			page->pmapped = TRUE;
6779
6780			/*
6781			 * Keep the VM object locked over the PMAP_ENTER
6782			 * and the actual use of the page by the kernel,
6783			 * or this pmap mapping might get undone by a
6784			 * vm_object_pmap_protect() call...
6785			 */
6786			PMAP_ENTER(kernel_pmap,
6787				   page_map_offset,
6788				   page,
6789				   protection,
6790				   VM_PROT_NONE,
6791				   0,
6792				   TRUE);
6793			vm_paging_objects_mapped++;
6794			vm_paging_pages_mapped++;
6795			*address = page_map_offset;
6796
6797			/* all done and mapped, ready to use ! */
6798			return KERN_SUCCESS;
6799		}
6800
6801		/*
6802		 * We ran out of pre-allocated kernel virtual
6803		 * addresses.  Just map the page in the kernel
6804		 * the slow and regular way.
6805		 */
6806		vm_paging_no_kernel_page++;
6807		simple_unlock(&vm_paging_lock);
6808	}
6809
6810	if (! can_unlock_object) {
6811		return KERN_NOT_SUPPORTED;
6812	}
6813
6814	object_offset = vm_object_trunc_page(offset);
6815	map_size = vm_map_round_page(*size);
6816
6817	/*
6818	 * Try and map the required range of the object
6819	 * in the kernel_map
6820	 */
6821
6822	vm_object_reference_locked(object);	/* for the map entry */
6823	vm_object_unlock(object);
6824
6825	kr = vm_map_enter(kernel_map,
6826			  address,
6827			  map_size,
6828			  0,
6829			  VM_FLAGS_ANYWHERE,
6830			  object,
6831			  object_offset,
6832			  FALSE,
6833			  protection,
6834			  VM_PROT_ALL,
6835			  VM_INHERIT_NONE);
6836	if (kr != KERN_SUCCESS) {
6837		*address = 0;
6838		*size = 0;
6839		vm_object_deallocate(object);	/* for the map entry */
6840		vm_object_lock(object);
6841		return kr;
6842	}
6843
6844	*size = map_size;
6845
6846	/*
6847	 * Enter the mapped pages in the page table now.
6848	 */
6849	vm_object_lock(object);
6850	/*
6851	 * VM object must be kept locked from before PMAP_ENTER()
6852	 * until after the kernel is done accessing the page(s).
6853	 * Otherwise, the pmap mappings in the kernel could be
6854	 * undone by a call to vm_object_pmap_protect().
6855	 */
6856
6857	for (page_map_offset = 0;
6858	     map_size != 0;
6859	     map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
6860
6861		page = vm_page_lookup(object, offset + page_map_offset);
6862		if (page == VM_PAGE_NULL) {
6863			printf("vm_paging_map_object: no page !?");
6864			vm_object_unlock(object);
6865			kr = vm_map_remove(kernel_map, *address, *size,
6866					   VM_MAP_NO_FLAGS);
6867			assert(kr == KERN_SUCCESS);
6868			*address = 0;
6869			*size = 0;
6870			vm_object_lock(object);
6871			return KERN_MEMORY_ERROR;
6872		}
6873		page->pmapped = TRUE;
6874
6875		//assert(pmap_verify_free(page->phys_page));
6876		PMAP_ENTER(kernel_pmap,
6877			   *address + page_map_offset,
6878			   page,
6879			   protection,
6880			   VM_PROT_NONE,
6881			   0,
6882			   TRUE);
6883	}
6884
6885	vm_paging_objects_mapped_slow++;
6886	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
6887
6888	return KERN_SUCCESS;
6889}
6890
6891/*
6892 * ENCRYPTED SWAP:
6893 * vm_paging_unmap_object:
6894 *	Unmaps part of a VM object's pages from the kernel
6895 * 	virtual address space.
6896 * Context:
6897 * 	The VM object is locked.  This lock will get
6898 * 	dropped and re-acquired though.
6899 */
6900void
6901vm_paging_unmap_object(
6902	vm_object_t	object,
6903	vm_map_offset_t	start,
6904	vm_map_offset_t	end)
6905{
6906	kern_return_t	kr;
6907	int		i;
6908
6909	if ((vm_paging_base_address == 0) ||
6910	    (start < vm_paging_base_address) ||
6911	    (end > (vm_paging_base_address
6912		     + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
6913		/*
6914		 * We didn't use our pre-allocated pool of
6915		 * kernel virtual address.  Deallocate the
6916		 * virtual memory.
6917		 */
6918		if (object != VM_OBJECT_NULL) {
6919			vm_object_unlock(object);
6920		}
6921		kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
6922		if (object != VM_OBJECT_NULL) {
6923			vm_object_lock(object);
6924		}
6925		assert(kr == KERN_SUCCESS);
6926	} else {
6927		/*
6928		 * We used a kernel virtual address from our
6929		 * pre-allocated pool.  Put it back in the pool
6930		 * for next time.
6931		 */
6932		assert(end - start == PAGE_SIZE);
6933		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
6934		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
6935
6936		/* undo the pmap mapping */
6937		pmap_remove(kernel_pmap, start, end);
6938
6939		simple_lock(&vm_paging_lock);
6940		vm_paging_page_inuse[i] = FALSE;
6941		if (vm_paging_page_waiter) {
6942			thread_wakeup(&vm_paging_page_waiter);
6943		}
6944		simple_unlock(&vm_paging_lock);
6945	}
6946}
6947
6948#if CRYPTO
6949/*
6950 * Encryption data.
6951 * "iv" is the "initial vector".  Ideally, we want to
6952 * have a different one for each page we encrypt, so that
6953 * crackers can't find encryption patterns too easily.
6954 */
6955#define SWAP_CRYPT_AES_KEY_SIZE	128	/* XXX 192 and 256 don't work ! */
6956boolean_t		swap_crypt_ctx_initialized = FALSE;
6957uint32_t 		swap_crypt_key[8]; /* big enough for a 256 key */
6958aes_ctx			swap_crypt_ctx;
6959const unsigned char	swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
6960
6961#if DEBUG
6962boolean_t		swap_crypt_ctx_tested = FALSE;
6963unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
6964unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
6965unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
6966#endif /* DEBUG */
6967
6968/*
6969 * Initialize the encryption context: key and key size.
6970 */
6971void swap_crypt_ctx_initialize(void); /* forward */
6972void
6973swap_crypt_ctx_initialize(void)
6974{
6975	unsigned int	i;
6976
6977	/*
6978	 * No need for locking to protect swap_crypt_ctx_initialized
6979	 * because the first use of encryption will come from the
6980	 * pageout thread (we won't pagein before there's been a pageout)
6981	 * and there's only one pageout thread.
6982	 */
6983	if (swap_crypt_ctx_initialized == FALSE) {
6984		for (i = 0;
6985		     i < (sizeof (swap_crypt_key) /
6986			  sizeof (swap_crypt_key[0]));
6987		     i++) {
6988			swap_crypt_key[i] = random();
6989		}
6990		aes_encrypt_key((const unsigned char *) swap_crypt_key,
6991				SWAP_CRYPT_AES_KEY_SIZE,
6992				&swap_crypt_ctx.encrypt);
6993		aes_decrypt_key((const unsigned char *) swap_crypt_key,
6994				SWAP_CRYPT_AES_KEY_SIZE,
6995				&swap_crypt_ctx.decrypt);
6996		swap_crypt_ctx_initialized = TRUE;
6997	}
6998
6999#if DEBUG
7000	/*
7001	 * Validate the encryption algorithms.
7002	 */
7003	if (swap_crypt_ctx_tested == FALSE) {
7004		/* initialize */
7005		for (i = 0; i < 4096; i++) {
7006			swap_crypt_test_page_ref[i] = (char) i;
7007		}
7008		/* encrypt */
7009		aes_encrypt_cbc(swap_crypt_test_page_ref,
7010				swap_crypt_null_iv,
7011				PAGE_SIZE / AES_BLOCK_SIZE,
7012				swap_crypt_test_page_encrypt,
7013				&swap_crypt_ctx.encrypt);
7014		/* decrypt */
7015		aes_decrypt_cbc(swap_crypt_test_page_encrypt,
7016				swap_crypt_null_iv,
7017				PAGE_SIZE / AES_BLOCK_SIZE,
7018				swap_crypt_test_page_decrypt,
7019				&swap_crypt_ctx.decrypt);
7020		/* compare result with original */
7021		for (i = 0; i < 4096; i ++) {
7022			if (swap_crypt_test_page_decrypt[i] !=
7023			    swap_crypt_test_page_ref[i]) {
7024				panic("encryption test failed");
7025			}
7026		}
7027
7028		/* encrypt again */
7029		aes_encrypt_cbc(swap_crypt_test_page_decrypt,
7030				swap_crypt_null_iv,
7031				PAGE_SIZE / AES_BLOCK_SIZE,
7032				swap_crypt_test_page_decrypt,
7033				&swap_crypt_ctx.encrypt);
7034		/* decrypt in place */
7035		aes_decrypt_cbc(swap_crypt_test_page_decrypt,
7036				swap_crypt_null_iv,
7037				PAGE_SIZE / AES_BLOCK_SIZE,
7038				swap_crypt_test_page_decrypt,
7039				&swap_crypt_ctx.decrypt);
7040		for (i = 0; i < 4096; i ++) {
7041			if (swap_crypt_test_page_decrypt[i] !=
7042			    swap_crypt_test_page_ref[i]) {
7043				panic("in place encryption test failed");
7044			}
7045		}
7046
7047		swap_crypt_ctx_tested = TRUE;
7048	}
7049#endif /* DEBUG */
7050}
7051
7052/*
7053 * ENCRYPTED SWAP:
7054 * vm_page_encrypt:
7055 * 	Encrypt the given page, for secure paging.
7056 * 	The page might already be mapped at kernel virtual
7057 * 	address "kernel_mapping_offset".  Otherwise, we need
7058 * 	to map it.
7059 *
7060 * Context:
7061 * 	The page's object is locked, but this lock will be released
7062 * 	and re-acquired.
7063 * 	The page is busy and not accessible by users (not entered in any pmap).
7064 */
7065void
7066vm_page_encrypt(
7067	vm_page_t	page,
7068	vm_map_offset_t	kernel_mapping_offset)
7069{
7070	kern_return_t		kr;
7071	vm_map_size_t		kernel_mapping_size;
7072	vm_offset_t		kernel_vaddr;
7073	union {
7074		unsigned char	aes_iv[AES_BLOCK_SIZE];
7075		struct {
7076			memory_object_t		pager_object;
7077			vm_object_offset_t	paging_offset;
7078		} vm;
7079	} encrypt_iv;
7080
7081	if (! vm_pages_encrypted) {
7082		vm_pages_encrypted = TRUE;
7083	}
7084
7085	assert(page->busy);
7086
7087	if (page->encrypted) {
7088		/*
7089		 * Already encrypted: no need to do it again.
7090		 */
7091		vm_page_encrypt_already_encrypted_counter++;
7092		return;
7093	}
7094	assert(page->dirty || page->precious);
7095
7096	ASSERT_PAGE_DECRYPTED(page);
7097
7098	/*
7099	 * Take a paging-in-progress reference to keep the object
7100	 * alive even if we have to unlock it (in vm_paging_map_object()
7101	 * for example)...
7102	 */
7103	vm_object_paging_begin(page->object);
7104
7105	if (kernel_mapping_offset == 0) {
7106		/*
7107		 * The page hasn't already been mapped in kernel space
7108		 * by the caller.  Map it now, so that we can access
7109		 * its contents and encrypt them.
7110		 */
7111		kernel_mapping_size = PAGE_SIZE;
7112		kr = vm_paging_map_object(&kernel_mapping_offset,
7113					  page,
7114					  page->object,
7115					  page->offset,
7116					  &kernel_mapping_size,
7117					  VM_PROT_READ | VM_PROT_WRITE,
7118					  FALSE);
7119		if (kr != KERN_SUCCESS) {
7120			panic("vm_page_encrypt: "
7121			      "could not map page in kernel: 0x%x\n",
7122			      kr);
7123		}
7124	} else {
7125		kernel_mapping_size = 0;
7126	}
7127	kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
7128
7129	if (swap_crypt_ctx_initialized == FALSE) {
7130		swap_crypt_ctx_initialize();
7131	}
7132	assert(swap_crypt_ctx_initialized);
7133
7134	/*
7135	 * Prepare an "initial vector" for the encryption.
7136	 * We use the "pager" and the "paging_offset" for that
7137	 * page to obfuscate the encrypted data a bit more and
7138	 * prevent crackers from finding patterns that they could
7139	 * use to break the key.
7140	 */
7141	bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
7142	encrypt_iv.vm.pager_object = page->object->pager;
7143	encrypt_iv.vm.paging_offset =
7144		page->object->paging_offset + page->offset;
7145
7146	/* encrypt the "initial vector" */
7147	aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
7148			swap_crypt_null_iv,
7149			1,
7150			&encrypt_iv.aes_iv[0],
7151			&swap_crypt_ctx.encrypt);
7152
7153	/*
7154	 * Encrypt the page.
7155	 */
7156	aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
7157			&encrypt_iv.aes_iv[0],
7158			PAGE_SIZE / AES_BLOCK_SIZE,
7159			(unsigned char *) kernel_vaddr,
7160			&swap_crypt_ctx.encrypt);
7161
7162	vm_page_encrypt_counter++;
7163
7164	/*
7165	 * Unmap the page from the kernel's address space,
7166	 * if we had to map it ourselves.  Otherwise, let
7167	 * the caller undo the mapping if needed.
7168	 */
7169	if (kernel_mapping_size != 0) {
7170		vm_paging_unmap_object(page->object,
7171				       kernel_mapping_offset,
7172				       kernel_mapping_offset + kernel_mapping_size);
7173	}
7174
7175	/*
7176	 * Clear the "reference" and "modified" bits.
7177	 * This should clean up any impact the encryption had
7178	 * on them.
7179	 * The page was kept busy and disconnected from all pmaps,
7180	 * so it can't have been referenced or modified from user
7181	 * space.
7182	 * The software bits will be reset later after the I/O
7183	 * has completed (in upl_commit_range()).
7184	 */
7185	pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
7186
7187	page->encrypted = TRUE;
7188
7189	vm_object_paging_end(page->object);
7190}
7191
7192/*
7193 * ENCRYPTED SWAP:
7194 * vm_page_decrypt:
7195 * 	Decrypt the given page.
7196 * 	The page might already be mapped at kernel virtual
7197 * 	address "kernel_mapping_offset".  Otherwise, we need
7198 * 	to map it.
7199 *
7200 * Context:
7201 *	The page's VM object is locked but will be unlocked and relocked.
7202 * 	The page is busy and not accessible by users (not entered in any pmap).
7203 */
7204void
7205vm_page_decrypt(
7206	vm_page_t	page,
7207	vm_map_offset_t	kernel_mapping_offset)
7208{
7209	kern_return_t		kr;
7210	vm_map_size_t		kernel_mapping_size;
7211	vm_offset_t		kernel_vaddr;
7212	union {
7213		unsigned char	aes_iv[AES_BLOCK_SIZE];
7214		struct {
7215			memory_object_t		pager_object;
7216			vm_object_offset_t	paging_offset;
7217		} vm;
7218	} decrypt_iv;
7219	boolean_t		was_dirty;
7220
7221	assert(page->busy);
7222	assert(page->encrypted);
7223
7224	was_dirty = page->dirty;
7225
7226	/*
7227	 * Take a paging-in-progress reference to keep the object
7228	 * alive even if we have to unlock it (in vm_paging_map_object()
7229	 * for example)...
7230	 */
7231	vm_object_paging_begin(page->object);
7232
7233	if (kernel_mapping_offset == 0) {
7234		/*
7235		 * The page hasn't already been mapped in kernel space
7236		 * by the caller.  Map it now, so that we can access
7237		 * its contents and decrypt them.
7238		 */
7239		kernel_mapping_size = PAGE_SIZE;
7240		kr = vm_paging_map_object(&kernel_mapping_offset,
7241					  page,
7242					  page->object,
7243					  page->offset,
7244					  &kernel_mapping_size,
7245					  VM_PROT_READ | VM_PROT_WRITE,
7246					  FALSE);
7247		if (kr != KERN_SUCCESS) {
7248			panic("vm_page_decrypt: "
7249			      "could not map page in kernel: 0x%x\n",
7250			      kr);
7251		}
7252	} else {
7253		kernel_mapping_size = 0;
7254	}
7255	kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
7256
7257	assert(swap_crypt_ctx_initialized);
7258
7259	/*
7260	 * Prepare an "initial vector" for the decryption.
7261	 * It has to be the same as the "initial vector" we
7262	 * used to encrypt that page.
7263	 */
7264	bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
7265	decrypt_iv.vm.pager_object = page->object->pager;
7266	decrypt_iv.vm.paging_offset =
7267		page->object->paging_offset + page->offset;
7268
7269	/* encrypt the "initial vector" */
7270	aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
7271			swap_crypt_null_iv,
7272			1,
7273			&decrypt_iv.aes_iv[0],
7274			&swap_crypt_ctx.encrypt);
7275
7276	/*
7277	 * Decrypt the page.
7278	 */
7279	aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
7280			&decrypt_iv.aes_iv[0],
7281			PAGE_SIZE / AES_BLOCK_SIZE,
7282			(unsigned char *) kernel_vaddr,
7283			&swap_crypt_ctx.decrypt);
7284	vm_page_decrypt_counter++;
7285
7286	/*
7287	 * Unmap the page from the kernel's address space,
7288	 * if we had to map it ourselves.  Otherwise, let
7289	 * the caller undo the mapping if needed.
7290	 */
7291	if (kernel_mapping_size != 0) {
7292		vm_paging_unmap_object(page->object,
7293				       kernel_vaddr,
7294				       kernel_vaddr + PAGE_SIZE);
7295	}
7296
7297	if (was_dirty) {
7298		/*
7299		 * The pager did not specify that the page would be
7300		 * clean when it got paged in, so let's not clean it here
7301		 * either.
7302		 */
7303	} else {
7304		/*
7305		 * After decryption, the page is actually still clean.
7306		 * It was encrypted as part of paging, which "cleans"
7307		 * the "dirty" pages.
7308		 * Noone could access it after it was encrypted
7309		 * and the decryption doesn't count.
7310		 */
7311		page->dirty = FALSE;
7312		assert (page->cs_validated == FALSE);
7313		pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
7314	}
7315	page->encrypted = FALSE;
7316
7317	/*
7318	 * We've just modified the page's contents via the data cache and part
7319	 * of the new contents might still be in the cache and not yet in RAM.
7320	 * Since the page is now available and might get gathered in a UPL to
7321	 * be part of a DMA transfer from a driver that expects the memory to
7322	 * be coherent at this point, we have to flush the data cache.
7323	 */
7324	pmap_sync_page_attributes_phys(page->phys_page);
7325	/*
7326	 * Since the page is not mapped yet, some code might assume that it
7327	 * doesn't need to invalidate the instruction cache when writing to
7328	 * that page.  That code relies on "pmapped" being FALSE, so that the
7329	 * caches get synchronized when the page is first mapped.
7330	 */
7331	assert(pmap_verify_free(page->phys_page));
7332	page->pmapped = FALSE;
7333	page->wpmapped = FALSE;
7334
7335	vm_object_paging_end(page->object);
7336}
7337
7338#if DEVELOPMENT || DEBUG
7339unsigned long upl_encrypt_upls = 0;
7340unsigned long upl_encrypt_pages = 0;
7341#endif
7342
7343/*
7344 * ENCRYPTED SWAP:
7345 *
7346 * upl_encrypt:
7347 * 	Encrypts all the pages in the UPL, within the specified range.
7348 *
7349 */
7350void
7351upl_encrypt(
7352	upl_t			upl,
7353	upl_offset_t		crypt_offset,
7354	upl_size_t		crypt_size)
7355{
7356	upl_size_t		upl_size, subupl_size=crypt_size;
7357	upl_offset_t		offset_in_upl, subupl_offset=crypt_offset;
7358	vm_object_t		upl_object;
7359	vm_object_offset_t	upl_offset;
7360	vm_page_t		page;
7361	vm_object_t		shadow_object;
7362	vm_object_offset_t	shadow_offset;
7363	vm_object_offset_t	paging_offset;
7364	vm_object_offset_t	base_offset;
7365	int	 		isVectorUPL = 0;
7366	upl_t			vector_upl = NULL;
7367
7368	if((isVectorUPL = vector_upl_is_valid(upl)))
7369		vector_upl = upl;
7370
7371process_upl_to_encrypt:
7372	if(isVectorUPL) {
7373		crypt_size = subupl_size;
7374		crypt_offset = subupl_offset;
7375		upl =  vector_upl_subupl_byoffset(vector_upl, &crypt_offset, &crypt_size);
7376		if(upl == NULL)
7377			panic("upl_encrypt: Accessing a sub-upl that doesn't exist\n");
7378		subupl_size -= crypt_size;
7379		subupl_offset += crypt_size;
7380	}
7381
7382#if DEVELOPMENT || DEBUG
7383	upl_encrypt_upls++;
7384	upl_encrypt_pages += crypt_size / PAGE_SIZE;
7385#endif
7386	upl_object = upl->map_object;
7387	upl_offset = upl->offset;
7388	upl_size = upl->size;
7389
7390	vm_object_lock(upl_object);
7391
7392	/*
7393	 * Find the VM object that contains the actual pages.
7394	 */
7395	if (upl_object->pageout) {
7396		shadow_object = upl_object->shadow;
7397		/*
7398		 * The offset in the shadow object is actually also
7399		 * accounted for in upl->offset.  It possibly shouldn't be
7400		 * this way, but for now don't account for it twice.
7401		 */
7402		shadow_offset = 0;
7403		assert(upl_object->paging_offset == 0);	/* XXX ? */
7404		vm_object_lock(shadow_object);
7405	} else {
7406		shadow_object = upl_object;
7407		shadow_offset = 0;
7408	}
7409
7410	paging_offset = shadow_object->paging_offset;
7411	vm_object_paging_begin(shadow_object);
7412
7413	if (shadow_object != upl_object)
7414	        vm_object_unlock(upl_object);
7415
7416
7417	base_offset = shadow_offset;
7418	base_offset += upl_offset;
7419	base_offset += crypt_offset;
7420	base_offset -= paging_offset;
7421
7422	assert(crypt_offset + crypt_size <= upl_size);
7423
7424	for (offset_in_upl = 0;
7425	     offset_in_upl < crypt_size;
7426	     offset_in_upl += PAGE_SIZE) {
7427		page = vm_page_lookup(shadow_object,
7428				      base_offset + offset_in_upl);
7429		if (page == VM_PAGE_NULL) {
7430			panic("upl_encrypt: "
7431			      "no page for (obj=%p,off=0x%llx+0x%x)!\n",
7432			      shadow_object,
7433			      base_offset,
7434			      offset_in_upl);
7435		}
7436		/*
7437		 * Disconnect the page from all pmaps, so that nobody can
7438		 * access it while it's encrypted.  After that point, all
7439		 * accesses to this page will cause a page fault and block
7440		 * while the page is busy being encrypted.  After the
7441		 * encryption completes, any access will cause a
7442		 * page fault and the page gets decrypted at that time.
7443		 */
7444		pmap_disconnect(page->phys_page);
7445		vm_page_encrypt(page, 0);
7446
7447		if (vm_object_lock_avoid(shadow_object)) {
7448			/*
7449			 * Give vm_pageout_scan() a chance to convert more
7450			 * pages from "clean-in-place" to "clean-and-free",
7451			 * if it's interested in the same pages we selected
7452			 * in this cluster.
7453			 */
7454			vm_object_unlock(shadow_object);
7455			mutex_pause(2);
7456			vm_object_lock(shadow_object);
7457		}
7458	}
7459
7460	vm_object_paging_end(shadow_object);
7461	vm_object_unlock(shadow_object);
7462
7463	if(isVectorUPL && subupl_size)
7464		goto process_upl_to_encrypt;
7465}
7466
7467#else /* CRYPTO */
7468void
7469upl_encrypt(
7470	__unused upl_t			upl,
7471	__unused upl_offset_t	crypt_offset,
7472	__unused upl_size_t	crypt_size)
7473{
7474}
7475
7476void
7477vm_page_encrypt(
7478	__unused vm_page_t		page,
7479	__unused vm_map_offset_t	kernel_mapping_offset)
7480{
7481}
7482
7483void
7484vm_page_decrypt(
7485	__unused vm_page_t		page,
7486	__unused vm_map_offset_t	kernel_mapping_offset)
7487{
7488}
7489
7490#endif /* CRYPTO */
7491
7492/*
7493 * page->object must be locked
7494 */
7495void
7496vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
7497{
7498	if (!queues_locked) {
7499		vm_page_lockspin_queues();
7500	}
7501
7502	/*
7503	 * need to drop the laundry count...
7504	 * we may also need to remove it
7505	 * from the I/O paging queue...
7506	 * vm_pageout_throttle_up handles both cases
7507	 *
7508	 * the laundry and pageout_queue flags are cleared...
7509	 */
7510	vm_pageout_throttle_up(page);
7511
7512	vm_page_steal_pageout_page++;
7513
7514	if (!queues_locked) {
7515		vm_page_unlock_queues();
7516	}
7517}
7518
7519upl_t
7520vector_upl_create(vm_offset_t upl_offset)
7521{
7522	int	vector_upl_size  = sizeof(struct _vector_upl);
7523	int i=0;
7524	upl_t	upl;
7525	vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
7526
7527	upl = upl_create(0,UPL_VECTOR,0);
7528	upl->vector_upl = vector_upl;
7529	upl->offset = upl_offset;
7530	vector_upl->size = 0;
7531	vector_upl->offset = upl_offset;
7532	vector_upl->invalid_upls=0;
7533	vector_upl->num_upls=0;
7534	vector_upl->pagelist = NULL;
7535
7536	for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
7537		vector_upl->upl_iostates[i].size = 0;
7538		vector_upl->upl_iostates[i].offset = 0;
7539
7540	}
7541	return upl;
7542}
7543
7544void
7545vector_upl_deallocate(upl_t upl)
7546{
7547	if(upl) {
7548		vector_upl_t vector_upl = upl->vector_upl;
7549		if(vector_upl) {
7550			if(vector_upl->invalid_upls != vector_upl->num_upls)
7551				panic("Deallocating non-empty Vectored UPL\n");
7552			kfree(vector_upl->pagelist,(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)));
7553			vector_upl->invalid_upls=0;
7554			vector_upl->num_upls = 0;
7555			vector_upl->pagelist = NULL;
7556			vector_upl->size = 0;
7557			vector_upl->offset = 0;
7558			kfree(vector_upl, sizeof(struct _vector_upl));
7559			vector_upl = (vector_upl_t)0xfeedfeed;
7560		}
7561		else
7562			panic("vector_upl_deallocate was passed a non-vectored upl\n");
7563	}
7564	else
7565		panic("vector_upl_deallocate was passed a NULL upl\n");
7566}
7567
7568boolean_t
7569vector_upl_is_valid(upl_t upl)
7570{
7571	if(upl &&  ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) {
7572		vector_upl_t vector_upl = upl->vector_upl;
7573		if(vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef)
7574			return FALSE;
7575		else
7576			return TRUE;
7577	}
7578	return FALSE;
7579}
7580
7581boolean_t
7582vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
7583{
7584	if(vector_upl_is_valid(upl)) {
7585		vector_upl_t vector_upl = upl->vector_upl;
7586
7587		if(vector_upl) {
7588			if(subupl) {
7589				if(io_size) {
7590					if(io_size < PAGE_SIZE)
7591						io_size = PAGE_SIZE;
7592					subupl->vector_upl = (void*)vector_upl;
7593					vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
7594					vector_upl->size += io_size;
7595					upl->size += io_size;
7596				}
7597				else {
7598					uint32_t i=0,invalid_upls=0;
7599					for(i = 0; i < vector_upl->num_upls; i++) {
7600						if(vector_upl->upl_elems[i] == subupl)
7601							break;
7602					}
7603					if(i == vector_upl->num_upls)
7604						panic("Trying to remove sub-upl when none exists");
7605
7606					vector_upl->upl_elems[i] = NULL;
7607					invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
7608					if(invalid_upls == vector_upl->num_upls)
7609						return TRUE;
7610					else
7611						return FALSE;
7612				}
7613			}
7614			else
7615				panic("vector_upl_set_subupl was passed a NULL upl element\n");
7616		}
7617		else
7618			panic("vector_upl_set_subupl was passed a non-vectored upl\n");
7619	}
7620	else
7621		panic("vector_upl_set_subupl was passed a NULL upl\n");
7622
7623	return FALSE;
7624}
7625
7626void
7627vector_upl_set_pagelist(upl_t upl)
7628{
7629	if(vector_upl_is_valid(upl)) {
7630		uint32_t i=0;
7631		vector_upl_t vector_upl = upl->vector_upl;
7632
7633		if(vector_upl) {
7634			vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
7635
7636			vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
7637
7638			for(i=0; i < vector_upl->num_upls; i++) {
7639				cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
7640				bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
7641				pagelist_size += cur_upl_pagelist_size;
7642				if(vector_upl->upl_elems[i]->highest_page > upl->highest_page)
7643					upl->highest_page = vector_upl->upl_elems[i]->highest_page;
7644			}
7645			assert( pagelist_size == (sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)) );
7646		}
7647		else
7648			panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
7649	}
7650	else
7651		panic("vector_upl_set_pagelist was passed a NULL upl\n");
7652
7653}
7654
7655upl_t
7656vector_upl_subupl_byindex(upl_t upl, uint32_t index)
7657{
7658	if(vector_upl_is_valid(upl)) {
7659		vector_upl_t vector_upl = upl->vector_upl;
7660		if(vector_upl) {
7661			if(index < vector_upl->num_upls)
7662				return vector_upl->upl_elems[index];
7663		}
7664		else
7665			panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
7666	}
7667	return NULL;
7668}
7669
7670upl_t
7671vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
7672{
7673	if(vector_upl_is_valid(upl)) {
7674		uint32_t i=0;
7675		vector_upl_t vector_upl = upl->vector_upl;
7676
7677		if(vector_upl) {
7678			upl_t subupl = NULL;
7679			vector_upl_iostates_t subupl_state;
7680
7681			for(i=0; i < vector_upl->num_upls; i++) {
7682				subupl = vector_upl->upl_elems[i];
7683				subupl_state = vector_upl->upl_iostates[i];
7684				if( *upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
7685					/* We could have been passed an offset/size pair that belongs
7686					 * to an UPL element that has already been committed/aborted.
7687					 * If so, return NULL.
7688					 */
7689					if(subupl == NULL)
7690						return NULL;
7691					if((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
7692						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
7693						if(*upl_size > subupl_state.size)
7694							*upl_size = subupl_state.size;
7695					}
7696					if(*upl_offset >= subupl_state.offset)
7697						*upl_offset -= subupl_state.offset;
7698					else if(i)
7699						panic("Vector UPL offset miscalculation\n");
7700					return subupl;
7701				}
7702			}
7703		}
7704		else
7705			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
7706	}
7707	return NULL;
7708}
7709
7710void
7711vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
7712{
7713	*v_upl_submap = NULL;
7714
7715	if(vector_upl_is_valid(upl)) {
7716		vector_upl_t vector_upl = upl->vector_upl;
7717		if(vector_upl) {
7718			*v_upl_submap = vector_upl->submap;
7719			*submap_dst_addr = vector_upl->submap_dst_addr;
7720		}
7721		else
7722			panic("vector_upl_get_submap was passed a non-vectored UPL\n");
7723	}
7724	else
7725		panic("vector_upl_get_submap was passed a null UPL\n");
7726}
7727
7728void
7729vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
7730{
7731	if(vector_upl_is_valid(upl)) {
7732		vector_upl_t vector_upl = upl->vector_upl;
7733		if(vector_upl) {
7734			vector_upl->submap = submap;
7735			vector_upl->submap_dst_addr = submap_dst_addr;
7736		}
7737		else
7738			panic("vector_upl_get_submap was passed a non-vectored UPL\n");
7739	}
7740	else
7741		panic("vector_upl_get_submap was passed a NULL UPL\n");
7742}
7743
7744void
7745vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
7746{
7747	if(vector_upl_is_valid(upl)) {
7748		uint32_t i = 0;
7749		vector_upl_t vector_upl = upl->vector_upl;
7750
7751		if(vector_upl) {
7752			for(i = 0; i < vector_upl->num_upls; i++) {
7753				if(vector_upl->upl_elems[i] == subupl)
7754					break;
7755			}
7756
7757			if(i == vector_upl->num_upls)
7758				panic("setting sub-upl iostate when none exists");
7759
7760			vector_upl->upl_iostates[i].offset = offset;
7761			if(size < PAGE_SIZE)
7762				size = PAGE_SIZE;
7763			vector_upl->upl_iostates[i].size = size;
7764		}
7765		else
7766			panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
7767	}
7768	else
7769		panic("vector_upl_set_iostate was passed a NULL UPL\n");
7770}
7771
7772void
7773vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
7774{
7775	if(vector_upl_is_valid(upl)) {
7776		uint32_t i = 0;
7777		vector_upl_t vector_upl = upl->vector_upl;
7778
7779		if(vector_upl) {
7780			for(i = 0; i < vector_upl->num_upls; i++) {
7781				if(vector_upl->upl_elems[i] == subupl)
7782					break;
7783			}
7784
7785			if(i == vector_upl->num_upls)
7786				panic("getting sub-upl iostate when none exists");
7787
7788			*offset = vector_upl->upl_iostates[i].offset;
7789			*size = vector_upl->upl_iostates[i].size;
7790		}
7791		else
7792			panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
7793	}
7794	else
7795		panic("vector_upl_get_iostate was passed a NULL UPL\n");
7796}
7797
7798void
7799vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
7800{
7801	if(vector_upl_is_valid(upl)) {
7802		vector_upl_t vector_upl = upl->vector_upl;
7803		if(vector_upl) {
7804			if(index < vector_upl->num_upls) {
7805				*offset = vector_upl->upl_iostates[index].offset;
7806				*size = vector_upl->upl_iostates[index].size;
7807			}
7808			else
7809				*offset = *size = 0;
7810		}
7811		else
7812			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
7813	}
7814	else
7815		panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
7816}
7817
7818upl_page_info_t *
7819upl_get_internal_vectorupl_pagelist(upl_t upl)
7820{
7821	return ((vector_upl_t)(upl->vector_upl))->pagelist;
7822}
7823
7824void *
7825upl_get_internal_vectorupl(upl_t upl)
7826{
7827	return upl->vector_upl;
7828}
7829
7830vm_size_t
7831upl_get_internal_pagelist_offset(void)
7832{
7833	return sizeof(struct upl);
7834}
7835
7836void
7837upl_clear_dirty(
7838	upl_t		upl,
7839	boolean_t 	value)
7840{
7841	if (value) {
7842		upl->flags |= UPL_CLEAR_DIRTY;
7843	} else {
7844		upl->flags &= ~UPL_CLEAR_DIRTY;
7845	}
7846}
7847
7848void
7849upl_set_referenced(
7850	upl_t		upl,
7851	boolean_t 	value)
7852{
7853	upl_lock(upl);
7854	if (value) {
7855		upl->ext_ref_count++;
7856	} else {
7857		if (!upl->ext_ref_count) {
7858			panic("upl_set_referenced not %p\n", upl);
7859		}
7860		upl->ext_ref_count--;
7861	}
7862	upl_unlock(upl);
7863}
7864
7865boolean_t
7866vm_page_is_slideable(vm_page_t m)
7867{
7868	boolean_t result = FALSE;
7869	vm_object_t slide_object = slide_info.slide_object;
7870	mach_vm_offset_t start = slide_info.start;
7871	mach_vm_offset_t end = slide_info.end;
7872
7873	/* make sure our page belongs to the one object allowed to do this */
7874	if (slide_object == VM_OBJECT_NULL) {
7875		return result;
7876	}
7877
7878	/*Should we traverse down the chain?*/
7879	if (m->object != slide_object) {
7880		return result;
7881	}
7882
7883	if(!m->slid && (start <= m->offset && end > m->offset)) {
7884		result = TRUE;
7885	}
7886	return result;
7887}
7888
7889int vm_page_slide_counter = 0;
7890int vm_page_slide_errors = 0;
7891kern_return_t
7892vm_page_slide(
7893	vm_page_t	page,
7894	vm_map_offset_t	kernel_mapping_offset)
7895{
7896	kern_return_t		kr;
7897	vm_map_size_t		kernel_mapping_size;
7898	vm_offset_t		kernel_vaddr;
7899	uint32_t		pageIndex = 0;
7900
7901	assert(!page->slid);
7902
7903	if (page->error)
7904		return KERN_FAILURE;
7905
7906	/*
7907	 * Take a paging-in-progress reference to keep the object
7908	 * alive even if we have to unlock it (in vm_paging_map_object()
7909	 * for example)...
7910	 */
7911	vm_object_paging_begin(page->object);
7912
7913	if (kernel_mapping_offset == 0) {
7914		/*
7915		 * The page hasn't already been mapped in kernel space
7916		 * by the caller.  Map it now, so that we can access
7917		 * its contents and decrypt them.
7918		 */
7919		kernel_mapping_size = PAGE_SIZE;
7920		kr = vm_paging_map_object(&kernel_mapping_offset,
7921					  page,
7922					  page->object,
7923					  page->offset,
7924					  &kernel_mapping_size,
7925					  VM_PROT_READ | VM_PROT_WRITE,
7926					  FALSE);
7927		if (kr != KERN_SUCCESS) {
7928			panic("vm_page_slide: "
7929			      "could not map page in kernel: 0x%x\n",
7930			      kr);
7931		}
7932	} else {
7933		kernel_mapping_size = 0;
7934	}
7935	kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
7936
7937	/*
7938	 * Slide the pointers on the page.
7939	 */
7940
7941	/*assert that slide_file_info.start/end are page-aligned?*/
7942
7943	pageIndex = (uint32_t)((page->offset - slide_info.start)/PAGE_SIZE);
7944	kr = vm_shared_region_slide(kernel_vaddr, pageIndex);
7945	vm_page_slide_counter++;
7946
7947	/*
7948	 * Unmap the page from the kernel's address space,
7949	 */
7950	if (kernel_mapping_size != 0) {
7951		vm_paging_unmap_object(page->object,
7952				       kernel_vaddr,
7953				       kernel_vaddr + PAGE_SIZE);
7954	}
7955
7956	page->dirty = FALSE;
7957	pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
7958
7959	if (kr != KERN_SUCCESS || cs_debug > 1) {
7960		printf("vm_page_slide(%p): "
7961		       "obj %p off 0x%llx mobj %p moff 0x%llx\n",
7962		       page,
7963		       page->object, page->offset,
7964		       page->object->pager,
7965		       page->offset + page->object->paging_offset);
7966	}
7967
7968	if (kr == KERN_SUCCESS) {
7969		page->slid = TRUE;
7970	} else {
7971		page->error = TRUE;
7972		vm_page_slide_errors++;
7973	}
7974
7975	vm_object_paging_end(page->object);
7976
7977	return kr;
7978}
7979
7980
7981#ifdef MACH_BSD
7982
7983boolean_t  upl_device_page(upl_page_info_t *upl)
7984{
7985	return(UPL_DEVICE_PAGE(upl));
7986}
7987boolean_t  upl_page_present(upl_page_info_t *upl, int index)
7988{
7989	return(UPL_PAGE_PRESENT(upl, index));
7990}
7991boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
7992{
7993	return(UPL_SPECULATIVE_PAGE(upl, index));
7994}
7995boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
7996{
7997	return(UPL_DIRTY_PAGE(upl, index));
7998}
7999boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
8000{
8001	return(UPL_VALID_PAGE(upl, index));
8002}
8003ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
8004{
8005	return(UPL_PHYS_PAGE(upl, index));
8006}
8007
8008
8009void
8010vm_countdirtypages(void)
8011{
8012	vm_page_t m;
8013	int dpages;
8014	int pgopages;
8015	int precpages;
8016
8017
8018	dpages=0;
8019	pgopages=0;
8020	precpages=0;
8021
8022	vm_page_lock_queues();
8023	m = (vm_page_t) queue_first(&vm_page_queue_inactive);
8024	do {
8025		if (m ==(vm_page_t )0) break;
8026
8027		if(m->dirty) dpages++;
8028		if(m->pageout) pgopages++;
8029		if(m->precious) precpages++;
8030
8031		assert(m->object != kernel_object);
8032		m = (vm_page_t) queue_next(&m->pageq);
8033		if (m ==(vm_page_t )0) break;
8034
8035	} while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
8036	vm_page_unlock_queues();
8037
8038	vm_page_lock_queues();
8039	m = (vm_page_t) queue_first(&vm_page_queue_throttled);
8040	do {
8041		if (m ==(vm_page_t )0) break;
8042
8043		dpages++;
8044		assert(m->dirty);
8045		assert(!m->pageout);
8046		assert(m->object != kernel_object);
8047		m = (vm_page_t) queue_next(&m->pageq);
8048		if (m ==(vm_page_t )0) break;
8049
8050	} while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
8051	vm_page_unlock_queues();
8052
8053	vm_page_lock_queues();
8054	m = (vm_page_t) queue_first(&vm_page_queue_anonymous);
8055	do {
8056		if (m ==(vm_page_t )0) break;
8057
8058		if(m->dirty) dpages++;
8059		if(m->pageout) pgopages++;
8060		if(m->precious) precpages++;
8061
8062		assert(m->object != kernel_object);
8063		m = (vm_page_t) queue_next(&m->pageq);
8064		if (m ==(vm_page_t )0) break;
8065
8066	} while (!queue_end(&vm_page_queue_anonymous,(queue_entry_t) m));
8067	vm_page_unlock_queues();
8068
8069	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
8070
8071	dpages=0;
8072	pgopages=0;
8073	precpages=0;
8074
8075	vm_page_lock_queues();
8076	m = (vm_page_t) queue_first(&vm_page_queue_active);
8077
8078	do {
8079		if(m == (vm_page_t )0) break;
8080		if(m->dirty) dpages++;
8081		if(m->pageout) pgopages++;
8082		if(m->precious) precpages++;
8083
8084		assert(m->object != kernel_object);
8085		m = (vm_page_t) queue_next(&m->pageq);
8086		if(m == (vm_page_t )0) break;
8087
8088	} while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
8089	vm_page_unlock_queues();
8090
8091	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
8092
8093}
8094#endif /* MACH_BSD */
8095
8096ppnum_t upl_get_highest_page(
8097			     upl_t			upl)
8098{
8099        return upl->highest_page;
8100}
8101
8102upl_size_t upl_get_size(
8103			     upl_t			upl)
8104{
8105        return upl->size;
8106}
8107
8108#if UPL_DEBUG
8109kern_return_t  upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
8110{
8111	upl->ubc_alias1 = alias1;
8112	upl->ubc_alias2 = alias2;
8113	return KERN_SUCCESS;
8114}
8115int  upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
8116{
8117	if(al)
8118		*al = upl->ubc_alias1;
8119	if(al2)
8120		*al2 = upl->ubc_alias2;
8121	return KERN_SUCCESS;
8122}
8123#endif /* UPL_DEBUG */
8124