1/*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 *	File:	vm/vm_pageout.c
60 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61 *	Date:	1985
62 *
63 *	The proverbial page-out daemon.
64 */
65
66#include <stdint.h>
67
68#include <debug.h>
69#include <mach_pagemap.h>
70#include <mach_cluster_stats.h>
71#include <advisory_pageout.h>
72
73#include <mach/mach_types.h>
74#include <mach/memory_object.h>
75#include <mach/memory_object_default.h>
76#include <mach/memory_object_control_server.h>
77#include <mach/mach_host_server.h>
78#include <mach/upl.h>
79#include <mach/vm_map.h>
80#include <mach/vm_param.h>
81#include <mach/vm_statistics.h>
82#include <mach/sdt.h>
83
84#include <kern/kern_types.h>
85#include <kern/counters.h>
86#include <kern/host_statistics.h>
87#include <kern/machine.h>
88#include <kern/misc_protos.h>
89#include <kern/sched.h>
90#include <kern/thread.h>
91#include <kern/xpr.h>
92#include <kern/kalloc.h>
93
94#include <machine/vm_tuning.h>
95#include <machine/commpage.h>
96
97#include <vm/pmap.h>
98#include <vm/vm_compressor_pager.h>
99#include <vm/vm_fault.h>
100#include <vm/vm_map.h>
101#include <vm/vm_object.h>
102#include <vm/vm_page.h>
103#include <vm/vm_pageout.h>
104#include <vm/vm_protos.h> /* must be last */
105#include <vm/memory_object.h>
106#include <vm/vm_purgeable_internal.h>
107#include <vm/vm_shared_region.h>
108#include <vm/vm_compressor.h>
109
110/*
111 * ENCRYPTED SWAP:
112 */
113#include <libkern/crypto/aes.h>
114extern u_int32_t random(void);	/* from <libkern/libkern.h> */
115
116extern int cs_debug;
117
118#if UPL_DEBUG
119#include <libkern/OSDebug.h>
120#endif
121
122extern vm_pressure_level_t memorystatus_vm_pressure_level;
123int memorystatus_purge_on_warning = 2;
124int memorystatus_purge_on_urgent = 5;
125int memorystatus_purge_on_critical = 8;
126
127#if VM_PRESSURE_EVENTS
128void vm_pressure_response(void);
129boolean_t vm_pressure_thread_running = FALSE;
130extern void consider_vm_pressure_events(void);
131#endif
132boolean_t	vm_pressure_changed = FALSE;
133
134#ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
135#define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
136#endif
137
138#ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
139#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
140#endif
141
142#ifndef VM_PAGEOUT_DEADLOCK_RELIEF
143#define VM_PAGEOUT_DEADLOCK_RELIEF 100	/* number of pages to move to break deadlock */
144#endif
145
146#ifndef VM_PAGEOUT_INACTIVE_RELIEF
147#define VM_PAGEOUT_INACTIVE_RELIEF 50	/* minimum number of pages to move to the inactive q */
148#endif
149
150#ifndef	VM_PAGE_LAUNDRY_MAX
151#define	VM_PAGE_LAUNDRY_MAX	128UL	/* maximum pageouts on a given pageout queue */
152#endif	/* VM_PAGEOUT_LAUNDRY_MAX */
153
154#ifndef	VM_PAGEOUT_BURST_WAIT
155#define	VM_PAGEOUT_BURST_WAIT	30	/* milliseconds */
156#endif	/* VM_PAGEOUT_BURST_WAIT */
157
158#ifndef	VM_PAGEOUT_EMPTY_WAIT
159#define VM_PAGEOUT_EMPTY_WAIT	200	/* milliseconds */
160#endif	/* VM_PAGEOUT_EMPTY_WAIT */
161
162#ifndef	VM_PAGEOUT_DEADLOCK_WAIT
163#define VM_PAGEOUT_DEADLOCK_WAIT	300	/* milliseconds */
164#endif	/* VM_PAGEOUT_DEADLOCK_WAIT */
165
166#ifndef	VM_PAGEOUT_IDLE_WAIT
167#define VM_PAGEOUT_IDLE_WAIT	10	/* milliseconds */
168#endif	/* VM_PAGEOUT_IDLE_WAIT */
169
170#ifndef	VM_PAGEOUT_SWAP_WAIT
171#define VM_PAGEOUT_SWAP_WAIT	50	/* milliseconds */
172#endif	/* VM_PAGEOUT_SWAP_WAIT */
173
174#ifndef VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED
175#define VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED		1000	/* maximum pages considered before we issue a pressure event */
176#endif /* VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED */
177
178#ifndef VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS
179#define VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS		5	/* seconds */
180#endif /* VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS */
181
182unsigned int	vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
183unsigned int	vm_page_speculative_percentage = 5;
184
185#ifndef VM_PAGE_SPECULATIVE_TARGET
186#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_page_speculative_percentage))
187#endif /* VM_PAGE_SPECULATIVE_TARGET */
188
189
190#ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
191#define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
192#endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
193
194
195/*
196 *	To obtain a reasonable LRU approximation, the inactive queue
197 *	needs to be large enough to give pages on it a chance to be
198 *	referenced a second time.  This macro defines the fraction
199 *	of active+inactive pages that should be inactive.
200 *	The pageout daemon uses it to update vm_page_inactive_target.
201 *
202 *	If vm_page_free_count falls below vm_page_free_target and
203 *	vm_page_inactive_count is below vm_page_inactive_target,
204 *	then the pageout daemon starts running.
205 */
206
207#ifndef	VM_PAGE_INACTIVE_TARGET
208#define	VM_PAGE_INACTIVE_TARGET(avail)	((avail) * 1 / 2)
209#endif	/* VM_PAGE_INACTIVE_TARGET */
210
211/*
212 *	Once the pageout daemon starts running, it keeps going
213 *	until vm_page_free_count meets or exceeds vm_page_free_target.
214 */
215
216#ifndef	VM_PAGE_FREE_TARGET
217#define	VM_PAGE_FREE_TARGET(free)	(15 + (free) / 80)
218#endif	/* VM_PAGE_FREE_TARGET */
219
220
221/*
222 *	The pageout daemon always starts running once vm_page_free_count
223 *	falls below vm_page_free_min.
224 */
225
226#ifndef	VM_PAGE_FREE_MIN
227#define	VM_PAGE_FREE_MIN(free)		(10 + (free) / 100)
228#endif	/* VM_PAGE_FREE_MIN */
229
230#define VM_PAGE_FREE_RESERVED_LIMIT	100
231#define VM_PAGE_FREE_MIN_LIMIT		1500
232#define VM_PAGE_FREE_TARGET_LIMIT	2000
233
234
235/*
236 *	When vm_page_free_count falls below vm_page_free_reserved,
237 *	only vm-privileged threads can allocate pages.  vm-privilege
238 *	allows the pageout daemon and default pager (and any other
239 *	associated threads needed for default pageout) to continue
240 *	operation by dipping into the reserved pool of pages.
241 */
242
243#ifndef	VM_PAGE_FREE_RESERVED
244#define	VM_PAGE_FREE_RESERVED(n)	\
245	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
246#endif	/* VM_PAGE_FREE_RESERVED */
247
248/*
249 *	When we dequeue pages from the inactive list, they are
250 *	reactivated (ie, put back on the active queue) if referenced.
251 *	However, it is possible to starve the free list if other
252 *	processors are referencing pages faster than we can turn off
253 *	the referenced bit.  So we limit the number of reactivations
254 *	we will make per call of vm_pageout_scan().
255 */
256#define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
257#ifndef	VM_PAGE_REACTIVATE_LIMIT
258#define	VM_PAGE_REACTIVATE_LIMIT(avail)	(MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
259#endif	/* VM_PAGE_REACTIVATE_LIMIT */
260#define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM	100
261
262
263extern boolean_t hibernate_cleaning_in_progress;
264
265/*
266 * Exported variable used to broadcast the activation of the pageout scan
267 * Working Set uses this to throttle its use of pmap removes.  In this
268 * way, code which runs within memory in an uncontested context does
269 * not keep encountering soft faults.
270 */
271
272unsigned int	vm_pageout_scan_event_counter = 0;
273
274/*
275 * Forward declarations for internal routines.
276 */
277struct cq {
278	struct vm_pageout_queue *q;
279	void			*current_chead;
280	char			*scratch_buf;
281};
282
283
284#if VM_PRESSURE_EVENTS
285void vm_pressure_thread(void);
286#endif
287static void vm_pageout_garbage_collect(int);
288static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
289static void vm_pageout_iothread_external(void);
290static void vm_pageout_iothread_internal(struct cq *cq);
291static void vm_pageout_adjust_io_throttles(struct vm_pageout_queue *, struct vm_pageout_queue *, boolean_t);
292
293extern void vm_pageout_continue(void);
294extern void vm_pageout_scan(void);
295
296static thread_t	vm_pageout_external_iothread = THREAD_NULL;
297static thread_t	vm_pageout_internal_iothread = THREAD_NULL;
298
299unsigned int vm_pageout_reserved_internal = 0;
300unsigned int vm_pageout_reserved_really = 0;
301
302unsigned int vm_pageout_swap_wait = 0;
303unsigned int vm_pageout_idle_wait = 0;		/* milliseconds */
304unsigned int vm_pageout_empty_wait = 0;		/* milliseconds */
305unsigned int vm_pageout_burst_wait = 0;		/* milliseconds */
306unsigned int vm_pageout_deadlock_wait = 0;	/* milliseconds */
307unsigned int vm_pageout_deadlock_relief = 0;
308unsigned int vm_pageout_inactive_relief = 0;
309unsigned int vm_pageout_burst_active_throttle = 0;
310unsigned int vm_pageout_burst_inactive_throttle = 0;
311
312int	vm_upl_wait_for_pages = 0;
313
314
315/*
316 *	These variables record the pageout daemon's actions:
317 *	how many pages it looks at and what happens to those pages.
318 *	No locking needed because only one thread modifies the variables.
319 */
320
321unsigned int vm_pageout_active = 0;		/* debugging */
322unsigned int vm_pageout_active_busy = 0;	/* debugging */
323unsigned int vm_pageout_inactive = 0;		/* debugging */
324unsigned int vm_pageout_inactive_throttled = 0;	/* debugging */
325unsigned int vm_pageout_inactive_forced = 0;	/* debugging */
326unsigned int vm_pageout_inactive_nolock = 0;	/* debugging */
327unsigned int vm_pageout_inactive_avoid = 0;	/* debugging */
328unsigned int vm_pageout_inactive_busy = 0;	/* debugging */
329unsigned int vm_pageout_inactive_error = 0;	/* debugging */
330unsigned int vm_pageout_inactive_absent = 0;	/* debugging */
331unsigned int vm_pageout_inactive_notalive = 0;	/* debugging */
332unsigned int vm_pageout_inactive_used = 0;	/* debugging */
333unsigned int vm_pageout_cache_evicted = 0;	/* debugging */
334unsigned int vm_pageout_inactive_clean = 0;	/* debugging */
335unsigned int vm_pageout_speculative_clean = 0;	/* debugging */
336
337unsigned int vm_pageout_freed_from_cleaned = 0;
338unsigned int vm_pageout_freed_from_speculative = 0;
339unsigned int vm_pageout_freed_from_inactive_clean = 0;
340
341unsigned int vm_pageout_enqueued_cleaned_from_inactive_clean = 0;
342unsigned int vm_pageout_enqueued_cleaned_from_inactive_dirty = 0;
343
344unsigned int vm_pageout_cleaned_reclaimed = 0;		/* debugging; how many cleaned pages are reclaimed by the pageout scan */
345unsigned int vm_pageout_cleaned_reactivated = 0;	/* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */
346unsigned int vm_pageout_cleaned_reference_reactivated = 0;
347unsigned int vm_pageout_cleaned_volatile_reactivated = 0;
348unsigned int vm_pageout_cleaned_fault_reactivated = 0;
349unsigned int vm_pageout_cleaned_commit_reactivated = 0;	/* debugging; how many cleaned pages are found to be referenced on commit (and are therefore reactivated) */
350unsigned int vm_pageout_cleaned_busy = 0;
351unsigned int vm_pageout_cleaned_nolock = 0;
352
353unsigned int vm_pageout_inactive_dirty_internal = 0;	/* debugging */
354unsigned int vm_pageout_inactive_dirty_external = 0;	/* debugging */
355unsigned int vm_pageout_inactive_deactivated = 0;	/* debugging */
356unsigned int vm_pageout_inactive_anonymous = 0;	/* debugging */
357unsigned int vm_pageout_dirty_no_pager = 0;	/* debugging */
358unsigned int vm_pageout_purged_objects = 0;	/* debugging */
359unsigned int vm_stat_discard = 0;		/* debugging */
360unsigned int vm_stat_discard_sent = 0;		/* debugging */
361unsigned int vm_stat_discard_failure = 0;	/* debugging */
362unsigned int vm_stat_discard_throttle = 0;	/* debugging */
363unsigned int vm_pageout_reactivation_limit_exceeded = 0;	/* debugging */
364unsigned int vm_pageout_catch_ups = 0;				/* debugging */
365unsigned int vm_pageout_inactive_force_reclaim = 0;	/* debugging */
366
367unsigned int vm_pageout_scan_reclaimed_throttled = 0;
368unsigned int vm_pageout_scan_active_throttled = 0;
369unsigned int vm_pageout_scan_inactive_throttled_internal = 0;
370unsigned int vm_pageout_scan_inactive_throttled_external = 0;
371unsigned int vm_pageout_scan_throttle = 0;			/* debugging */
372unsigned int vm_pageout_scan_burst_throttle = 0;		/* debugging */
373unsigned int vm_pageout_scan_empty_throttle = 0;		/* debugging */
374unsigned int vm_pageout_scan_swap_throttle = 0;		/* debugging */
375unsigned int vm_pageout_scan_deadlock_detected = 0;		/* debugging */
376unsigned int vm_pageout_scan_active_throttle_success = 0;	/* debugging */
377unsigned int vm_pageout_scan_inactive_throttle_success = 0;	/* debugging */
378unsigned int vm_pageout_inactive_external_forced_jetsam_count = 0;	/* debugging */
379unsigned int vm_page_speculative_count_drifts = 0;
380unsigned int vm_page_speculative_count_drift_max = 0;
381
382
383/*
384 * Backing store throttle when BS is exhausted
385 */
386unsigned int	vm_backing_store_low = 0;
387
388unsigned int vm_pageout_out_of_line  = 0;
389unsigned int vm_pageout_in_place  = 0;
390
391unsigned int vm_page_steal_pageout_page = 0;
392
393/*
394 * ENCRYPTED SWAP:
395 * counters and statistics...
396 */
397unsigned long vm_page_decrypt_counter = 0;
398unsigned long vm_page_decrypt_for_upl_counter = 0;
399unsigned long vm_page_encrypt_counter = 0;
400unsigned long vm_page_encrypt_abort_counter = 0;
401unsigned long vm_page_encrypt_already_encrypted_counter = 0;
402boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
403
404struct	vm_pageout_queue vm_pageout_queue_internal;
405struct	vm_pageout_queue vm_pageout_queue_external;
406
407unsigned int vm_page_speculative_target = 0;
408
409vm_object_t 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
410
411boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL;
412
413#if DEVELOPMENT || DEBUG
414unsigned long vm_cs_validated_resets = 0;
415#endif
416
417int	vm_debug_events	= 0;
418
419#if CONFIG_MEMORYSTATUS
420#if !CONFIG_JETSAM
421extern boolean_t memorystatus_idle_exit_from_VM(void);
422#endif
423extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
424extern void memorystatus_on_pageout_scan_end(void);
425#endif
426
427boolean_t	vm_page_compressions_failing = FALSE;
428
429/*
430 *	Routine:	vm_backing_store_disable
431 *	Purpose:
432 *		Suspend non-privileged threads wishing to extend
433 *		backing store when we are low on backing store
434 *		(Synchronized by caller)
435 */
436void
437vm_backing_store_disable(
438	boolean_t	disable)
439{
440	if(disable) {
441		vm_backing_store_low = 1;
442	} else {
443		if(vm_backing_store_low) {
444			vm_backing_store_low = 0;
445			thread_wakeup((event_t) &vm_backing_store_low);
446		}
447	}
448}
449
450
451#if MACH_CLUSTER_STATS
452unsigned long vm_pageout_cluster_dirtied = 0;
453unsigned long vm_pageout_cluster_cleaned = 0;
454unsigned long vm_pageout_cluster_collisions = 0;
455unsigned long vm_pageout_cluster_clusters = 0;
456unsigned long vm_pageout_cluster_conversions = 0;
457unsigned long vm_pageout_target_collisions = 0;
458unsigned long vm_pageout_target_page_dirtied = 0;
459unsigned long vm_pageout_target_page_freed = 0;
460#define CLUSTER_STAT(clause)	clause
461#else	/* MACH_CLUSTER_STATS */
462#define CLUSTER_STAT(clause)
463#endif	/* MACH_CLUSTER_STATS */
464
465/*
466 *	Routine:	vm_pageout_object_terminate
467 *	Purpose:
468 *		Destroy the pageout_object, and perform all of the
469 *		required cleanup actions.
470 *
471 *	In/Out conditions:
472 *		The object must be locked, and will be returned locked.
473 */
474void
475vm_pageout_object_terminate(
476	vm_object_t	object)
477{
478	vm_object_t	shadow_object;
479
480	/*
481	 * Deal with the deallocation (last reference) of a pageout object
482	 * (used for cleaning-in-place) by dropping the paging references/
483	 * freeing pages in the original object.
484	 */
485
486	assert(object->pageout);
487	shadow_object = object->shadow;
488	vm_object_lock(shadow_object);
489
490	while (!queue_empty(&object->memq)) {
491		vm_page_t 		p, m;
492		vm_object_offset_t	offset;
493
494		p = (vm_page_t) queue_first(&object->memq);
495
496		assert(p->private);
497		assert(p->pageout);
498		p->pageout = FALSE;
499		assert(!p->cleaning);
500		assert(!p->laundry);
501
502		offset = p->offset;
503		VM_PAGE_FREE(p);
504		p = VM_PAGE_NULL;
505
506		m = vm_page_lookup(shadow_object,
507			offset + object->vo_shadow_offset);
508
509		if(m == VM_PAGE_NULL)
510			continue;
511
512		assert((m->dirty) || (m->precious) ||
513				(m->busy && m->cleaning));
514
515		/*
516		 * Handle the trusted pager throttle.
517		 * Also decrement the burst throttle (if external).
518		 */
519		vm_page_lock_queues();
520		if (m->pageout_queue)
521			vm_pageout_throttle_up(m);
522
523		/*
524		 * Handle the "target" page(s). These pages are to be freed if
525		 * successfully cleaned. Target pages are always busy, and are
526		 * wired exactly once. The initial target pages are not mapped,
527		 * (so cannot be referenced or modified) but converted target
528		 * pages may have been modified between the selection as an
529		 * adjacent page and conversion to a target.
530		 */
531		if (m->pageout) {
532			assert(m->busy);
533			assert(m->wire_count == 1);
534			m->cleaning = FALSE;
535			m->encrypted_cleaning = FALSE;
536			m->pageout = FALSE;
537#if MACH_CLUSTER_STATS
538			if (m->wanted) vm_pageout_target_collisions++;
539#endif
540			/*
541			 * Revoke all access to the page. Since the object is
542			 * locked, and the page is busy, this prevents the page
543			 * from being dirtied after the pmap_disconnect() call
544			 * returns.
545			 *
546			 * Since the page is left "dirty" but "not modifed", we
547			 * can detect whether the page was redirtied during
548			 * pageout by checking the modify state.
549			 */
550			if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) {
551				SET_PAGE_DIRTY(m, FALSE);
552			} else {
553				m->dirty = FALSE;
554			}
555
556			if (m->dirty) {
557				CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
558				vm_page_unwire(m, TRUE);	/* reactivates */
559				VM_STAT_INCR(reactivations);
560				PAGE_WAKEUP_DONE(m);
561			} else {
562				CLUSTER_STAT(vm_pageout_target_page_freed++;)
563				vm_page_free(m);/* clears busy, etc. */
564			}
565			vm_page_unlock_queues();
566			continue;
567		}
568		/*
569		 * Handle the "adjacent" pages. These pages were cleaned in
570		 * place, and should be left alone.
571		 * If prep_pin_count is nonzero, then someone is using the
572		 * page, so make it active.
573		 */
574		if (!m->active && !m->inactive && !m->throttled && !m->private) {
575			if (m->reference)
576				vm_page_activate(m);
577			else
578				vm_page_deactivate(m);
579		}
580		if (m->overwriting) {
581			/*
582			 * the (COPY_OUT_FROM == FALSE) request_page_list case
583			 */
584			if (m->busy) {
585				/*
586				 * We do not re-set m->dirty !
587				 * The page was busy so no extraneous activity
588				 * could have occurred. COPY_INTO is a read into the
589				 * new pages. CLEAN_IN_PLACE does actually write
590				 * out the pages but handling outside of this code
591				 * will take care of resetting dirty. We clear the
592				 * modify however for the Programmed I/O case.
593				 */
594				pmap_clear_modify(m->phys_page);
595
596				m->busy = FALSE;
597				m->absent = FALSE;
598			} else {
599				/*
600				 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
601				 * Occurs when the original page was wired
602				 * at the time of the list request
603				 */
604				 assert(VM_PAGE_WIRED(m));
605				 vm_page_unwire(m, TRUE);	/* reactivates */
606			}
607			m->overwriting = FALSE;
608		} else {
609			/*
610			 * Set the dirty state according to whether or not the page was
611			 * modified during the pageout. Note that we purposefully do
612			 * NOT call pmap_clear_modify since the page is still mapped.
613			 * If the page were to be dirtied between the 2 calls, this
614			 * this fact would be lost. This code is only necessary to
615			 * maintain statistics, since the pmap module is always
616			 * consulted if m->dirty is false.
617			 */
618#if MACH_CLUSTER_STATS
619			m->dirty = pmap_is_modified(m->phys_page);
620
621			if (m->dirty)	vm_pageout_cluster_dirtied++;
622			else		vm_pageout_cluster_cleaned++;
623			if (m->wanted)	vm_pageout_cluster_collisions++;
624#else
625			m->dirty = FALSE;
626#endif
627		}
628		if (m->encrypted_cleaning == TRUE) {
629			m->encrypted_cleaning = FALSE;
630			m->busy = FALSE;
631		}
632		m->cleaning = FALSE;
633
634		/*
635		 * Wakeup any thread waiting for the page to be un-cleaning.
636		 */
637		PAGE_WAKEUP(m);
638		vm_page_unlock_queues();
639	}
640	/*
641	 * Account for the paging reference taken in vm_paging_object_allocate.
642	 */
643	vm_object_activity_end(shadow_object);
644	vm_object_unlock(shadow_object);
645
646	assert(object->ref_count == 0);
647	assert(object->paging_in_progress == 0);
648	assert(object->activity_in_progress == 0);
649	assert(object->resident_page_count == 0);
650	return;
651}
652
653/*
654 * Routine:	vm_pageclean_setup
655 *
656 * Purpose:	setup a page to be cleaned (made non-dirty), but not
657 *		necessarily flushed from the VM page cache.
658 *		This is accomplished by cleaning in place.
659 *
660 *		The page must not be busy, and new_object
661 *		must be locked.
662 *
663 */
664void
665vm_pageclean_setup(
666	vm_page_t		m,
667	vm_page_t		new_m,
668	vm_object_t		new_object,
669	vm_object_offset_t	new_offset)
670{
671	assert(!m->busy);
672#if 0
673	assert(!m->cleaning);
674#endif
675
676	XPR(XPR_VM_PAGEOUT,
677    "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
678		m->object, m->offset, m,
679		new_m, new_offset);
680
681	pmap_clear_modify(m->phys_page);
682
683	/*
684	 * Mark original page as cleaning in place.
685	 */
686	m->cleaning = TRUE;
687	SET_PAGE_DIRTY(m, FALSE);
688	m->precious = FALSE;
689
690	/*
691	 * Convert the fictitious page to a private shadow of
692	 * the real page.
693	 */
694	assert(new_m->fictitious);
695	assert(new_m->phys_page == vm_page_fictitious_addr);
696	new_m->fictitious = FALSE;
697	new_m->private = TRUE;
698	new_m->pageout = TRUE;
699	new_m->phys_page = m->phys_page;
700
701	vm_page_lockspin_queues();
702	vm_page_wire(new_m);
703	vm_page_unlock_queues();
704
705	vm_page_insert(new_m, new_object, new_offset);
706	assert(!new_m->wanted);
707	new_m->busy = FALSE;
708}
709
710/*
711 *	Routine:	vm_pageout_initialize_page
712 *	Purpose:
713 *		Causes the specified page to be initialized in
714 *		the appropriate memory object. This routine is used to push
715 *		pages into a copy-object when they are modified in the
716 *		permanent object.
717 *
718 *		The page is moved to a temporary object and paged out.
719 *
720 *	In/out conditions:
721 *		The page in question must not be on any pageout queues.
722 *		The object to which it belongs must be locked.
723 *		The page must be busy, but not hold a paging reference.
724 *
725 *	Implementation:
726 *		Move this page to a completely new object.
727 */
728void
729vm_pageout_initialize_page(
730	vm_page_t	m)
731{
732	vm_object_t		object;
733	vm_object_offset_t	paging_offset;
734	memory_object_t		pager;
735
736	XPR(XPR_VM_PAGEOUT,
737		"vm_pageout_initialize_page, page 0x%X\n",
738		m, 0, 0, 0, 0);
739	assert(m->busy);
740
741	/*
742	 *	Verify that we really want to clean this page
743	 */
744	assert(!m->absent);
745	assert(!m->error);
746	assert(m->dirty);
747
748	/*
749	 *	Create a paging reference to let us play with the object.
750	 */
751	object = m->object;
752	paging_offset = m->offset + object->paging_offset;
753
754	if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
755		VM_PAGE_FREE(m);
756		panic("reservation without pageout?"); /* alan */
757		vm_object_unlock(object);
758
759		return;
760	}
761
762	/*
763	 * If there's no pager, then we can't clean the page.  This should
764	 * never happen since this should be a copy object and therefore not
765	 * an external object, so the pager should always be there.
766	 */
767
768	pager = object->pager;
769
770	if (pager == MEMORY_OBJECT_NULL) {
771		VM_PAGE_FREE(m);
772		panic("missing pager for copy object");
773		return;
774	}
775
776	/*
777	 * set the page for future call to vm_fault_list_request
778	 */
779	pmap_clear_modify(m->phys_page);
780	SET_PAGE_DIRTY(m, FALSE);
781	m->pageout = TRUE;
782
783	/*
784	 * keep the object from collapsing or terminating
785	 */
786	vm_object_paging_begin(object);
787	vm_object_unlock(object);
788
789	/*
790	 *	Write the data to its pager.
791	 *	Note that the data is passed by naming the new object,
792	 *	not a virtual address; the pager interface has been
793	 *	manipulated to use the "internal memory" data type.
794	 *	[The object reference from its allocation is donated
795	 *	to the eventual recipient.]
796	 */
797	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
798
799	vm_object_lock(object);
800	vm_object_paging_end(object);
801}
802
803#if	MACH_CLUSTER_STATS
804#define MAXCLUSTERPAGES	16
805struct {
806	unsigned long pages_in_cluster;
807	unsigned long pages_at_higher_offsets;
808	unsigned long pages_at_lower_offsets;
809} cluster_stats[MAXCLUSTERPAGES];
810#endif	/* MACH_CLUSTER_STATS */
811
812
813/*
814 * vm_pageout_cluster:
815 *
816 * Given a page, queue it to the appropriate I/O thread,
817 * which will page it out and attempt to clean adjacent pages
818 * in the same operation.
819 *
820 * The object and queues must be locked. We will take a
821 * paging reference to prevent deallocation or collapse when we
822 * release the object lock back at the call site.  The I/O thread
823 * is responsible for consuming this reference
824 *
825 * The page must not be on any pageout queue.
826 */
827
828void
829vm_pageout_cluster(vm_page_t m, boolean_t pageout)
830{
831	vm_object_t	object = m->object;
832        struct		vm_pageout_queue *q;
833
834
835	XPR(XPR_VM_PAGEOUT,
836		"vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
837		object, m->offset, m, 0, 0);
838
839	VM_PAGE_CHECK(m);
840#if DEBUG
841	lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
842#endif
843	vm_object_lock_assert_exclusive(object);
844
845	/*
846	 * Only a certain kind of page is appreciated here.
847	 */
848	assert((m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
849	assert(!m->cleaning && !m->pageout && !m->laundry);
850#ifndef CONFIG_FREEZE
851	assert(!m->inactive && !m->active);
852	assert(!m->throttled);
853#endif
854
855	/*
856	 * protect the object from collapse or termination
857	 */
858	vm_object_activity_begin(object);
859
860	m->pageout = pageout;
861
862	if (object->internal == TRUE) {
863		if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)
864			m->busy = TRUE;
865
866	        q = &vm_pageout_queue_internal;
867	} else
868	        q = &vm_pageout_queue_external;
869
870	/*
871	 * pgo_laundry count is tied to the laundry bit
872	 */
873	m->laundry = TRUE;
874	q->pgo_laundry++;
875
876	m->pageout_queue = TRUE;
877	queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
878
879	if (q->pgo_idle == TRUE) {
880		q->pgo_idle = FALSE;
881		thread_wakeup((event_t) &q->pgo_pending);
882	}
883	VM_PAGE_CHECK(m);
884}
885
886
887unsigned long vm_pageout_throttle_up_count = 0;
888
889/*
890 * A page is back from laundry or we are stealing it back from
891 * the laundering state.  See if there are some pages waiting to
892 * go to laundry and if we can let some of them go now.
893 *
894 * Object and page queues must be locked.
895 */
896void
897vm_pageout_throttle_up(
898       vm_page_t       m)
899{
900       struct vm_pageout_queue *q;
901
902       assert(m->object != VM_OBJECT_NULL);
903       assert(m->object != kernel_object);
904
905#if DEBUG
906       lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
907       vm_object_lock_assert_exclusive(m->object);
908#endif
909
910       vm_pageout_throttle_up_count++;
911
912       if (m->object->internal == TRUE)
913               q = &vm_pageout_queue_internal;
914       else
915               q = &vm_pageout_queue_external;
916
917       if (m->pageout_queue == TRUE) {
918
919	       queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
920	       m->pageout_queue = FALSE;
921
922	       m->pageq.next = NULL;
923	       m->pageq.prev = NULL;
924
925	       vm_object_activity_end(m->object);
926       }
927       if (m->laundry == TRUE) {
928
929	       m->laundry = FALSE;
930	       q->pgo_laundry--;
931
932	       if (q->pgo_throttled == TRUE) {
933		       q->pgo_throttled = FALSE;
934                       thread_wakeup((event_t) &q->pgo_laundry);
935               }
936	       if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
937		       q->pgo_draining = FALSE;
938		       thread_wakeup((event_t) (&q->pgo_laundry+1));
939	       }
940	}
941}
942
943
944static void
945vm_pageout_throttle_up_batch(
946	struct vm_pageout_queue *q,
947	int		batch_cnt)
948{
949#if DEBUG
950       lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
951#endif
952
953       vm_pageout_throttle_up_count += batch_cnt;
954
955       q->pgo_laundry -= batch_cnt;
956
957       if (q->pgo_throttled == TRUE) {
958	       q->pgo_throttled = FALSE;
959	       thread_wakeup((event_t) &q->pgo_laundry);
960       }
961       if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
962	       q->pgo_draining = FALSE;
963	       thread_wakeup((event_t) (&q->pgo_laundry+1));
964       }
965}
966
967
968
969/*
970 * VM memory pressure monitoring.
971 *
972 * vm_pageout_scan() keeps track of the number of pages it considers and
973 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
974 *
975 * compute_memory_pressure() is called every second from compute_averages()
976 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
977 * of recalimed pages in a new vm_pageout_stat[] bucket.
978 *
979 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
980 * The caller provides the number of seconds ("nsecs") worth of statistics
981 * it wants, up to 30 seconds.
982 * It computes the number of pages reclaimed in the past "nsecs" seconds and
983 * also returns the number of pages the system still needs to reclaim at this
984 * moment in time.
985 */
986#define VM_PAGEOUT_STAT_SIZE	31
987struct vm_pageout_stat {
988	unsigned int considered;
989	unsigned int reclaimed;
990} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0}, };
991unsigned int vm_pageout_stat_now = 0;
992unsigned int vm_memory_pressure = 0;
993
994#define VM_PAGEOUT_STAT_BEFORE(i) \
995	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
996#define VM_PAGEOUT_STAT_AFTER(i) \
997	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
998
999#if VM_PAGE_BUCKETS_CHECK
1000int vm_page_buckets_check_interval = 10; /* in seconds */
1001#endif /* VM_PAGE_BUCKETS_CHECK */
1002
1003/*
1004 * Called from compute_averages().
1005 */
1006void
1007compute_memory_pressure(
1008	__unused void *arg)
1009{
1010	unsigned int vm_pageout_next;
1011
1012#if VM_PAGE_BUCKETS_CHECK
1013	/* check the consistency of VM page buckets at regular interval */
1014	static int counter = 0;
1015	if ((++counter % vm_page_buckets_check_interval) == 0) {
1016		vm_page_buckets_check();
1017	}
1018#endif /* VM_PAGE_BUCKETS_CHECK */
1019
1020	vm_memory_pressure =
1021		vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed;
1022
1023	commpage_set_memory_pressure( vm_memory_pressure );
1024
1025	/* move "now" forward */
1026	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
1027	vm_pageout_stats[vm_pageout_next].considered = 0;
1028	vm_pageout_stats[vm_pageout_next].reclaimed = 0;
1029	vm_pageout_stat_now = vm_pageout_next;
1030}
1031
1032
1033/*
1034 * IMPORTANT
1035 * mach_vm_ctl_page_free_wanted() is called indirectly, via
1036 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
1037 * it must be safe in the restricted stackshot context. Locks and/or
1038 * blocking are not allowable.
1039 */
1040unsigned int
1041mach_vm_ctl_page_free_wanted(void)
1042{
1043	unsigned int page_free_target, page_free_count, page_free_wanted;
1044
1045	page_free_target = vm_page_free_target;
1046	page_free_count = vm_page_free_count;
1047	if (page_free_target > page_free_count) {
1048		page_free_wanted = page_free_target - page_free_count;
1049	} else {
1050		page_free_wanted = 0;
1051	}
1052
1053	return page_free_wanted;
1054}
1055
1056
1057/*
1058 * IMPORTANT:
1059 * mach_vm_pressure_monitor() is called when taking a stackshot, with
1060 * wait_for_pressure FALSE, so that code path must remain safe in the
1061 * restricted stackshot context. No blocking or locks are allowable.
1062 * on that code path.
1063 */
1064
1065kern_return_t
1066mach_vm_pressure_monitor(
1067	boolean_t	wait_for_pressure,
1068	unsigned int	nsecs_monitored,
1069	unsigned int	*pages_reclaimed_p,
1070	unsigned int	*pages_wanted_p)
1071{
1072	wait_result_t	wr;
1073	unsigned int	vm_pageout_then, vm_pageout_now;
1074	unsigned int	pages_reclaimed;
1075
1076	/*
1077	 * We don't take the vm_page_queue_lock here because we don't want
1078	 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1079	 * thread when it's trying to reclaim memory.  We don't need fully
1080	 * accurate monitoring anyway...
1081	 */
1082
1083	if (wait_for_pressure) {
1084		/* wait until there's memory pressure */
1085		while (vm_page_free_count >= vm_page_free_target) {
1086			wr = assert_wait((event_t) &vm_page_free_wanted,
1087					 THREAD_INTERRUPTIBLE);
1088			if (wr == THREAD_WAITING) {
1089				wr = thread_block(THREAD_CONTINUE_NULL);
1090			}
1091			if (wr == THREAD_INTERRUPTED) {
1092				return KERN_ABORTED;
1093			}
1094			if (wr == THREAD_AWAKENED) {
1095				/*
1096				 * The memory pressure might have already
1097				 * been relieved but let's not block again
1098				 * and let's report that there was memory
1099				 * pressure at some point.
1100				 */
1101				break;
1102			}
1103		}
1104	}
1105
1106	/* provide the number of pages the system wants to reclaim */
1107	if (pages_wanted_p != NULL) {
1108		*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1109	}
1110
1111	if (pages_reclaimed_p == NULL) {
1112		return KERN_SUCCESS;
1113	}
1114
1115	/* provide number of pages reclaimed in the last "nsecs_monitored" */
1116	do {
1117		vm_pageout_now = vm_pageout_stat_now;
1118		pages_reclaimed = 0;
1119		for (vm_pageout_then =
1120			     VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1121		     vm_pageout_then != vm_pageout_now &&
1122			     nsecs_monitored-- != 0;
1123		     vm_pageout_then =
1124			     VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1125			pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed;
1126		}
1127	} while (vm_pageout_now != vm_pageout_stat_now);
1128	*pages_reclaimed_p = pages_reclaimed;
1129
1130	return KERN_SUCCESS;
1131}
1132
1133
1134
1135/*
1136 * function in BSD to apply I/O throttle to the pageout thread
1137 */
1138extern void vm_pageout_io_throttle(void);
1139
1140
1141#if LATENCY_JETSAM
1142boolean_t	jlp_init = FALSE;
1143uint64_t	jlp_time = 0, jlp_current = 0;
1144struct vm_page	jetsam_latency_page[NUM_OF_JETSAM_LATENCY_TOKENS];
1145unsigned int	latency_jetsam_wakeup = 0;
1146#endif /* LATENCY_JETSAM */
1147
1148/*
1149 * Page States: Used below to maintain the page state
1150 * before it's removed from it's Q. This saved state
1151 * helps us do the right accounting in certain cases
1152 */
1153#define PAGE_STATE_SPECULATIVE		1
1154#define PAGE_STATE_ANONYMOUS		2
1155#define PAGE_STATE_INACTIVE		3
1156#define PAGE_STATE_INACTIVE_FIRST	4
1157#define PAGE_STATE_CLEAN      5
1158
1159
1160#define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m)                         \
1161        MACRO_BEGIN                                                     \
1162        /*                                                              \
1163         * If a "reusable" page somehow made it back into               \
1164         * the active queue, it's been re-used and is not               \
1165         * quite re-usable.                                             \
1166         * If the VM object was "all_reusable", consider it             \
1167         * as "all re-used" instead of converting it to                 \
1168         * "partially re-used", which could be expensive.               \
1169         */                                                             \
1170        if ((m)->reusable ||                                            \
1171            (m)->object->all_reusable) {                                \
1172                vm_object_reuse_pages((m)->object,                      \
1173                                      (m)->offset,                      \
1174                                      (m)->offset + PAGE_SIZE_64,       \
1175                                      FALSE);                           \
1176        }                                                               \
1177        MACRO_END
1178
1179
1180#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT  	64
1181#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX	1024
1182
1183#define	FCS_IDLE		0
1184#define FCS_DELAYED		1
1185#define FCS_DEADLOCK_DETECTED	2
1186
1187struct flow_control {
1188        int		state;
1189        mach_timespec_t	ts;
1190};
1191
1192uint32_t vm_pageout_considered_page = 0;
1193uint32_t vm_page_filecache_min = 0;
1194
1195#define	VM_PAGE_FILECACHE_MIN	50000
1196#define ANONS_GRABBED_LIMIT	2
1197
1198/*
1199 *	vm_pageout_scan does the dirty work for the pageout daemon.
1200 *	It returns with both vm_page_queue_free_lock and vm_page_queue_lock
1201 *	held and vm_page_free_wanted == 0.
1202 */
1203void
1204vm_pageout_scan(void)
1205{
1206	unsigned int loop_count = 0;
1207	unsigned int inactive_burst_count = 0;
1208	unsigned int active_burst_count = 0;
1209	unsigned int reactivated_this_call;
1210	unsigned int reactivate_limit;
1211	vm_page_t   local_freeq = NULL;
1212	int         local_freed = 0;
1213	int         delayed_unlock;
1214	int	    delayed_unlock_limit = 0;
1215	int	    refmod_state = 0;
1216        int	vm_pageout_deadlock_target = 0;
1217	struct	vm_pageout_queue *iq;
1218	struct	vm_pageout_queue *eq;
1219        struct	vm_speculative_age_q *sq;
1220	struct  flow_control	flow_control = { 0, { 0, 0 } };
1221        boolean_t inactive_throttled = FALSE;
1222	boolean_t try_failed;
1223	mach_timespec_t	ts;
1224	unsigned	int msecs = 0;
1225	vm_object_t	object;
1226	vm_object_t	last_object_tried;
1227	uint32_t	catch_up_count = 0;
1228	uint32_t	inactive_reclaim_run;
1229	boolean_t	forced_reclaim;
1230	boolean_t	exceeded_burst_throttle;
1231	boolean_t	grab_anonymous = FALSE;
1232	boolean_t	force_anonymous = FALSE;
1233	int		anons_grabbed = 0;
1234	int		page_prev_state = 0;
1235	int		cache_evict_throttle = 0;
1236	uint32_t	vm_pageout_inactive_external_forced_reactivate_limit = 0;
1237	vm_pressure_level_t pressure_level;
1238
1239	VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
1240		       vm_pageout_speculative_clean, vm_pageout_inactive_clean,
1241		       vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
1242
1243#if LATENCY_JETSAM
1244	if (jlp_init == FALSE) {
1245		int i=0;
1246		vm_page_t jlp;
1247		for(; i < NUM_OF_JETSAM_LATENCY_TOKENS; i++) {
1248			jlp = &jetsam_latency_page[i];
1249			jlp->fictitious = TRUE;
1250			jlp->offset = 0;
1251
1252		}
1253		jlp = &jetsam_latency_page[0];
1254		queue_enter(&vm_page_queue_active, jlp, vm_page_t, pageq);
1255		jlp->active = TRUE;
1256
1257		jlp->offset = mach_absolute_time();
1258		jlp_time = jlp->offset;
1259		jlp_current++;
1260		jlp_init = TRUE;
1261	}
1262#endif /* LATENCY_JETSAM */
1263
1264	flow_control.state = FCS_IDLE;
1265	iq = &vm_pageout_queue_internal;
1266	eq = &vm_pageout_queue_external;
1267	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1268
1269
1270        XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1271
1272
1273	vm_page_lock_queues();
1274	delayed_unlock = 1;	/* must be nonzero if Qs are locked, 0 if unlocked */
1275
1276	/*
1277	 *	Calculate the max number of referenced pages on the inactive
1278	 *	queue that we will reactivate.
1279	 */
1280	reactivated_this_call = 0;
1281	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
1282						    vm_page_inactive_count);
1283	inactive_reclaim_run = 0;
1284
1285	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
1286
1287	/*
1288	 *	We want to gradually dribble pages from the active queue
1289	 *	to the inactive queue.  If we let the inactive queue get
1290	 *	very small, and then suddenly dump many pages into it,
1291	 *	those pages won't get a sufficient chance to be referenced
1292	 *	before we start taking them from the inactive queue.
1293	 *
1294	 *	We must limit the rate at which we send pages to the pagers
1295	 *	so that we don't tie up too many pages in the I/O queues.
1296	 *	We implement a throttling mechanism using the laundry count
1297	 * 	to limit the number of pages outstanding to the default
1298	 *	and external pagers.  We can bypass the throttles and look
1299	 *	for clean pages if the pageout queues don't drain in a timely
1300	 *	fashion since this may indicate that the pageout paths are
1301	 *	stalled waiting for memory, which only we can provide.
1302	 */
1303
1304
1305Restart:
1306	assert(delayed_unlock!=0);
1307
1308	/*
1309	 *	Recalculate vm_page_inactivate_target.
1310	 */
1311	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1312							  vm_page_inactive_count +
1313							  vm_page_speculative_count);
1314
1315	vm_page_anonymous_min = vm_page_inactive_target / 20;
1316
1317
1318	/*
1319	 * don't want to wake the pageout_scan thread up everytime we fall below
1320	 * the targets... set a low water mark at 0.25% below the target
1321	 */
1322	vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
1323
1324	if (vm_page_speculative_percentage > 50)
1325		vm_page_speculative_percentage = 50;
1326	else if (vm_page_speculative_percentage <= 0)
1327		vm_page_speculative_percentage = 1;
1328
1329	vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1330								vm_page_inactive_count);
1331
1332	object = NULL;
1333	last_object_tried = NULL;
1334	try_failed = FALSE;
1335
1336	if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
1337	        catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1338	else
1339	        catch_up_count = 0;
1340
1341	for (;;) {
1342		vm_page_t m;
1343
1344		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1345
1346		if (delayed_unlock == 0) {
1347		        vm_page_lock_queues();
1348			delayed_unlock = 1;
1349		}
1350		if (vm_upl_wait_for_pages < 0)
1351			vm_upl_wait_for_pages = 0;
1352
1353		delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
1354
1355		if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX)
1356			delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
1357
1358		/*
1359		 * Move pages from active to inactive if we're below the target
1360		 */
1361		/* if we are trying to make clean, we need to make sure we actually have inactive - mj */
1362		if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1363			goto done_moving_active_pages;
1364
1365		if (object != NULL) {
1366			vm_object_unlock(object);
1367			object = NULL;
1368			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1369		}
1370		/*
1371		 * Don't sweep through active queue more than the throttle
1372		 * which should be kept relatively low
1373		 */
1374		active_burst_count = MIN(vm_pageout_burst_active_throttle, vm_page_active_count);
1375
1376		VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_START,
1377			       vm_pageout_inactive, vm_pageout_inactive_used, vm_page_free_count, local_freed);
1378
1379		VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_NONE,
1380			       vm_pageout_speculative_clean, vm_pageout_inactive_clean,
1381			       vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
1382		memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_START);
1383
1384
1385		while (!queue_empty(&vm_page_queue_active) && active_burst_count--) {
1386
1387			vm_pageout_active++;
1388
1389			m = (vm_page_t) queue_first(&vm_page_queue_active);
1390
1391			assert(m->active && !m->inactive);
1392			assert(!m->laundry);
1393			assert(m->object != kernel_object);
1394			assert(m->phys_page != vm_page_guard_addr);
1395
1396			DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1397
1398#if LATENCY_JETSAM
1399			if (m->fictitious) {
1400				const uint32_t FREE_TARGET_MULTIPLIER = 2;
1401
1402				uint64_t now = mach_absolute_time();
1403				uint64_t delta = now - m->offset;
1404				clock_sec_t jl_secs = 0;
1405				clock_usec_t jl_usecs = 0;
1406				boolean_t issue_jetsam = FALSE;
1407
1408				absolutetime_to_microtime(delta, &jl_secs, &jl_usecs);
1409				jl_usecs += jl_secs * USEC_PER_SEC;
1410
1411				/* Jetsam only if the token hasn't aged sufficiently and the free count is close to the target (avoiding spurious triggers) */
1412				if ((jl_usecs <= JETSAM_AGE_NOTIFY_CRITICAL) && (vm_page_free_count < (FREE_TARGET_MULTIPLIER * vm_page_free_target))) {
1413					issue_jetsam = TRUE;
1414				}
1415
1416				VM_DEBUG_EVENT(vm_pageout_page_token, VM_PAGEOUT_PAGE_TOKEN, DBG_FUNC_NONE,
1417					       vm_page_active_count, vm_page_inactive_count, vm_page_free_count, jl_usecs);
1418
1419				m->offset = 0;
1420				queue_remove(&vm_page_queue_active, m, vm_page_t, pageq);
1421				queue_enter(&vm_page_queue_active, m, vm_page_t, pageq);
1422
1423				m->offset = now;
1424				jlp_time = now;
1425
1426				if (issue_jetsam) {
1427					vm_page_unlock_queues();
1428
1429					if (local_freeq) {
1430						vm_page_free_list(local_freeq, TRUE);
1431						local_freeq = NULL;
1432						local_freed = 0;
1433					}
1434
1435					VM_DEBUG_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
1436						       vm_page_active_count, vm_page_inactive_count, vm_page_free_count, 0);
1437
1438					assert_wait_timeout(&latency_jetsam_wakeup, THREAD_INTERRUPTIBLE, 10 /* msecs */, 1000*NSEC_PER_USEC);
1439					/* Kill the top process asynchronously */
1440					memorystatus_kill_on_VM_page_shortage(TRUE);
1441					thread_block(THREAD_CONTINUE_NULL);
1442
1443					VM_DEBUG_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, 0, 0, 0, 0);
1444
1445					vm_page_lock_queues();
1446				}
1447			} else {
1448#endif /* LATENCY_JETSAM */
1449				/*
1450				 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
1451				 *
1452				 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
1453				 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
1454				 * new reference happens. If no futher references happen on the page after that remote TLB flushes
1455				 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
1456				 * by pageout_scan, which is just fine since the last reference would have happened quite far
1457				 * in the past (TLB caches don't hang around for very long), and of course could just as easily
1458				 * have happened before we moved the page
1459				 */
1460				pmap_clear_refmod_options(m->phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1461
1462				/*
1463				 * The page might be absent or busy,
1464				 * but vm_page_deactivate can handle that.
1465				 * FALSE indicates that we don't want a H/W clear reference
1466				 */
1467				vm_page_deactivate_internal(m, FALSE);
1468
1469				if (delayed_unlock++ > delayed_unlock_limit) {
1470
1471					if (local_freeq) {
1472						vm_page_unlock_queues();
1473
1474						VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1475							       vm_page_free_count, local_freed, delayed_unlock_limit, 1);
1476
1477						vm_page_free_list(local_freeq, TRUE);
1478
1479						VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1480							       vm_page_free_count, 0, 0, 1);
1481
1482						local_freeq = NULL;
1483						local_freed = 0;
1484						vm_page_lock_queues();
1485					} else
1486						lck_mtx_yield(&vm_page_queue_lock);
1487
1488					delayed_unlock = 1;
1489
1490					/*
1491					 * continue the while loop processing
1492					 * the active queue... need to hold
1493					 * the page queues lock
1494					 */
1495				}
1496#if LATENCY_JETSAM
1497			}
1498#endif	/* LATENCY_JETSAM */
1499		}
1500
1501		VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_END,
1502			       vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, vm_page_inactive_target);
1503		memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_END);
1504
1505		/**********************************************************************
1506		 * above this point we're playing with the active queue
1507		 * below this point we're playing with the throttling mechanisms
1508		 * and the inactive queue
1509		 **********************************************************************/
1510
1511done_moving_active_pages:
1512
1513		if (vm_page_free_count + local_freed >= vm_page_free_target) {
1514			if (object != NULL) {
1515			        vm_object_unlock(object);
1516				object = NULL;
1517			}
1518			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1519
1520			if (local_freeq) {
1521				vm_page_unlock_queues();
1522
1523				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1524					       vm_page_free_count, local_freed, delayed_unlock_limit, 2);
1525
1526				vm_page_free_list(local_freeq, TRUE);
1527
1528				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1529					       vm_page_free_count, local_freed, 0, 2);
1530
1531				local_freeq = NULL;
1532				local_freed = 0;
1533				vm_page_lock_queues();
1534			}
1535			/*
1536			 * make sure the pageout I/O threads are running
1537			 * throttled in case there are still requests
1538			 * in the laundry... since we have met our targets
1539			 * we don't need the laundry to be cleaned in a timely
1540			 * fashion... so let's avoid interfering with foreground
1541			 * activity
1542			 */
1543			vm_pageout_adjust_io_throttles(iq, eq, TRUE);
1544
1545			/*
1546			 * recalculate vm_page_inactivate_target
1547			 */
1548			vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1549									  vm_page_inactive_count +
1550									  vm_page_speculative_count);
1551			if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1552			    !queue_empty(&vm_page_queue_active)) {
1553				/*
1554				 * inactive target still not met... keep going
1555				 * until we get the queues balanced...
1556				 */
1557			        continue;
1558			}
1559		        lck_mtx_lock(&vm_page_queue_free_lock);
1560
1561			if ((vm_page_free_count >= vm_page_free_target) &&
1562			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1563				/*
1564				 * done - we have met our target *and*
1565				 * there is no one waiting for a page.
1566				 */
1567return_from_scan:
1568				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1569
1570				VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
1571					       vm_pageout_inactive, vm_pageout_inactive_used, 0, 0);
1572				VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
1573					       vm_pageout_speculative_clean, vm_pageout_inactive_clean,
1574					       vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
1575
1576				return;
1577			}
1578			lck_mtx_unlock(&vm_page_queue_free_lock);
1579		}
1580
1581		/*
1582		 * Before anything, we check if we have any ripe volatile
1583		 * objects around. If so, try to purge the first object.
1584		 * If the purge fails, fall through to reclaim a page instead.
1585		 * If the purge succeeds, go back to the top and reevalute
1586		 * the new memory situation.
1587		 */
1588		pressure_level = memorystatus_vm_pressure_level;
1589		assert (available_for_purge>=0);
1590
1591		if (available_for_purge
1592		    || pressure_level > kVMPressureNormal
1593		    ) {
1594			int force_purge;
1595
1596		        if (object != NULL) {
1597			        vm_object_unlock(object);
1598				object = NULL;
1599			}
1600
1601			VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1602			memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1603
1604			force_purge = 0; /* no force-purging */
1605			if (pressure_level >= kVMPressureCritical) {
1606				force_purge = memorystatus_purge_on_critical;
1607			} else if (pressure_level >= kVMPressureUrgent) {
1608				force_purge = memorystatus_purge_on_urgent;
1609			} else if (pressure_level >= kVMPressureWarning) {
1610				force_purge = memorystatus_purge_on_warning;
1611			} else {
1612				force_purge = 0;
1613			}
1614			if (vm_purgeable_object_purge_one(force_purge)) {
1615
1616				VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1617				memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1618				continue;
1619			}
1620			VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1621			memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1622		}
1623		if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1624		        /*
1625			 * try to pull pages from the aging bins...
1626			 * see vm_page.h for an explanation of how
1627			 * this mechanism works
1628			 */
1629		        struct vm_speculative_age_q	*aq;
1630			mach_timespec_t	ts_fully_aged;
1631			boolean_t	can_steal = FALSE;
1632			int num_scanned_queues;
1633
1634			aq = &vm_page_queue_speculative[speculative_steal_index];
1635
1636			num_scanned_queues = 0;
1637			while (queue_empty(&aq->age_q) &&
1638			       num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1639
1640			        speculative_steal_index++;
1641
1642				if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1643				        speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1644
1645				aq = &vm_page_queue_speculative[speculative_steal_index];
1646			}
1647
1648			if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1649				/*
1650				 * XXX We've scanned all the speculative
1651				 * queues but still haven't found one
1652				 * that is not empty, even though
1653				 * vm_page_speculative_count is not 0.
1654				 *
1655				 * report the anomaly...
1656				 */
1657				printf("vm_pageout_scan: "
1658				       "all speculative queues empty "
1659				       "but count=%d.  Re-adjusting.\n",
1660				       vm_page_speculative_count);
1661				if (vm_page_speculative_count > vm_page_speculative_count_drift_max)
1662					vm_page_speculative_count_drift_max = vm_page_speculative_count;
1663				vm_page_speculative_count_drifts++;
1664#if 6553678
1665				Debugger("vm_pageout_scan: no speculative pages");
1666#endif
1667				/* readjust... */
1668				vm_page_speculative_count = 0;
1669				/* ... and continue */
1670				continue;
1671			}
1672
1673			if (vm_page_speculative_count > vm_page_speculative_target)
1674			        can_steal = TRUE;
1675			else {
1676			        ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) / 1000;
1677				ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) % 1000)
1678				                      * 1000 * NSEC_PER_USEC;
1679
1680				ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1681
1682				clock_sec_t sec;
1683				clock_nsec_t nsec;
1684			        clock_get_system_nanotime(&sec, &nsec);
1685				ts.tv_sec = (unsigned int) sec;
1686				ts.tv_nsec = nsec;
1687
1688				if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1689				        can_steal = TRUE;
1690			}
1691			if (can_steal == TRUE)
1692			        vm_page_speculate_ageit(aq);
1693		}
1694		if (queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
1695			int 	pages_evicted;
1696
1697		        if (object != NULL) {
1698			        vm_object_unlock(object);
1699				object = NULL;
1700			}
1701			pages_evicted = vm_object_cache_evict(100, 10);
1702
1703			if (pages_evicted) {
1704
1705				vm_pageout_cache_evicted += pages_evicted;
1706
1707				VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
1708					       vm_page_free_count, pages_evicted, vm_pageout_cache_evicted, 0);
1709				memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
1710
1711				/*
1712				 * we just freed up to 100 pages,
1713				 * so go back to the top of the main loop
1714				 * and re-evaulate the memory situation
1715				 */
1716				continue;
1717			} else
1718				cache_evict_throttle = 100;
1719		}
1720		if  (cache_evict_throttle)
1721			cache_evict_throttle--;
1722
1723
1724		exceeded_burst_throttle = FALSE;
1725		/*
1726		 * Sometimes we have to pause:
1727		 *	1) No inactive pages - nothing to do.
1728		 *	2) Loop control - no acceptable pages found on the inactive queue
1729		 *         within the last vm_pageout_burst_inactive_throttle iterations
1730		 *	3) Flow control - default pageout queue is full
1731		 */
1732		if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_anonymous) && queue_empty(&sq->age_q)) {
1733		        vm_pageout_scan_empty_throttle++;
1734			msecs = vm_pageout_empty_wait;
1735			goto vm_pageout_scan_delay;
1736
1737		} else if (inactive_burst_count >=
1738			   MIN(vm_pageout_burst_inactive_throttle,
1739			       (vm_page_inactive_count +
1740				vm_page_speculative_count))) {
1741		        vm_pageout_scan_burst_throttle++;
1742			msecs = vm_pageout_burst_wait;
1743
1744			exceeded_burst_throttle = TRUE;
1745			goto vm_pageout_scan_delay;
1746
1747		} else if (vm_page_free_count > (vm_page_free_reserved / 4) &&
1748			   VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE()) {
1749		        vm_pageout_scan_swap_throttle++;
1750			msecs = vm_pageout_swap_wait;
1751			goto vm_pageout_scan_delay;
1752
1753		} else if (VM_PAGE_Q_THROTTLED(iq) &&
1754				  VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) {
1755			clock_sec_t sec;
1756			clock_nsec_t nsec;
1757
1758		        switch (flow_control.state) {
1759
1760			case FCS_IDLE:
1761				if ((vm_page_free_count + local_freed) < vm_page_free_target) {
1762
1763					if (vm_page_pageable_external_count > vm_page_filecache_min && !queue_empty(&vm_page_queue_inactive)) {
1764						anons_grabbed = ANONS_GRABBED_LIMIT;
1765						goto consider_inactive;
1766					}
1767					if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && vm_page_active_count)
1768						continue;
1769				}
1770reset_deadlock_timer:
1771			        ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1772				ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1773			        clock_get_system_nanotime(&sec, &nsec);
1774				flow_control.ts.tv_sec = (unsigned int) sec;
1775				flow_control.ts.tv_nsec = nsec;
1776				ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1777
1778				flow_control.state = FCS_DELAYED;
1779				msecs = vm_pageout_deadlock_wait;
1780
1781				break;
1782
1783			case FCS_DELAYED:
1784			        clock_get_system_nanotime(&sec, &nsec);
1785				ts.tv_sec = (unsigned int) sec;
1786				ts.tv_nsec = nsec;
1787
1788				if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1789				        /*
1790					 * the pageout thread for the default pager is potentially
1791					 * deadlocked since the
1792					 * default pager queue has been throttled for more than the
1793					 * allowable time... we need to move some clean pages or dirty
1794					 * pages belonging to the external pagers if they aren't throttled
1795					 * vm_page_free_wanted represents the number of threads currently
1796					 * blocked waiting for pages... we'll move one page for each of
1797					 * these plus a fixed amount to break the logjam... once we're done
1798					 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1799					 * with a new timeout target since we have no way of knowing
1800					 * whether we've broken the deadlock except through observation
1801					 * of the queue associated with the default pager... we need to
1802					 * stop moving pages and allow the system to run to see what
1803					 * state it settles into.
1804					 */
1805				        vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1806					vm_pageout_scan_deadlock_detected++;
1807					flow_control.state = FCS_DEADLOCK_DETECTED;
1808					thread_wakeup((event_t) &vm_pageout_garbage_collect);
1809					goto consider_inactive;
1810				}
1811				/*
1812				 * just resniff instead of trying
1813				 * to compute a new delay time... we're going to be
1814				 * awakened immediately upon a laundry completion,
1815				 * so we won't wait any longer than necessary
1816				 */
1817				msecs = vm_pageout_idle_wait;
1818				break;
1819
1820			case FCS_DEADLOCK_DETECTED:
1821			        if (vm_pageout_deadlock_target)
1822				        goto consider_inactive;
1823				goto reset_deadlock_timer;
1824
1825			}
1826vm_pageout_scan_delay:
1827			if (object != NULL) {
1828			        vm_object_unlock(object);
1829				object = NULL;
1830			}
1831			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1832
1833			if (local_freeq) {
1834				vm_page_unlock_queues();
1835
1836				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1837					       vm_page_free_count, local_freed, delayed_unlock_limit, 3);
1838
1839				vm_page_free_list(local_freeq, TRUE);
1840
1841				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1842					       vm_page_free_count, local_freed, 0, 3);
1843
1844				local_freeq = NULL;
1845				local_freed = 0;
1846				vm_page_lock_queues();
1847
1848				if (flow_control.state == FCS_DELAYED &&
1849				    !VM_PAGE_Q_THROTTLED(iq)) {
1850					flow_control.state = FCS_IDLE;
1851					goto consider_inactive;
1852				}
1853			}
1854
1855			if (vm_page_free_count >= vm_page_free_target) {
1856				/*
1857				 * we're here because
1858				 *  1) someone else freed up some pages while we had
1859				 *     the queues unlocked above
1860				 * and we've hit one of the 3 conditions that
1861				 * cause us to pause the pageout scan thread
1862				 *
1863				 * since we already have enough free pages,
1864				 * let's avoid stalling and return normally
1865				 *
1866				 * before we return, make sure the pageout I/O threads
1867				 * are running throttled in case there are still requests
1868				 * in the laundry... since we have enough free pages
1869				 * we don't need the laundry to be cleaned in a timely
1870				 * fashion... so let's avoid interfering with foreground
1871				 * activity
1872				 *
1873				 * we don't want to hold vm_page_queue_free_lock when
1874				 * calling vm_pageout_adjust_io_throttles (since it
1875				 * may cause other locks to be taken), we do the intitial
1876				 * check outside of the lock.  Once we take the lock,
1877				 * we recheck the condition since it may have changed.
1878				 * if it has, no problem, we will make the threads
1879				 * non-throttled before actually blocking
1880				 */
1881				vm_pageout_adjust_io_throttles(iq, eq, TRUE);
1882			}
1883			lck_mtx_lock(&vm_page_queue_free_lock);
1884
1885			if (vm_page_free_count >= vm_page_free_target &&
1886			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1887				goto return_from_scan;
1888			}
1889			lck_mtx_unlock(&vm_page_queue_free_lock);
1890
1891			if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
1892				/*
1893				 * we're most likely about to block due to one of
1894				 * the 3 conditions that cause vm_pageout_scan to
1895				 * not be able to make forward progress w/r
1896				 * to providing new pages to the free queue,
1897				 * so unthrottle the I/O threads in case we
1898				 * have laundry to be cleaned... it needs
1899				 * to be completed ASAP.
1900				 *
1901				 * even if we don't block, we want the io threads
1902				 * running unthrottled since the sum of free +
1903				 * clean pages is still under our free target
1904				 */
1905				vm_pageout_adjust_io_throttles(iq, eq, FALSE);
1906			}
1907			if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
1908				/*
1909				 * if we get here we're below our free target and
1910				 * we're stalling due to a full laundry queue or
1911				 * we don't have any inactive pages other then
1912				 * those in the clean queue...
1913				 * however, we have pages on the clean queue that
1914				 * can be moved to the free queue, so let's not
1915				 * stall the pageout scan
1916				 */
1917				flow_control.state = FCS_IDLE;
1918				goto consider_inactive;
1919			}
1920			VM_CHECK_MEMORYSTATUS;
1921
1922			if (flow_control.state != FCS_IDLE)
1923				vm_pageout_scan_throttle++;
1924			iq->pgo_throttled = TRUE;
1925
1926			if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)
1927				vm_consider_waking_compactor_swapper();
1928
1929			assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1930			counter(c_vm_pageout_scan_block++);
1931
1932			vm_page_unlock_queues();
1933
1934			assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1935
1936			VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
1937				       iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
1938			memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
1939
1940			thread_block(THREAD_CONTINUE_NULL);
1941
1942			VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
1943				       iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
1944			memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
1945
1946			vm_page_lock_queues();
1947			delayed_unlock = 1;
1948
1949			iq->pgo_throttled = FALSE;
1950
1951			if (loop_count >= vm_page_inactive_count)
1952				loop_count = 0;
1953			inactive_burst_count = 0;
1954
1955			goto Restart;
1956			/*NOTREACHED*/
1957		}
1958
1959
1960		flow_control.state = FCS_IDLE;
1961consider_inactive:
1962		vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
1963									    vm_pageout_inactive_external_forced_reactivate_limit);
1964		loop_count++;
1965		inactive_burst_count++;
1966		vm_pageout_inactive++;
1967
1968
1969		/*
1970		 * Choose a victim.
1971		 */
1972		while (1) {
1973			m = NULL;
1974
1975			if (VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) {
1976				assert(vm_page_throttled_count == 0);
1977				assert(queue_empty(&vm_page_queue_throttled));
1978			}
1979			/*
1980			 * The most eligible pages are ones we paged in speculatively,
1981			 * but which have not yet been touched.
1982			 */
1983			if (!queue_empty(&sq->age_q) ) {
1984				m = (vm_page_t) queue_first(&sq->age_q);
1985
1986				page_prev_state = PAGE_STATE_SPECULATIVE;
1987
1988				break;
1989			}
1990			/*
1991			 * Try a clean-queue inactive page.
1992			 */
1993			if (!queue_empty(&vm_page_queue_cleaned)) {
1994				m = (vm_page_t) queue_first(&vm_page_queue_cleaned);
1995
1996				page_prev_state = PAGE_STATE_CLEAN;
1997
1998				break;
1999			}
2000
2001			grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2002
2003			if (vm_page_pageable_external_count < vm_page_filecache_min || force_anonymous == TRUE) {
2004				grab_anonymous = TRUE;
2005				anons_grabbed = 0;
2006			}
2007
2008			if (grab_anonymous == TRUE && vm_compression_available() == FALSE)
2009				grab_anonymous = FALSE;
2010
2011			if (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || queue_empty(&vm_page_queue_anonymous)) {
2012
2013				if ( !queue_empty(&vm_page_queue_inactive) ) {
2014					m = (vm_page_t) queue_first(&vm_page_queue_inactive);
2015
2016					page_prev_state = PAGE_STATE_INACTIVE;
2017					anons_grabbed = 0;
2018
2019					break;
2020				}
2021			}
2022			if ( !queue_empty(&vm_page_queue_anonymous) ) {
2023				m = (vm_page_t) queue_first(&vm_page_queue_anonymous);
2024
2025				page_prev_state = PAGE_STATE_ANONYMOUS;
2026				anons_grabbed++;
2027
2028				break;
2029			}
2030
2031			/*
2032			 * if we've gotten here, we have no victim page.
2033			 * if making clean, free the local freed list and return.
2034			 * if making free, check to see if we've finished balancing the queues
2035			 * yet, if we haven't just continue, else panic
2036			 */
2037			vm_page_unlock_queues();
2038
2039			if (object != NULL) {
2040				vm_object_unlock(object);
2041				object = NULL;
2042			}
2043			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2044
2045			if (local_freeq) {
2046				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2047					       vm_page_free_count, local_freed, delayed_unlock_limit, 5);
2048
2049				vm_page_free_list(local_freeq, TRUE);
2050
2051				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2052					       vm_page_free_count, local_freed, 0, 5);
2053
2054				local_freeq = NULL;
2055				local_freed = 0;
2056			}
2057			vm_page_lock_queues();
2058			delayed_unlock = 1;
2059
2060			if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target)
2061				goto Restart;
2062
2063			panic("vm_pageout: no victim");
2064
2065			/* NOTREACHED */
2066		}
2067		force_anonymous = FALSE;
2068
2069		/*
2070		 * we just found this page on one of our queues...
2071		 * it can't also be on the pageout queue, so safe
2072		 * to call VM_PAGE_QUEUES_REMOVE
2073		 */
2074		assert(!m->pageout_queue);
2075
2076		VM_PAGE_QUEUES_REMOVE(m);
2077
2078		assert(!m->laundry);
2079		assert(!m->private);
2080		assert(!m->fictitious);
2081		assert(m->object != kernel_object);
2082		assert(m->phys_page != vm_page_guard_addr);
2083
2084
2085		if (page_prev_state != PAGE_STATE_SPECULATIVE)
2086			vm_pageout_stats[vm_pageout_stat_now].considered++;
2087
2088		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2089
2090		/*
2091		 * check to see if we currently are working
2092		 * with the same object... if so, we've
2093		 * already got the lock
2094		 */
2095		if (m->object != object) {
2096		        /*
2097			 * the object associated with candidate page is
2098			 * different from the one we were just working
2099			 * with... dump the lock if we still own it
2100			 */
2101		        if (object != NULL) {
2102			        vm_object_unlock(object);
2103				object = NULL;
2104				vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2105			}
2106			/*
2107			 * Try to lock object; since we've alread got the
2108			 * page queues lock, we can only 'try' for this one.
2109			 * if the 'try' fails, we need to do a mutex_pause
2110			 * to allow the owner of the object lock a chance to
2111			 * run... otherwise, we're likely to trip over this
2112			 * object in the same state as we work our way through
2113			 * the queue... clumps of pages associated with the same
2114			 * object are fairly typical on the inactive and active queues
2115			 */
2116			if (!vm_object_lock_try_scan(m->object)) {
2117				vm_page_t m_want = NULL;
2118
2119				vm_pageout_inactive_nolock++;
2120
2121				if (page_prev_state == PAGE_STATE_CLEAN)
2122					vm_pageout_cleaned_nolock++;
2123
2124				if (page_prev_state == PAGE_STATE_SPECULATIVE)
2125					page_prev_state = PAGE_STATE_INACTIVE_FIRST;
2126
2127				pmap_clear_reference(m->phys_page);
2128				m->reference = FALSE;
2129
2130				/*
2131				 * m->object must be stable since we hold the page queues lock...
2132				 * we can update the scan_collisions field sans the object lock
2133				 * since it is a separate field and this is the only spot that does
2134				 * a read-modify-write operation and it is never executed concurrently...
2135				 * we can asynchronously set this field to 0 when creating a UPL, so it
2136				 * is possible for the value to be a bit non-determistic, but that's ok
2137				 * since it's only used as a hint
2138				 */
2139				m->object->scan_collisions++;
2140
2141				if ( !queue_empty(&sq->age_q) )
2142					m_want = (vm_page_t) queue_first(&sq->age_q);
2143				else if ( !queue_empty(&vm_page_queue_cleaned))
2144					m_want = (vm_page_t) queue_first(&vm_page_queue_cleaned);
2145				else if (anons_grabbed >= ANONS_GRABBED_LIMIT || queue_empty(&vm_page_queue_anonymous))
2146					m_want = (vm_page_t) queue_first(&vm_page_queue_inactive);
2147				else if ( !queue_empty(&vm_page_queue_anonymous))
2148					m_want = (vm_page_t) queue_first(&vm_page_queue_anonymous);
2149
2150				/*
2151				 * this is the next object we're going to be interested in
2152				 * try to make sure its available after the mutex_yield
2153				 * returns control
2154				 */
2155				if (m_want)
2156					vm_pageout_scan_wants_object = m_want->object;
2157
2158				/*
2159				 * force us to dump any collected free pages
2160				 * and to pause before moving on
2161				 */
2162				try_failed = TRUE;
2163
2164				goto requeue_page;
2165			}
2166			object = m->object;
2167			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2168
2169			try_failed = FALSE;
2170		}
2171		if (catch_up_count)
2172		        catch_up_count--;
2173
2174		if (m->busy) {
2175			if (m->encrypted_cleaning) {
2176				/*
2177				 * ENCRYPTED SWAP:
2178				 * if this page has already been picked up as
2179				 * part of a page-out cluster, it will be busy
2180				 * because it is being encrypted (see
2181				 * vm_object_upl_request()).  But we still
2182				 * want to demote it from "clean-in-place"
2183				 * (aka "adjacent") to "clean-and-free" (aka
2184				 * "target"), so let's ignore its "busy" bit
2185				 * here and proceed to check for "cleaning" a
2186				 * little bit below...
2187				 *
2188				 * CAUTION CAUTION:
2189				 * A "busy" page should still be left alone for
2190				 * most purposes, so we have to be very careful
2191				 * not to process that page too much.
2192				 */
2193				assert(m->cleaning);
2194				goto consider_inactive_page;
2195			}
2196
2197			/*
2198			 *	Somebody is already playing with this page.
2199			 *	Put it back on the appropriate queue
2200			 *
2201			 */
2202			vm_pageout_inactive_busy++;
2203
2204			if (page_prev_state == PAGE_STATE_CLEAN)
2205				vm_pageout_cleaned_busy++;
2206
2207requeue_page:
2208			switch (page_prev_state) {
2209
2210			case PAGE_STATE_SPECULATIVE:
2211				vm_page_speculate(m, FALSE);
2212				break;
2213
2214			case PAGE_STATE_ANONYMOUS:
2215			case PAGE_STATE_CLEAN:
2216			case PAGE_STATE_INACTIVE:
2217				VM_PAGE_ENQUEUE_INACTIVE(m, FALSE);
2218				break;
2219
2220			case PAGE_STATE_INACTIVE_FIRST:
2221				VM_PAGE_ENQUEUE_INACTIVE(m, TRUE);
2222				break;
2223			}
2224			goto done_with_inactivepage;
2225		}
2226
2227
2228		/*
2229		 *	If it's absent, in error or the object is no longer alive,
2230		 *	we can reclaim the page... in the no longer alive case,
2231		 *	there are 2 states the page can be in that preclude us
2232		 *	from reclaiming it - busy or cleaning - that we've already
2233		 *	dealt with
2234		 */
2235		if (m->absent || m->error || !object->alive) {
2236
2237			if (m->absent)
2238				vm_pageout_inactive_absent++;
2239			else if (!object->alive)
2240				vm_pageout_inactive_notalive++;
2241			else
2242				vm_pageout_inactive_error++;
2243reclaim_page:
2244			if (vm_pageout_deadlock_target) {
2245				vm_pageout_scan_inactive_throttle_success++;
2246			        vm_pageout_deadlock_target--;
2247			}
2248
2249			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
2250
2251			if (object->internal) {
2252				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
2253			} else {
2254				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
2255			}
2256			assert(!m->cleaning);
2257			assert(!m->laundry);
2258
2259			m->busy = TRUE;
2260
2261			/*
2262			 * remove page from object here since we're already
2263			 * behind the object lock... defer the rest of the work
2264			 * we'd normally do in vm_page_free_prepare_object
2265			 * until 'vm_page_free_list' is called
2266			 */
2267			if (m->tabled)
2268				vm_page_remove(m, TRUE);
2269
2270			assert(m->pageq.next == NULL &&
2271			       m->pageq.prev == NULL);
2272			m->pageq.next = (queue_entry_t)local_freeq;
2273			local_freeq = m;
2274			local_freed++;
2275
2276			if (page_prev_state == PAGE_STATE_SPECULATIVE)
2277				vm_pageout_freed_from_speculative++;
2278			else if (page_prev_state == PAGE_STATE_CLEAN)
2279				vm_pageout_freed_from_cleaned++;
2280			else
2281				vm_pageout_freed_from_inactive_clean++;
2282
2283			if (page_prev_state != PAGE_STATE_SPECULATIVE)
2284				vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
2285
2286			goto done_with_inactivepage;
2287		}
2288		/*
2289		 * If the object is empty, the page must be reclaimed even
2290		 * if dirty or used.
2291		 * If the page belongs to a volatile object, we stick it back
2292		 * on.
2293		 */
2294		if (object->copy == VM_OBJECT_NULL) {
2295			if (object->purgable == VM_PURGABLE_EMPTY) {
2296				if (m->pmapped == TRUE) {
2297					/* unmap the page */
2298					refmod_state = pmap_disconnect(m->phys_page);
2299					if (refmod_state & VM_MEM_MODIFIED) {
2300						SET_PAGE_DIRTY(m, FALSE);
2301					}
2302				}
2303				if (m->dirty || m->precious) {
2304					/* we saved the cost of cleaning this page ! */
2305					vm_page_purged_count++;
2306				}
2307				goto reclaim_page;
2308			}
2309
2310			if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
2311				/*
2312				 * With the VM compressor, the cost of
2313				 * reclaiming a page is much lower (no I/O),
2314				 * so if we find a "volatile" page, it's better
2315				 * to let it get compressed rather than letting
2316				 * it occupy a full page until it gets purged.
2317				 * So no need to check for "volatile" here.
2318				 */
2319			} else if (object->purgable == VM_PURGABLE_VOLATILE) {
2320				/*
2321				 * Avoid cleaning a "volatile" page which might
2322				 * be purged soon.
2323				 */
2324
2325				/* if it's wired, we can't put it on our queue */
2326				assert(!VM_PAGE_WIRED(m));
2327
2328				/* just stick it back on! */
2329				reactivated_this_call++;
2330
2331				if (page_prev_state == PAGE_STATE_CLEAN)
2332					vm_pageout_cleaned_volatile_reactivated++;
2333
2334				goto reactivate_page;
2335			}
2336		}
2337
2338consider_inactive_page:
2339		if (m->busy) {
2340			/*
2341			 * CAUTION CAUTION:
2342			 * A "busy" page should always be left alone, except...
2343			 */
2344			if (m->cleaning && m->encrypted_cleaning) {
2345				/*
2346				 * ENCRYPTED_SWAP:
2347				 * We could get here with a "busy" page
2348				 * if it's being encrypted during a
2349				 * "clean-in-place" operation.  We'll deal
2350				 * with it right away by testing if it has been
2351				 * referenced and either reactivating it or
2352				 * promoting it from "clean-in-place" to
2353				 * "clean-and-free".
2354				 */
2355			} else {
2356				panic("\"busy\" page considered for pageout\n");
2357			}
2358		}
2359
2360		/*
2361		 *	If it's being used, reactivate.
2362		 *	(Fictitious pages are either busy or absent.)
2363		 *	First, update the reference and dirty bits
2364		 *	to make sure the page is unreferenced.
2365		 */
2366		refmod_state = -1;
2367
2368		if (m->reference == FALSE && m->pmapped == TRUE) {
2369		        refmod_state = pmap_get_refmod(m->phys_page);
2370
2371		        if (refmod_state & VM_MEM_REFERENCED)
2372			        m->reference = TRUE;
2373		        if (refmod_state & VM_MEM_MODIFIED) {
2374				SET_PAGE_DIRTY(m, FALSE);
2375			}
2376		}
2377
2378		/*
2379		 *   if (m->cleaning && !m->pageout)
2380		 *	If already cleaning this page in place and it hasn't
2381		 *	been recently referenced, just pull off the queue.
2382		 *	We can leave the page mapped, and upl_commit_range
2383		 *	will put it on the clean queue.
2384		 *
2385		 *	note: if m->encrypted_cleaning == TRUE, then
2386		 *		m->cleaning == TRUE
2387		 *	and we'll handle it here
2388		 *
2389		 *   if (m->pageout && !m->cleaning)
2390		 *	an msync INVALIDATE is in progress...
2391		 *	this page has been marked for destruction
2392		 * 	after it has been cleaned,
2393		 * 	but not yet gathered into a UPL
2394		 *	where 'cleaning' will be set...
2395		 *	just leave it off the paging queues
2396		 *
2397		 *   if (m->pageout && m->clenaing)
2398		 *	an msync INVALIDATE is in progress
2399		 *	and the UPL has already gathered this page...
2400		 *	just leave it off the paging queues
2401		 */
2402
2403		/*
2404		 * page with m->pageout and still on the queues means that an
2405		 * MS_INVALIDATE is in progress on this page... leave it alone
2406		 */
2407		if (m->pageout) {
2408			goto done_with_inactivepage;
2409		}
2410
2411		/* if cleaning, reactivate if referenced.  otherwise, just pull off queue */
2412		if (m->cleaning) {
2413			if (m->reference == TRUE) {
2414				reactivated_this_call++;
2415				goto reactivate_page;
2416			} else {
2417				goto done_with_inactivepage;
2418			}
2419		}
2420
2421                if (m->reference || m->dirty) {
2422                        /* deal with a rogue "reusable" page */
2423                        VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m);
2424                }
2425
2426		if (m->reference && !m->no_cache) {
2427			/*
2428			 * The page we pulled off the inactive list has
2429			 * been referenced.  It is possible for other
2430			 * processors to be touching pages faster than we
2431			 * can clear the referenced bit and traverse the
2432			 * inactive queue, so we limit the number of
2433			 * reactivations.
2434			 */
2435			if (++reactivated_this_call >= reactivate_limit) {
2436				vm_pageout_reactivation_limit_exceeded++;
2437			} else if (catch_up_count) {
2438				vm_pageout_catch_ups++;
2439			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
2440				vm_pageout_inactive_force_reclaim++;
2441			} else {
2442				uint32_t isinuse;
2443
2444				if (page_prev_state == PAGE_STATE_CLEAN)
2445					vm_pageout_cleaned_reference_reactivated++;
2446
2447reactivate_page:
2448				if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
2449				     vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
2450					/*
2451					 * no explict mappings of this object exist
2452					 * and it's not open via the filesystem
2453					 */
2454					vm_page_deactivate(m);
2455					vm_pageout_inactive_deactivated++;
2456				} else {
2457					/*
2458					 * The page was/is being used, so put back on active list.
2459					 */
2460					vm_page_activate(m);
2461					VM_STAT_INCR(reactivations);
2462				}
2463
2464				if (page_prev_state == PAGE_STATE_CLEAN)
2465					vm_pageout_cleaned_reactivated++;
2466
2467				vm_pageout_inactive_used++;
2468
2469                                goto done_with_inactivepage;
2470			}
2471			/*
2472			 * Make sure we call pmap_get_refmod() if it
2473			 * wasn't already called just above, to update
2474			 * the dirty bit.
2475			 */
2476			if ((refmod_state == -1) && !m->dirty && m->pmapped) {
2477				refmod_state = pmap_get_refmod(m->phys_page);
2478				if (refmod_state & VM_MEM_MODIFIED) {
2479					SET_PAGE_DIRTY(m, FALSE);
2480				}
2481			}
2482			forced_reclaim = TRUE;
2483		} else {
2484			forced_reclaim = FALSE;
2485		}
2486
2487                XPR(XPR_VM_PAGEOUT,
2488                "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
2489                object, m->offset, m, 0,0);
2490
2491		/*
2492		 * we've got a candidate page to steal...
2493		 *
2494		 * m->dirty is up to date courtesy of the
2495		 * preceding check for m->reference... if
2496		 * we get here, then m->reference had to be
2497		 * FALSE (or possibly "reactivate_limit" was
2498                 * exceeded), but in either case we called
2499                 * pmap_get_refmod() and updated both
2500                 * m->reference and m->dirty
2501		 *
2502		 * if it's dirty or precious we need to
2503		 * see if the target queue is throtttled
2504		 * it if is, we need to skip over it by moving it back
2505		 * to the end of the inactive queue
2506		 */
2507
2508		inactive_throttled = FALSE;
2509
2510		if (m->dirty || m->precious) {
2511		        if (object->internal) {
2512				if (VM_PAGE_Q_THROTTLED(iq))
2513				        inactive_throttled = TRUE;
2514			} else if (VM_PAGE_Q_THROTTLED(eq)) {
2515				inactive_throttled = TRUE;
2516			}
2517		}
2518throttle_inactive:
2519		if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
2520		    object->internal && m->dirty &&
2521		    (object->purgable == VM_PURGABLE_DENY ||
2522		     object->purgable == VM_PURGABLE_NONVOLATILE ||
2523		     object->purgable == VM_PURGABLE_VOLATILE)) {
2524			queue_enter(&vm_page_queue_throttled, m,
2525				    vm_page_t, pageq);
2526			m->throttled = TRUE;
2527			vm_page_throttled_count++;
2528
2529			vm_pageout_scan_reclaimed_throttled++;
2530
2531			goto done_with_inactivepage;
2532		}
2533		if (inactive_throttled == TRUE) {
2534
2535			if (object->internal == FALSE) {
2536                                /*
2537				 * we need to break up the following potential deadlock case...
2538				 *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2539				 *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2540				 *  c) Most of the pages in the inactive queue belong to this file.
2541				 *
2542				 * we are potentially in this deadlock because...
2543				 *  a) the external pageout queue is throttled
2544				 *  b) we're done with the active queue and moved on to the inactive queue
2545				 *  c) we've got a dirty external page
2546				 *
2547				 * since we don't know the reason for the external pageout queue being throttled we
2548				 * must suspect that we are deadlocked, so move the current page onto the active queue
2549				 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2550				 *
2551				 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2552				 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2553				 * pool the next time we select a victim page... if we can make enough new free pages,
2554				 * the deadlock will break, the external pageout queue will empty and it will no longer
2555				 * be throttled
2556				 *
2557				 * if we have jestam configured, keep a count of the pages reactivated this way so
2558				 * that we can try to find clean pages in the active/inactive queues before
2559				 * deciding to jetsam a process
2560				 */
2561				vm_pageout_scan_inactive_throttled_external++;
2562
2563				queue_enter(&vm_page_queue_active, m, vm_page_t, pageq);
2564				m->active = TRUE;
2565				vm_page_active_count++;
2566				if (m->object->internal) {
2567					vm_page_pageable_internal_count++;
2568				} else {
2569					vm_page_pageable_external_count++;
2570				}
2571
2572				vm_pageout_adjust_io_throttles(iq, eq, FALSE);
2573
2574#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2575				vm_pageout_inactive_external_forced_reactivate_limit--;
2576
2577				if (vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2578					vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2579					/*
2580					 * Possible deadlock scenario so request jetsam action
2581					 */
2582					assert(object);
2583					vm_object_unlock(object);
2584					object = VM_OBJECT_NULL;
2585					vm_page_unlock_queues();
2586
2587					VM_DEBUG_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2588    					       vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2589
2590                                        /* Kill first suitable process */
2591					if (memorystatus_kill_on_VM_page_shortage(FALSE) == FALSE) {
2592						panic("vm_pageout_scan: Jetsam request failed\n");
2593					}
2594
2595					VM_DEBUG_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, 0, 0, 0, 0);
2596
2597					vm_pageout_inactive_external_forced_jetsam_count++;
2598					vm_page_lock_queues();
2599					delayed_unlock = 1;
2600				}
2601#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2602				force_anonymous = TRUE;
2603#endif
2604				goto done_with_inactivepage;
2605			} else {
2606				if (page_prev_state == PAGE_STATE_SPECULATIVE)
2607					page_prev_state = PAGE_STATE_INACTIVE;
2608
2609				vm_pageout_scan_inactive_throttled_internal++;
2610
2611				goto requeue_page;
2612			}
2613		}
2614
2615		/*
2616		 * we've got a page that we can steal...
2617		 * eliminate all mappings and make sure
2618		 * we have the up-to-date modified state
2619		 *
2620		 * if we need to do a pmap_disconnect then we
2621		 * need to re-evaluate m->dirty since the pmap_disconnect
2622		 * provides the true state atomically... the
2623		 * page was still mapped up to the pmap_disconnect
2624		 * and may have been dirtied at the last microsecond
2625		 *
2626		 * Note that if 'pmapped' is FALSE then the page is not
2627		 * and has not been in any map, so there is no point calling
2628		 * pmap_disconnect().  m->dirty could have been set in anticipation
2629		 * of likely usage of the page.
2630		 */
2631		if (m->pmapped == TRUE) {
2632
2633			if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE || object->internal == FALSE) {
2634		        	refmod_state = pmap_disconnect_options(m->phys_page, 0, NULL);
2635			} else {
2636				refmod_state = pmap_disconnect_options(m->phys_page, PMAP_OPTIONS_COMPRESSOR, NULL);
2637			}
2638
2639			if (refmod_state & VM_MEM_MODIFIED) {
2640				SET_PAGE_DIRTY(m, FALSE);
2641			}
2642		}
2643		/*
2644		 * reset our count of pages that have been reclaimed
2645		 * since the last page was 'stolen'
2646		 */
2647		inactive_reclaim_run = 0;
2648
2649		/*
2650		 *	If it's clean and not precious, we can free the page.
2651		 */
2652		if (!m->dirty && !m->precious) {
2653
2654			if (page_prev_state == PAGE_STATE_SPECULATIVE)
2655				vm_pageout_speculative_clean++;
2656			else {
2657				if (page_prev_state == PAGE_STATE_ANONYMOUS)
2658					vm_pageout_inactive_anonymous++;
2659				else if (page_prev_state == PAGE_STATE_CLEAN)
2660					vm_pageout_cleaned_reclaimed++;
2661
2662				if (m->was_dirty) {
2663					/* page on clean queue used to be dirty; we should increment the vm_stat pageout count here */
2664					VM_STAT_INCR(pageouts);
2665					DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
2666				}
2667				vm_pageout_inactive_clean++;
2668			}
2669
2670			/*
2671			 * OK, at this point we have found a page we are going to free.
2672			 */
2673			goto reclaim_page;
2674		}
2675
2676		/*
2677		 * The page may have been dirtied since the last check
2678		 * for a throttled target queue (which may have been skipped
2679		 * if the page was clean then).  With the dirty page
2680		 * disconnected here, we can make one final check.
2681		 */
2682		if (object->internal) {
2683			if (VM_PAGE_Q_THROTTLED(iq))
2684				inactive_throttled = TRUE;
2685		} else if (VM_PAGE_Q_THROTTLED(eq)) {
2686			inactive_throttled = TRUE;
2687		}
2688
2689		if (inactive_throttled == TRUE)
2690			goto throttle_inactive;
2691
2692#if VM_PRESSURE_EVENTS
2693		vm_pressure_response();
2694#endif /* VM_PRESSURE_EVENTS */
2695
2696		/*
2697		 * do NOT set the pageout bit!
2698		 * sure, we might need free pages, but this page is going to take time to become free
2699		 * anyway, so we may as well put it on the clean queue first and take it from there later
2700		 * if necessary.  that way, we'll ensure we don't free up too much. -mj
2701		 */
2702		vm_pageout_cluster(m, FALSE);
2703
2704		if (page_prev_state == PAGE_STATE_ANONYMOUS)
2705			vm_pageout_inactive_anonymous++;
2706		if (object->internal)
2707			vm_pageout_inactive_dirty_internal++;
2708		else
2709			vm_pageout_inactive_dirty_external++;
2710
2711
2712done_with_inactivepage:
2713		inactive_burst_count = 0;
2714
2715		if (delayed_unlock++ > delayed_unlock_limit || try_failed == TRUE) {
2716
2717		        if (object != NULL) {
2718				vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2719			        vm_object_unlock(object);
2720				object = NULL;
2721			}
2722		        if (local_freeq) {
2723				vm_page_unlock_queues();
2724
2725				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2726					       vm_page_free_count, local_freed, delayed_unlock_limit, 4);
2727
2728				vm_page_free_list(local_freeq, TRUE);
2729
2730				VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2731					       vm_page_free_count, local_freed, 0, 4);
2732
2733				local_freeq = NULL;
2734				local_freed = 0;
2735				vm_page_lock_queues();
2736			} else
2737				lck_mtx_yield(&vm_page_queue_lock);
2738
2739			delayed_unlock = 1;
2740		}
2741		vm_pageout_considered_page++;
2742
2743		if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)
2744			vm_consider_waking_compactor_swapper();
2745
2746		/*
2747		 * back to top of pageout scan loop
2748		 */
2749	}
2750}
2751
2752
2753int vm_page_free_count_init;
2754
2755void
2756vm_page_free_reserve(
2757	int pages)
2758{
2759	int		free_after_reserve;
2760
2761	if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
2762
2763		if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT))
2764			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
2765		else
2766			vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
2767
2768	} else {
2769		if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT)
2770			vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
2771		else
2772			vm_page_free_reserved += pages;
2773	}
2774	free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
2775
2776	vm_page_free_min = vm_page_free_reserved +
2777		VM_PAGE_FREE_MIN(free_after_reserve);
2778
2779	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
2780	        vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
2781
2782	vm_page_free_target = vm_page_free_reserved +
2783		VM_PAGE_FREE_TARGET(free_after_reserve);
2784
2785	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
2786	        vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
2787
2788	if (vm_page_free_target < vm_page_free_min + 5)
2789		vm_page_free_target = vm_page_free_min + 5;
2790
2791	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3);
2792	vm_page_creation_throttle = vm_page_free_target * 3;
2793}
2794
2795/*
2796 *	vm_pageout is the high level pageout daemon.
2797 */
2798
2799void
2800vm_pageout_continue(void)
2801{
2802	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
2803	vm_pageout_scan_event_counter++;
2804
2805	vm_pageout_scan();
2806	/*
2807	 * we hold both the vm_page_queue_free_lock
2808	 * and the vm_page_queues_lock at this point
2809	 */
2810	assert(vm_page_free_wanted == 0);
2811	assert(vm_page_free_wanted_privileged == 0);
2812	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2813
2814	lck_mtx_unlock(&vm_page_queue_free_lock);
2815	vm_page_unlock_queues();
2816
2817	counter(c_vm_pageout_block++);
2818	thread_block((thread_continue_t)vm_pageout_continue);
2819	/*NOTREACHED*/
2820}
2821
2822
2823#ifdef FAKE_DEADLOCK
2824
2825#define FAKE_COUNT	5000
2826
2827int internal_count = 0;
2828int fake_deadlock = 0;
2829
2830#endif
2831
2832static void
2833vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2834{
2835	vm_page_t	m = NULL;
2836	vm_object_t	object;
2837	vm_object_offset_t offset;
2838	memory_object_t	pager;
2839	thread_t	self = current_thread();
2840
2841	if ((vm_pageout_internal_iothread != THREAD_NULL)
2842	    && (self == vm_pageout_external_iothread )
2843	    && (self->options & TH_OPT_VMPRIV))
2844		self->options &= ~TH_OPT_VMPRIV;
2845
2846	vm_page_lockspin_queues();
2847
2848        while ( !queue_empty(&q->pgo_pending) ) {
2849
2850		   q->pgo_busy = TRUE;
2851		   queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2852		   if (m->object->object_slid) {
2853			   panic("slid page %p not allowed on this path\n", m);
2854		   }
2855		   VM_PAGE_CHECK(m);
2856		   m->pageout_queue = FALSE;
2857		   m->pageq.next = NULL;
2858		   m->pageq.prev = NULL;
2859
2860		   /*
2861		    * grab a snapshot of the object and offset this
2862		    * page is tabled in so that we can relookup this
2863		    * page after we've taken the object lock - these
2864		    * fields are stable while we hold the page queues lock
2865		    * but as soon as we drop it, there is nothing to keep
2866		    * this page in this object... we hold an activity_in_progress
2867		    * on this object which will keep it from terminating
2868		    */
2869		   object = m->object;
2870		   offset = m->offset;
2871
2872		   vm_page_unlock_queues();
2873
2874#ifdef FAKE_DEADLOCK
2875		   if (q == &vm_pageout_queue_internal) {
2876		           vm_offset_t addr;
2877			   int	pg_count;
2878
2879			   internal_count++;
2880
2881			   if ((internal_count == FAKE_COUNT)) {
2882
2883				   pg_count = vm_page_free_count + vm_page_free_reserved;
2884
2885			           if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2886				           kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2887				   }
2888				   internal_count = 0;
2889				   fake_deadlock++;
2890			   }
2891		   }
2892#endif
2893		   vm_object_lock(object);
2894
2895		   m = vm_page_lookup(object, offset);
2896
2897		   if (m == NULL ||
2898		       m->busy || m->cleaning || m->pageout_queue || !m->laundry) {
2899			   /*
2900			    * it's either the same page that someone else has
2901			    * started cleaning (or it's finished cleaning or
2902			    * been put back on the pageout queue), or
2903			    * the page has been freed or we have found a
2904			    * new page at this offset... in all of these cases
2905			    * we merely need to release the activity_in_progress
2906			    * we took when we put the page on the pageout queue
2907			    */
2908			   vm_object_activity_end(object);
2909			   vm_object_unlock(object);
2910
2911			   vm_page_lockspin_queues();
2912			   continue;
2913		   }
2914		   if (!object->pager_initialized) {
2915
2916			   /*
2917			    *	If there is no memory object for the page, create
2918			    *	one and hand it to the default pager.
2919			    */
2920
2921			   if (!object->pager_initialized)
2922			           vm_object_collapse(object,
2923						      (vm_object_offset_t) 0,
2924						      TRUE);
2925			   if (!object->pager_initialized)
2926			           vm_object_pager_create(object);
2927			   if (!object->pager_initialized) {
2928			           /*
2929				    *	Still no pager for the object.
2930				    *	Reactivate the page.
2931				    *
2932				    *	Should only happen if there is no
2933				    *	default pager.
2934				    */
2935				   m->pageout = FALSE;
2936
2937			           vm_page_lockspin_queues();
2938
2939				   vm_pageout_throttle_up(m);
2940				   vm_page_activate(m);
2941				   vm_pageout_dirty_no_pager++;
2942
2943				   vm_page_unlock_queues();
2944
2945				   /*
2946				    *	And we are done with it.
2947				    */
2948			           vm_object_activity_end(object);
2949				   vm_object_unlock(object);
2950
2951				   vm_page_lockspin_queues();
2952				   continue;
2953			   }
2954		   }
2955		   pager = object->pager;
2956
2957	           if (pager == MEMORY_OBJECT_NULL) {
2958		           /*
2959			    * This pager has been destroyed by either
2960			    * memory_object_destroy or vm_object_destroy, and
2961			    * so there is nowhere for the page to go.
2962			    */
2963			   if (m->pageout) {
2964				   /*
2965				    * Just free the page... VM_PAGE_FREE takes
2966				    * care of cleaning up all the state...
2967				    * including doing the vm_pageout_throttle_up
2968				    */
2969				   VM_PAGE_FREE(m);
2970			   } else {
2971			           vm_page_lockspin_queues();
2972
2973				   vm_pageout_throttle_up(m);
2974				   vm_page_activate(m);
2975
2976				   vm_page_unlock_queues();
2977
2978				   /*
2979				    *	And we are done with it.
2980				    */
2981			   }
2982			   vm_object_activity_end(object);
2983			   vm_object_unlock(object);
2984
2985			   vm_page_lockspin_queues();
2986			   continue;
2987		   }
2988#if 0
2989		   /*
2990		    * we don't hold the page queue lock
2991		    * so this check isn't safe to make
2992		    */
2993		   VM_PAGE_CHECK(m);
2994#endif
2995		   /*
2996		    * give back the activity_in_progress reference we
2997		    * took when we queued up this page and replace it
2998		    * it with a paging_in_progress reference that will
2999                    * also hold the paging offset from changing and
3000                    * prevent the object from terminating
3001		    */
3002		   vm_object_activity_end(object);
3003		   vm_object_paging_begin(object);
3004		   vm_object_unlock(object);
3005
3006                   /*
3007		    * Send the data to the pager.
3008		    * any pageout clustering happens there
3009		    */
3010		   memory_object_data_return(pager,
3011					     m->offset + object->paging_offset,
3012					     PAGE_SIZE,
3013					     NULL,
3014					     NULL,
3015					     FALSE,
3016					     FALSE,
3017					     0);
3018
3019		   vm_object_lock(object);
3020		   vm_object_paging_end(object);
3021		   vm_object_unlock(object);
3022
3023		   vm_pageout_io_throttle();
3024
3025		   vm_page_lockspin_queues();
3026	}
3027	q->pgo_busy = FALSE;
3028	q->pgo_idle = TRUE;
3029
3030	assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
3031	vm_page_unlock_queues();
3032
3033	thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) q);
3034	/*NOTREACHED*/
3035}
3036
3037
3038static void
3039vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
3040{
3041	vm_page_t	m = NULL;
3042	vm_object_t	object;
3043	vm_object_offset_t offset;
3044	memory_object_t	pager;
3045
3046
3047	if (vm_pageout_internal_iothread != THREAD_NULL)
3048		current_thread()->options &= ~TH_OPT_VMPRIV;
3049
3050	vm_page_lockspin_queues();
3051
3052        while ( !queue_empty(&q->pgo_pending) ) {
3053
3054		   q->pgo_busy = TRUE;
3055		   queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
3056		   if (m->object->object_slid) {
3057			   panic("slid page %p not allowed on this path\n", m);
3058		   }
3059		   VM_PAGE_CHECK(m);
3060		   m->pageout_queue = FALSE;
3061		   m->pageq.next = NULL;
3062		   m->pageq.prev = NULL;
3063
3064		   /*
3065		    * grab a snapshot of the object and offset this
3066		    * page is tabled in so that we can relookup this
3067		    * page after we've taken the object lock - these
3068		    * fields are stable while we hold the page queues lock
3069		    * but as soon as we drop it, there is nothing to keep
3070		    * this page in this object... we hold an activity_in_progress
3071		    * on this object which will keep it from terminating
3072		    */
3073		   object = m->object;
3074		   offset = m->offset;
3075
3076		   vm_page_unlock_queues();
3077
3078		   vm_object_lock(object);
3079
3080		   m = vm_page_lookup(object, offset);
3081
3082		   if (m == NULL ||
3083		       m->busy || m->cleaning || m->pageout_queue || !m->laundry) {
3084			   /*
3085			    * it's either the same page that someone else has
3086			    * started cleaning (or it's finished cleaning or
3087			    * been put back on the pageout queue), or
3088			    * the page has been freed or we have found a
3089			    * new page at this offset... in all of these cases
3090			    * we merely need to release the activity_in_progress
3091			    * we took when we put the page on the pageout queue
3092			    */
3093			   vm_object_activity_end(object);
3094			   vm_object_unlock(object);
3095
3096			   vm_page_lockspin_queues();
3097			   continue;
3098		   }
3099		   pager = object->pager;
3100
3101	           if (pager == MEMORY_OBJECT_NULL) {
3102		           /*
3103			    * This pager has been destroyed by either
3104			    * memory_object_destroy or vm_object_destroy, and
3105			    * so there is nowhere for the page to go.
3106			    */
3107			   if (m->pageout) {
3108				   /*
3109				    * Just free the page... VM_PAGE_FREE takes
3110				    * care of cleaning up all the state...
3111				    * including doing the vm_pageout_throttle_up
3112				    */
3113				   VM_PAGE_FREE(m);
3114			   } else {
3115			           vm_page_lockspin_queues();
3116
3117				   vm_pageout_throttle_up(m);
3118				   vm_page_activate(m);
3119
3120				   vm_page_unlock_queues();
3121
3122				   /*
3123				    *	And we are done with it.
3124				    */
3125			   }
3126			   vm_object_activity_end(object);
3127			   vm_object_unlock(object);
3128
3129			   vm_page_lockspin_queues();
3130			   continue;
3131		   }
3132#if 0
3133		   /*
3134		    * we don't hold the page queue lock
3135		    * so this check isn't safe to make
3136		    */
3137		   VM_PAGE_CHECK(m);
3138#endif
3139		   /*
3140		    * give back the activity_in_progress reference we
3141		    * took when we queued up this page and replace it
3142		    * it with a paging_in_progress reference that will
3143                    * also hold the paging offset from changing and
3144                    * prevent the object from terminating
3145		    */
3146		   vm_object_activity_end(object);
3147		   vm_object_paging_begin(object);
3148		   vm_object_unlock(object);
3149
3150                   /*
3151		    * Send the data to the pager.
3152		    * any pageout clustering happens there
3153		    */
3154		   memory_object_data_return(pager,
3155					     m->offset + object->paging_offset,
3156					     PAGE_SIZE,
3157					     NULL,
3158					     NULL,
3159					     FALSE,
3160					     FALSE,
3161					     0);
3162
3163		   vm_object_lock(object);
3164		   vm_object_paging_end(object);
3165		   vm_object_unlock(object);
3166
3167		   vm_pageout_io_throttle();
3168
3169		   vm_page_lockspin_queues();
3170	}
3171	q->pgo_busy = FALSE;
3172	q->pgo_idle = TRUE;
3173
3174	assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
3175	vm_page_unlock_queues();
3176
3177	thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
3178	/*NOTREACHED*/
3179}
3180
3181
3182uint32_t	vm_compressor_failed;
3183
3184static void
3185vm_pageout_iothread_internal_continue(struct cq *cq)
3186{
3187	struct vm_pageout_queue *q;
3188	vm_page_t	m = NULL;
3189	vm_object_t	object;
3190	memory_object_t	pager;
3191	boolean_t	pgo_draining;
3192	vm_page_t   local_q;
3193	int	    local_cnt;
3194	vm_page_t   local_freeq = NULL;
3195	int         local_freed = 0;
3196	int	    local_batch_size;
3197	kern_return_t	retval;
3198
3199
3200	KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
3201
3202	q = cq->q;
3203	local_batch_size = q->pgo_maxlaundry / (vm_compressor_thread_count * 4);
3204
3205	while (TRUE) {
3206
3207		local_cnt = 0;
3208		local_q = NULL;
3209
3210		KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
3211
3212		vm_page_lock_queues();
3213
3214		KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3215
3216		KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, 0, 0, 0, 0, 0);
3217
3218		while ( !queue_empty(&q->pgo_pending) && local_cnt <  local_batch_size) {
3219
3220			queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
3221
3222			VM_PAGE_CHECK(m);
3223
3224			m->pageout_queue = FALSE;
3225			m->pageq.prev = NULL;
3226
3227			m->pageq.next = (queue_entry_t)local_q;
3228			local_q = m;
3229			local_cnt++;
3230		}
3231		if (local_q == NULL)
3232			break;
3233
3234		q->pgo_busy = TRUE;
3235
3236		if ((pgo_draining = q->pgo_draining) == FALSE)
3237			vm_pageout_throttle_up_batch(q, local_cnt);
3238
3239		vm_page_unlock_queues();
3240
3241		KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3242
3243		while (local_q) {
3244
3245			m = local_q;
3246			local_q = (vm_page_t)m->pageq.next;
3247			m->pageq.next = NULL;
3248
3249			if (m->object->object_slid) {
3250				panic("slid page %p not allowed on this path\n", m);
3251			}
3252
3253			object = m->object;
3254			pager = object->pager;
3255
3256			if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL)  {
3257
3258				KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
3259
3260				vm_object_lock(object);
3261
3262				/*
3263				 * If there is no memory object for the page, create
3264				 * one and hand it to the compression pager.
3265				 */
3266
3267				if (!object->pager_initialized)
3268					vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
3269				if (!object->pager_initialized)
3270					vm_object_compressor_pager_create(object);
3271
3272				if (!object->pager_initialized) {
3273					/*
3274					 * Still no pager for the object.
3275					 * Reactivate the page.
3276					 *
3277					 * Should only happen if there is no
3278					 * compression pager
3279					 */
3280					m->pageout = FALSE;
3281					m->laundry = FALSE;
3282					PAGE_WAKEUP_DONE(m);
3283
3284					vm_page_lockspin_queues();
3285					vm_page_activate(m);
3286					vm_pageout_dirty_no_pager++;
3287					vm_page_unlock_queues();
3288
3289					/*
3290					 *	And we are done with it.
3291					 */
3292					vm_object_activity_end(object);
3293					vm_object_unlock(object);
3294
3295					continue;
3296				}
3297				pager = object->pager;
3298
3299				if (pager == MEMORY_OBJECT_NULL) {
3300					/*
3301					 * This pager has been destroyed by either
3302					 * memory_object_destroy or vm_object_destroy, and
3303					 * so there is nowhere for the page to go.
3304					 */
3305					if (m->pageout) {
3306						/*
3307						 * Just free the page... VM_PAGE_FREE takes
3308						 * care of cleaning up all the state...
3309						 * including doing the vm_pageout_throttle_up
3310						 */
3311						VM_PAGE_FREE(m);
3312					} else {
3313						m->laundry = FALSE;
3314						PAGE_WAKEUP_DONE(m);
3315
3316						vm_page_lockspin_queues();
3317						vm_page_activate(m);
3318						vm_page_unlock_queues();
3319
3320						/*
3321						 *	And we are done with it.
3322						 */
3323					}
3324					vm_object_activity_end(object);
3325					vm_object_unlock(object);
3326
3327					continue;
3328				}
3329				vm_object_unlock(object);
3330
3331				KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
3332			}
3333			while (vm_page_free_count < (vm_page_free_reserved - COMPRESSOR_FREE_RESERVED_LIMIT)) {
3334				kern_return_t	wait_result;
3335				int		need_wakeup = 0;
3336
3337				if (local_freeq) {
3338					vm_page_free_list(local_freeq, TRUE);
3339
3340					local_freeq = NULL;
3341					local_freed = 0;
3342
3343					continue;
3344				}
3345				lck_mtx_lock_spin(&vm_page_queue_free_lock);
3346
3347				if (vm_page_free_count < (vm_page_free_reserved - COMPRESSOR_FREE_RESERVED_LIMIT)) {
3348
3349					if (vm_page_free_wanted_privileged++ == 0)
3350						need_wakeup = 1;
3351					wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
3352
3353					lck_mtx_unlock(&vm_page_queue_free_lock);
3354
3355					if (need_wakeup)
3356						thread_wakeup((event_t)&vm_page_free_wanted);
3357
3358					if (wait_result == THREAD_WAITING)
3359						thread_block(THREAD_CONTINUE_NULL);
3360				} else
3361					lck_mtx_unlock(&vm_page_queue_free_lock);
3362			}
3363			retval = vm_compressor_pager_put(pager, m->offset + object->paging_offset, m->phys_page, &cq->current_chead, cq->scratch_buf);
3364
3365			vm_object_lock(object);
3366			m->laundry = FALSE;
3367			m->pageout = FALSE;
3368
3369			if (retval == KERN_SUCCESS) {
3370
3371				vm_page_compressions_failing = FALSE;
3372
3373				VM_STAT_INCR(compressions);
3374
3375				if (m->tabled)
3376					vm_page_remove(m, TRUE);
3377				vm_object_activity_end(object);
3378				vm_object_unlock(object);
3379
3380				m->pageq.next = (queue_entry_t)local_freeq;
3381				local_freeq = m;
3382				local_freed++;
3383
3384			} else {
3385				PAGE_WAKEUP_DONE(m);
3386
3387				vm_page_lockspin_queues();
3388
3389				vm_page_activate(m);
3390				vm_compressor_failed++;
3391
3392				vm_page_compressions_failing = TRUE;
3393
3394				vm_page_unlock_queues();
3395
3396				vm_object_activity_end(object);
3397				vm_object_unlock(object);
3398			}
3399		}
3400		if (local_freeq) {
3401			vm_page_free_list(local_freeq, TRUE);
3402
3403			local_freeq = NULL;
3404			local_freed = 0;
3405		}
3406		if (pgo_draining == TRUE) {
3407			vm_page_lockspin_queues();
3408			vm_pageout_throttle_up_batch(q, local_cnt);
3409			vm_page_unlock_queues();
3410		}
3411	}
3412	KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
3413
3414	/*
3415	 * queue lock is held and our q is empty
3416	 */
3417	q->pgo_busy = FALSE;
3418	q->pgo_idle = TRUE;
3419
3420	assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
3421	vm_page_unlock_queues();
3422
3423	KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3424
3425	thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
3426	/*NOTREACHED*/
3427}
3428
3429
3430
3431static void
3432vm_pageout_adjust_io_throttles(struct vm_pageout_queue *iq, struct vm_pageout_queue *eq, boolean_t req_lowpriority)
3433{
3434	uint32_t 	policy;
3435	boolean_t	set_iq = FALSE;
3436	boolean_t	set_eq = FALSE;
3437
3438	if (hibernate_cleaning_in_progress == TRUE)
3439		req_lowpriority = FALSE;
3440
3441	if ((DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE) && iq->pgo_inited == TRUE && iq->pgo_lowpriority != req_lowpriority)
3442		set_iq = TRUE;
3443
3444	if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority)
3445		set_eq = TRUE;
3446
3447	if (set_iq == TRUE || set_eq == TRUE) {
3448
3449		vm_page_unlock_queues();
3450
3451		if (req_lowpriority == TRUE) {
3452			policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
3453			DTRACE_VM(laundrythrottle);
3454		} else {
3455			policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
3456			DTRACE_VM(laundryunthrottle);
3457		}
3458		if (set_iq == TRUE) {
3459			proc_set_task_policy_thread(kernel_task, iq->pgo_tid, TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
3460
3461			iq->pgo_lowpriority = req_lowpriority;
3462		}
3463		if (set_eq == TRUE) {
3464			proc_set_task_policy_thread(kernel_task, eq->pgo_tid, TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
3465
3466			eq->pgo_lowpriority = req_lowpriority;
3467		}
3468		vm_page_lock_queues();
3469	}
3470}
3471
3472
3473static void
3474vm_pageout_iothread_external(void)
3475{
3476	thread_t	self = current_thread();
3477
3478	self->options |= TH_OPT_VMPRIV;
3479
3480	DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
3481
3482	proc_set_task_policy_thread(kernel_task, self->thread_id, TASK_POLICY_EXTERNAL,
3483	                            TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
3484
3485	vm_page_lock_queues();
3486
3487	vm_pageout_queue_external.pgo_tid = self->thread_id;
3488	vm_pageout_queue_external.pgo_lowpriority = TRUE;
3489	vm_pageout_queue_external.pgo_inited = TRUE;
3490
3491	vm_page_unlock_queues();
3492
3493	if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)
3494		vm_pageout_iothread_external_continue(&vm_pageout_queue_external);
3495	else
3496		vm_pageout_iothread_continue(&vm_pageout_queue_external);
3497
3498	/*NOTREACHED*/
3499}
3500
3501
3502static void
3503vm_pageout_iothread_internal(struct cq *cq)
3504{
3505	thread_t	self = current_thread();
3506
3507	self->options |= TH_OPT_VMPRIV;
3508
3509	if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE) {
3510		DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
3511
3512		proc_set_task_policy_thread(kernel_task, self->thread_id, TASK_POLICY_EXTERNAL,
3513		                            TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
3514	}
3515	vm_page_lock_queues();
3516
3517	vm_pageout_queue_internal.pgo_tid = self->thread_id;
3518	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
3519	vm_pageout_queue_internal.pgo_inited = TRUE;
3520
3521	vm_page_unlock_queues();
3522
3523	if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
3524		cq->q = &vm_pageout_queue_internal;
3525		cq->current_chead = NULL;
3526		cq->scratch_buf = kalloc(COMPRESSOR_SCRATCH_BUF_SIZE);
3527
3528		vm_pageout_iothread_internal_continue(cq);
3529	} else
3530		vm_pageout_iothread_continue(&vm_pageout_queue_internal);
3531
3532	/*NOTREACHED*/
3533}
3534
3535kern_return_t
3536vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
3537{
3538	if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
3539		return KERN_SUCCESS;
3540	} else {
3541		return KERN_FAILURE; /* Already set */
3542	}
3543}
3544
3545
3546extern boolean_t	memorystatus_manual_testing_on;
3547extern unsigned int 	memorystatus_level;
3548
3549
3550
3551#if VM_PRESSURE_EVENTS
3552
3553void
3554vm_pressure_response(void)
3555{
3556
3557
3558	vm_pressure_level_t	old_level = kVMPressureNormal;
3559	int			new_level = -1;
3560
3561	uint64_t		available_memory = (((uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY) * 100);
3562
3563	memorystatus_level = (unsigned int) (available_memory / atop_64(max_mem));
3564
3565	if (memorystatus_manual_testing_on) {
3566		return;
3567	}
3568
3569	old_level = memorystatus_vm_pressure_level;
3570
3571	switch (memorystatus_vm_pressure_level) {
3572
3573		case kVMPressureNormal:
3574		{
3575			if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
3576				new_level = kVMPressureCritical;
3577			}  else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
3578				new_level = kVMPressureWarning;
3579			}
3580			break;
3581		}
3582
3583		case kVMPressureWarning:
3584		case kVMPressureUrgent:
3585		{
3586			if (VM_PRESSURE_WARNING_TO_NORMAL()) {
3587				new_level = kVMPressureNormal;
3588			}  else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
3589				new_level = kVMPressureCritical;
3590			}
3591			break;
3592		}
3593
3594		case kVMPressureCritical:
3595		{
3596			if (VM_PRESSURE_WARNING_TO_NORMAL()) {
3597				new_level = kVMPressureNormal;
3598			}  else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
3599				new_level = kVMPressureWarning;
3600			}
3601			break;
3602		}
3603
3604		default:
3605			return;
3606	}
3607
3608	if (new_level != -1) {
3609		memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
3610
3611		if (old_level != new_level) {
3612			if (vm_pressure_thread_running == FALSE) {
3613				thread_wakeup(&vm_pressure_thread);
3614			}
3615			thread_wakeup(&vm_pressure_changed);
3616		}
3617	}
3618
3619}
3620#endif /* VM_PRESSURE_EVENTS */
3621
3622kern_return_t
3623mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level) {
3624
3625#if   !VM_PRESSURE_EVENTS
3626
3627	return KERN_FAILURE;
3628
3629#else /* VM_PRESSURE_EVENTS */
3630
3631	kern_return_t	kr = KERN_SUCCESS;
3632
3633	if (pressure_level != NULL) {
3634
3635		vm_pressure_level_t	old_level = memorystatus_vm_pressure_level;
3636
3637		if (wait_for_pressure == TRUE) {
3638			wait_result_t		wr = 0;
3639
3640			while (old_level == *pressure_level) {
3641				wr = assert_wait((event_t) &vm_pressure_changed,
3642						 THREAD_INTERRUPTIBLE);
3643				if (wr == THREAD_WAITING) {
3644					wr = thread_block(THREAD_CONTINUE_NULL);
3645				}
3646				if (wr == THREAD_INTERRUPTED) {
3647					return KERN_ABORTED;
3648				}
3649				if (wr == THREAD_AWAKENED) {
3650
3651					old_level = memorystatus_vm_pressure_level;
3652
3653					if (old_level != *pressure_level) {
3654						break;
3655					}
3656				}
3657			}
3658		}
3659
3660		*pressure_level = old_level;
3661		kr = KERN_SUCCESS;
3662	} else {
3663		kr = KERN_INVALID_ARGUMENT;
3664	}
3665
3666	return kr;
3667#endif /* VM_PRESSURE_EVENTS */
3668}
3669
3670#if VM_PRESSURE_EVENTS
3671void
3672vm_pressure_thread(void) {
3673	static boolean_t set_up_thread = FALSE;
3674
3675	if (set_up_thread) {
3676		vm_pressure_thread_running = TRUE;
3677		consider_vm_pressure_events();
3678		vm_pressure_thread_running = FALSE;
3679	}
3680
3681	set_up_thread = TRUE;
3682	assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
3683	thread_block((thread_continue_t)vm_pressure_thread);
3684}
3685#endif /* VM_PRESSURE_EVENTS */
3686
3687
3688uint32_t vm_pageout_considered_page_last = 0;
3689
3690/*
3691 * called once per-second via "compute_averages"
3692 */
3693void
3694compute_pageout_gc_throttle()
3695{
3696	if (vm_pageout_considered_page != vm_pageout_considered_page_last) {
3697
3698		vm_pageout_considered_page_last = vm_pageout_considered_page;
3699
3700		thread_wakeup((event_t) &vm_pageout_garbage_collect);
3701	}
3702}
3703
3704
3705static void
3706vm_pageout_garbage_collect(int collect)
3707{
3708
3709	if (collect) {
3710		boolean_t buf_large_zfree = FALSE;
3711		boolean_t first_try = TRUE;
3712
3713		stack_collect();
3714
3715		consider_machine_collect();
3716
3717		do {
3718			if (consider_buffer_cache_collect != NULL) {
3719				buf_large_zfree = (*consider_buffer_cache_collect)(0);
3720			}
3721			if (first_try == TRUE || buf_large_zfree == TRUE) {
3722				/*
3723				 * consider_zone_gc should be last, because the other operations
3724				 * might return memory to zones.
3725				 */
3726				consider_zone_gc(buf_large_zfree);
3727			}
3728			first_try = FALSE;
3729
3730		} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
3731
3732		consider_machine_adjust();
3733	}
3734	assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
3735
3736	thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
3737	/*NOTREACHED*/
3738}
3739
3740
3741#if VM_PAGE_BUCKETS_CHECK
3742#if VM_PAGE_FAKE_BUCKETS
3743extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
3744#endif /* VM_PAGE_FAKE_BUCKETS */
3745#endif /* VM_PAGE_BUCKETS_CHECK */
3746
3747void
3748vm_pageout(void)
3749{
3750	thread_t	self = current_thread();
3751	thread_t	thread;
3752	kern_return_t	result;
3753	spl_t		s;
3754
3755	/*
3756	 * Set thread privileges.
3757	 */
3758	s = splsched();
3759	thread_lock(self);
3760	self->priority = BASEPRI_PREEMPT - 1;
3761	set_sched_pri(self, self->priority);
3762	thread_unlock(self);
3763
3764	if (!self->reserved_stack)
3765		self->reserved_stack = self->kernel_stack;
3766
3767	splx(s);
3768
3769	/*
3770	 *	Initialize some paging parameters.
3771	 */
3772
3773	if (vm_pageout_swap_wait == 0)
3774		vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
3775
3776	if (vm_pageout_idle_wait == 0)
3777		vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
3778
3779	if (vm_pageout_burst_wait == 0)
3780		vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
3781
3782	if (vm_pageout_empty_wait == 0)
3783		vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
3784
3785	if (vm_pageout_deadlock_wait == 0)
3786		vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
3787
3788	if (vm_pageout_deadlock_relief == 0)
3789		vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
3790
3791	if (vm_pageout_inactive_relief == 0)
3792		vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
3793
3794	if (vm_pageout_burst_active_throttle == 0)
3795	        vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
3796
3797	if (vm_pageout_burst_inactive_throttle == 0)
3798	        vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
3799
3800#if !CONFIG_JETSAM
3801	vm_page_filecache_min = (uint32_t) (max_mem / PAGE_SIZE) / 20;
3802	if (vm_page_filecache_min < VM_PAGE_FILECACHE_MIN)
3803		vm_page_filecache_min = VM_PAGE_FILECACHE_MIN;
3804#endif
3805
3806	/*
3807	 * Set kernel task to low backing store privileged
3808	 * status
3809	 */
3810	task_lock(kernel_task);
3811	kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
3812	task_unlock(kernel_task);
3813
3814	vm_page_free_count_init = vm_page_free_count;
3815
3816	/*
3817	 * even if we've already called vm_page_free_reserve
3818	 * call it again here to insure that the targets are
3819	 * accurately calculated (it uses vm_page_free_count_init)
3820	 * calling it with an arg of 0 will not change the reserve
3821	 * but will re-calculate free_min and free_target
3822	 */
3823	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
3824		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
3825	} else
3826		vm_page_free_reserve(0);
3827
3828
3829	queue_init(&vm_pageout_queue_external.pgo_pending);
3830	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
3831	vm_pageout_queue_external.pgo_laundry = 0;
3832	vm_pageout_queue_external.pgo_idle = FALSE;
3833	vm_pageout_queue_external.pgo_busy = FALSE;
3834	vm_pageout_queue_external.pgo_throttled = FALSE;
3835	vm_pageout_queue_external.pgo_draining = FALSE;
3836	vm_pageout_queue_external.pgo_lowpriority = FALSE;
3837	vm_pageout_queue_external.pgo_tid = -1;
3838	vm_pageout_queue_external.pgo_inited = FALSE;
3839
3840
3841	queue_init(&vm_pageout_queue_internal.pgo_pending);
3842	vm_pageout_queue_internal.pgo_maxlaundry = 0;
3843	vm_pageout_queue_internal.pgo_laundry = 0;
3844	vm_pageout_queue_internal.pgo_idle = FALSE;
3845	vm_pageout_queue_internal.pgo_busy = FALSE;
3846	vm_pageout_queue_internal.pgo_throttled = FALSE;
3847	vm_pageout_queue_internal.pgo_draining = FALSE;
3848	vm_pageout_queue_internal.pgo_lowpriority = FALSE;
3849	vm_pageout_queue_internal.pgo_tid = -1;
3850	vm_pageout_queue_internal.pgo_inited = FALSE;
3851
3852	/* internal pageout thread started when default pager registered first time */
3853	/* external pageout and garbage collection threads started here */
3854
3855	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
3856					      BASEPRI_PREEMPT - 1,
3857					      &vm_pageout_external_iothread);
3858	if (result != KERN_SUCCESS)
3859		panic("vm_pageout_iothread_external: create failed");
3860
3861	thread_deallocate(vm_pageout_external_iothread);
3862
3863	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
3864					      BASEPRI_DEFAULT,
3865					      &thread);
3866	if (result != KERN_SUCCESS)
3867		panic("vm_pageout_garbage_collect: create failed");
3868
3869	thread_deallocate(thread);
3870
3871#if VM_PRESSURE_EVENTS
3872	result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
3873						BASEPRI_DEFAULT,
3874						&thread);
3875
3876	if (result != KERN_SUCCESS)
3877		panic("vm_pressure_thread: create failed");
3878
3879	thread_deallocate(thread);
3880#endif
3881
3882	vm_object_reaper_init();
3883
3884	if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)
3885		vm_compressor_pager_init();
3886
3887#if VM_PAGE_BUCKETS_CHECK
3888#if VM_PAGE_FAKE_BUCKETS
3889	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
3890	       vm_page_fake_buckets_start, vm_page_fake_buckets_end);
3891	pmap_protect(kernel_pmap,
3892		     vm_page_fake_buckets_start,
3893		     vm_page_fake_buckets_end,
3894		     VM_PROT_READ);
3895//	*(char *) vm_page_fake_buckets_start = 'x';	/* panic! */
3896#endif /* VM_PAGE_FAKE_BUCKETS */
3897#endif /* VM_PAGE_BUCKETS_CHECK */
3898
3899	vm_pageout_continue();
3900
3901	/*
3902	 * Unreached code!
3903	 *
3904	 * The vm_pageout_continue() call above never returns, so the code below is never
3905	 * executed.  We take advantage of this to declare several DTrace VM related probe
3906	 * points that our kernel doesn't have an analog for.  These are probe points that
3907	 * exist in Solaris and are in the DTrace documentation, so people may have written
3908	 * scripts that use them.  Declaring the probe points here means their scripts will
3909	 * compile and execute which we want for portability of the scripts, but since this
3910	 * section of code is never reached, the probe points will simply never fire.  Yes,
3911	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
3912	 * Solaris specific VM events in mind, not portability to different VM implementations.
3913	 */
3914
3915	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
3916	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
3917	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
3918	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
3919	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
3920	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
3921	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
3922	/*NOTREACHED*/
3923}
3924
3925
3926
3927#define MAX_COMRPESSOR_THREAD_COUNT	8
3928
3929struct cq ciq[MAX_COMRPESSOR_THREAD_COUNT];
3930
3931int vm_compressor_thread_count = 2;
3932
3933kern_return_t
3934vm_pageout_internal_start(void)
3935{
3936	kern_return_t	result;
3937	int		i;
3938	host_basic_info_data_t hinfo;
3939
3940	if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
3941		mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
3942#define BSD_HOST 1
3943		host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
3944
3945		assert(hinfo.max_cpus > 0);
3946
3947		if (vm_compressor_thread_count >= hinfo.max_cpus)
3948			vm_compressor_thread_count = hinfo.max_cpus - 1;
3949		if (vm_compressor_thread_count <= 0)
3950			vm_compressor_thread_count = 1;
3951		else if (vm_compressor_thread_count > MAX_COMRPESSOR_THREAD_COUNT)
3952			vm_compressor_thread_count = MAX_COMRPESSOR_THREAD_COUNT;
3953
3954		vm_pageout_queue_internal.pgo_maxlaundry = (vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
3955	} else {
3956		vm_compressor_thread_count = 1;
3957		vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
3958	}
3959
3960	for (i = 0; i < vm_compressor_thread_count; i++) {
3961
3962		result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i], BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
3963		if (result == KERN_SUCCESS)
3964			thread_deallocate(vm_pageout_internal_iothread);
3965		else
3966			break;
3967	}
3968	return result;
3969}
3970
3971
3972static upl_t
3973upl_create(int type, int flags, upl_size_t size)
3974{
3975	upl_t	upl;
3976	vm_size_t	page_field_size = 0;
3977	int	upl_flags = 0;
3978	vm_size_t	upl_size  = sizeof(struct upl);
3979
3980	size = round_page_32(size);
3981
3982	if (type & UPL_CREATE_LITE) {
3983		page_field_size = (atop(size) + 7) >> 3;
3984		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
3985
3986		upl_flags |= UPL_LITE;
3987	}
3988	if (type & UPL_CREATE_INTERNAL) {
3989		upl_size += sizeof(struct upl_page_info) * atop(size);
3990
3991		upl_flags |= UPL_INTERNAL;
3992	}
3993	upl = (upl_t)kalloc(upl_size + page_field_size);
3994
3995	if (page_field_size)
3996	        bzero((char *)upl + upl_size, page_field_size);
3997
3998	upl->flags = upl_flags | flags;
3999	upl->src_object = NULL;
4000	upl->kaddr = (vm_offset_t)0;
4001	upl->size = 0;
4002	upl->map_object = NULL;
4003	upl->ref_count = 1;
4004	upl->ext_ref_count = 0;
4005	upl->highest_page = 0;
4006	upl_lock_init(upl);
4007	upl->vector_upl = NULL;
4008#if UPL_DEBUG
4009	upl->ubc_alias1 = 0;
4010	upl->ubc_alias2 = 0;
4011
4012	upl->upl_creator = current_thread();
4013	upl->upl_state = 0;
4014	upl->upl_commit_index = 0;
4015	bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
4016
4017	upl->uplq.next = 0;
4018	upl->uplq.prev = 0;
4019
4020	(void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
4021#endif /* UPL_DEBUG */
4022
4023	return(upl);
4024}
4025
4026static void
4027upl_destroy(upl_t upl)
4028{
4029	int	page_field_size;  /* bit field in word size buf */
4030        int	size;
4031
4032	if (upl->ext_ref_count) {
4033		panic("upl(%p) ext_ref_count", upl);
4034	}
4035
4036#if UPL_DEBUG
4037	if ( !(upl->flags & UPL_VECTOR)) {
4038		vm_object_t	object;
4039
4040		if (upl->flags & UPL_SHADOWED) {
4041			object = upl->map_object->shadow;
4042		} else {
4043			object = upl->map_object;
4044		}
4045		vm_object_lock(object);
4046		queue_remove(&object->uplq, upl, upl_t, uplq);
4047		vm_object_activity_end(object);
4048		vm_object_collapse(object, 0, TRUE);
4049		vm_object_unlock(object);
4050	}
4051#endif /* UPL_DEBUG */
4052	/*
4053	 * drop a reference on the map_object whether or
4054	 * not a pageout object is inserted
4055	 */
4056	if (upl->flags & UPL_SHADOWED)
4057		vm_object_deallocate(upl->map_object);
4058
4059        if (upl->flags & UPL_DEVICE_MEMORY)
4060	        size = PAGE_SIZE;
4061	else
4062	        size = upl->size;
4063	page_field_size = 0;
4064
4065	if (upl->flags & UPL_LITE) {
4066		page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4067		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
4068	}
4069	upl_lock_destroy(upl);
4070	upl->vector_upl = (vector_upl_t) 0xfeedbeef;
4071
4072	if (upl->flags & UPL_INTERNAL) {
4073		kfree(upl,
4074		      sizeof(struct upl) +
4075		      (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
4076		      + page_field_size);
4077	} else {
4078		kfree(upl, sizeof(struct upl) + page_field_size);
4079	}
4080}
4081
4082void
4083upl_deallocate(upl_t upl)
4084{
4085	if (--upl->ref_count == 0) {
4086		if(vector_upl_is_valid(upl))
4087			vector_upl_deallocate(upl);
4088		upl_destroy(upl);
4089	}
4090}
4091
4092#if DEVELOPMENT || DEBUG
4093/*/*
4094 * Statistics about UPL enforcement of copy-on-write obligations.
4095 */
4096unsigned long upl_cow = 0;
4097unsigned long upl_cow_again = 0;
4098unsigned long upl_cow_pages = 0;
4099unsigned long upl_cow_again_pages = 0;
4100
4101unsigned long iopl_cow = 0;
4102unsigned long iopl_cow_pages = 0;
4103#endif
4104
4105/*
4106 *	Routine:	vm_object_upl_request
4107 *	Purpose:
4108 *		Cause the population of a portion of a vm_object.
4109 *		Depending on the nature of the request, the pages
4110 *		returned may be contain valid data or be uninitialized.
4111 *		A page list structure, listing the physical pages
4112 *		will be returned upon request.
4113 *		This function is called by the file system or any other
4114 *		supplier of backing store to a pager.
4115 *		IMPORTANT NOTE: The caller must still respect the relationship
4116 *		between the vm_object and its backing memory object.  The
4117 *		caller MUST NOT substitute changes in the backing file
4118 *		without first doing a memory_object_lock_request on the
4119 *		target range unless it is know that the pages are not
4120 *		shared with another entity at the pager level.
4121 *		Copy_in_to:
4122 *			if a page list structure is present
4123 *			return the mapped physical pages, where a
4124 *			page is not present, return a non-initialized
4125 *			one.  If the no_sync bit is turned on, don't
4126 *			call the pager unlock to synchronize with other
4127 *			possible copies of the page. Leave pages busy
4128 *			in the original object, if a page list structure
4129 *			was specified.  When a commit of the page list
4130 *			pages is done, the dirty bit will be set for each one.
4131 *		Copy_out_from:
4132 *			If a page list structure is present, return
4133 *			all mapped pages.  Where a page does not exist
4134 *			map a zero filled one. Leave pages busy in
4135 *			the original object.  If a page list structure
4136 *			is not specified, this call is a no-op.
4137 *
4138 *		Note:  access of default pager objects has a rather interesting
4139 *		twist.  The caller of this routine, presumably the file system
4140 *		page cache handling code, will never actually make a request
4141 *		against a default pager backed object.  Only the default
4142 *		pager will make requests on backing store related vm_objects
4143 *		In this way the default pager can maintain the relationship
4144 *		between backing store files (abstract memory objects) and
4145 *		the vm_objects (cache objects), they support.
4146 *
4147 */
4148
4149__private_extern__ kern_return_t
4150vm_object_upl_request(
4151	vm_object_t		object,
4152	vm_object_offset_t	offset,
4153	upl_size_t		size,
4154	upl_t			*upl_ptr,
4155	upl_page_info_array_t	user_page_list,
4156	unsigned int		*page_list_count,
4157	int			cntrl_flags)
4158{
4159	vm_page_t		dst_page = VM_PAGE_NULL;
4160	vm_object_offset_t	dst_offset;
4161	upl_size_t		xfer_size;
4162	unsigned int		size_in_pages;
4163	boolean_t		dirty;
4164	boolean_t		hw_dirty;
4165	upl_t			upl = NULL;
4166	unsigned int		entry;
4167#if MACH_CLUSTER_STATS
4168	boolean_t		encountered_lrp = FALSE;
4169#endif
4170	vm_page_t		alias_page = NULL;
4171        int			refmod_state = 0;
4172	wpl_array_t 		lite_list = NULL;
4173	vm_object_t		last_copy_object;
4174	struct	vm_page_delayed_work	dw_array[DEFAULT_DELAYED_WORK_LIMIT];
4175	struct	vm_page_delayed_work	*dwp;
4176	int			dw_count;
4177	int			dw_limit;
4178
4179	if (cntrl_flags & ~UPL_VALID_FLAGS) {
4180		/*
4181		 * For forward compatibility's sake,
4182		 * reject any unknown flag.
4183		 */
4184		return KERN_INVALID_VALUE;
4185	}
4186	if ( (!object->internal) && (object->paging_offset != 0) )
4187		panic("vm_object_upl_request: external object with non-zero paging offset\n");
4188	if (object->phys_contiguous)
4189	        panic("vm_object_upl_request: contiguous object specified\n");
4190
4191
4192	if ((size / PAGE_SIZE) > MAX_UPL_SIZE)
4193		size = MAX_UPL_SIZE * PAGE_SIZE;
4194
4195	if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
4196	        *page_list_count = MAX_UPL_SIZE;
4197
4198	if (cntrl_flags & UPL_SET_INTERNAL) {
4199	        if (cntrl_flags & UPL_SET_LITE) {
4200
4201			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
4202
4203			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
4204			lite_list = (wpl_array_t)
4205					(((uintptr_t)user_page_list) +
4206					((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4207			if (size == 0) {
4208				user_page_list = NULL;
4209				lite_list = NULL;
4210			}
4211		} else {
4212		        upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
4213
4214			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
4215			if (size == 0) {
4216				user_page_list = NULL;
4217			}
4218		}
4219	} else {
4220	        if (cntrl_flags & UPL_SET_LITE) {
4221
4222			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
4223
4224			lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4225			if (size == 0) {
4226				lite_list = NULL;
4227			}
4228		} else {
4229		        upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
4230		}
4231	}
4232	*upl_ptr = upl;
4233
4234	if (user_page_list)
4235	        user_page_list[0].device = FALSE;
4236
4237	if (cntrl_flags & UPL_SET_LITE) {
4238	        upl->map_object = object;
4239	} else {
4240	        upl->map_object = vm_object_allocate(size);
4241		/*
4242		 * No neeed to lock the new object: nobody else knows
4243		 * about it yet, so it's all ours so far.
4244		 */
4245		upl->map_object->shadow = object;
4246		upl->map_object->pageout = TRUE;
4247		upl->map_object->can_persist = FALSE;
4248		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
4249		upl->map_object->vo_shadow_offset = offset;
4250		upl->map_object->wimg_bits = object->wimg_bits;
4251
4252		VM_PAGE_GRAB_FICTITIOUS(alias_page);
4253
4254		upl->flags |= UPL_SHADOWED;
4255	}
4256	/*
4257	 * ENCRYPTED SWAP:
4258	 * Just mark the UPL as "encrypted" here.
4259	 * We'll actually encrypt the pages later,
4260	 * in upl_encrypt(), when the caller has
4261	 * selected which pages need to go to swap.
4262	 */
4263	if (cntrl_flags & UPL_ENCRYPT)
4264		upl->flags |= UPL_ENCRYPTED;
4265
4266	if (cntrl_flags & UPL_FOR_PAGEOUT)
4267		upl->flags |= UPL_PAGEOUT;
4268
4269	vm_object_lock(object);
4270	vm_object_activity_begin(object);
4271
4272	/*
4273	 * we can lock in the paging_offset once paging_in_progress is set
4274	 */
4275	upl->size = size;
4276	upl->offset = offset + object->paging_offset;
4277
4278#if UPL_DEBUG
4279	vm_object_activity_begin(object);
4280	queue_enter(&object->uplq, upl, upl_t, uplq);
4281#endif /* UPL_DEBUG */
4282
4283	if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
4284		/*
4285		 * Honor copy-on-write obligations
4286		 *
4287		 * The caller is gathering these pages and
4288		 * might modify their contents.  We need to
4289		 * make sure that the copy object has its own
4290		 * private copies of these pages before we let
4291		 * the caller modify them.
4292		 */
4293		vm_object_update(object,
4294				 offset,
4295				 size,
4296				 NULL,
4297				 NULL,
4298				 FALSE,	/* should_return */
4299				 MEMORY_OBJECT_COPY_SYNC,
4300				 VM_PROT_NO_CHANGE);
4301#if DEVELOPMENT || DEBUG
4302		upl_cow++;
4303		upl_cow_pages += size >> PAGE_SHIFT;
4304#endif
4305	}
4306	/*
4307	 * remember which copy object we synchronized with
4308	 */
4309	last_copy_object = object->copy;
4310	entry = 0;
4311
4312	xfer_size = size;
4313	dst_offset = offset;
4314	size_in_pages = size / PAGE_SIZE;
4315
4316	dwp = &dw_array[0];
4317	dw_count = 0;
4318	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
4319
4320	if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
4321	    object->resident_page_count < (MAX_UPL_SIZE * 2))
4322		object->scan_collisions = 0;
4323
4324	while (xfer_size) {
4325
4326		dwp->dw_mask = 0;
4327
4328		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4329			vm_object_unlock(object);
4330			VM_PAGE_GRAB_FICTITIOUS(alias_page);
4331			vm_object_lock(object);
4332		}
4333		if (cntrl_flags & UPL_COPYOUT_FROM) {
4334		        upl->flags |= UPL_PAGE_SYNC_DONE;
4335
4336			if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
4337				dst_page->fictitious ||
4338				dst_page->absent ||
4339				dst_page->error ||
4340			        dst_page->cleaning ||
4341			        (VM_PAGE_WIRED(dst_page))) {
4342
4343				if (user_page_list)
4344					user_page_list[entry].phys_addr = 0;
4345
4346				goto try_next_page;
4347			}
4348			/*
4349			 * grab this up front...
4350			 * a high percentange of the time we're going to
4351			 * need the hardware modification state a bit later
4352			 * anyway... so we can eliminate an extra call into
4353			 * the pmap layer by grabbing it here and recording it
4354			 */
4355			if (dst_page->pmapped)
4356			        refmod_state = pmap_get_refmod(dst_page->phys_page);
4357			else
4358			        refmod_state = 0;
4359
4360			if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
4361			        /*
4362				 * page is on inactive list and referenced...
4363				 * reactivate it now... this gets it out of the
4364				 * way of vm_pageout_scan which would have to
4365				 * reactivate it upon tripping over it
4366				 */
4367				dwp->dw_mask |= DW_vm_page_activate;
4368			}
4369			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
4370			        /*
4371				 * we're only asking for DIRTY pages to be returned
4372				 */
4373			        if (dst_page->laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
4374				        /*
4375					 * if we were the page stolen by vm_pageout_scan to be
4376					 * cleaned (as opposed to a buddy being clustered in
4377					 * or this request is not being driven by a PAGEOUT cluster
4378					 * then we only need to check for the page being dirty or
4379					 * precious to decide whether to return it
4380					 */
4381				        if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
4382					        goto check_busy;
4383					goto dont_return;
4384				}
4385				/*
4386				 * this is a request for a PAGEOUT cluster and this page
4387				 * is merely along for the ride as a 'buddy'... not only
4388				 * does it have to be dirty to be returned, but it also
4389				 * can't have been referenced recently...
4390				 */
4391				if ( (hibernate_cleaning_in_progress == TRUE ||
4392				      (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) || dst_page->throttled)) &&
4393				      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
4394				        goto check_busy;
4395				}
4396dont_return:
4397				/*
4398				 * if we reach here, we're not to return
4399				 * the page... go on to the next one
4400				 */
4401				if (dst_page->laundry == TRUE) {
4402					/*
4403					 * if we get here, the page is not 'cleaning' (filtered out above).
4404					 * since it has been referenced, remove it from the laundry
4405					 * so we don't pay the cost of an I/O to clean a page
4406					 * we're just going to take back
4407					 */
4408					vm_page_lockspin_queues();
4409
4410					vm_pageout_steal_laundry(dst_page, TRUE);
4411					vm_page_activate(dst_page);
4412
4413					vm_page_unlock_queues();
4414				}
4415				if (user_page_list)
4416				        user_page_list[entry].phys_addr = 0;
4417
4418				goto try_next_page;
4419			}
4420check_busy:
4421			if (dst_page->busy) {
4422			        if (cntrl_flags & UPL_NOBLOCK) {
4423			        if (user_page_list)
4424					        user_page_list[entry].phys_addr = 0;
4425
4426					goto try_next_page;
4427				}
4428				/*
4429				 * someone else is playing with the
4430				 * page.  We will have to wait.
4431				 */
4432				PAGE_SLEEP(object, dst_page, THREAD_UNINT);
4433
4434				continue;
4435			}
4436			/*
4437			 * ENCRYPTED SWAP:
4438			 * The caller is gathering this page and might
4439			 * access its contents later on.  Decrypt the
4440			 * page before adding it to the UPL, so that
4441			 * the caller never sees encrypted data.
4442			 */
4443			if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
4444			        int  was_busy;
4445
4446				/*
4447				 * save the current state of busy
4448				 * mark page as busy while decrypt
4449				 * is in progress since it will drop
4450				 * the object lock...
4451				 */
4452				was_busy = dst_page->busy;
4453				dst_page->busy = TRUE;
4454
4455				vm_page_decrypt(dst_page, 0);
4456				vm_page_decrypt_for_upl_counter++;
4457				/*
4458				 * restore to original busy state
4459				 */
4460				dst_page->busy = was_busy;
4461			}
4462			if (dst_page->pageout_queue == TRUE) {
4463
4464				vm_page_lockspin_queues();
4465
4466				if (dst_page->pageout_queue == TRUE) {
4467					/*
4468					 * we've buddied up a page for a clustered pageout
4469					 * that has already been moved to the pageout
4470					 * queue by pageout_scan... we need to remove
4471					 * it from the queue and drop the laundry count
4472					 * on that queue
4473					 */
4474					vm_pageout_throttle_up(dst_page);
4475				}
4476				vm_page_unlock_queues();
4477			}
4478#if MACH_CLUSTER_STATS
4479			/*
4480			 * pageout statistics gathering.  count
4481			 * all the pages we will page out that
4482			 * were not counted in the initial
4483			 * vm_pageout_scan work
4484			 */
4485			if (dst_page->pageout)
4486			        encountered_lrp = TRUE;
4487			if ((dst_page->dirty ||	(dst_page->object->internal && dst_page->precious))) {
4488			        if (encountered_lrp)
4489				        CLUSTER_STAT(pages_at_higher_offsets++;)
4490				else
4491				        CLUSTER_STAT(pages_at_lower_offsets++;)
4492			}
4493#endif
4494			hw_dirty = refmod_state & VM_MEM_MODIFIED;
4495			dirty = hw_dirty ? TRUE : dst_page->dirty;
4496
4497			if (dst_page->phys_page > upl->highest_page)
4498			        upl->highest_page = dst_page->phys_page;
4499
4500			if (cntrl_flags & UPL_SET_LITE) {
4501				unsigned int	pg_num;
4502
4503				pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
4504				assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
4505				lite_list[pg_num>>5] |= 1 << (pg_num & 31);
4506
4507				if (hw_dirty)
4508				        pmap_clear_modify(dst_page->phys_page);
4509
4510				/*
4511				 * Mark original page as cleaning
4512				 * in place.
4513				 */
4514				dst_page->cleaning = TRUE;
4515				dst_page->precious = FALSE;
4516			} else {
4517			        /*
4518				 * use pageclean setup, it is more
4519				 * convenient even for the pageout
4520				 * cases here
4521				 */
4522			        vm_object_lock(upl->map_object);
4523				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
4524				vm_object_unlock(upl->map_object);
4525
4526				alias_page->absent = FALSE;
4527				alias_page = NULL;
4528			}
4529#if     MACH_PAGEMAP
4530			/*
4531			 * Record that this page has been
4532			 * written out
4533			 */
4534			vm_external_state_set(object->existence_map, dst_page->offset);
4535#endif  /*MACH_PAGEMAP*/
4536			if (dirty) {
4537				SET_PAGE_DIRTY(dst_page, FALSE);
4538			} else {
4539				dst_page->dirty = FALSE;
4540			}
4541
4542			if (!dirty)
4543				dst_page->precious = TRUE;
4544
4545			if ( (cntrl_flags & UPL_ENCRYPT) ) {
4546			        /*
4547				 * ENCRYPTED SWAP:
4548				 * We want to deny access to the target page
4549				 * because its contents are about to be
4550				 * encrypted and the user would be very
4551				 * confused to see encrypted data instead
4552				 * of their data.
4553				 * We also set "encrypted_cleaning" to allow
4554				 * vm_pageout_scan() to demote that page
4555				 * from "adjacent/clean-in-place" to
4556				 * "target/clean-and-free" if it bumps into
4557				 * this page during its scanning while we're
4558				 * still processing this cluster.
4559				 */
4560			        dst_page->busy = TRUE;
4561				dst_page->encrypted_cleaning = TRUE;
4562			}
4563			if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
4564				if ( !VM_PAGE_WIRED(dst_page))
4565					dst_page->pageout = TRUE;
4566			}
4567		} else {
4568			if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
4569				/*
4570				 * Honor copy-on-write obligations
4571				 *
4572				 * The copy object has changed since we
4573				 * last synchronized for copy-on-write.
4574				 * Another copy object might have been
4575				 * inserted while we released the object's
4576				 * lock.  Since someone could have seen the
4577				 * original contents of the remaining pages
4578				 * through that new object, we have to
4579				 * synchronize with it again for the remaining
4580				 * pages only.  The previous pages are "busy"
4581				 * so they can not be seen through the new
4582				 * mapping.  The new mapping will see our
4583				 * upcoming changes for those previous pages,
4584				 * but that's OK since they couldn't see what
4585				 * was there before.  It's just a race anyway
4586				 * and there's no guarantee of consistency or
4587				 * atomicity.  We just don't want new mappings
4588				 * to see both the *before* and *after* pages.
4589				 */
4590				if (object->copy != VM_OBJECT_NULL) {
4591					vm_object_update(
4592						object,
4593						dst_offset,/* current offset */
4594						xfer_size, /* remaining size */
4595						NULL,
4596						NULL,
4597						FALSE,	   /* should_return */
4598						MEMORY_OBJECT_COPY_SYNC,
4599						VM_PROT_NO_CHANGE);
4600
4601#if DEVELOPMENT || DEBUG
4602					upl_cow_again++;
4603					upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
4604#endif
4605				}
4606				/*
4607				 * remember the copy object we synced with
4608				 */
4609				last_copy_object = object->copy;
4610			}
4611			dst_page = vm_page_lookup(object, dst_offset);
4612
4613			if (dst_page != VM_PAGE_NULL) {
4614
4615				if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
4616					/*
4617					 * skip over pages already present in the cache
4618					 */
4619					if (user_page_list)
4620						user_page_list[entry].phys_addr = 0;
4621
4622					goto try_next_page;
4623				}
4624				if (dst_page->fictitious) {
4625					panic("need corner case for fictitious page");
4626				}
4627
4628				if (dst_page->busy || dst_page->cleaning) {
4629					/*
4630					 * someone else is playing with the
4631					 * page.  We will have to wait.
4632					 */
4633					PAGE_SLEEP(object, dst_page, THREAD_UNINT);
4634
4635					continue;
4636				}
4637				if (dst_page->laundry) {
4638					dst_page->pageout = FALSE;
4639
4640					vm_pageout_steal_laundry(dst_page, FALSE);
4641				}
4642			} else {
4643				if (object->private) {
4644					/*
4645					 * This is a nasty wrinkle for users
4646					 * of upl who encounter device or
4647					 * private memory however, it is
4648					 * unavoidable, only a fault can
4649					 * resolve the actual backing
4650					 * physical page by asking the
4651					 * backing device.
4652					 */
4653					if (user_page_list)
4654						user_page_list[entry].phys_addr = 0;
4655
4656					goto try_next_page;
4657				}
4658				if (object->scan_collisions) {
4659					/*
4660					 * the pageout_scan thread is trying to steal
4661					 * pages from this object, but has run into our
4662					 * lock... grab 2 pages from the head of the object...
4663					 * the first is freed on behalf of pageout_scan, the
4664					 * 2nd is for our own use... we use vm_object_page_grab
4665					 * in both cases to avoid taking pages from the free
4666					 * list since we are under memory pressure and our
4667					 * lock on this object is getting in the way of
4668					 * relieving it
4669					 */
4670					dst_page = vm_object_page_grab(object);
4671
4672					if (dst_page != VM_PAGE_NULL)
4673						vm_page_release(dst_page);
4674
4675					dst_page = vm_object_page_grab(object);
4676				}
4677				if (dst_page == VM_PAGE_NULL) {
4678					/*
4679					 * need to allocate a page
4680					 */
4681					dst_page = vm_page_grab();
4682				}
4683				if (dst_page == VM_PAGE_NULL) {
4684				        if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
4685					       /*
4686						* we don't want to stall waiting for pages to come onto the free list
4687						* while we're already holding absent pages in this UPL
4688						* the caller will deal with the empty slots
4689						*/
4690					        if (user_page_list)
4691						        user_page_list[entry].phys_addr = 0;
4692
4693						goto try_next_page;
4694					}
4695				        /*
4696					 * no pages available... wait
4697					 * then try again for the same
4698					 * offset...
4699					 */
4700					vm_object_unlock(object);
4701
4702					OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
4703
4704					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
4705
4706					VM_PAGE_WAIT();
4707					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
4708
4709					VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
4710
4711					vm_object_lock(object);
4712
4713					continue;
4714				}
4715				vm_page_insert(dst_page, object, dst_offset);
4716
4717				dst_page->absent = TRUE;
4718				dst_page->busy = FALSE;
4719
4720				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
4721				        /*
4722					 * if UPL_RET_ONLY_ABSENT was specified,
4723					 * than we're definitely setting up a
4724					 * upl for a clustered read/pagein
4725					 * operation... mark the pages as clustered
4726					 * so upl_commit_range can put them on the
4727					 * speculative list
4728					 */
4729				        dst_page->clustered = TRUE;
4730				}
4731			}
4732			/*
4733			 * ENCRYPTED SWAP:
4734			 */
4735			if (cntrl_flags & UPL_ENCRYPT) {
4736				/*
4737				 * The page is going to be encrypted when we
4738				 * get it from the pager, so mark it so.
4739				 */
4740				dst_page->encrypted = TRUE;
4741			} else {
4742				/*
4743				 * Otherwise, the page will not contain
4744				 * encrypted data.
4745				 */
4746				dst_page->encrypted = FALSE;
4747			}
4748			dst_page->overwriting = TRUE;
4749
4750			if (dst_page->pmapped) {
4751			        if ( !(cntrl_flags & UPL_FILE_IO))
4752				        /*
4753					 * eliminate all mappings from the
4754					 * original object and its prodigy
4755					 */
4756				        refmod_state = pmap_disconnect(dst_page->phys_page);
4757				else
4758				        refmod_state = pmap_get_refmod(dst_page->phys_page);
4759			} else
4760			        refmod_state = 0;
4761
4762			hw_dirty = refmod_state & VM_MEM_MODIFIED;
4763			dirty = hw_dirty ? TRUE : dst_page->dirty;
4764
4765			if (cntrl_flags & UPL_SET_LITE) {
4766				unsigned int	pg_num;
4767
4768				pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
4769				assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
4770				lite_list[pg_num>>5] |= 1 << (pg_num & 31);
4771
4772				if (hw_dirty)
4773				        pmap_clear_modify(dst_page->phys_page);
4774
4775				/*
4776				 * Mark original page as cleaning
4777				 * in place.
4778				 */
4779				dst_page->cleaning = TRUE;
4780				dst_page->precious = FALSE;
4781			} else {
4782				/*
4783				 * use pageclean setup, it is more
4784				 * convenient even for the pageout
4785				 * cases here
4786				 */
4787			        vm_object_lock(upl->map_object);
4788				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
4789			        vm_object_unlock(upl->map_object);
4790
4791				alias_page->absent = FALSE;
4792				alias_page = NULL;
4793			}
4794
4795			if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
4796				upl->flags &= ~UPL_CLEAR_DIRTY;
4797				upl->flags |= UPL_SET_DIRTY;
4798				dirty = TRUE;
4799				upl->flags |= UPL_SET_DIRTY;
4800			} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
4801				/*
4802				 * clean in place for read implies
4803				 * that a write will be done on all
4804				 * the pages that are dirty before
4805				 * a upl commit is done.  The caller
4806				 * is obligated to preserve the
4807				 * contents of all pages marked dirty
4808				 */
4809				upl->flags |= UPL_CLEAR_DIRTY;
4810			}
4811			dst_page->dirty = dirty;
4812
4813			if (!dirty)
4814				dst_page->precious = TRUE;
4815
4816			if ( !VM_PAGE_WIRED(dst_page)) {
4817			        /*
4818				 * deny access to the target page while
4819				 * it is being worked on
4820				 */
4821				dst_page->busy = TRUE;
4822			} else
4823				dwp->dw_mask |= DW_vm_page_wire;
4824
4825			/*
4826			 * We might be about to satisfy a fault which has been
4827			 * requested. So no need for the "restart" bit.
4828			 */
4829			dst_page->restart = FALSE;
4830			if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
4831			        /*
4832				 * expect the page to be used
4833				 */
4834				dwp->dw_mask |= DW_set_reference;
4835			}
4836			if (cntrl_flags & UPL_PRECIOUS) {
4837				if (dst_page->object->internal) {
4838					SET_PAGE_DIRTY(dst_page, FALSE);
4839					dst_page->precious = FALSE;
4840				} else {
4841					dst_page->precious = TRUE;
4842				}
4843			} else {
4844				dst_page->precious = FALSE;
4845			}
4846		}
4847		if (dst_page->busy)
4848			upl->flags |= UPL_HAS_BUSY;
4849
4850		if (dst_page->phys_page > upl->highest_page)
4851		        upl->highest_page = dst_page->phys_page;
4852		if (user_page_list) {
4853			user_page_list[entry].phys_addr = dst_page->phys_page;
4854			user_page_list[entry].pageout	= dst_page->pageout;
4855			user_page_list[entry].absent	= dst_page->absent;
4856			user_page_list[entry].dirty	= dst_page->dirty;
4857			user_page_list[entry].precious	= dst_page->precious;
4858			user_page_list[entry].device	= FALSE;
4859			user_page_list[entry].needed    = FALSE;
4860			if (dst_page->clustered == TRUE)
4861			        user_page_list[entry].speculative = dst_page->speculative;
4862			else
4863			        user_page_list[entry].speculative = FALSE;
4864			user_page_list[entry].cs_validated = dst_page->cs_validated;
4865			user_page_list[entry].cs_tainted = dst_page->cs_tainted;
4866		}
4867	        /*
4868		 * if UPL_RET_ONLY_ABSENT is set, then
4869		 * we are working with a fresh page and we've
4870		 * just set the clustered flag on it to
4871		 * indicate that it was drug in as part of a
4872		 * speculative cluster... so leave it alone
4873		 */
4874		if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
4875		        /*
4876			 * someone is explicitly grabbing this page...
4877			 * update clustered and speculative state
4878			 *
4879			 */
4880		        VM_PAGE_CONSUME_CLUSTERED(dst_page);
4881		}
4882try_next_page:
4883		if (dwp->dw_mask) {
4884			if (dwp->dw_mask & DW_vm_page_activate)
4885				VM_STAT_INCR(reactivations);
4886
4887			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
4888
4889			if (dw_count >= dw_limit) {
4890				vm_page_do_delayed_work(object, &dw_array[0], dw_count);
4891
4892				dwp = &dw_array[0];
4893				dw_count = 0;
4894			}
4895		}
4896		entry++;
4897		dst_offset += PAGE_SIZE_64;
4898		xfer_size -= PAGE_SIZE;
4899	}
4900	if (dw_count)
4901		vm_page_do_delayed_work(object, &dw_array[0], dw_count);
4902
4903	if (alias_page != NULL) {
4904		VM_PAGE_FREE(alias_page);
4905	}
4906
4907	if (page_list_count != NULL) {
4908	        if (upl->flags & UPL_INTERNAL)
4909			*page_list_count = 0;
4910		else if (*page_list_count > entry)
4911			*page_list_count = entry;
4912	}
4913#if UPL_DEBUG
4914	upl->upl_state = 1;
4915#endif
4916	vm_object_unlock(object);
4917
4918	return KERN_SUCCESS;
4919}
4920
4921/* JMM - Backward compatability for now */
4922kern_return_t
4923vm_fault_list_request(			/* forward */
4924	memory_object_control_t		control,
4925	vm_object_offset_t	offset,
4926	upl_size_t		size,
4927	upl_t			*upl_ptr,
4928	upl_page_info_t		**user_page_list_ptr,
4929	unsigned int		page_list_count,
4930	int			cntrl_flags);
4931kern_return_t
4932vm_fault_list_request(
4933	memory_object_control_t		control,
4934	vm_object_offset_t	offset,
4935	upl_size_t		size,
4936	upl_t			*upl_ptr,
4937	upl_page_info_t		**user_page_list_ptr,
4938	unsigned int		page_list_count,
4939	int			cntrl_flags)
4940{
4941	unsigned int		local_list_count;
4942	upl_page_info_t		*user_page_list;
4943	kern_return_t		kr;
4944
4945	if((cntrl_flags & UPL_VECTOR)==UPL_VECTOR)
4946		 return KERN_INVALID_ARGUMENT;
4947
4948	if (user_page_list_ptr != NULL) {
4949		local_list_count = page_list_count;
4950		user_page_list = *user_page_list_ptr;
4951	} else {
4952		local_list_count = 0;
4953		user_page_list = NULL;
4954	}
4955	kr =  memory_object_upl_request(control,
4956				offset,
4957				size,
4958				upl_ptr,
4959				user_page_list,
4960				&local_list_count,
4961				cntrl_flags);
4962
4963	if(kr != KERN_SUCCESS)
4964		return kr;
4965
4966	if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
4967		*user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
4968	}
4969
4970	return KERN_SUCCESS;
4971}
4972
4973
4974
4975/*
4976 *	Routine:	vm_object_super_upl_request
4977 *	Purpose:
4978 *		Cause the population of a portion of a vm_object
4979 *		in much the same way as memory_object_upl_request.
4980 *		Depending on the nature of the request, the pages
4981 *		returned may be contain valid data or be uninitialized.
4982 *		However, the region may be expanded up to the super
4983 *		cluster size provided.
4984 */
4985
4986__private_extern__ kern_return_t
4987vm_object_super_upl_request(
4988	vm_object_t object,
4989	vm_object_offset_t	offset,
4990	upl_size_t		size,
4991	upl_size_t		super_cluster,
4992	upl_t			*upl,
4993	upl_page_info_t		*user_page_list,
4994	unsigned int		*page_list_count,
4995	int			cntrl_flags)
4996{
4997	if (object->paging_offset > offset  || ((cntrl_flags & UPL_VECTOR)==UPL_VECTOR))
4998		return KERN_FAILURE;
4999
5000	assert(object->paging_in_progress);
5001	offset = offset - object->paging_offset;
5002
5003	if (super_cluster > size) {
5004
5005		vm_object_offset_t	base_offset;
5006		upl_size_t		super_size;
5007		vm_object_size_t	super_size_64;
5008
5009		base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
5010		super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
5011		super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
5012		super_size = (upl_size_t) super_size_64;
5013		assert(super_size == super_size_64);
5014
5015		if (offset > (base_offset + super_size)) {
5016		        panic("vm_object_super_upl_request: Missed target pageout"
5017			      " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
5018			      offset, base_offset, super_size, super_cluster,
5019			      size, object->paging_offset);
5020		}
5021		/*
5022		 * apparently there is a case where the vm requests a
5023		 * page to be written out who's offset is beyond the
5024		 * object size
5025		 */
5026		if ((offset + size) > (base_offset + super_size)) {
5027		        super_size_64 = (offset + size) - base_offset;
5028			super_size = (upl_size_t) super_size_64;
5029			assert(super_size == super_size_64);
5030		}
5031
5032		offset = base_offset;
5033		size = super_size;
5034	}
5035	return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
5036}
5037
5038
5039kern_return_t
5040vm_map_create_upl(
5041	vm_map_t		map,
5042	vm_map_address_t	offset,
5043	upl_size_t		*upl_size,
5044	upl_t			*upl,
5045	upl_page_info_array_t	page_list,
5046	unsigned int		*count,
5047	int			*flags)
5048{
5049	vm_map_entry_t	entry;
5050	int		caller_flags;
5051	int		force_data_sync;
5052	int		sync_cow_data;
5053	vm_object_t	local_object;
5054	vm_map_offset_t	local_offset;
5055	vm_map_offset_t	local_start;
5056	kern_return_t	ret;
5057
5058	caller_flags = *flags;
5059
5060	if (caller_flags & ~UPL_VALID_FLAGS) {
5061		/*
5062		 * For forward compatibility's sake,
5063		 * reject any unknown flag.
5064		 */
5065		return KERN_INVALID_VALUE;
5066	}
5067	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
5068	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
5069
5070	if (upl == NULL)
5071		return KERN_INVALID_ARGUMENT;
5072
5073REDISCOVER_ENTRY:
5074	vm_map_lock_read(map);
5075
5076	if (vm_map_lookup_entry(map, offset, &entry)) {
5077
5078		if ((entry->vme_end - offset) < *upl_size) {
5079			*upl_size = (upl_size_t) (entry->vme_end - offset);
5080			assert(*upl_size == entry->vme_end - offset);
5081		}
5082
5083		if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
5084		        *flags = 0;
5085
5086			if ( !entry->is_sub_map && entry->object.vm_object != VM_OBJECT_NULL) {
5087			        if (entry->object.vm_object->private)
5088				        *flags = UPL_DEV_MEMORY;
5089
5090				if (entry->object.vm_object->phys_contiguous)
5091					*flags |= UPL_PHYS_CONTIG;
5092			}
5093			vm_map_unlock_read(map);
5094
5095			return KERN_SUCCESS;
5096		}
5097
5098		if (entry->is_sub_map) {
5099			vm_map_t	submap;
5100
5101			submap = entry->object.sub_map;
5102			local_start = entry->vme_start;
5103			local_offset = entry->offset;
5104
5105			vm_map_reference(submap);
5106			vm_map_unlock_read(map);
5107
5108			ret = vm_map_create_upl(submap,
5109						local_offset + (offset - local_start),
5110						upl_size, upl, page_list, count, flags);
5111			vm_map_deallocate(submap);
5112
5113			return ret;
5114		}
5115
5116	        if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
5117        		if ((*upl_size/PAGE_SIZE) > MAX_UPL_SIZE)
5118               			*upl_size = MAX_UPL_SIZE * PAGE_SIZE;
5119		}
5120		/*
5121		 *      Create an object if necessary.
5122		 */
5123		if (entry->object.vm_object == VM_OBJECT_NULL) {
5124
5125			if (vm_map_lock_read_to_write(map))
5126				goto REDISCOVER_ENTRY;
5127
5128			entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
5129			entry->offset = 0;
5130
5131			vm_map_lock_write_to_read(map);
5132		}
5133		if (!(caller_flags & UPL_COPYOUT_FROM)) {
5134			if (!(entry->protection & VM_PROT_WRITE)) {
5135				vm_map_unlock_read(map);
5136				return KERN_PROTECTION_FAILURE;
5137			}
5138
5139			local_object = entry->object.vm_object;
5140			if (vm_map_entry_should_cow_for_true_share(entry) &&
5141			    local_object->vo_size > *upl_size &&
5142			    *upl_size != 0) {
5143				vm_prot_t	prot;
5144
5145				/*
5146				 * Set up the targeted range for copy-on-write to avoid
5147				 * applying true_share/copy_delay to the entire object.
5148				 */
5149
5150				if (vm_map_lock_read_to_write(map)) {
5151					goto REDISCOVER_ENTRY;
5152				}
5153
5154				vm_map_clip_start(map,
5155						  entry,
5156						  vm_map_trunc_page(offset,
5157								    VM_MAP_PAGE_MASK(map)));
5158				vm_map_clip_end(map,
5159						entry,
5160						vm_map_round_page(offset + *upl_size,
5161								  VM_MAP_PAGE_MASK(map)));
5162				prot = entry->protection & ~VM_PROT_WRITE;
5163				if (override_nx(map, entry->alias) && prot)
5164					prot |= VM_PROT_EXECUTE;
5165				vm_object_pmap_protect(local_object,
5166						       entry->offset,
5167						       entry->vme_end - entry->vme_start,
5168						       ((entry->is_shared || map->mapped_in_other_pmaps)
5169							? PMAP_NULL
5170							: map->pmap),
5171						       entry->vme_start,
5172						       prot);
5173				entry->needs_copy = TRUE;
5174
5175				vm_map_lock_write_to_read(map);
5176			}
5177
5178			if (entry->needs_copy)  {
5179				/*
5180				 * Honor copy-on-write for COPY_SYMMETRIC
5181				 * strategy.
5182				 */
5183				vm_map_t		local_map;
5184				vm_object_t		object;
5185				vm_object_offset_t	new_offset;
5186				vm_prot_t		prot;
5187				boolean_t		wired;
5188				vm_map_version_t	version;
5189				vm_map_t		real_map;
5190
5191				local_map = map;
5192
5193				if (vm_map_lookup_locked(&local_map,
5194							 offset, VM_PROT_WRITE,
5195							 OBJECT_LOCK_EXCLUSIVE,
5196							 &version, &object,
5197							 &new_offset, &prot, &wired,
5198							 NULL,
5199							 &real_map) != KERN_SUCCESS) {
5200				        vm_map_unlock_read(local_map);
5201					return KERN_FAILURE;
5202				}
5203				if (real_map != map)
5204					vm_map_unlock(real_map);
5205				vm_map_unlock_read(local_map);
5206
5207				vm_object_unlock(object);
5208
5209				goto REDISCOVER_ENTRY;
5210			}
5211		}
5212		if (sync_cow_data) {
5213			if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
5214				local_object = entry->object.vm_object;
5215				local_start = entry->vme_start;
5216				local_offset = entry->offset;
5217
5218				vm_object_reference(local_object);
5219				vm_map_unlock_read(map);
5220
5221				if (local_object->shadow && local_object->copy) {
5222				        vm_object_lock_request(
5223							       local_object->shadow,
5224							       (vm_object_offset_t)
5225							       ((offset - local_start) +
5226								local_offset) +
5227							       local_object->vo_shadow_offset,
5228							       *upl_size, FALSE,
5229							       MEMORY_OBJECT_DATA_SYNC,
5230							       VM_PROT_NO_CHANGE);
5231				}
5232				sync_cow_data = FALSE;
5233				vm_object_deallocate(local_object);
5234
5235				goto REDISCOVER_ENTRY;
5236			}
5237		}
5238		if (force_data_sync) {
5239			local_object = entry->object.vm_object;
5240			local_start = entry->vme_start;
5241			local_offset = entry->offset;
5242
5243			vm_object_reference(local_object);
5244		        vm_map_unlock_read(map);
5245
5246			vm_object_lock_request(
5247					       local_object,
5248					       (vm_object_offset_t)
5249					       ((offset - local_start) + local_offset),
5250					       (vm_object_size_t)*upl_size, FALSE,
5251					       MEMORY_OBJECT_DATA_SYNC,
5252					       VM_PROT_NO_CHANGE);
5253
5254			force_data_sync = FALSE;
5255			vm_object_deallocate(local_object);
5256
5257			goto REDISCOVER_ENTRY;
5258		}
5259		if (entry->object.vm_object->private)
5260		        *flags = UPL_DEV_MEMORY;
5261		else
5262		        *flags = 0;
5263
5264		if (entry->object.vm_object->phys_contiguous)
5265		        *flags |= UPL_PHYS_CONTIG;
5266
5267		local_object = entry->object.vm_object;
5268		local_offset = entry->offset;
5269		local_start = entry->vme_start;
5270
5271		vm_object_reference(local_object);
5272		vm_map_unlock_read(map);
5273
5274		ret = vm_object_iopl_request(local_object,
5275					      (vm_object_offset_t) ((offset - local_start) + local_offset),
5276					      *upl_size,
5277					      upl,
5278					      page_list,
5279					      count,
5280					      caller_flags);
5281		vm_object_deallocate(local_object);
5282
5283		return(ret);
5284	}
5285	vm_map_unlock_read(map);
5286
5287	return(KERN_FAILURE);
5288}
5289
5290/*
5291 * Internal routine to enter a UPL into a VM map.
5292 *
5293 * JMM - This should just be doable through the standard
5294 * vm_map_enter() API.
5295 */
5296kern_return_t
5297vm_map_enter_upl(
5298	vm_map_t		map,
5299	upl_t			upl,
5300	vm_map_offset_t		*dst_addr)
5301{
5302	vm_map_size_t	 	size;
5303	vm_object_offset_t 	offset;
5304	vm_map_offset_t		addr;
5305	vm_page_t		m;
5306	kern_return_t		kr;
5307	int			isVectorUPL = 0, curr_upl=0;
5308	upl_t			vector_upl = NULL;
5309	vm_offset_t		vector_upl_dst_addr = 0;
5310	vm_map_t		vector_upl_submap = NULL;
5311	upl_offset_t 		subupl_offset = 0;
5312	upl_size_t		subupl_size = 0;
5313
5314	if (upl == UPL_NULL)
5315		return KERN_INVALID_ARGUMENT;
5316
5317	if((isVectorUPL = vector_upl_is_valid(upl))) {
5318		int mapped=0,valid_upls=0;
5319		vector_upl = upl;
5320
5321		upl_lock(vector_upl);
5322		for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
5323			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
5324			if(upl == NULL)
5325				continue;
5326			valid_upls++;
5327			if (UPL_PAGE_LIST_MAPPED & upl->flags)
5328				mapped++;
5329		}
5330
5331		if(mapped) {
5332			if(mapped != valid_upls)
5333				panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
5334			else {
5335				upl_unlock(vector_upl);
5336				return KERN_FAILURE;
5337			}
5338		}
5339
5340		kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE, VM_FLAGS_ANYWHERE, &vector_upl_submap);
5341		if( kr != KERN_SUCCESS )
5342			panic("Vector UPL submap allocation failed\n");
5343		map = vector_upl_submap;
5344		vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
5345		curr_upl=0;
5346	}
5347	else
5348		upl_lock(upl);
5349
5350process_upl_to_enter:
5351	if(isVectorUPL){
5352		if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
5353			*dst_addr = vector_upl_dst_addr;
5354			upl_unlock(vector_upl);
5355			return KERN_SUCCESS;
5356		}
5357		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
5358		if(upl == NULL)
5359			goto process_upl_to_enter;
5360
5361		vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
5362		*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
5363	} else {
5364		/*
5365		 * check to see if already mapped
5366		 */
5367		if (UPL_PAGE_LIST_MAPPED & upl->flags) {
5368			upl_unlock(upl);
5369			return KERN_FAILURE;
5370		}
5371	}
5372	if ((!(upl->flags & UPL_SHADOWED)) &&
5373	    ((upl->flags & UPL_HAS_BUSY) ||
5374	     !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
5375
5376		vm_object_t 		object;
5377		vm_page_t		alias_page;
5378		vm_object_offset_t	new_offset;
5379		unsigned int		pg_num;
5380		wpl_array_t 		lite_list;
5381
5382		if (upl->flags & UPL_INTERNAL) {
5383			lite_list = (wpl_array_t)
5384				((((uintptr_t)upl) + sizeof(struct upl))
5385				 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
5386		} else {
5387		        lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
5388		}
5389		object = upl->map_object;
5390		upl->map_object = vm_object_allocate(upl->size);
5391
5392		vm_object_lock(upl->map_object);
5393
5394		upl->map_object->shadow = object;
5395		upl->map_object->pageout = TRUE;
5396		upl->map_object->can_persist = FALSE;
5397		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5398		upl->map_object->vo_shadow_offset = upl->offset - object->paging_offset;
5399		upl->map_object->wimg_bits = object->wimg_bits;
5400		offset = upl->map_object->vo_shadow_offset;
5401		new_offset = 0;
5402		size = upl->size;
5403
5404		upl->flags |= UPL_SHADOWED;
5405
5406		while (size) {
5407			pg_num = (unsigned int) (new_offset / PAGE_SIZE);
5408			assert(pg_num == new_offset / PAGE_SIZE);
5409
5410			if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
5411
5412				VM_PAGE_GRAB_FICTITIOUS(alias_page);
5413
5414				vm_object_lock(object);
5415
5416				m = vm_page_lookup(object, offset);
5417				if (m == VM_PAGE_NULL) {
5418				        panic("vm_upl_map: page missing\n");
5419				}
5420
5421				/*
5422				 * Convert the fictitious page to a private
5423				 * shadow of the real page.
5424				 */
5425				assert(alias_page->fictitious);
5426				alias_page->fictitious = FALSE;
5427				alias_page->private = TRUE;
5428				alias_page->pageout = TRUE;
5429				/*
5430				 * since m is a page in the upl it must
5431				 * already be wired or BUSY, so it's
5432				 * safe to assign the underlying physical
5433				 * page to the alias
5434				 */
5435				alias_page->phys_page = m->phys_page;
5436
5437			        vm_object_unlock(object);
5438
5439				vm_page_lockspin_queues();
5440				vm_page_wire(alias_page);
5441				vm_page_unlock_queues();
5442
5443				/*
5444				 * ENCRYPTED SWAP:
5445				 * The virtual page ("m") has to be wired in some way
5446				 * here or its physical page ("m->phys_page") could
5447				 * be recycled at any time.
5448				 * Assuming this is enforced by the caller, we can't
5449				 * get an encrypted page here.  Since the encryption
5450				 * key depends on the VM page's "pager" object and
5451				 * the "paging_offset", we couldn't handle 2 pageable
5452				 * VM pages (with different pagers and paging_offsets)
5453				 * sharing the same physical page:  we could end up
5454				 * encrypting with one key (via one VM page) and
5455				 * decrypting with another key (via the alias VM page).
5456				 */
5457				ASSERT_PAGE_DECRYPTED(m);
5458
5459				vm_page_insert(alias_page, upl->map_object, new_offset);
5460
5461				assert(!alias_page->wanted);
5462				alias_page->busy = FALSE;
5463				alias_page->absent = FALSE;
5464			}
5465			size -= PAGE_SIZE;
5466			offset += PAGE_SIZE_64;
5467			new_offset += PAGE_SIZE_64;
5468		}
5469		vm_object_unlock(upl->map_object);
5470	}
5471	if (upl->flags & UPL_SHADOWED)
5472	        offset = 0;
5473	else
5474	        offset = upl->offset - upl->map_object->paging_offset;
5475
5476	size = upl->size;
5477
5478	vm_object_reference(upl->map_object);
5479
5480	if(!isVectorUPL) {
5481		*dst_addr = 0;
5482		/*
5483	 	* NEED A UPL_MAP ALIAS
5484	 	*/
5485		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
5486				  VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
5487				  VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
5488
5489		if (kr != KERN_SUCCESS) {
5490			upl_unlock(upl);
5491			return(kr);
5492		}
5493	}
5494	else {
5495		kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
5496				  VM_FLAGS_FIXED, upl->map_object, offset, FALSE,
5497				  VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
5498		if(kr)
5499			panic("vm_map_enter failed for a Vector UPL\n");
5500	}
5501	vm_object_lock(upl->map_object);
5502
5503	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
5504		m = vm_page_lookup(upl->map_object, offset);
5505
5506		if (m) {
5507			m->pmapped = TRUE;
5508
5509			/* CODE SIGNING ENFORCEMENT: page has been wpmapped,
5510			 * but only in kernel space. If this was on a user map,
5511			 * we'd have to set the wpmapped bit. */
5512			/* m->wpmapped = TRUE; */
5513			assert(map==kernel_map);
5514
5515			PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, VM_PROT_NONE, 0, TRUE);
5516		}
5517		offset += PAGE_SIZE_64;
5518	}
5519	vm_object_unlock(upl->map_object);
5520
5521	/*
5522	 * hold a reference for the mapping
5523	 */
5524	upl->ref_count++;
5525	upl->flags |= UPL_PAGE_LIST_MAPPED;
5526	upl->kaddr = (vm_offset_t) *dst_addr;
5527	assert(upl->kaddr == *dst_addr);
5528
5529	if(isVectorUPL)
5530		goto process_upl_to_enter;
5531
5532	upl_unlock(upl);
5533
5534	return KERN_SUCCESS;
5535}
5536
5537/*
5538 * Internal routine to remove a UPL mapping from a VM map.
5539 *
5540 * XXX - This should just be doable through a standard
5541 * vm_map_remove() operation.  Otherwise, implicit clean-up
5542 * of the target map won't be able to correctly remove
5543 * these (and release the reference on the UPL).  Having
5544 * to do this means we can't map these into user-space
5545 * maps yet.
5546 */
5547kern_return_t
5548vm_map_remove_upl(
5549	vm_map_t	map,
5550	upl_t		upl)
5551{
5552	vm_address_t	addr;
5553	upl_size_t	size;
5554	int		isVectorUPL = 0, curr_upl = 0;
5555	upl_t		vector_upl = NULL;
5556
5557	if (upl == UPL_NULL)
5558		return KERN_INVALID_ARGUMENT;
5559
5560	if((isVectorUPL = vector_upl_is_valid(upl))) {
5561		int 	unmapped=0, valid_upls=0;
5562		vector_upl = upl;
5563		upl_lock(vector_upl);
5564		for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
5565			upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
5566			if(upl == NULL)
5567				continue;
5568			valid_upls++;
5569			if (!(UPL_PAGE_LIST_MAPPED & upl->flags))
5570				unmapped++;
5571		}
5572
5573		if(unmapped) {
5574			if(unmapped != valid_upls)
5575				panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
5576			else {
5577				upl_unlock(vector_upl);
5578				return KERN_FAILURE;
5579			}
5580		}
5581		curr_upl=0;
5582	}
5583	else
5584		upl_lock(upl);
5585
5586process_upl_to_remove:
5587	if(isVectorUPL) {
5588		if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
5589			vm_map_t v_upl_submap;
5590			vm_offset_t v_upl_submap_dst_addr;
5591			vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
5592
5593			vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
5594			vm_map_deallocate(v_upl_submap);
5595			upl_unlock(vector_upl);
5596			return KERN_SUCCESS;
5597		}
5598
5599		upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
5600		if(upl == NULL)
5601			goto process_upl_to_remove;
5602	}
5603
5604	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
5605		addr = upl->kaddr;
5606		size = upl->size;
5607
5608		assert(upl->ref_count > 1);
5609		upl->ref_count--;		/* removing mapping ref */
5610
5611		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
5612		upl->kaddr = (vm_offset_t) 0;
5613
5614		if(!isVectorUPL) {
5615			upl_unlock(upl);
5616
5617			vm_map_remove(
5618				map,
5619				vm_map_trunc_page(addr,
5620						  VM_MAP_PAGE_MASK(map)),
5621				vm_map_round_page(addr + size,
5622						  VM_MAP_PAGE_MASK(map)),
5623				VM_MAP_NO_FLAGS);
5624
5625			return KERN_SUCCESS;
5626		}
5627		else {
5628			/*
5629			* If it's a Vectored UPL, we'll be removing the entire
5630			* submap anyways, so no need to remove individual UPL
5631			* element mappings from within the submap
5632			*/
5633			goto process_upl_to_remove;
5634		}
5635	}
5636	upl_unlock(upl);
5637
5638	return KERN_FAILURE;
5639}
5640
5641extern int panic_on_cs_killed;
5642kern_return_t
5643upl_commit_range(
5644	upl_t			upl,
5645	upl_offset_t		offset,
5646	upl_size_t		size,
5647	int			flags,
5648	upl_page_info_t		*page_list,
5649	mach_msg_type_number_t	count,
5650	boolean_t		*empty)
5651{
5652	upl_size_t		xfer_size, subupl_size = size;
5653	vm_object_t		shadow_object;
5654	vm_object_t		object;
5655	vm_object_offset_t	target_offset;
5656	upl_offset_t		subupl_offset = offset;
5657	int			entry;
5658	wpl_array_t 		lite_list;
5659	int			occupied;
5660	int			clear_refmod = 0;
5661	int			pgpgout_count = 0;
5662	struct	vm_page_delayed_work	dw_array[DEFAULT_DELAYED_WORK_LIMIT];
5663	struct	vm_page_delayed_work	*dwp;
5664	int			dw_count;
5665	int			dw_limit;
5666	int			isVectorUPL = 0;
5667	upl_t			vector_upl = NULL;
5668	boolean_t		should_be_throttled = FALSE;
5669
5670	*empty = FALSE;
5671
5672	if (upl == UPL_NULL)
5673		return KERN_INVALID_ARGUMENT;
5674
5675	if (count == 0)
5676		page_list = NULL;
5677
5678	if((isVectorUPL = vector_upl_is_valid(upl))) {
5679		vector_upl = upl;
5680		upl_lock(vector_upl);
5681	}
5682	else
5683		upl_lock(upl);
5684
5685process_upl_to_commit:
5686
5687	if(isVectorUPL) {
5688		size = subupl_size;
5689		offset = subupl_offset;
5690		if(size == 0) {
5691			upl_unlock(vector_upl);
5692			return KERN_SUCCESS;
5693		}
5694		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
5695		if(upl == NULL) {
5696			upl_unlock(vector_upl);
5697			return KERN_FAILURE;
5698		}
5699		page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
5700		subupl_size -= size;
5701		subupl_offset += size;
5702	}
5703
5704#if UPL_DEBUG
5705	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
5706		(void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5707
5708		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
5709		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
5710
5711		upl->upl_commit_index++;
5712	}
5713#endif
5714	if (upl->flags & UPL_DEVICE_MEMORY)
5715		xfer_size = 0;
5716	else if ((offset + size) <= upl->size)
5717	        xfer_size = size;
5718	else {
5719		if(!isVectorUPL)
5720			upl_unlock(upl);
5721		else {
5722			upl_unlock(vector_upl);
5723		}
5724		return KERN_FAILURE;
5725	}
5726	if (upl->flags & UPL_SET_DIRTY)
5727		flags |= UPL_COMMIT_SET_DIRTY;
5728	if (upl->flags & UPL_CLEAR_DIRTY)
5729	        flags |= UPL_COMMIT_CLEAR_DIRTY;
5730
5731	if (upl->flags & UPL_INTERNAL)
5732		lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
5733					   + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
5734	else
5735		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5736
5737	object = upl->map_object;
5738
5739	if (upl->flags & UPL_SHADOWED) {
5740	        vm_object_lock(object);
5741		shadow_object = object->shadow;
5742	} else {
5743		shadow_object = object;
5744	}
5745	entry = offset/PAGE_SIZE;
5746	target_offset = (vm_object_offset_t)offset;
5747
5748	if (upl->flags & UPL_KERNEL_OBJECT)
5749		vm_object_lock_shared(shadow_object);
5750	else
5751		vm_object_lock(shadow_object);
5752
5753	if (upl->flags & UPL_ACCESS_BLOCKED) {
5754		assert(shadow_object->blocked_access);
5755		shadow_object->blocked_access = FALSE;
5756		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
5757	}
5758
5759	if (shadow_object->code_signed) {
5760		/*
5761		 * CODE SIGNING:
5762		 * If the object is code-signed, do not let this UPL tell
5763		 * us if the pages are valid or not.  Let the pages be
5764		 * validated by VM the normal way (when they get mapped or
5765		 * copied).
5766		 */
5767		flags &= ~UPL_COMMIT_CS_VALIDATED;
5768	}
5769	if (! page_list) {
5770		/*
5771		 * No page list to get the code-signing info from !?
5772		 */
5773		flags &= ~UPL_COMMIT_CS_VALIDATED;
5774	}
5775	if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && shadow_object->internal)
5776		should_be_throttled = TRUE;
5777
5778	dwp = &dw_array[0];
5779	dw_count = 0;
5780	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5781
5782	while (xfer_size) {
5783		vm_page_t	t, m;
5784
5785		dwp->dw_mask = 0;
5786		clear_refmod = 0;
5787
5788		m = VM_PAGE_NULL;
5789
5790		if (upl->flags & UPL_LITE) {
5791			unsigned int	pg_num;
5792
5793			pg_num = (unsigned int) (target_offset/PAGE_SIZE);
5794			assert(pg_num == target_offset/PAGE_SIZE);
5795
5796			if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
5797			        lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
5798
5799				if (!(upl->flags & UPL_KERNEL_OBJECT))
5800					m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
5801			}
5802		}
5803		if (upl->flags & UPL_SHADOWED) {
5804			if ((t = vm_page_lookup(object, target_offset))	!= VM_PAGE_NULL) {
5805
5806				t->pageout = FALSE;
5807
5808				VM_PAGE_FREE(t);
5809
5810				if (m == VM_PAGE_NULL)
5811					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
5812			}
5813		}
5814		if ((upl->flags & UPL_KERNEL_OBJECT) || m == VM_PAGE_NULL)
5815			goto commit_next_page;
5816
5817		if (m->compressor) {
5818			assert(m->busy);
5819
5820			dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5821			goto commit_next_page;
5822		}
5823
5824		if (flags & UPL_COMMIT_CS_VALIDATED) {
5825			/*
5826			 * CODE SIGNING:
5827			 * Set the code signing bits according to
5828			 * what the UPL says they should be.
5829			 */
5830			m->cs_validated = page_list[entry].cs_validated;
5831			m->cs_tainted = page_list[entry].cs_tainted;
5832		}
5833		if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL)
5834		        m->written_by_kernel = TRUE;
5835
5836		if (upl->flags & UPL_IO_WIRE) {
5837
5838			if (page_list)
5839				page_list[entry].phys_addr = 0;
5840
5841			if (flags & UPL_COMMIT_SET_DIRTY) {
5842				SET_PAGE_DIRTY(m, FALSE);
5843			} else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
5844				m->dirty = FALSE;
5845
5846				if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
5847				    m->cs_validated && !m->cs_tainted) {
5848					/*
5849					 * CODE SIGNING:
5850					 * This page is no longer dirty
5851					 * but could have been modified,
5852					 * so it will need to be
5853					 * re-validated.
5854					 */
5855					if (panic_on_cs_killed &&
5856					    m->slid) {
5857						panic("upl_commit_range(%p): page %p was slid\n",
5858						      upl, m);
5859					}
5860					assert(!m->slid);
5861					m->cs_validated = FALSE;
5862#if DEVELOPMENT || DEBUG
5863					vm_cs_validated_resets++;
5864#endif
5865					pmap_disconnect(m->phys_page);
5866				}
5867				clear_refmod |= VM_MEM_MODIFIED;
5868			}
5869			if (flags & UPL_COMMIT_INACTIVATE) {
5870				dwp->dw_mask |= DW_vm_page_deactivate_internal;
5871				clear_refmod |= VM_MEM_REFERENCED;
5872			}
5873			if (upl->flags & UPL_ACCESS_BLOCKED) {
5874				/*
5875				 * We blocked access to the pages in this UPL.
5876				 * Clear the "busy" bit and wake up any waiter
5877				 * for this page.
5878				 */
5879				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5880			}
5881			if (m->absent) {
5882				if (flags & UPL_COMMIT_FREE_ABSENT)
5883					dwp->dw_mask |= DW_vm_page_free;
5884				else {
5885					m->absent = FALSE;
5886					dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5887
5888					if ( !(dwp->dw_mask & DW_vm_page_deactivate_internal))
5889						dwp->dw_mask |= DW_vm_page_activate;
5890				}
5891			} else
5892				dwp->dw_mask |= DW_vm_page_unwire;
5893
5894			goto commit_next_page;
5895		}
5896		assert(!m->compressor);
5897
5898		if (page_list)
5899			page_list[entry].phys_addr = 0;
5900
5901		/*
5902		 * make sure to clear the hardware
5903		 * modify or reference bits before
5904		 * releasing the BUSY bit on this page
5905		 * otherwise we risk losing a legitimate
5906		 * change of state
5907		 */
5908		if (flags & UPL_COMMIT_CLEAR_DIRTY) {
5909			m->dirty = FALSE;
5910
5911			clear_refmod |= VM_MEM_MODIFIED;
5912		}
5913		if (m->laundry)
5914			dwp->dw_mask |= DW_vm_pageout_throttle_up;
5915
5916		if (VM_PAGE_WIRED(m))
5917			m->pageout = FALSE;
5918
5919		if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
5920		    m->cs_validated && !m->cs_tainted) {
5921			/*
5922			 * CODE SIGNING:
5923			 * This page is no longer dirty
5924			 * but could have been modified,
5925			 * so it will need to be
5926			 * re-validated.
5927			 */
5928			if (panic_on_cs_killed &&
5929			    m->slid) {
5930				panic("upl_commit_range(%p): page %p was slid\n",
5931				      upl, m);
5932			}
5933			assert(!m->slid);
5934			m->cs_validated = FALSE;
5935#if DEVELOPMENT || DEBUG
5936			vm_cs_validated_resets++;
5937#endif
5938			pmap_disconnect(m->phys_page);
5939		}
5940		if (m->overwriting) {
5941			/*
5942			 * the (COPY_OUT_FROM == FALSE) request_page_list case
5943			 */
5944			if (m->busy) {
5945				m->absent = FALSE;
5946
5947				dwp->dw_mask |= DW_clear_busy;
5948			} else {
5949				/*
5950				 * alternate (COPY_OUT_FROM == FALSE) page_list case
5951				 * Occurs when the original page was wired
5952				 * at the time of the list request
5953				 */
5954				assert(VM_PAGE_WIRED(m));
5955
5956				dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
5957			}
5958			m->overwriting = FALSE;
5959		}
5960		if (m->encrypted_cleaning == TRUE) {
5961			m->encrypted_cleaning = FALSE;
5962
5963			dwp->dw_mask |= DW_clear_busy | DW_PAGE_WAKEUP;
5964		}
5965		m->cleaning = FALSE;
5966
5967		if (m->pageout) {
5968			/*
5969			 * With the clean queue enabled, UPL_PAGEOUT should
5970			 * no longer set the pageout bit. It's pages now go
5971			 * to the clean queue.
5972			 */
5973			assert(!(flags & UPL_PAGEOUT));
5974
5975			m->pageout = FALSE;
5976#if MACH_CLUSTER_STATS
5977			if (m->wanted) vm_pageout_target_collisions++;
5978#endif
5979			if ((flags & UPL_COMMIT_SET_DIRTY) ||
5980			    (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))) {
5981				/*
5982				 * page was re-dirtied after we started
5983				 * the pageout... reactivate it since
5984				 * we don't know whether the on-disk
5985				 * copy matches what is now in memory
5986				 */
5987				SET_PAGE_DIRTY(m, FALSE);
5988
5989				dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
5990
5991				if (upl->flags & UPL_PAGEOUT) {
5992					CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
5993					VM_STAT_INCR(reactivations);
5994					DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
5995				}
5996			} else {
5997				/*
5998				 * page has been successfully cleaned
5999				 * go ahead and free it for other use
6000				 */
6001				if (m->object->internal) {
6002					DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
6003				} else {
6004					DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
6005				}
6006				m->dirty = FALSE;
6007				m->busy = TRUE;
6008
6009				dwp->dw_mask |= DW_vm_page_free;
6010			}
6011			goto commit_next_page;
6012		}
6013#if MACH_CLUSTER_STATS
6014		if (m->wpmapped)
6015			m->dirty = pmap_is_modified(m->phys_page);
6016
6017		if (m->dirty)   vm_pageout_cluster_dirtied++;
6018		else            vm_pageout_cluster_cleaned++;
6019		if (m->wanted)  vm_pageout_cluster_collisions++;
6020#endif
6021		/*
6022		 * It is a part of the semantic of COPYOUT_FROM
6023		 * UPLs that a commit implies cache sync
6024		 * between the vm page and the backing store
6025		 * this can be used to strip the precious bit
6026		 * as well as clean
6027		 */
6028		if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
6029			m->precious = FALSE;
6030
6031		if (flags & UPL_COMMIT_SET_DIRTY) {
6032			SET_PAGE_DIRTY(m, FALSE);
6033		} else {
6034			m->dirty = FALSE;
6035		}
6036
6037		/* with the clean queue on, move *all* cleaned pages to the clean queue */
6038		if (hibernate_cleaning_in_progress == FALSE && !m->dirty && (upl->flags & UPL_PAGEOUT)) {
6039			pgpgout_count++;
6040
6041			/* this page used to be dirty; now it's on the clean queue. */
6042			m->was_dirty = TRUE;
6043
6044			dwp->dw_mask |= DW_enqueue_cleaned;
6045			vm_pageout_enqueued_cleaned_from_inactive_dirty++;
6046		} else if (should_be_throttled == TRUE && !m->active && !m->inactive && !m->speculative && !m->throttled) {
6047			/*
6048			 * page coming back in from being 'frozen'...
6049			 * it was dirty before it was frozen, so keep it so
6050			 * the vm_page_activate will notice that it really belongs
6051			 * on the throttle queue and put it there
6052			 */
6053			SET_PAGE_DIRTY(m, FALSE);
6054			dwp->dw_mask |= DW_vm_page_activate;
6055
6056		} else {
6057			if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
6058				dwp->dw_mask |= DW_vm_page_deactivate_internal;
6059				clear_refmod |= VM_MEM_REFERENCED;
6060			} else if (!m->active && !m->inactive && !m->speculative) {
6061
6062				if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
6063					dwp->dw_mask |= DW_vm_page_speculate;
6064				else if (m->reference)
6065					dwp->dw_mask |= DW_vm_page_activate;
6066				else {
6067					dwp->dw_mask |= DW_vm_page_deactivate_internal;
6068					clear_refmod |= VM_MEM_REFERENCED;
6069				}
6070			}
6071		}
6072		if (upl->flags & UPL_ACCESS_BLOCKED) {
6073			/*
6074			 * We blocked access to the pages in this URL.
6075			 * Clear the "busy" bit on this page before we
6076			 * wake up any waiter.
6077			 */
6078			dwp->dw_mask |= DW_clear_busy;
6079		}
6080
6081		/*
6082		 * Wakeup any thread waiting for the page to be un-cleaning.
6083		 */
6084		dwp->dw_mask |= DW_PAGE_WAKEUP;
6085
6086commit_next_page:
6087		if (clear_refmod)
6088			pmap_clear_refmod(m->phys_page, clear_refmod);
6089
6090		target_offset += PAGE_SIZE_64;
6091		xfer_size -= PAGE_SIZE;
6092		entry++;
6093
6094		if (dwp->dw_mask) {
6095			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
6096				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
6097
6098				if (dw_count >= dw_limit) {
6099					vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count);
6100
6101					dwp = &dw_array[0];
6102					dw_count = 0;
6103				}
6104			} else {
6105				if (dwp->dw_mask & DW_clear_busy)
6106					m->busy = FALSE;
6107
6108				if (dwp->dw_mask & DW_PAGE_WAKEUP)
6109					PAGE_WAKEUP(m);
6110			}
6111		}
6112	}
6113	if (dw_count)
6114		vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count);
6115
6116	occupied = 1;
6117
6118	if (upl->flags & UPL_DEVICE_MEMORY)  {
6119		occupied = 0;
6120	} else if (upl->flags & UPL_LITE) {
6121		int	pg_num;
6122		int	i;
6123
6124		pg_num = upl->size/PAGE_SIZE;
6125		pg_num = (pg_num + 31) >> 5;
6126		occupied = 0;
6127
6128		for (i = 0; i < pg_num; i++) {
6129			if (lite_list[i] != 0) {
6130				occupied = 1;
6131				break;
6132			}
6133		}
6134	} else {
6135		if (queue_empty(&upl->map_object->memq))
6136			occupied = 0;
6137	}
6138	if (occupied == 0) {
6139		/*
6140		 * If this UPL element belongs to a Vector UPL and is
6141		 * empty, then this is the right function to deallocate
6142		 * it. So go ahead set the *empty variable. The flag
6143		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
6144		 * should be considered relevant for the Vector UPL and not
6145		 * the internal UPLs.
6146		 */
6147		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
6148			*empty = TRUE;
6149
6150		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
6151		        /*
6152			 * this is not a paging object
6153			 * so we need to drop the paging reference
6154			 * that was taken when we created the UPL
6155			 * against this object
6156			 */
6157			vm_object_activity_end(shadow_object);
6158			vm_object_collapse(shadow_object, 0, TRUE);
6159		} else {
6160		         /*
6161			  * we dontated the paging reference to
6162			  * the map object... vm_pageout_object_terminate
6163			  * will drop this reference
6164			  */
6165		}
6166	}
6167	vm_object_unlock(shadow_object);
6168	if (object != shadow_object)
6169	        vm_object_unlock(object);
6170
6171	if(!isVectorUPL)
6172		upl_unlock(upl);
6173	else {
6174		/*
6175		 * If we completed our operations on an UPL that is
6176		 * part of a Vectored UPL and if empty is TRUE, then
6177		 * we should go ahead and deallocate this UPL element.
6178		 * Then we check if this was the last of the UPL elements
6179		 * within that Vectored UPL. If so, set empty to TRUE
6180		 * so that in ubc_upl_commit_range or ubc_upl_commit, we
6181		 * can go ahead and deallocate the Vector UPL too.
6182		 */
6183		if(*empty==TRUE) {
6184			*empty = vector_upl_set_subupl(vector_upl, upl, 0);
6185			upl_deallocate(upl);
6186		}
6187		goto process_upl_to_commit;
6188	}
6189
6190	if (pgpgout_count) {
6191		DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
6192	}
6193
6194	return KERN_SUCCESS;
6195}
6196
6197kern_return_t
6198upl_abort_range(
6199	upl_t			upl,
6200	upl_offset_t		offset,
6201	upl_size_t		size,
6202	int			error,
6203	boolean_t		*empty)
6204{
6205	upl_page_info_t		*user_page_list = NULL;
6206	upl_size_t		xfer_size, subupl_size = size;
6207	vm_object_t		shadow_object;
6208	vm_object_t		object;
6209	vm_object_offset_t	target_offset;
6210	upl_offset_t		subupl_offset = offset;
6211	int			entry;
6212	wpl_array_t 	 	lite_list;
6213	int			occupied;
6214	struct	vm_page_delayed_work	dw_array[DEFAULT_DELAYED_WORK_LIMIT];
6215	struct	vm_page_delayed_work	*dwp;
6216	int			dw_count;
6217	int			dw_limit;
6218	int			isVectorUPL = 0;
6219	upl_t			vector_upl = NULL;
6220
6221	*empty = FALSE;
6222
6223	if (upl == UPL_NULL)
6224		return KERN_INVALID_ARGUMENT;
6225
6226	if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
6227		return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
6228
6229	if((isVectorUPL = vector_upl_is_valid(upl))) {
6230		vector_upl = upl;
6231		upl_lock(vector_upl);
6232	}
6233	else
6234		upl_lock(upl);
6235
6236process_upl_to_abort:
6237	if(isVectorUPL) {
6238		size = subupl_size;
6239		offset = subupl_offset;
6240		if(size == 0) {
6241			upl_unlock(vector_upl);
6242			return KERN_SUCCESS;
6243		}
6244		upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
6245		if(upl == NULL) {
6246			upl_unlock(vector_upl);
6247			return KERN_FAILURE;
6248		}
6249		subupl_size -= size;
6250		subupl_offset += size;
6251	}
6252
6253	*empty = FALSE;
6254
6255#if UPL_DEBUG
6256	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
6257		(void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
6258
6259		upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
6260		upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
6261		upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
6262
6263		upl->upl_commit_index++;
6264	}
6265#endif
6266	if (upl->flags & UPL_DEVICE_MEMORY)
6267		xfer_size = 0;
6268	else if ((offset + size) <= upl->size)
6269	        xfer_size = size;
6270	else {
6271		if(!isVectorUPL)
6272			upl_unlock(upl);
6273		else {
6274			upl_unlock(vector_upl);
6275		}
6276
6277		return KERN_FAILURE;
6278	}
6279	if (upl->flags & UPL_INTERNAL) {
6280		lite_list = (wpl_array_t)
6281			((((uintptr_t)upl) + sizeof(struct upl))
6282			+ ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
6283
6284		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
6285	} else {
6286		lite_list = (wpl_array_t)
6287			(((uintptr_t)upl) + sizeof(struct upl));
6288	}
6289	object = upl->map_object;
6290
6291	if (upl->flags & UPL_SHADOWED) {
6292	        vm_object_lock(object);
6293		shadow_object = object->shadow;
6294	} else
6295		shadow_object = object;
6296
6297	entry = offset/PAGE_SIZE;
6298	target_offset = (vm_object_offset_t)offset;
6299
6300	if (upl->flags & UPL_KERNEL_OBJECT)
6301		vm_object_lock_shared(shadow_object);
6302	else
6303		vm_object_lock(shadow_object);
6304
6305	if (upl->flags & UPL_ACCESS_BLOCKED) {
6306		assert(shadow_object->blocked_access);
6307		shadow_object->blocked_access = FALSE;
6308		vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
6309	}
6310
6311	dwp = &dw_array[0];
6312	dw_count = 0;
6313	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
6314
6315	if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT))
6316		panic("upl_abort_range: kernel_object being DUMPED");
6317
6318	while (xfer_size) {
6319		vm_page_t	t, m;
6320		unsigned int	pg_num;
6321		boolean_t	needed;
6322
6323		pg_num = (unsigned int) (target_offset/PAGE_SIZE);
6324		assert(pg_num == target_offset/PAGE_SIZE);
6325
6326		needed = FALSE;
6327
6328		if (user_page_list)
6329			needed = user_page_list[pg_num].needed;
6330
6331		dwp->dw_mask = 0;
6332		m = VM_PAGE_NULL;
6333
6334		if (upl->flags & UPL_LITE) {
6335
6336			if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
6337				lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
6338
6339				if ( !(upl->flags & UPL_KERNEL_OBJECT))
6340					m = vm_page_lookup(shadow_object, target_offset +
6341							   (upl->offset - shadow_object->paging_offset));
6342			}
6343		}
6344		if (upl->flags & UPL_SHADOWED) {
6345		        if ((t = vm_page_lookup(object, target_offset))	!= VM_PAGE_NULL) {
6346			        t->pageout = FALSE;
6347
6348				VM_PAGE_FREE(t);
6349
6350				if (m == VM_PAGE_NULL)
6351					m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
6352			}
6353		}
6354		if ((upl->flags & UPL_KERNEL_OBJECT))
6355			goto abort_next_page;
6356
6357		if (m != VM_PAGE_NULL) {
6358
6359			assert(!m->compressor);
6360
6361			if (m->absent) {
6362			        boolean_t must_free = TRUE;
6363
6364				/*
6365				 * COPYOUT = FALSE case
6366				 * check for error conditions which must
6367				 * be passed back to the pages customer
6368				 */
6369				if (error & UPL_ABORT_RESTART) {
6370					m->restart = TRUE;
6371					m->absent = FALSE;
6372					m->unusual = TRUE;
6373					must_free = FALSE;
6374				} else if (error & UPL_ABORT_UNAVAILABLE) {
6375					m->restart = FALSE;
6376					m->unusual = TRUE;
6377					must_free = FALSE;
6378				} else if (error & UPL_ABORT_ERROR) {
6379					m->restart = FALSE;
6380					m->absent = FALSE;
6381					m->error = TRUE;
6382					m->unusual = TRUE;
6383					must_free = FALSE;
6384				}
6385				if (m->clustered && needed == FALSE) {
6386					/*
6387					 * This page was a part of a speculative
6388					 * read-ahead initiated by the kernel
6389					 * itself.  No one is expecting this
6390					 * page and no one will clean up its
6391					 * error state if it ever becomes valid
6392					 * in the future.
6393					 * We have to free it here.
6394					 */
6395					must_free = TRUE;
6396				}
6397
6398				/*
6399				 * ENCRYPTED SWAP:
6400				 * If the page was already encrypted,
6401				 * we don't really need to decrypt it
6402				 * now.  It will get decrypted later,
6403				 * on demand, as soon as someone needs
6404				 * to access its contents.
6405				 */
6406
6407				m->cleaning = FALSE;
6408				m->encrypted_cleaning = FALSE;
6409
6410				if (m->overwriting && !m->busy) {
6411					/*
6412					 * this shouldn't happen since
6413					 * this is an 'absent' page, but
6414					 * it doesn't hurt to check for
6415					 * the 'alternate' method of
6416					 * stabilizing the page...
6417					 * we will mark 'busy' to be cleared
6418					 * in the following code which will
6419					 * take care of the primary stabilzation
6420					 * method (i.e. setting 'busy' to TRUE)
6421					 */
6422					dwp->dw_mask |= DW_vm_page_unwire;
6423				}
6424				m->overwriting = FALSE;
6425
6426				dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
6427
6428				if (must_free == TRUE)
6429					dwp->dw_mask |= DW_vm_page_free;
6430				else
6431					dwp->dw_mask |= DW_vm_page_activate;
6432			} else {
6433			        /*
6434				 * Handle the trusted pager throttle.
6435				 */
6436			        if (m->laundry)
6437					dwp->dw_mask |= DW_vm_pageout_throttle_up;
6438
6439				if (upl->flags & UPL_ACCESS_BLOCKED) {
6440					/*
6441					 * We blocked access to the pages in this UPL.
6442					 * Clear the "busy" bit and wake up any waiter
6443					 * for this page.
6444					 */
6445					dwp->dw_mask |= DW_clear_busy;
6446				}
6447				if (m->overwriting) {
6448					if (m->busy)
6449						dwp->dw_mask |= DW_clear_busy;
6450					else {
6451						/*
6452						 * deal with the 'alternate' method
6453						 * of stabilizing the page...
6454						 * we will either free the page
6455						 * or mark 'busy' to be cleared
6456						 * in the following code which will
6457						 * take care of the primary stabilzation
6458						 * method (i.e. setting 'busy' to TRUE)
6459						 */
6460						dwp->dw_mask |= DW_vm_page_unwire;
6461					}
6462					m->overwriting = FALSE;
6463				}
6464				if (m->encrypted_cleaning == TRUE) {
6465					m->encrypted_cleaning = FALSE;
6466
6467					dwp->dw_mask |= DW_clear_busy;
6468				}
6469				m->pageout = FALSE;
6470				m->cleaning = FALSE;
6471#if	MACH_PAGEMAP
6472				vm_external_state_clr(m->object->existence_map, m->offset);
6473#endif	/* MACH_PAGEMAP */
6474				if (error & UPL_ABORT_DUMP_PAGES) {
6475					pmap_disconnect(m->phys_page);
6476
6477					dwp->dw_mask |= DW_vm_page_free;
6478				} else {
6479					if (!(dwp->dw_mask & DW_vm_page_unwire)) {
6480						if (error & UPL_ABORT_REFERENCE) {
6481							/*
6482							 * we've been told to explictly
6483							 * reference this page... for
6484							 * file I/O, this is done by
6485							 * implementing an LRU on the inactive q
6486							 */
6487							dwp->dw_mask |= DW_vm_page_lru;
6488
6489						} else if (!m->active && !m->inactive && !m->speculative)
6490							dwp->dw_mask |= DW_vm_page_deactivate_internal;
6491					}
6492					dwp->dw_mask |= DW_PAGE_WAKEUP;
6493				}
6494			}
6495		}
6496abort_next_page:
6497		target_offset += PAGE_SIZE_64;
6498		xfer_size -= PAGE_SIZE;
6499		entry++;
6500
6501		if (dwp->dw_mask) {
6502			if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
6503				VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
6504
6505				if (dw_count >= dw_limit) {
6506					vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count);
6507
6508					dwp = &dw_array[0];
6509					dw_count = 0;
6510				}
6511			} else {
6512				if (dwp->dw_mask & DW_clear_busy)
6513					m->busy = FALSE;
6514
6515				if (dwp->dw_mask & DW_PAGE_WAKEUP)
6516					PAGE_WAKEUP(m);
6517			}
6518		}
6519	}
6520	if (dw_count)
6521		vm_page_do_delayed_work(shadow_object, &dw_array[0], dw_count);
6522
6523	occupied = 1;
6524
6525	if (upl->flags & UPL_DEVICE_MEMORY)  {
6526		occupied = 0;
6527	} else if (upl->flags & UPL_LITE) {
6528		int	pg_num;
6529		int	i;
6530
6531		pg_num = upl->size/PAGE_SIZE;
6532		pg_num = (pg_num + 31) >> 5;
6533		occupied = 0;
6534
6535		for (i = 0; i < pg_num; i++) {
6536			if (lite_list[i] != 0) {
6537				occupied = 1;
6538				break;
6539			}
6540		}
6541	} else {
6542		if (queue_empty(&upl->map_object->memq))
6543			occupied = 0;
6544	}
6545	if (occupied == 0) {
6546		/*
6547		 * If this UPL element belongs to a Vector UPL and is
6548		 * empty, then this is the right function to deallocate
6549		 * it. So go ahead set the *empty variable. The flag
6550		 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
6551		 * should be considered relevant for the Vector UPL and
6552		 * not the internal UPLs.
6553		 */
6554		if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
6555			*empty = TRUE;
6556
6557		if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
6558		        /*
6559			 * this is not a paging object
6560			 * so we need to drop the paging reference
6561			 * that was taken when we created the UPL
6562			 * against this object
6563			 */
6564			vm_object_activity_end(shadow_object);
6565			vm_object_collapse(shadow_object, 0, TRUE);
6566		} else {
6567		         /*
6568			  * we dontated the paging reference to
6569			  * the map object... vm_pageout_object_terminate
6570			  * will drop this reference
6571			  */
6572		}
6573	}
6574	vm_object_unlock(shadow_object);
6575	if (object != shadow_object)
6576	        vm_object_unlock(object);
6577
6578	if(!isVectorUPL)
6579		upl_unlock(upl);
6580	else {
6581		/*
6582		* If we completed our operations on an UPL that is
6583	 	* part of a Vectored UPL and if empty is TRUE, then
6584	 	* we should go ahead and deallocate this UPL element.
6585	 	* Then we check if this was the last of the UPL elements
6586	 	* within that Vectored UPL. If so, set empty to TRUE
6587	 	* so that in ubc_upl_abort_range or ubc_upl_abort, we
6588	 	* can go ahead and deallocate the Vector UPL too.
6589	 	*/
6590		if(*empty == TRUE) {
6591			*empty = vector_upl_set_subupl(vector_upl, upl,0);
6592			upl_deallocate(upl);
6593		}
6594		goto process_upl_to_abort;
6595	}
6596
6597	return KERN_SUCCESS;
6598}
6599
6600
6601kern_return_t
6602upl_abort(
6603	upl_t	upl,
6604	int	error)
6605{
6606	boolean_t	empty;
6607
6608	return upl_abort_range(upl, 0, upl->size, error, &empty);
6609}
6610
6611
6612/* an option on commit should be wire */
6613kern_return_t
6614upl_commit(
6615	upl_t			upl,
6616	upl_page_info_t		*page_list,
6617	mach_msg_type_number_t	count)
6618{
6619	boolean_t	empty;
6620
6621	return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
6622}
6623
6624void
6625vm_object_set_pmap_cache_attr(
6626		vm_object_t		object,
6627		upl_page_info_array_t	user_page_list,
6628		unsigned int		num_pages,
6629		boolean_t		batch_pmap_op)
6630{
6631	unsigned int    cache_attr = 0;
6632
6633	cache_attr = object->wimg_bits & VM_WIMG_MASK;
6634	assert(user_page_list);
6635	if (cache_attr != VM_WIMG_USE_DEFAULT) {
6636		PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
6637	}
6638}
6639
6640unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
6641
6642kern_return_t
6643vm_object_iopl_request(
6644	vm_object_t		object,
6645	vm_object_offset_t	offset,
6646	upl_size_t		size,
6647	upl_t			*upl_ptr,
6648	upl_page_info_array_t	user_page_list,
6649	unsigned int		*page_list_count,
6650	int			cntrl_flags)
6651{
6652	vm_page_t		dst_page;
6653	vm_object_offset_t	dst_offset;
6654	upl_size_t		xfer_size;
6655	upl_t			upl = NULL;
6656	unsigned int		entry;
6657	wpl_array_t 		lite_list = NULL;
6658	int			no_zero_fill = FALSE;
6659	unsigned int		size_in_pages;
6660	u_int32_t		psize;
6661	kern_return_t		ret;
6662	vm_prot_t		prot;
6663	struct vm_object_fault_info fault_info;
6664	struct	vm_page_delayed_work	dw_array[DEFAULT_DELAYED_WORK_LIMIT];
6665	struct	vm_page_delayed_work	*dwp;
6666	int			dw_count;
6667	int			dw_limit;
6668	int			dw_index;
6669	boolean_t		caller_lookup;
6670
6671	if (cntrl_flags & ~UPL_VALID_FLAGS) {
6672		/*
6673		 * For forward compatibility's sake,
6674		 * reject any unknown flag.
6675		 */
6676		return KERN_INVALID_VALUE;
6677	}
6678	if (vm_lopage_needed == FALSE)
6679	        cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
6680
6681	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
6682	        if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
6683		        return KERN_INVALID_VALUE;
6684
6685		if (object->phys_contiguous) {
6686		        if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
6687			        return KERN_INVALID_ADDRESS;
6688
6689			if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
6690			        return KERN_INVALID_ADDRESS;
6691		}
6692	}
6693
6694	if (cntrl_flags & UPL_ENCRYPT) {
6695		/*
6696		 * ENCRYPTED SWAP:
6697		 * The paging path doesn't use this interface,
6698		 * so we don't support the UPL_ENCRYPT flag
6699		 * here.  We won't encrypt the pages.
6700		 */
6701		assert(! (cntrl_flags & UPL_ENCRYPT));
6702	}
6703	if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO))
6704	        no_zero_fill = TRUE;
6705
6706	if (cntrl_flags & UPL_COPYOUT_FROM)
6707		prot = VM_PROT_READ;
6708	else
6709		prot = VM_PROT_READ | VM_PROT_WRITE;
6710
6711	if (((size/PAGE_SIZE) > MAX_UPL_SIZE) && !object->phys_contiguous)
6712		size = MAX_UPL_SIZE * PAGE_SIZE;
6713
6714	if (cntrl_flags & UPL_SET_INTERNAL) {
6715		if (page_list_count != NULL)
6716			*page_list_count = MAX_UPL_SIZE;
6717	}
6718	if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
6719	    ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
6720	        return KERN_INVALID_ARGUMENT;
6721
6722	if ((!object->internal) && (object->paging_offset != 0))
6723		panic("vm_object_iopl_request: external object with non-zero paging offset\n");
6724
6725
6726	if (object->phys_contiguous)
6727	        psize = PAGE_SIZE;
6728	else
6729	        psize = size;
6730
6731	if (cntrl_flags & UPL_SET_INTERNAL) {
6732	        upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
6733
6734		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
6735		lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
6736					   ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
6737		if (size == 0) {
6738			user_page_list = NULL;
6739			lite_list = NULL;
6740		}
6741	} else {
6742	        upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
6743
6744		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
6745		if (size == 0) {
6746			lite_list = NULL;
6747		}
6748	}
6749	if (user_page_list)
6750	        user_page_list[0].device = FALSE;
6751	*upl_ptr = upl;
6752
6753	upl->map_object = object;
6754	upl->size = size;
6755
6756	size_in_pages = size / PAGE_SIZE;
6757
6758	if (object == kernel_object &&
6759	    !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
6760		upl->flags |= UPL_KERNEL_OBJECT;
6761#if UPL_DEBUG
6762		vm_object_lock(object);
6763#else
6764		vm_object_lock_shared(object);
6765#endif
6766	} else {
6767		vm_object_lock(object);
6768		vm_object_activity_begin(object);
6769	}
6770	/*
6771	 * paging in progress also protects the paging_offset
6772	 */
6773	upl->offset = offset + object->paging_offset;
6774
6775	if (cntrl_flags & UPL_BLOCK_ACCESS) {
6776		/*
6777		 * The user requested that access to the pages in this UPL
6778		 * be blocked until the UPL is commited or aborted.
6779		 */
6780		upl->flags |= UPL_ACCESS_BLOCKED;
6781	}
6782
6783	if (object->phys_contiguous) {
6784#if UPL_DEBUG
6785		vm_object_activity_begin(object);
6786		queue_enter(&object->uplq, upl, upl_t, uplq);
6787#endif /* UPL_DEBUG */
6788
6789		if (upl->flags & UPL_ACCESS_BLOCKED) {
6790			assert(!object->blocked_access);
6791			object->blocked_access = TRUE;
6792		}
6793
6794		vm_object_unlock(object);
6795
6796		/*
6797		 * don't need any shadow mappings for this one
6798		 * since it is already I/O memory
6799		 */
6800		upl->flags |= UPL_DEVICE_MEMORY;
6801
6802		upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1)>>PAGE_SHIFT);
6803
6804		if (user_page_list) {
6805		        user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset)>>PAGE_SHIFT);
6806			user_page_list[0].device = TRUE;
6807		}
6808		if (page_list_count != NULL) {
6809		        if (upl->flags & UPL_INTERNAL)
6810			        *page_list_count = 0;
6811			else
6812			        *page_list_count = 1;
6813		}
6814		return KERN_SUCCESS;
6815	}
6816	if (object != kernel_object && object != compressor_object) {
6817		/*
6818		 * Protect user space from future COW operations
6819		 */
6820		object->true_share = TRUE;
6821
6822		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
6823			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6824	}
6825
6826#if UPL_DEBUG
6827	vm_object_activity_begin(object);
6828	queue_enter(&object->uplq, upl, upl_t, uplq);
6829#endif /* UPL_DEBUG */
6830
6831	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
6832	    object->copy != VM_OBJECT_NULL) {
6833		/*
6834		 * Honor copy-on-write obligations
6835		 *
6836		 * The caller is gathering these pages and
6837		 * might modify their contents.  We need to
6838		 * make sure that the copy object has its own
6839		 * private copies of these pages before we let
6840		 * the caller modify them.
6841		 *
6842		 * NOTE: someone else could map the original object
6843		 * after we've done this copy-on-write here, and they
6844		 * could then see an inconsistent picture of the memory
6845		 * while it's being modified via the UPL.  To prevent this,
6846		 * we would have to block access to these pages until the
6847		 * UPL is released.  We could use the UPL_BLOCK_ACCESS
6848		 * code path for that...
6849		 */
6850		vm_object_update(object,
6851				 offset,
6852				 size,
6853				 NULL,
6854				 NULL,
6855				 FALSE,	/* should_return */
6856				 MEMORY_OBJECT_COPY_SYNC,
6857				 VM_PROT_NO_CHANGE);
6858#if DEVELOPMENT || DEBUG
6859		iopl_cow++;
6860		iopl_cow_pages += size >> PAGE_SHIFT;
6861#endif
6862	}
6863
6864
6865	entry = 0;
6866
6867	xfer_size = size;
6868	dst_offset = offset;
6869
6870	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
6871	fault_info.user_tag  = 0;
6872	fault_info.lo_offset = offset;
6873	fault_info.hi_offset = offset + xfer_size;
6874	fault_info.no_cache  = FALSE;
6875	fault_info.stealth = FALSE;
6876	fault_info.io_sync = FALSE;
6877	fault_info.cs_bypass = FALSE;
6878	fault_info.mark_zf_absent = (0 == (cntrl_flags & UPL_NOZEROFILLIO));
6879
6880	dwp = &dw_array[0];
6881	dw_count = 0;
6882	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
6883
6884	while (xfer_size) {
6885	        vm_fault_return_t	result;
6886		unsigned int		pg_num;
6887
6888		dwp->dw_mask = 0;
6889
6890		dst_page = vm_page_lookup(object, dst_offset);
6891
6892		/*
6893		 * ENCRYPTED SWAP:
6894		 * If the page is encrypted, we need to decrypt it,
6895		 * so force a soft page fault.
6896		 */
6897		if (dst_page == VM_PAGE_NULL ||
6898		    dst_page->busy ||
6899		    dst_page->encrypted ||
6900		    dst_page->error ||
6901		    dst_page->restart ||
6902		    dst_page->absent ||
6903		    dst_page->fictitious) {
6904
6905		   if (object == kernel_object)
6906			   panic("vm_object_iopl_request: missing/bad page in kernel object\n");
6907		   if (object == compressor_object)
6908			   panic("vm_object_iopl_request: missing/bad page in compressor object\n");
6909
6910		   if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
6911			   ret = KERN_MEMORY_ERROR;
6912			   goto return_err;
6913		   }
6914
6915                   /*
6916		    * We just looked up the page and the result remains valid
6917		    * until the object lock is release, so send it to
6918		    * vm_fault_page() (as "dst_page"), to avoid having to
6919		    * look it up again there.
6920		    */
6921		   caller_lookup = TRUE;
6922
6923		   do {
6924			vm_page_t	top_page;
6925			kern_return_t	error_code;
6926			int		interruptible;
6927
6928			if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
6929				interruptible = THREAD_ABORTSAFE;
6930			else
6931				interruptible = THREAD_UNINT;
6932
6933			fault_info.interruptible = interruptible;
6934			fault_info.cluster_size = xfer_size;
6935			fault_info.batch_pmap_op = TRUE;
6936
6937			vm_object_paging_begin(object);
6938
6939			result = vm_fault_page(object, dst_offset,
6940					       prot | VM_PROT_WRITE, FALSE,
6941					       caller_lookup,
6942					       &prot, &dst_page, &top_page,
6943					       (int *)0,
6944					       &error_code, no_zero_fill,
6945					       FALSE, &fault_info);
6946
6947                        /* our lookup is no longer valid at this point */
6948			caller_lookup = FALSE;
6949
6950			switch (result) {
6951
6952			case VM_FAULT_SUCCESS:
6953
6954				if ( !dst_page->absent) {
6955					PAGE_WAKEUP_DONE(dst_page);
6956				} else {
6957					/*
6958					 * we only get back an absent page if we
6959					 * requested that it not be zero-filled
6960					 * because we are about to fill it via I/O
6961					 *
6962					 * absent pages should be left BUSY
6963					 * to prevent them from being faulted
6964					 * into an address space before we've
6965					 * had a chance to complete the I/O on
6966					 * them since they may contain info that
6967					 * shouldn't be seen by the faulting task
6968					 */
6969				}
6970				/*
6971				 *	Release paging references and
6972				 *	top-level placeholder page, if any.
6973				 */
6974				if (top_page != VM_PAGE_NULL) {
6975					vm_object_t local_object;
6976
6977					local_object = top_page->object;
6978
6979					if (top_page->object != dst_page->object) {
6980						vm_object_lock(local_object);
6981						VM_PAGE_FREE(top_page);
6982						vm_object_paging_end(local_object);
6983						vm_object_unlock(local_object);
6984					} else {
6985						VM_PAGE_FREE(top_page);
6986						vm_object_paging_end(local_object);
6987					}
6988				}
6989				vm_object_paging_end(object);
6990				break;
6991
6992			case VM_FAULT_RETRY:
6993				vm_object_lock(object);
6994				break;
6995
6996			case VM_FAULT_MEMORY_SHORTAGE:
6997				OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6998
6999				VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
7000
7001				if (vm_page_wait(interruptible)) {
7002					OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
7003
7004					VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
7005					vm_object_lock(object);
7006
7007					break;
7008				}
7009				OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
7010
7011				VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
7012
7013				/* fall thru */
7014
7015			case VM_FAULT_INTERRUPTED:
7016				error_code = MACH_SEND_INTERRUPTED;
7017			case VM_FAULT_MEMORY_ERROR:
7018			memory_error:
7019				ret = (error_code ? error_code:	KERN_MEMORY_ERROR);
7020
7021				vm_object_lock(object);
7022				goto return_err;
7023
7024			case VM_FAULT_SUCCESS_NO_VM_PAGE:
7025				/* success but no page: fail */
7026				vm_object_paging_end(object);
7027				vm_object_unlock(object);
7028				goto memory_error;
7029
7030			default:
7031				panic("vm_object_iopl_request: unexpected error"
7032				      " 0x%x from vm_fault_page()\n", result);
7033			}
7034		   } while (result != VM_FAULT_SUCCESS);
7035
7036		}
7037		if (upl->flags & UPL_KERNEL_OBJECT)
7038			goto record_phys_addr;
7039
7040		if (dst_page->compressor) {
7041			dst_page->busy = TRUE;
7042			goto record_phys_addr;
7043		}
7044
7045		if (dst_page->cleaning) {
7046			/*
7047			 * Someone else is cleaning this page in place.
7048			 * In theory, we should be able to  proceed and use this
7049			 * page but they'll probably end up clearing the "busy"
7050			 * bit on it in upl_commit_range() but they didn't set
7051			 * it, so they would clear our "busy" bit and open
7052			 * us to race conditions.
7053			 * We'd better wait for the cleaning to complete and
7054			 * then try again.
7055			 */
7056			vm_object_iopl_request_sleep_for_cleaning++;
7057			PAGE_SLEEP(object, dst_page, THREAD_UNINT);
7058			continue;
7059		}
7060		if (dst_page->laundry) {
7061			dst_page->pageout = FALSE;
7062
7063			vm_pageout_steal_laundry(dst_page, FALSE);
7064		}
7065		if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
7066		     dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
7067		        vm_page_t	low_page;
7068			int 		refmod;
7069
7070			/*
7071			 * support devices that can't DMA above 32 bits
7072			 * by substituting pages from a pool of low address
7073			 * memory for any pages we find above the 4G mark
7074			 * can't substitute if the page is already wired because
7075			 * we don't know whether that physical address has been
7076			 * handed out to some other 64 bit capable DMA device to use
7077			 */
7078			if (VM_PAGE_WIRED(dst_page)) {
7079			        ret = KERN_PROTECTION_FAILURE;
7080				goto return_err;
7081			}
7082			low_page = vm_page_grablo();
7083
7084			if (low_page == VM_PAGE_NULL) {
7085			        ret = KERN_RESOURCE_SHORTAGE;
7086				goto return_err;
7087			}
7088			/*
7089			 * from here until the vm_page_replace completes
7090			 * we musn't drop the object lock... we don't
7091			 * want anyone refaulting this page in and using
7092			 * it after we disconnect it... we want the fault
7093			 * to find the new page being substituted.
7094			 */
7095			if (dst_page->pmapped)
7096			        refmod = pmap_disconnect(dst_page->phys_page);
7097			else
7098			        refmod = 0;
7099
7100			if (!dst_page->absent)
7101				vm_page_copy(dst_page, low_page);
7102
7103			low_page->reference = dst_page->reference;
7104			low_page->dirty     = dst_page->dirty;
7105			low_page->absent    = dst_page->absent;
7106
7107			if (refmod & VM_MEM_REFERENCED)
7108			        low_page->reference = TRUE;
7109			if (refmod & VM_MEM_MODIFIED) {
7110			        SET_PAGE_DIRTY(low_page, FALSE);
7111			}
7112
7113			vm_page_replace(low_page, object, dst_offset);
7114
7115			dst_page = low_page;
7116			/*
7117			 * vm_page_grablo returned the page marked
7118			 * BUSY... we don't need a PAGE_WAKEUP_DONE
7119			 * here, because we've never dropped the object lock
7120			 */
7121			if ( !dst_page->absent)
7122				dst_page->busy = FALSE;
7123		}
7124		if ( !dst_page->busy)
7125			dwp->dw_mask |= DW_vm_page_wire;
7126
7127		if (cntrl_flags & UPL_BLOCK_ACCESS) {
7128			/*
7129			 * Mark the page "busy" to block any future page fault
7130			 * on this page in addition to wiring it.
7131			 * We'll also remove the mapping
7132			 * of all these pages before leaving this routine.
7133			 */
7134			assert(!dst_page->fictitious);
7135			dst_page->busy = TRUE;
7136		}
7137		/*
7138		 * expect the page to be used
7139		 * page queues lock must be held to set 'reference'
7140		 */
7141		dwp->dw_mask |= DW_set_reference;
7142
7143   		if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7144			SET_PAGE_DIRTY(dst_page, TRUE);
7145		}
7146		if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
7147			pmap_sync_page_attributes_phys(dst_page->phys_page);
7148			dst_page->written_by_kernel = FALSE;
7149		}
7150
7151record_phys_addr:
7152		if (dst_page->busy)
7153			upl->flags |= UPL_HAS_BUSY;
7154
7155		pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
7156		assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
7157		lite_list[pg_num>>5] |= 1 << (pg_num & 31);
7158
7159		if (dst_page->phys_page > upl->highest_page)
7160		        upl->highest_page = dst_page->phys_page;
7161
7162		if (user_page_list) {
7163			user_page_list[entry].phys_addr	= dst_page->phys_page;
7164			user_page_list[entry].pageout	= dst_page->pageout;
7165			user_page_list[entry].absent	= dst_page->absent;
7166			user_page_list[entry].dirty 	= dst_page->dirty;
7167			user_page_list[entry].precious	= dst_page->precious;
7168			user_page_list[entry].device 	= FALSE;
7169			user_page_list[entry].needed    = FALSE;
7170			if (dst_page->clustered == TRUE)
7171			        user_page_list[entry].speculative = dst_page->speculative;
7172			else
7173			        user_page_list[entry].speculative = FALSE;
7174			user_page_list[entry].cs_validated = dst_page->cs_validated;
7175			user_page_list[entry].cs_tainted = dst_page->cs_tainted;
7176		}
7177		if (object != kernel_object && object != compressor_object) {
7178			/*
7179			 * someone is explicitly grabbing this page...
7180			 * update clustered and speculative state
7181			 *
7182			 */
7183			VM_PAGE_CONSUME_CLUSTERED(dst_page);
7184		}
7185		entry++;
7186		dst_offset += PAGE_SIZE_64;
7187		xfer_size -= PAGE_SIZE;
7188
7189		if (dwp->dw_mask) {
7190			VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
7191
7192			if (dw_count >= dw_limit) {
7193				vm_page_do_delayed_work(object, &dw_array[0], dw_count);
7194
7195				dwp = &dw_array[0];
7196				dw_count = 0;
7197			}
7198		}
7199	}
7200	if (dw_count)
7201		vm_page_do_delayed_work(object, &dw_array[0], dw_count);
7202
7203	vm_object_set_pmap_cache_attr(object, user_page_list, entry, TRUE);
7204
7205	if (page_list_count != NULL) {
7206	        if (upl->flags & UPL_INTERNAL)
7207			*page_list_count = 0;
7208		else if (*page_list_count > entry)
7209			*page_list_count = entry;
7210	}
7211	vm_object_unlock(object);
7212
7213	if (cntrl_flags & UPL_BLOCK_ACCESS) {
7214		/*
7215		 * We've marked all the pages "busy" so that future
7216		 * page faults will block.
7217		 * Now remove the mapping for these pages, so that they
7218		 * can't be accessed without causing a page fault.
7219		 */
7220		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
7221				       PMAP_NULL, 0, VM_PROT_NONE);
7222		assert(!object->blocked_access);
7223		object->blocked_access = TRUE;
7224	}
7225	return KERN_SUCCESS;
7226
7227return_err:
7228	dw_index = 0;
7229
7230	for (; offset < dst_offset; offset += PAGE_SIZE) {
7231		boolean_t need_unwire;
7232
7233	        dst_page = vm_page_lookup(object, offset);
7234
7235		if (dst_page == VM_PAGE_NULL)
7236		        panic("vm_object_iopl_request: Wired page missing. \n");
7237
7238		/*
7239		 * if we've already processed this page in an earlier
7240		 * dw_do_work, we need to undo the wiring... we will
7241		 * leave the dirty and reference bits on if they
7242		 * were set, since we don't have a good way of knowing
7243		 * what the previous state was and we won't get here
7244		 * under any normal circumstances...  we will always
7245		 * clear BUSY and wakeup any waiters via vm_page_free
7246		 * or PAGE_WAKEUP_DONE
7247		 */
7248		need_unwire = TRUE;
7249
7250		if (dw_count) {
7251			if (dw_array[dw_index].dw_m == dst_page) {
7252				/*
7253				 * still in the deferred work list
7254				 * which means we haven't yet called
7255				 * vm_page_wire on this page
7256				 */
7257				need_unwire = FALSE;
7258
7259				dw_index++;
7260				dw_count--;
7261			}
7262		}
7263		vm_page_lock_queues();
7264
7265		if (dst_page->absent) {
7266			vm_page_free(dst_page);
7267
7268			need_unwire = FALSE;
7269		} else {
7270			if (need_unwire == TRUE)
7271				vm_page_unwire(dst_page, TRUE);
7272
7273			PAGE_WAKEUP_DONE(dst_page);
7274		}
7275		vm_page_unlock_queues();
7276
7277		if (need_unwire == TRUE)
7278			VM_STAT_INCR(reactivations);
7279	}
7280#if UPL_DEBUG
7281	upl->upl_state = 2;
7282#endif
7283	if (! (upl->flags & UPL_KERNEL_OBJECT)) {
7284		vm_object_activity_end(object);
7285		vm_object_collapse(object, 0, TRUE);
7286	}
7287	vm_object_unlock(object);
7288	upl_destroy(upl);
7289
7290	return ret;
7291}
7292
7293kern_return_t
7294upl_transpose(
7295	upl_t		upl1,
7296	upl_t		upl2)
7297{
7298	kern_return_t		retval;
7299	boolean_t		upls_locked;
7300	vm_object_t		object1, object2;
7301
7302	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2  || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR)  || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
7303		return KERN_INVALID_ARGUMENT;
7304	}
7305
7306	upls_locked = FALSE;
7307
7308	/*
7309	 * Since we need to lock both UPLs at the same time,
7310	 * avoid deadlocks by always taking locks in the same order.
7311	 */
7312	if (upl1 < upl2) {
7313		upl_lock(upl1);
7314		upl_lock(upl2);
7315	} else {
7316		upl_lock(upl2);
7317		upl_lock(upl1);
7318	}
7319	upls_locked = TRUE;	/* the UPLs will need to be unlocked */
7320
7321	object1 = upl1->map_object;
7322	object2 = upl2->map_object;
7323
7324	if (upl1->offset != 0 || upl2->offset != 0 ||
7325	    upl1->size != upl2->size) {
7326		/*
7327		 * We deal only with full objects, not subsets.
7328		 * That's because we exchange the entire backing store info
7329		 * for the objects: pager, resident pages, etc...  We can't do
7330		 * only part of it.
7331		 */
7332		retval = KERN_INVALID_VALUE;
7333		goto done;
7334	}
7335
7336	/*
7337	 * Tranpose the VM objects' backing store.
7338	 */
7339	retval = vm_object_transpose(object1, object2,
7340				     (vm_object_size_t) upl1->size);
7341
7342	if (retval == KERN_SUCCESS) {
7343		/*
7344		 * Make each UPL point to the correct VM object, i.e. the
7345		 * object holding the pages that the UPL refers to...
7346		 */
7347#if UPL_DEBUG
7348		queue_remove(&object1->uplq, upl1, upl_t, uplq);
7349		queue_remove(&object2->uplq, upl2, upl_t, uplq);
7350#endif
7351		upl1->map_object = object2;
7352		upl2->map_object = object1;
7353#if UPL_DEBUG
7354		queue_enter(&object1->uplq, upl2, upl_t, uplq);
7355		queue_enter(&object2->uplq, upl1, upl_t, uplq);
7356#endif
7357	}
7358
7359done:
7360	/*
7361	 * Cleanup.
7362	 */
7363	if (upls_locked) {
7364		upl_unlock(upl1);
7365		upl_unlock(upl2);
7366		upls_locked = FALSE;
7367	}
7368
7369	return retval;
7370}
7371
7372void
7373upl_range_needed(
7374	upl_t		upl,
7375	int		index,
7376	int		count)
7377{
7378	upl_page_info_t	*user_page_list;
7379	int		size_in_pages;
7380
7381	if ( !(upl->flags & UPL_INTERNAL) || count <= 0)
7382		return;
7383
7384	size_in_pages = upl->size / PAGE_SIZE;
7385
7386	user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
7387
7388	while (count-- && index < size_in_pages)
7389		user_page_list[index++].needed = TRUE;
7390}
7391
7392
7393/*
7394 * ENCRYPTED SWAP:
7395 *
7396 * Rationale:  the user might have some encrypted data on disk (via
7397 * FileVault or any other mechanism).  That data is then decrypted in
7398 * memory, which is safe as long as the machine is secure.  But that
7399 * decrypted data in memory could be paged out to disk by the default
7400 * pager.  The data would then be stored on disk in clear (not encrypted)
7401 * and it could be accessed by anyone who gets physical access to the
7402 * disk (if the laptop or the disk gets stolen for example).  This weakens
7403 * the security offered by FileVault.
7404 *
7405 * Solution:  the default pager will optionally request that all the
7406 * pages it gathers for pageout be encrypted, via the UPL interfaces,
7407 * before it sends this UPL to disk via the vnode_pageout() path.
7408 *
7409 * Notes:
7410 *
7411 * To avoid disrupting the VM LRU algorithms, we want to keep the
7412 * clean-in-place mechanisms, which allow us to send some extra pages to
7413 * swap (clustering) without actually removing them from the user's
7414 * address space.  We don't want the user to unknowingly access encrypted
7415 * data, so we have to actually remove the encrypted pages from the page
7416 * table.  When the user accesses the data, the hardware will fail to
7417 * locate the virtual page in its page table and will trigger a page
7418 * fault.  We can then decrypt the page and enter it in the page table
7419 * again.  Whenever we allow the user to access the contents of a page,
7420 * we have to make sure it's not encrypted.
7421 *
7422 *
7423 */
7424/*
7425 * ENCRYPTED SWAP:
7426 * Reserve of virtual addresses in the kernel address space.
7427 * We need to map the physical pages in the kernel, so that we
7428 * can call the encryption/decryption routines with a kernel
7429 * virtual address.  We keep this pool of pre-allocated kernel
7430 * virtual addresses so that we don't have to scan the kernel's
7431 * virtaul address space each time we need to encrypt or decrypt
7432 * a physical page.
7433 * It would be nice to be able to encrypt and decrypt in physical
7434 * mode but that might not always be more efficient...
7435 */
7436decl_simple_lock_data(,vm_paging_lock)
7437#define VM_PAGING_NUM_PAGES	64
7438vm_map_offset_t vm_paging_base_address = 0;
7439boolean_t	vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
7440int		vm_paging_max_index = 0;
7441int		vm_paging_page_waiter = 0;
7442int		vm_paging_page_waiter_total = 0;
7443unsigned long	vm_paging_no_kernel_page = 0;
7444unsigned long	vm_paging_objects_mapped = 0;
7445unsigned long	vm_paging_pages_mapped = 0;
7446unsigned long	vm_paging_objects_mapped_slow = 0;
7447unsigned long	vm_paging_pages_mapped_slow = 0;
7448
7449void
7450vm_paging_map_init(void)
7451{
7452	kern_return_t	kr;
7453	vm_map_offset_t	page_map_offset;
7454	vm_map_entry_t	map_entry;
7455
7456	assert(vm_paging_base_address == 0);
7457
7458	/*
7459	 * Initialize our pool of pre-allocated kernel
7460	 * virtual addresses.
7461	 */
7462	page_map_offset = 0;
7463	kr = vm_map_find_space(kernel_map,
7464			       &page_map_offset,
7465			       VM_PAGING_NUM_PAGES * PAGE_SIZE,
7466			       0,
7467			       0,
7468			       &map_entry);
7469	if (kr != KERN_SUCCESS) {
7470		panic("vm_paging_map_init: kernel_map full\n");
7471	}
7472	map_entry->object.vm_object = kernel_object;
7473	map_entry->offset = page_map_offset;
7474	map_entry->protection = VM_PROT_NONE;
7475	map_entry->max_protection = VM_PROT_NONE;
7476	map_entry->permanent = TRUE;
7477	vm_object_reference(kernel_object);
7478	vm_map_unlock(kernel_map);
7479
7480	assert(vm_paging_base_address == 0);
7481	vm_paging_base_address = page_map_offset;
7482}
7483
7484/*
7485 * ENCRYPTED SWAP:
7486 * vm_paging_map_object:
7487 *	Maps part of a VM object's pages in the kernel
7488 * 	virtual address space, using the pre-allocated
7489 *	kernel virtual addresses, if possible.
7490 * Context:
7491 * 	The VM object is locked.  This lock will get
7492 * 	dropped and re-acquired though, so the caller
7493 * 	must make sure the VM object is kept alive
7494 *	(by holding a VM map that has a reference
7495 * 	on it, for example, or taking an extra reference).
7496 * 	The page should also be kept busy to prevent
7497 *	it from being reclaimed.
7498 */
7499kern_return_t
7500vm_paging_map_object(
7501	vm_page_t		page,
7502	vm_object_t		object,
7503	vm_object_offset_t	offset,
7504	vm_prot_t		protection,
7505	boolean_t		can_unlock_object,
7506	vm_map_size_t		*size,		/* IN/OUT */
7507	vm_map_offset_t		*address,	/* OUT */
7508	boolean_t		*need_unmap)	/* OUT */
7509{
7510	kern_return_t		kr;
7511	vm_map_offset_t		page_map_offset;
7512	vm_map_size_t		map_size;
7513	vm_object_offset_t	object_offset;
7514	int			i;
7515
7516	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
7517		/* use permanent 1-to-1 kernel mapping of physical memory ? */
7518#if __x86_64__
7519		*address = (vm_map_offset_t)
7520			PHYSMAP_PTOV((pmap_paddr_t)page->phys_page <<
7521				     PAGE_SHIFT);
7522		*need_unmap = FALSE;
7523		return KERN_SUCCESS;
7524#else
7525#warn "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
7526#endif
7527
7528		assert(page->busy);
7529		/*
7530		 * Use one of the pre-allocated kernel virtual addresses
7531		 * and just enter the VM page in the kernel address space
7532		 * at that virtual address.
7533		 */
7534		simple_lock(&vm_paging_lock);
7535
7536		/*
7537		 * Try and find an available kernel virtual address
7538		 * from our pre-allocated pool.
7539		 */
7540		page_map_offset = 0;
7541		for (;;) {
7542			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
7543				if (vm_paging_page_inuse[i] == FALSE) {
7544					page_map_offset =
7545						vm_paging_base_address +
7546						(i * PAGE_SIZE);
7547					break;
7548				}
7549			}
7550			if (page_map_offset != 0) {
7551				/* found a space to map our page ! */
7552				break;
7553			}
7554
7555			if (can_unlock_object) {
7556				/*
7557				 * If we can afford to unlock the VM object,
7558				 * let's take the slow path now...
7559				 */
7560				break;
7561			}
7562			/*
7563			 * We can't afford to unlock the VM object, so
7564			 * let's wait for a space to become available...
7565			 */
7566			vm_paging_page_waiter_total++;
7567			vm_paging_page_waiter++;
7568			thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
7569						       &vm_paging_lock,
7570						       THREAD_UNINT);
7571			vm_paging_page_waiter--;
7572			/* ... and try again */
7573		}
7574
7575		if (page_map_offset != 0) {
7576			/*
7577			 * We found a kernel virtual address;
7578			 * map the physical page to that virtual address.
7579			 */
7580			if (i > vm_paging_max_index) {
7581				vm_paging_max_index = i;
7582			}
7583			vm_paging_page_inuse[i] = TRUE;
7584			simple_unlock(&vm_paging_lock);
7585
7586			page->pmapped = TRUE;
7587
7588			/*
7589			 * Keep the VM object locked over the PMAP_ENTER
7590			 * and the actual use of the page by the kernel,
7591			 * or this pmap mapping might get undone by a
7592			 * vm_object_pmap_protect() call...
7593			 */
7594			PMAP_ENTER(kernel_pmap,
7595				   page_map_offset,
7596				   page,
7597				   protection,
7598				   VM_PROT_NONE,
7599				   0,
7600				   TRUE);
7601			vm_paging_objects_mapped++;
7602			vm_paging_pages_mapped++;
7603			*address = page_map_offset;
7604			*need_unmap = TRUE;
7605
7606			/* all done and mapped, ready to use ! */
7607			return KERN_SUCCESS;
7608		}
7609
7610		/*
7611		 * We ran out of pre-allocated kernel virtual
7612		 * addresses.  Just map the page in the kernel
7613		 * the slow and regular way.
7614		 */
7615		vm_paging_no_kernel_page++;
7616		simple_unlock(&vm_paging_lock);
7617	}
7618
7619	if (! can_unlock_object) {
7620		*address = 0;
7621		*size = 0;
7622		*need_unmap = FALSE;
7623		return KERN_NOT_SUPPORTED;
7624	}
7625
7626	object_offset = vm_object_trunc_page(offset);
7627	map_size = vm_map_round_page(*size,
7628				     VM_MAP_PAGE_MASK(kernel_map));
7629
7630	/*
7631	 * Try and map the required range of the object
7632	 * in the kernel_map
7633	 */
7634
7635	vm_object_reference_locked(object);	/* for the map entry */
7636	vm_object_unlock(object);
7637
7638	kr = vm_map_enter(kernel_map,
7639			  address,
7640			  map_size,
7641			  0,
7642			  VM_FLAGS_ANYWHERE,
7643			  object,
7644			  object_offset,
7645			  FALSE,
7646			  protection,
7647			  VM_PROT_ALL,
7648			  VM_INHERIT_NONE);
7649	if (kr != KERN_SUCCESS) {
7650		*address = 0;
7651		*size = 0;
7652		*need_unmap = FALSE;
7653		vm_object_deallocate(object);	/* for the map entry */
7654		vm_object_lock(object);
7655		return kr;
7656	}
7657
7658	*size = map_size;
7659
7660	/*
7661	 * Enter the mapped pages in the page table now.
7662	 */
7663	vm_object_lock(object);
7664	/*
7665	 * VM object must be kept locked from before PMAP_ENTER()
7666	 * until after the kernel is done accessing the page(s).
7667	 * Otherwise, the pmap mappings in the kernel could be
7668	 * undone by a call to vm_object_pmap_protect().
7669	 */
7670
7671	for (page_map_offset = 0;
7672	     map_size != 0;
7673	     map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
7674
7675		page = vm_page_lookup(object, offset + page_map_offset);
7676		if (page == VM_PAGE_NULL) {
7677			printf("vm_paging_map_object: no page !?");
7678			vm_object_unlock(object);
7679			kr = vm_map_remove(kernel_map, *address, *size,
7680					   VM_MAP_NO_FLAGS);
7681			assert(kr == KERN_SUCCESS);
7682			*address = 0;
7683			*size = 0;
7684			*need_unmap = FALSE;
7685			vm_object_lock(object);
7686			return KERN_MEMORY_ERROR;
7687		}
7688		page->pmapped = TRUE;
7689
7690		//assert(pmap_verify_free(page->phys_page));
7691		PMAP_ENTER(kernel_pmap,
7692			   *address + page_map_offset,
7693			   page,
7694			   protection,
7695			   VM_PROT_NONE,
7696			   0,
7697			   TRUE);
7698	}
7699
7700	vm_paging_objects_mapped_slow++;
7701	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
7702
7703	*need_unmap = TRUE;
7704
7705	return KERN_SUCCESS;
7706}
7707
7708/*
7709 * ENCRYPTED SWAP:
7710 * vm_paging_unmap_object:
7711 *	Unmaps part of a VM object's pages from the kernel
7712 * 	virtual address space.
7713 * Context:
7714 * 	The VM object is locked.  This lock will get
7715 * 	dropped and re-acquired though.
7716 */
7717void
7718vm_paging_unmap_object(
7719	vm_object_t	object,
7720	vm_map_offset_t	start,
7721	vm_map_offset_t	end)
7722{
7723	kern_return_t	kr;
7724	int		i;
7725
7726	if ((vm_paging_base_address == 0) ||
7727	    (start < vm_paging_base_address) ||
7728	    (end > (vm_paging_base_address
7729		     + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
7730		/*
7731		 * We didn't use our pre-allocated pool of
7732		 * kernel virtual address.  Deallocate the
7733		 * virtual memory.
7734		 */
7735		if (object != VM_OBJECT_NULL) {
7736			vm_object_unlock(object);
7737		}
7738		kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
7739		if (object != VM_OBJECT_NULL) {
7740			vm_object_lock(object);
7741		}
7742		assert(kr == KERN_SUCCESS);
7743	} else {
7744		/*
7745		 * We used a kernel virtual address from our
7746		 * pre-allocated pool.  Put it back in the pool
7747		 * for next time.
7748		 */
7749		assert(end - start == PAGE_SIZE);
7750		i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
7751		assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
7752
7753		/* undo the pmap mapping */
7754		pmap_remove(kernel_pmap, start, end);
7755
7756		simple_lock(&vm_paging_lock);
7757		vm_paging_page_inuse[i] = FALSE;
7758		if (vm_paging_page_waiter) {
7759			thread_wakeup(&vm_paging_page_waiter);
7760		}
7761		simple_unlock(&vm_paging_lock);
7762	}
7763}
7764
7765#if CRYPTO
7766/*
7767 * Encryption data.
7768 * "iv" is the "initial vector".  Ideally, we want to
7769 * have a different one for each page we encrypt, so that
7770 * crackers can't find encryption patterns too easily.
7771 */
7772#define SWAP_CRYPT_AES_KEY_SIZE	128	/* XXX 192 and 256 don't work ! */
7773boolean_t		swap_crypt_ctx_initialized = FALSE;
7774uint32_t 		swap_crypt_key[8]; /* big enough for a 256 key */
7775aes_ctx			swap_crypt_ctx;
7776const unsigned char	swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
7777
7778#if DEBUG
7779boolean_t		swap_crypt_ctx_tested = FALSE;
7780unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
7781unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
7782unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
7783#endif /* DEBUG */
7784
7785/*
7786 * Initialize the encryption context: key and key size.
7787 */
7788void swap_crypt_ctx_initialize(void); /* forward */
7789void
7790swap_crypt_ctx_initialize(void)
7791{
7792	unsigned int	i;
7793
7794	/*
7795	 * No need for locking to protect swap_crypt_ctx_initialized
7796	 * because the first use of encryption will come from the
7797	 * pageout thread (we won't pagein before there's been a pageout)
7798	 * and there's only one pageout thread.
7799	 */
7800	if (swap_crypt_ctx_initialized == FALSE) {
7801		for (i = 0;
7802		     i < (sizeof (swap_crypt_key) /
7803			  sizeof (swap_crypt_key[0]));
7804		     i++) {
7805			swap_crypt_key[i] = random();
7806		}
7807		aes_encrypt_key((const unsigned char *) swap_crypt_key,
7808				SWAP_CRYPT_AES_KEY_SIZE,
7809				&swap_crypt_ctx.encrypt);
7810		aes_decrypt_key((const unsigned char *) swap_crypt_key,
7811				SWAP_CRYPT_AES_KEY_SIZE,
7812				&swap_crypt_ctx.decrypt);
7813		swap_crypt_ctx_initialized = TRUE;
7814	}
7815
7816#if DEBUG
7817	/*
7818	 * Validate the encryption algorithms.
7819	 */
7820	if (swap_crypt_ctx_tested == FALSE) {
7821		/* initialize */
7822		for (i = 0; i < 4096; i++) {
7823			swap_crypt_test_page_ref[i] = (char) i;
7824		}
7825		/* encrypt */
7826		aes_encrypt_cbc(swap_crypt_test_page_ref,
7827				swap_crypt_null_iv,
7828				PAGE_SIZE / AES_BLOCK_SIZE,
7829				swap_crypt_test_page_encrypt,
7830				&swap_crypt_ctx.encrypt);
7831		/* decrypt */
7832		aes_decrypt_cbc(swap_crypt_test_page_encrypt,
7833				swap_crypt_null_iv,
7834				PAGE_SIZE / AES_BLOCK_SIZE,
7835				swap_crypt_test_page_decrypt,
7836				&swap_crypt_ctx.decrypt);
7837		/* compare result with original */
7838		for (i = 0; i < 4096; i ++) {
7839			if (swap_crypt_test_page_decrypt[i] !=
7840			    swap_crypt_test_page_ref[i]) {
7841				panic("encryption test failed");
7842			}
7843		}
7844
7845		/* encrypt again */
7846		aes_encrypt_cbc(swap_crypt_test_page_decrypt,
7847				swap_crypt_null_iv,
7848				PAGE_SIZE / AES_BLOCK_SIZE,
7849				swap_crypt_test_page_decrypt,
7850				&swap_crypt_ctx.encrypt);
7851		/* decrypt in place */
7852		aes_decrypt_cbc(swap_crypt_test_page_decrypt,
7853				swap_crypt_null_iv,
7854				PAGE_SIZE / AES_BLOCK_SIZE,
7855				swap_crypt_test_page_decrypt,
7856				&swap_crypt_ctx.decrypt);
7857		for (i = 0; i < 4096; i ++) {
7858			if (swap_crypt_test_page_decrypt[i] !=
7859			    swap_crypt_test_page_ref[i]) {
7860				panic("in place encryption test failed");
7861			}
7862		}
7863
7864		swap_crypt_ctx_tested = TRUE;
7865	}
7866#endif /* DEBUG */
7867}
7868
7869/*
7870 * ENCRYPTED SWAP:
7871 * vm_page_encrypt:
7872 * 	Encrypt the given page, for secure paging.
7873 * 	The page might already be mapped at kernel virtual
7874 * 	address "kernel_mapping_offset".  Otherwise, we need
7875 * 	to map it.
7876 *
7877 * Context:
7878 * 	The page's object is locked, but this lock will be released
7879 * 	and re-acquired.
7880 * 	The page is busy and not accessible by users (not entered in any pmap).
7881 */
7882void
7883vm_page_encrypt(
7884	vm_page_t	page,
7885	vm_map_offset_t	kernel_mapping_offset)
7886{
7887	kern_return_t		kr;
7888	vm_map_size_t		kernel_mapping_size;
7889	boolean_t		kernel_mapping_needs_unmap;
7890	vm_offset_t		kernel_vaddr;
7891	union {
7892		unsigned char	aes_iv[AES_BLOCK_SIZE];
7893		struct {
7894			memory_object_t		pager_object;
7895			vm_object_offset_t	paging_offset;
7896		} vm;
7897	} encrypt_iv;
7898
7899	if (! vm_pages_encrypted) {
7900		vm_pages_encrypted = TRUE;
7901	}
7902
7903	assert(page->busy);
7904
7905	if (page->encrypted) {
7906		/*
7907		 * Already encrypted: no need to do it again.
7908		 */
7909		vm_page_encrypt_already_encrypted_counter++;
7910		return;
7911	}
7912	assert(page->dirty || page->precious);
7913
7914	ASSERT_PAGE_DECRYPTED(page);
7915
7916	/*
7917	 * Take a paging-in-progress reference to keep the object
7918	 * alive even if we have to unlock it (in vm_paging_map_object()
7919	 * for example)...
7920	 */
7921	vm_object_paging_begin(page->object);
7922
7923	if (kernel_mapping_offset == 0) {
7924		/*
7925		 * The page hasn't already been mapped in kernel space
7926		 * by the caller.  Map it now, so that we can access
7927		 * its contents and encrypt them.
7928		 */
7929		kernel_mapping_size = PAGE_SIZE;
7930		kernel_mapping_needs_unmap = FALSE;
7931		kr = vm_paging_map_object(page,
7932					  page->object,
7933					  page->offset,
7934					  VM_PROT_READ | VM_PROT_WRITE,
7935					  FALSE,
7936					  &kernel_mapping_size,
7937					  &kernel_mapping_offset,
7938					  &kernel_mapping_needs_unmap);
7939		if (kr != KERN_SUCCESS) {
7940			panic("vm_page_encrypt: "
7941			      "could not map page in kernel: 0x%x\n",
7942			      kr);
7943		}
7944	} else {
7945		kernel_mapping_size = 0;
7946		kernel_mapping_needs_unmap = FALSE;
7947	}
7948	kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
7949
7950	if (swap_crypt_ctx_initialized == FALSE) {
7951		swap_crypt_ctx_initialize();
7952	}
7953	assert(swap_crypt_ctx_initialized);
7954
7955	/*
7956	 * Prepare an "initial vector" for the encryption.
7957	 * We use the "pager" and the "paging_offset" for that
7958	 * page to obfuscate the encrypted data a bit more and
7959	 * prevent crackers from finding patterns that they could
7960	 * use to break the key.
7961	 */
7962	bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
7963	encrypt_iv.vm.pager_object = page->object->pager;
7964	encrypt_iv.vm.paging_offset =
7965		page->object->paging_offset + page->offset;
7966
7967	/* encrypt the "initial vector" */
7968	aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
7969			swap_crypt_null_iv,
7970			1,
7971			&encrypt_iv.aes_iv[0],
7972			&swap_crypt_ctx.encrypt);
7973
7974	/*
7975	 * Encrypt the page.
7976	 */
7977	aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
7978			&encrypt_iv.aes_iv[0],
7979			PAGE_SIZE / AES_BLOCK_SIZE,
7980			(unsigned char *) kernel_vaddr,
7981			&swap_crypt_ctx.encrypt);
7982
7983	vm_page_encrypt_counter++;
7984
7985	/*
7986	 * Unmap the page from the kernel's address space,
7987	 * if we had to map it ourselves.  Otherwise, let
7988	 * the caller undo the mapping if needed.
7989	 */
7990	if (kernel_mapping_needs_unmap) {
7991		vm_paging_unmap_object(page->object,
7992				       kernel_mapping_offset,
7993				       kernel_mapping_offset + kernel_mapping_size);
7994	}
7995
7996	/*
7997	 * Clear the "reference" and "modified" bits.
7998	 * This should clean up any impact the encryption had
7999	 * on them.
8000	 * The page was kept busy and disconnected from all pmaps,
8001	 * so it can't have been referenced or modified from user
8002	 * space.
8003	 * The software bits will be reset later after the I/O
8004	 * has completed (in upl_commit_range()).
8005	 */
8006	pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
8007
8008	page->encrypted = TRUE;
8009
8010	vm_object_paging_end(page->object);
8011}
8012
8013/*
8014 * ENCRYPTED SWAP:
8015 * vm_page_decrypt:
8016 * 	Decrypt the given page.
8017 * 	The page might already be mapped at kernel virtual
8018 * 	address "kernel_mapping_offset".  Otherwise, we need
8019 * 	to map it.
8020 *
8021 * Context:
8022 *	The page's VM object is locked but will be unlocked and relocked.
8023 * 	The page is busy and not accessible by users (not entered in any pmap).
8024 */
8025void
8026vm_page_decrypt(
8027	vm_page_t	page,
8028	vm_map_offset_t	kernel_mapping_offset)
8029{
8030	kern_return_t		kr;
8031	vm_map_size_t		kernel_mapping_size;
8032	vm_offset_t		kernel_vaddr;
8033	boolean_t		kernel_mapping_needs_unmap;
8034	union {
8035		unsigned char	aes_iv[AES_BLOCK_SIZE];
8036		struct {
8037			memory_object_t		pager_object;
8038			vm_object_offset_t	paging_offset;
8039		} vm;
8040	} decrypt_iv;
8041	boolean_t		was_dirty;
8042
8043	assert(page->busy);
8044	assert(page->encrypted);
8045
8046	was_dirty = page->dirty;
8047
8048	/*
8049	 * Take a paging-in-progress reference to keep the object
8050	 * alive even if we have to unlock it (in vm_paging_map_object()
8051	 * for example)...
8052	 */
8053	vm_object_paging_begin(page->object);
8054
8055	if (kernel_mapping_offset == 0) {
8056		/*
8057		 * The page hasn't already been mapped in kernel space
8058		 * by the caller.  Map it now, so that we can access
8059		 * its contents and decrypt them.
8060		 */
8061		kernel_mapping_size = PAGE_SIZE;
8062		kernel_mapping_needs_unmap = FALSE;
8063		kr = vm_paging_map_object(page,
8064					  page->object,
8065					  page->offset,
8066					  VM_PROT_READ | VM_PROT_WRITE,
8067					  FALSE,
8068					  &kernel_mapping_size,
8069					  &kernel_mapping_offset,
8070					  &kernel_mapping_needs_unmap);
8071		if (kr != KERN_SUCCESS) {
8072			panic("vm_page_decrypt: "
8073			      "could not map page in kernel: 0x%x\n",
8074			      kr);
8075		}
8076	} else {
8077		kernel_mapping_size = 0;
8078		kernel_mapping_needs_unmap = FALSE;
8079	}
8080	kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
8081
8082	assert(swap_crypt_ctx_initialized);
8083
8084	/*
8085	 * Prepare an "initial vector" for the decryption.
8086	 * It has to be the same as the "initial vector" we
8087	 * used to encrypt that page.
8088	 */
8089	bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
8090	decrypt_iv.vm.pager_object = page->object->pager;
8091	decrypt_iv.vm.paging_offset =
8092		page->object->paging_offset + page->offset;
8093
8094	/* encrypt the "initial vector" */
8095	aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
8096			swap_crypt_null_iv,
8097			1,
8098			&decrypt_iv.aes_iv[0],
8099			&swap_crypt_ctx.encrypt);
8100
8101	/*
8102	 * Decrypt the page.
8103	 */
8104	aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
8105			&decrypt_iv.aes_iv[0],
8106			PAGE_SIZE / AES_BLOCK_SIZE,
8107			(unsigned char *) kernel_vaddr,
8108			&swap_crypt_ctx.decrypt);
8109	vm_page_decrypt_counter++;
8110
8111	/*
8112	 * Unmap the page from the kernel's address space,
8113	 * if we had to map it ourselves.  Otherwise, let
8114	 * the caller undo the mapping if needed.
8115	 */
8116	if (kernel_mapping_needs_unmap) {
8117		vm_paging_unmap_object(page->object,
8118				       kernel_vaddr,
8119				       kernel_vaddr + PAGE_SIZE);
8120	}
8121
8122	if (was_dirty) {
8123		/*
8124		 * The pager did not specify that the page would be
8125		 * clean when it got paged in, so let's not clean it here
8126		 * either.
8127		 */
8128	} else {
8129		/*
8130		 * After decryption, the page is actually still clean.
8131		 * It was encrypted as part of paging, which "cleans"
8132		 * the "dirty" pages.
8133		 * Noone could access it after it was encrypted
8134		 * and the decryption doesn't count.
8135		 */
8136		page->dirty = FALSE;
8137		assert (page->cs_validated == FALSE);
8138		pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
8139	}
8140	page->encrypted = FALSE;
8141
8142	/*
8143	 * We've just modified the page's contents via the data cache and part
8144	 * of the new contents might still be in the cache and not yet in RAM.
8145	 * Since the page is now available and might get gathered in a UPL to
8146	 * be part of a DMA transfer from a driver that expects the memory to
8147	 * be coherent at this point, we have to flush the data cache.
8148	 */
8149	pmap_sync_page_attributes_phys(page->phys_page);
8150	/*
8151	 * Since the page is not mapped yet, some code might assume that it
8152	 * doesn't need to invalidate the instruction cache when writing to
8153	 * that page.  That code relies on "pmapped" being FALSE, so that the
8154	 * caches get synchronized when the page is first mapped.
8155	 */
8156	assert(pmap_verify_free(page->phys_page));
8157	page->pmapped = FALSE;
8158	page->wpmapped = FALSE;
8159
8160	vm_object_paging_end(page->object);
8161}
8162
8163#if DEVELOPMENT || DEBUG
8164unsigned long upl_encrypt_upls = 0;
8165unsigned long upl_encrypt_pages = 0;
8166#endif
8167
8168/*
8169 * ENCRYPTED SWAP:
8170 *
8171 * upl_encrypt:
8172 * 	Encrypts all the pages in the UPL, within the specified range.
8173 *
8174 */
8175void
8176upl_encrypt(
8177	upl_t			upl,
8178	upl_offset_t		crypt_offset,
8179	upl_size_t		crypt_size)
8180{
8181	upl_size_t		upl_size, subupl_size=crypt_size;
8182	upl_offset_t		offset_in_upl, subupl_offset=crypt_offset;
8183	vm_object_t		upl_object;
8184	vm_object_offset_t	upl_offset;
8185	vm_page_t		page;
8186	vm_object_t		shadow_object;
8187	vm_object_offset_t	shadow_offset;
8188	vm_object_offset_t	paging_offset;
8189	vm_object_offset_t	base_offset;
8190	int	 		isVectorUPL = 0;
8191	upl_t			vector_upl = NULL;
8192
8193	if((isVectorUPL = vector_upl_is_valid(upl)))
8194		vector_upl = upl;
8195
8196process_upl_to_encrypt:
8197	if(isVectorUPL) {
8198		crypt_size = subupl_size;
8199		crypt_offset = subupl_offset;
8200		upl =  vector_upl_subupl_byoffset(vector_upl, &crypt_offset, &crypt_size);
8201		if(upl == NULL)
8202			panic("upl_encrypt: Accessing a sub-upl that doesn't exist\n");
8203		subupl_size -= crypt_size;
8204		subupl_offset += crypt_size;
8205	}
8206
8207#if DEVELOPMENT || DEBUG
8208	upl_encrypt_upls++;
8209	upl_encrypt_pages += crypt_size / PAGE_SIZE;
8210#endif
8211	upl_object = upl->map_object;
8212	upl_offset = upl->offset;
8213	upl_size = upl->size;
8214
8215	vm_object_lock(upl_object);
8216
8217	/*
8218	 * Find the VM object that contains the actual pages.
8219	 */
8220	if (upl_object->pageout) {
8221		shadow_object = upl_object->shadow;
8222		/*
8223		 * The offset in the shadow object is actually also
8224		 * accounted for in upl->offset.  It possibly shouldn't be
8225		 * this way, but for now don't account for it twice.
8226		 */
8227		shadow_offset = 0;
8228		assert(upl_object->paging_offset == 0);	/* XXX ? */
8229		vm_object_lock(shadow_object);
8230	} else {
8231		shadow_object = upl_object;
8232		shadow_offset = 0;
8233	}
8234
8235	paging_offset = shadow_object->paging_offset;
8236	vm_object_paging_begin(shadow_object);
8237
8238	if (shadow_object != upl_object)
8239	        vm_object_unlock(upl_object);
8240
8241
8242	base_offset = shadow_offset;
8243	base_offset += upl_offset;
8244	base_offset += crypt_offset;
8245	base_offset -= paging_offset;
8246
8247	assert(crypt_offset + crypt_size <= upl_size);
8248
8249	for (offset_in_upl = 0;
8250	     offset_in_upl < crypt_size;
8251	     offset_in_upl += PAGE_SIZE) {
8252		page = vm_page_lookup(shadow_object,
8253				      base_offset + offset_in_upl);
8254		if (page == VM_PAGE_NULL) {
8255			panic("upl_encrypt: "
8256			      "no page for (obj=%p,off=0x%llx+0x%x)!\n",
8257			      shadow_object,
8258			      base_offset,
8259			      offset_in_upl);
8260		}
8261		/*
8262		 * Disconnect the page from all pmaps, so that nobody can
8263		 * access it while it's encrypted.  After that point, all
8264		 * accesses to this page will cause a page fault and block
8265		 * while the page is busy being encrypted.  After the
8266		 * encryption completes, any access will cause a
8267		 * page fault and the page gets decrypted at that time.
8268		 */
8269		pmap_disconnect(page->phys_page);
8270		vm_page_encrypt(page, 0);
8271
8272		if (vm_object_lock_avoid(shadow_object)) {
8273			/*
8274			 * Give vm_pageout_scan() a chance to convert more
8275			 * pages from "clean-in-place" to "clean-and-free",
8276			 * if it's interested in the same pages we selected
8277			 * in this cluster.
8278			 */
8279			vm_object_unlock(shadow_object);
8280			mutex_pause(2);
8281			vm_object_lock(shadow_object);
8282		}
8283	}
8284
8285	vm_object_paging_end(shadow_object);
8286	vm_object_unlock(shadow_object);
8287
8288	if(isVectorUPL && subupl_size)
8289		goto process_upl_to_encrypt;
8290}
8291
8292#else /* CRYPTO */
8293void
8294upl_encrypt(
8295	__unused upl_t			upl,
8296	__unused upl_offset_t	crypt_offset,
8297	__unused upl_size_t	crypt_size)
8298{
8299}
8300
8301void
8302vm_page_encrypt(
8303	__unused vm_page_t		page,
8304	__unused vm_map_offset_t	kernel_mapping_offset)
8305{
8306}
8307
8308void
8309vm_page_decrypt(
8310	__unused vm_page_t		page,
8311	__unused vm_map_offset_t	kernel_mapping_offset)
8312{
8313}
8314
8315#endif /* CRYPTO */
8316
8317/*
8318 * page->object must be locked
8319 */
8320void
8321vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
8322{
8323	if (!queues_locked) {
8324		vm_page_lockspin_queues();
8325	}
8326
8327	/*
8328	 * need to drop the laundry count...
8329	 * we may also need to remove it
8330	 * from the I/O paging queue...
8331	 * vm_pageout_throttle_up handles both cases
8332	 *
8333	 * the laundry and pageout_queue flags are cleared...
8334	 */
8335	vm_pageout_throttle_up(page);
8336
8337	vm_page_steal_pageout_page++;
8338
8339	if (!queues_locked) {
8340		vm_page_unlock_queues();
8341	}
8342}
8343
8344upl_t
8345vector_upl_create(vm_offset_t upl_offset)
8346{
8347	int	vector_upl_size  = sizeof(struct _vector_upl);
8348	int i=0;
8349	upl_t	upl;
8350	vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
8351
8352	upl = upl_create(0,UPL_VECTOR,0);
8353	upl->vector_upl = vector_upl;
8354	upl->offset = upl_offset;
8355	vector_upl->size = 0;
8356	vector_upl->offset = upl_offset;
8357	vector_upl->invalid_upls=0;
8358	vector_upl->num_upls=0;
8359	vector_upl->pagelist = NULL;
8360
8361	for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
8362		vector_upl->upl_iostates[i].size = 0;
8363		vector_upl->upl_iostates[i].offset = 0;
8364
8365	}
8366	return upl;
8367}
8368
8369void
8370vector_upl_deallocate(upl_t upl)
8371{
8372	if(upl) {
8373		vector_upl_t vector_upl = upl->vector_upl;
8374		if(vector_upl) {
8375			if(vector_upl->invalid_upls != vector_upl->num_upls)
8376				panic("Deallocating non-empty Vectored UPL\n");
8377			kfree(vector_upl->pagelist,(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)));
8378			vector_upl->invalid_upls=0;
8379			vector_upl->num_upls = 0;
8380			vector_upl->pagelist = NULL;
8381			vector_upl->size = 0;
8382			vector_upl->offset = 0;
8383			kfree(vector_upl, sizeof(struct _vector_upl));
8384			vector_upl = (vector_upl_t)0xfeedfeed;
8385		}
8386		else
8387			panic("vector_upl_deallocate was passed a non-vectored upl\n");
8388	}
8389	else
8390		panic("vector_upl_deallocate was passed a NULL upl\n");
8391}
8392
8393boolean_t
8394vector_upl_is_valid(upl_t upl)
8395{
8396	if(upl &&  ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) {
8397		vector_upl_t vector_upl = upl->vector_upl;
8398		if(vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef)
8399			return FALSE;
8400		else
8401			return TRUE;
8402	}
8403	return FALSE;
8404}
8405
8406boolean_t
8407vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
8408{
8409	if(vector_upl_is_valid(upl)) {
8410		vector_upl_t vector_upl = upl->vector_upl;
8411
8412		if(vector_upl) {
8413			if(subupl) {
8414				if(io_size) {
8415					if(io_size < PAGE_SIZE)
8416						io_size = PAGE_SIZE;
8417					subupl->vector_upl = (void*)vector_upl;
8418					vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
8419					vector_upl->size += io_size;
8420					upl->size += io_size;
8421				}
8422				else {
8423					uint32_t i=0,invalid_upls=0;
8424					for(i = 0; i < vector_upl->num_upls; i++) {
8425						if(vector_upl->upl_elems[i] == subupl)
8426							break;
8427					}
8428					if(i == vector_upl->num_upls)
8429						panic("Trying to remove sub-upl when none exists");
8430
8431					vector_upl->upl_elems[i] = NULL;
8432					invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
8433					if(invalid_upls == vector_upl->num_upls)
8434						return TRUE;
8435					else
8436						return FALSE;
8437				}
8438			}
8439			else
8440				panic("vector_upl_set_subupl was passed a NULL upl element\n");
8441		}
8442		else
8443			panic("vector_upl_set_subupl was passed a non-vectored upl\n");
8444	}
8445	else
8446		panic("vector_upl_set_subupl was passed a NULL upl\n");
8447
8448	return FALSE;
8449}
8450
8451void
8452vector_upl_set_pagelist(upl_t upl)
8453{
8454	if(vector_upl_is_valid(upl)) {
8455		uint32_t i=0;
8456		vector_upl_t vector_upl = upl->vector_upl;
8457
8458		if(vector_upl) {
8459			vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
8460
8461			vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
8462
8463			for(i=0; i < vector_upl->num_upls; i++) {
8464				cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
8465				bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
8466				pagelist_size += cur_upl_pagelist_size;
8467				if(vector_upl->upl_elems[i]->highest_page > upl->highest_page)
8468					upl->highest_page = vector_upl->upl_elems[i]->highest_page;
8469			}
8470			assert( pagelist_size == (sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)) );
8471		}
8472		else
8473			panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
8474	}
8475	else
8476		panic("vector_upl_set_pagelist was passed a NULL upl\n");
8477
8478}
8479
8480upl_t
8481vector_upl_subupl_byindex(upl_t upl, uint32_t index)
8482{
8483	if(vector_upl_is_valid(upl)) {
8484		vector_upl_t vector_upl = upl->vector_upl;
8485		if(vector_upl) {
8486			if(index < vector_upl->num_upls)
8487				return vector_upl->upl_elems[index];
8488		}
8489		else
8490			panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
8491	}
8492	return NULL;
8493}
8494
8495upl_t
8496vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
8497{
8498	if(vector_upl_is_valid(upl)) {
8499		uint32_t i=0;
8500		vector_upl_t vector_upl = upl->vector_upl;
8501
8502		if(vector_upl) {
8503			upl_t subupl = NULL;
8504			vector_upl_iostates_t subupl_state;
8505
8506			for(i=0; i < vector_upl->num_upls; i++) {
8507				subupl = vector_upl->upl_elems[i];
8508				subupl_state = vector_upl->upl_iostates[i];
8509				if( *upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
8510					/* We could have been passed an offset/size pair that belongs
8511					 * to an UPL element that has already been committed/aborted.
8512					 * If so, return NULL.
8513					 */
8514					if(subupl == NULL)
8515						return NULL;
8516					if((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
8517						*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
8518						if(*upl_size > subupl_state.size)
8519							*upl_size = subupl_state.size;
8520					}
8521					if(*upl_offset >= subupl_state.offset)
8522						*upl_offset -= subupl_state.offset;
8523					else if(i)
8524						panic("Vector UPL offset miscalculation\n");
8525					return subupl;
8526				}
8527			}
8528		}
8529		else
8530			panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
8531	}
8532	return NULL;
8533}
8534
8535void
8536vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
8537{
8538	*v_upl_submap = NULL;
8539
8540	if(vector_upl_is_valid(upl)) {
8541		vector_upl_t vector_upl = upl->vector_upl;
8542		if(vector_upl) {
8543			*v_upl_submap = vector_upl->submap;
8544			*submap_dst_addr = vector_upl->submap_dst_addr;
8545		}
8546		else
8547			panic("vector_upl_get_submap was passed a non-vectored UPL\n");
8548	}
8549	else
8550		panic("vector_upl_get_submap was passed a null UPL\n");
8551}
8552
8553void
8554vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
8555{
8556	if(vector_upl_is_valid(upl)) {
8557		vector_upl_t vector_upl = upl->vector_upl;
8558		if(vector_upl) {
8559			vector_upl->submap = submap;
8560			vector_upl->submap_dst_addr = submap_dst_addr;
8561		}
8562		else
8563			panic("vector_upl_get_submap was passed a non-vectored UPL\n");
8564	}
8565	else
8566		panic("vector_upl_get_submap was passed a NULL UPL\n");
8567}
8568
8569void
8570vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
8571{
8572	if(vector_upl_is_valid(upl)) {
8573		uint32_t i = 0;
8574		vector_upl_t vector_upl = upl->vector_upl;
8575
8576		if(vector_upl) {
8577			for(i = 0; i < vector_upl->num_upls; i++) {
8578				if(vector_upl->upl_elems[i] == subupl)
8579					break;
8580			}
8581
8582			if(i == vector_upl->num_upls)
8583				panic("setting sub-upl iostate when none exists");
8584
8585			vector_upl->upl_iostates[i].offset = offset;
8586			if(size < PAGE_SIZE)
8587				size = PAGE_SIZE;
8588			vector_upl->upl_iostates[i].size = size;
8589		}
8590		else
8591			panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
8592	}
8593	else
8594		panic("vector_upl_set_iostate was passed a NULL UPL\n");
8595}
8596
8597void
8598vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
8599{
8600	if(vector_upl_is_valid(upl)) {
8601		uint32_t i = 0;
8602		vector_upl_t vector_upl = upl->vector_upl;
8603
8604		if(vector_upl) {
8605			for(i = 0; i < vector_upl->num_upls; i++) {
8606				if(vector_upl->upl_elems[i] == subupl)
8607					break;
8608			}
8609
8610			if(i == vector_upl->num_upls)
8611				panic("getting sub-upl iostate when none exists");
8612
8613			*offset = vector_upl->upl_iostates[i].offset;
8614			*size = vector_upl->upl_iostates[i].size;
8615		}
8616		else
8617			panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
8618	}
8619	else
8620		panic("vector_upl_get_iostate was passed a NULL UPL\n");
8621}
8622
8623void
8624vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
8625{
8626	if(vector_upl_is_valid(upl)) {
8627		vector_upl_t vector_upl = upl->vector_upl;
8628		if(vector_upl) {
8629			if(index < vector_upl->num_upls) {
8630				*offset = vector_upl->upl_iostates[index].offset;
8631				*size = vector_upl->upl_iostates[index].size;
8632			}
8633			else
8634				*offset = *size = 0;
8635		}
8636		else
8637			panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
8638	}
8639	else
8640		panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
8641}
8642
8643upl_page_info_t *
8644upl_get_internal_vectorupl_pagelist(upl_t upl)
8645{
8646	return ((vector_upl_t)(upl->vector_upl))->pagelist;
8647}
8648
8649void *
8650upl_get_internal_vectorupl(upl_t upl)
8651{
8652	return upl->vector_upl;
8653}
8654
8655vm_size_t
8656upl_get_internal_pagelist_offset(void)
8657{
8658	return sizeof(struct upl);
8659}
8660
8661void
8662upl_clear_dirty(
8663	upl_t		upl,
8664	boolean_t 	value)
8665{
8666	if (value) {
8667		upl->flags |= UPL_CLEAR_DIRTY;
8668	} else {
8669		upl->flags &= ~UPL_CLEAR_DIRTY;
8670	}
8671}
8672
8673void
8674upl_set_referenced(
8675	upl_t		upl,
8676	boolean_t 	value)
8677{
8678	upl_lock(upl);
8679	if (value) {
8680		upl->ext_ref_count++;
8681	} else {
8682		if (!upl->ext_ref_count) {
8683			panic("upl_set_referenced not %p\n", upl);
8684		}
8685		upl->ext_ref_count--;
8686	}
8687	upl_unlock(upl);
8688}
8689
8690boolean_t
8691vm_page_is_slideable(vm_page_t m)
8692{
8693	boolean_t result = FALSE;
8694	vm_shared_region_slide_info_t si;
8695
8696	vm_object_lock_assert_held(m->object);
8697
8698	/* make sure our page belongs to the one object allowed to do this */
8699	if (!m->object->object_slid) {
8700		goto done;
8701	}
8702
8703	si = m->object->vo_slide_info;
8704	if (si == NULL) {
8705		goto done;
8706	}
8707
8708	if(!m->slid && (si->start <= m->offset && si->end > m->offset)) {
8709		result = TRUE;
8710	}
8711
8712done:
8713	return result;
8714}
8715
8716int vm_page_slide_counter = 0;
8717int vm_page_slide_errors = 0;
8718kern_return_t
8719vm_page_slide(
8720	vm_page_t	page,
8721	vm_map_offset_t	kernel_mapping_offset)
8722{
8723	kern_return_t		kr;
8724	vm_map_size_t		kernel_mapping_size;
8725	boolean_t		kernel_mapping_needs_unmap;
8726	vm_offset_t		kernel_vaddr;
8727	uint32_t		pageIndex = 0;
8728
8729	assert(!page->slid);
8730	assert(page->object->object_slid);
8731	vm_object_lock_assert_exclusive(page->object);
8732
8733	if (page->error)
8734		return KERN_FAILURE;
8735
8736	/*
8737	 * Take a paging-in-progress reference to keep the object
8738	 * alive even if we have to unlock it (in vm_paging_map_object()
8739	 * for example)...
8740	 */
8741	vm_object_paging_begin(page->object);
8742
8743	if (kernel_mapping_offset == 0) {
8744		/*
8745		 * The page hasn't already been mapped in kernel space
8746		 * by the caller.  Map it now, so that we can access
8747		 * its contents and decrypt them.
8748		 */
8749		kernel_mapping_size = PAGE_SIZE;
8750		kernel_mapping_needs_unmap = FALSE;
8751		kr = vm_paging_map_object(page,
8752					  page->object,
8753					  page->offset,
8754					  VM_PROT_READ | VM_PROT_WRITE,
8755					  FALSE,
8756					  &kernel_mapping_size,
8757					  &kernel_mapping_offset,
8758					  &kernel_mapping_needs_unmap);
8759		if (kr != KERN_SUCCESS) {
8760			panic("vm_page_slide: "
8761			      "could not map page in kernel: 0x%x\n",
8762			      kr);
8763		}
8764	} else {
8765		kernel_mapping_size = 0;
8766		kernel_mapping_needs_unmap = FALSE;
8767	}
8768	kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
8769
8770	/*
8771	 * Slide the pointers on the page.
8772	 */
8773
8774	/*assert that slide_file_info.start/end are page-aligned?*/
8775
8776	assert(!page->slid);
8777	assert(page->object->object_slid);
8778
8779	pageIndex = (uint32_t)((page->offset - page->object->vo_slide_info->start)/PAGE_SIZE);
8780	kr = vm_shared_region_slide_page(page->object->vo_slide_info, kernel_vaddr, pageIndex);
8781	vm_page_slide_counter++;
8782
8783	/*
8784	 * Unmap the page from the kernel's address space,
8785	 */
8786	if (kernel_mapping_needs_unmap) {
8787		vm_paging_unmap_object(page->object,
8788				       kernel_vaddr,
8789				       kernel_vaddr + PAGE_SIZE);
8790	}
8791
8792	page->dirty = FALSE;
8793	pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
8794
8795	if (kr != KERN_SUCCESS || cs_debug > 1) {
8796		printf("vm_page_slide(%p): "
8797		       "obj %p off 0x%llx mobj %p moff 0x%llx\n",
8798		       page,
8799		       page->object, page->offset,
8800		       page->object->pager,
8801		       page->offset + page->object->paging_offset);
8802	}
8803
8804	if (kr == KERN_SUCCESS) {
8805		page->slid = TRUE;
8806	} else {
8807		page->error = TRUE;
8808		vm_page_slide_errors++;
8809	}
8810
8811	vm_object_paging_end(page->object);
8812
8813	return kr;
8814}
8815
8816void inline memoryshot(unsigned int event, unsigned int control)
8817{
8818	if (vm_debug_events) {
8819		KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
8820					vm_page_active_count, vm_page_inactive_count,
8821					vm_page_free_count, vm_page_speculative_count,
8822					vm_page_throttled_count);
8823	} else {
8824		(void) event;
8825		(void) control;
8826	}
8827
8828}
8829
8830#ifdef MACH_BSD
8831
8832boolean_t  upl_device_page(upl_page_info_t *upl)
8833{
8834	return(UPL_DEVICE_PAGE(upl));
8835}
8836boolean_t  upl_page_present(upl_page_info_t *upl, int index)
8837{
8838	return(UPL_PAGE_PRESENT(upl, index));
8839}
8840boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
8841{
8842	return(UPL_SPECULATIVE_PAGE(upl, index));
8843}
8844boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
8845{
8846	return(UPL_DIRTY_PAGE(upl, index));
8847}
8848boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
8849{
8850	return(UPL_VALID_PAGE(upl, index));
8851}
8852ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
8853{
8854	return(UPL_PHYS_PAGE(upl, index));
8855}
8856
8857
8858void
8859vm_countdirtypages(void)
8860{
8861	vm_page_t m;
8862	int dpages;
8863	int pgopages;
8864	int precpages;
8865
8866
8867	dpages=0;
8868	pgopages=0;
8869	precpages=0;
8870
8871	vm_page_lock_queues();
8872	m = (vm_page_t) queue_first(&vm_page_queue_inactive);
8873	do {
8874		if (m ==(vm_page_t )0) break;
8875
8876		if(m->dirty) dpages++;
8877		if(m->pageout) pgopages++;
8878		if(m->precious) precpages++;
8879
8880		assert(m->object != kernel_object);
8881		m = (vm_page_t) queue_next(&m->pageq);
8882		if (m ==(vm_page_t )0) break;
8883
8884	} while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
8885	vm_page_unlock_queues();
8886
8887	vm_page_lock_queues();
8888	m = (vm_page_t) queue_first(&vm_page_queue_throttled);
8889	do {
8890		if (m ==(vm_page_t )0) break;
8891
8892		dpages++;
8893		assert(m->dirty);
8894		assert(!m->pageout);
8895		assert(m->object != kernel_object);
8896		m = (vm_page_t) queue_next(&m->pageq);
8897		if (m ==(vm_page_t )0) break;
8898
8899	} while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
8900	vm_page_unlock_queues();
8901
8902	vm_page_lock_queues();
8903	m = (vm_page_t) queue_first(&vm_page_queue_anonymous);
8904	do {
8905		if (m ==(vm_page_t )0) break;
8906
8907		if(m->dirty) dpages++;
8908		if(m->pageout) pgopages++;
8909		if(m->precious) precpages++;
8910
8911		assert(m->object != kernel_object);
8912		m = (vm_page_t) queue_next(&m->pageq);
8913		if (m ==(vm_page_t )0) break;
8914
8915	} while (!queue_end(&vm_page_queue_anonymous,(queue_entry_t) m));
8916	vm_page_unlock_queues();
8917
8918	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
8919
8920	dpages=0;
8921	pgopages=0;
8922	precpages=0;
8923
8924	vm_page_lock_queues();
8925	m = (vm_page_t) queue_first(&vm_page_queue_active);
8926
8927	do {
8928		if(m == (vm_page_t )0) break;
8929		if(m->dirty) dpages++;
8930		if(m->pageout) pgopages++;
8931		if(m->precious) precpages++;
8932
8933		assert(m->object != kernel_object);
8934		m = (vm_page_t) queue_next(&m->pageq);
8935		if(m == (vm_page_t )0) break;
8936
8937	} while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
8938	vm_page_unlock_queues();
8939
8940	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
8941
8942}
8943#endif /* MACH_BSD */
8944
8945ppnum_t upl_get_highest_page(
8946			     upl_t			upl)
8947{
8948        return upl->highest_page;
8949}
8950
8951upl_size_t upl_get_size(
8952			     upl_t			upl)
8953{
8954        return upl->size;
8955}
8956
8957#if UPL_DEBUG
8958kern_return_t  upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
8959{
8960	upl->ubc_alias1 = alias1;
8961	upl->ubc_alias2 = alias2;
8962	return KERN_SUCCESS;
8963}
8964int  upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
8965{
8966	if(al)
8967		*al = upl->ubc_alias1;
8968	if(al2)
8969		*al2 = upl->ubc_alias2;
8970	return KERN_SUCCESS;
8971}
8972#endif /* UPL_DEBUG */
8973