1/*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 *	File:	vm/vm_pageout.c
60 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61 *	Date:	1985
62 *
63 *	The proverbial page-out daemon.
64 */
65
66#include <stdint.h>
67
68#include <debug.h>
69#include <mach_pagemap.h>
70#include <mach_cluster_stats.h>
71#include <mach_kdb.h>
72#include <advisory_pageout.h>
73
74#include <mach/mach_types.h>
75#include <mach/memory_object.h>
76#include <mach/memory_object_default.h>
77#include <mach/memory_object_control_server.h>
78#include <mach/mach_host_server.h>
79#include <mach/upl.h>
80#include <mach/vm_map.h>
81#include <mach/vm_param.h>
82#include <mach/vm_statistics.h>
83#include <mach/sdt.h>
84
85#include <kern/kern_types.h>
86#include <kern/counters.h>
87#include <kern/host_statistics.h>
88#include <kern/machine.h>
89#include <kern/misc_protos.h>
90#include <kern/thread.h>
91#include <kern/xpr.h>
92#include <kern/kalloc.h>
93
94#include <machine/vm_tuning.h>
95
96#if CONFIG_EMBEDDED
97#include <sys/kern_memorystatus.h>
98#endif
99
100#include <vm/pmap.h>
101#include <vm/vm_fault.h>
102#include <vm/vm_map.h>
103#include <vm/vm_object.h>
104#include <vm/vm_page.h>
105#include <vm/vm_pageout.h>
106#include <vm/vm_protos.h> /* must be last */
107#include <vm/memory_object.h>
108#include <vm/vm_purgeable_internal.h>
109
110/*
111 * ENCRYPTED SWAP:
112 */
113#include <../bsd/crypto/aes/aes.h>
114
115
116#ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
117#define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
118#endif
119
120#ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
121#ifdef	CONFIG_EMBEDDED
122#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
123#else
124#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
125#endif
126#endif
127
128#ifndef VM_PAGEOUT_DEADLOCK_RELIEF
129#define VM_PAGEOUT_DEADLOCK_RELIEF 100	/* number of pages to move to break deadlock */
130#endif
131
132#ifndef VM_PAGEOUT_INACTIVE_RELIEF
133#define VM_PAGEOUT_INACTIVE_RELIEF 50	/* minimum number of pages to move to the inactive q */
134#endif
135
136#ifndef	VM_PAGE_LAUNDRY_MAX
137#define	VM_PAGE_LAUNDRY_MAX	16UL	/* maximum pageouts on a given pageout queue */
138#endif	/* VM_PAGEOUT_LAUNDRY_MAX */
139
140#ifndef	VM_PAGEOUT_BURST_WAIT
141#define	VM_PAGEOUT_BURST_WAIT	30	/* milliseconds per page */
142#endif	/* VM_PAGEOUT_BURST_WAIT */
143
144#ifndef	VM_PAGEOUT_EMPTY_WAIT
145#define VM_PAGEOUT_EMPTY_WAIT	200	/* milliseconds */
146#endif	/* VM_PAGEOUT_EMPTY_WAIT */
147
148#ifndef	VM_PAGEOUT_DEADLOCK_WAIT
149#define VM_PAGEOUT_DEADLOCK_WAIT	300	/* milliseconds */
150#endif	/* VM_PAGEOUT_DEADLOCK_WAIT */
151
152#ifndef	VM_PAGEOUT_IDLE_WAIT
153#define VM_PAGEOUT_IDLE_WAIT	10	/* milliseconds */
154#endif	/* VM_PAGEOUT_IDLE_WAIT */
155
156#ifndef VM_PAGE_SPECULATIVE_TARGET
157#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20)
158#endif /* VM_PAGE_SPECULATIVE_TARGET */
159
160#ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
161#define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
162#endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
163
164
165/*
166 *	To obtain a reasonable LRU approximation, the inactive queue
167 *	needs to be large enough to give pages on it a chance to be
168 *	referenced a second time.  This macro defines the fraction
169 *	of active+inactive pages that should be inactive.
170 *	The pageout daemon uses it to update vm_page_inactive_target.
171 *
172 *	If vm_page_free_count falls below vm_page_free_target and
173 *	vm_page_inactive_count is below vm_page_inactive_target,
174 *	then the pageout daemon starts running.
175 */
176
177#ifndef	VM_PAGE_INACTIVE_TARGET
178#define	VM_PAGE_INACTIVE_TARGET(avail)	((avail) * 1 / 3)
179#endif	/* VM_PAGE_INACTIVE_TARGET */
180
181/*
182 *	Once the pageout daemon starts running, it keeps going
183 *	until vm_page_free_count meets or exceeds vm_page_free_target.
184 */
185
186#ifndef	VM_PAGE_FREE_TARGET
187#ifdef	CONFIG_EMBEDDED
188#define	VM_PAGE_FREE_TARGET(free)	(15 + (free) / 100)
189#else
190#define	VM_PAGE_FREE_TARGET(free)	(15 + (free) / 80)
191#endif
192#endif	/* VM_PAGE_FREE_TARGET */
193
194/*
195 *	The pageout daemon always starts running once vm_page_free_count
196 *	falls below vm_page_free_min.
197 */
198
199#ifndef	VM_PAGE_FREE_MIN
200#ifdef	CONFIG_EMBEDDED
201#define	VM_PAGE_FREE_MIN(free)		(10 + (free) / 200)
202#else
203#define	VM_PAGE_FREE_MIN(free)		(10 + (free) / 100)
204#endif
205#endif	/* VM_PAGE_FREE_MIN */
206
207#define VM_PAGE_FREE_MIN_LIMIT		1500
208#define VM_PAGE_FREE_TARGET_LIMIT	2000
209
210
211/*
212 *	When vm_page_free_count falls below vm_page_free_reserved,
213 *	only vm-privileged threads can allocate pages.  vm-privilege
214 *	allows the pageout daemon and default pager (and any other
215 *	associated threads needed for default pageout) to continue
216 *	operation by dipping into the reserved pool of pages.
217 */
218
219#ifndef	VM_PAGE_FREE_RESERVED
220#define	VM_PAGE_FREE_RESERVED(n)	\
221	((6 * VM_PAGE_LAUNDRY_MAX) + (n))
222#endif	/* VM_PAGE_FREE_RESERVED */
223
224/*
225 *	When we dequeue pages from the inactive list, they are
226 *	reactivated (ie, put back on the active queue) if referenced.
227 *	However, it is possible to starve the free list if other
228 *	processors are referencing pages faster than we can turn off
229 *	the referenced bit.  So we limit the number of reactivations
230 *	we will make per call of vm_pageout_scan().
231 */
232#define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
233#ifndef	VM_PAGE_REACTIVATE_LIMIT
234#ifdef	CONFIG_EMBEDDED
235#define	VM_PAGE_REACTIVATE_LIMIT(avail)	(VM_PAGE_INACTIVE_TARGET(avail) / 2)
236#else
237#define	VM_PAGE_REACTIVATE_LIMIT(avail)	(MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
238#endif
239#endif	/* VM_PAGE_REACTIVATE_LIMIT */
240#define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM	100
241
242
243/*
244 * must hold the page queues lock to
245 * manipulate this structure
246 */
247struct vm_pageout_queue {
248        queue_head_t	pgo_pending;	/* laundry pages to be processed by pager's iothread */
249        unsigned int	pgo_laundry;	/* current count of laundry pages on queue or in flight */
250        unsigned int	pgo_maxlaundry;
251
252        unsigned int	pgo_idle:1,	/* iothread is blocked waiting for work to do */
253	                pgo_busy:1,     /* iothread is currently processing request from pgo_pending */
254			pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
255			:0;
256};
257
258#define VM_PAGE_Q_THROTTLED(q)		\
259        ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
260
261
262/*
263 * Exported variable used to broadcast the activation of the pageout scan
264 * Working Set uses this to throttle its use of pmap removes.  In this
265 * way, code which runs within memory in an uncontested context does
266 * not keep encountering soft faults.
267 */
268
269unsigned int	vm_pageout_scan_event_counter = 0;
270
271/*
272 * Forward declarations for internal routines.
273 */
274
275static void vm_pageout_garbage_collect(int);
276static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
277static void vm_pageout_iothread_external(void);
278static void vm_pageout_iothread_internal(void);
279static void vm_pageout_queue_steal(vm_page_t);
280
281extern void vm_pageout_continue(void);
282extern void vm_pageout_scan(void);
283
284static thread_t	vm_pageout_external_iothread = THREAD_NULL;
285static thread_t	vm_pageout_internal_iothread = THREAD_NULL;
286
287unsigned int vm_pageout_reserved_internal = 0;
288unsigned int vm_pageout_reserved_really = 0;
289
290unsigned int vm_pageout_idle_wait = 0;		/* milliseconds */
291unsigned int vm_pageout_empty_wait = 0;		/* milliseconds */
292unsigned int vm_pageout_burst_wait = 0;		/* milliseconds */
293unsigned int vm_pageout_deadlock_wait = 0;	/* milliseconds */
294unsigned int vm_pageout_deadlock_relief = 0;
295unsigned int vm_pageout_inactive_relief = 0;
296unsigned int vm_pageout_burst_active_throttle = 0;
297unsigned int vm_pageout_burst_inactive_throttle = 0;
298
299/*
300 *	Protection against zero fill flushing live working sets derived
301 *	from existing backing store and files
302 */
303unsigned int vm_accellerate_zf_pageout_trigger = 400;
304unsigned int zf_queue_min_count = 100;
305unsigned int vm_zf_count = 0;
306unsigned int vm_zf_queue_count = 0;
307
308/*
309 *	These variables record the pageout daemon's actions:
310 *	how many pages it looks at and what happens to those pages.
311 *	No locking needed because only one thread modifies the variables.
312 */
313
314unsigned int vm_pageout_active = 0;		/* debugging */
315unsigned int vm_pageout_inactive = 0;		/* debugging */
316unsigned int vm_pageout_inactive_throttled = 0;	/* debugging */
317unsigned int vm_pageout_inactive_forced = 0;	/* debugging */
318unsigned int vm_pageout_inactive_nolock = 0;	/* debugging */
319unsigned int vm_pageout_inactive_avoid = 0;	/* debugging */
320unsigned int vm_pageout_inactive_busy = 0;	/* debugging */
321unsigned int vm_pageout_inactive_absent = 0;	/* debugging */
322unsigned int vm_pageout_inactive_used = 0;	/* debugging */
323unsigned int vm_pageout_inactive_clean = 0;	/* debugging */
324unsigned int vm_pageout_inactive_dirty = 0;	/* debugging */
325unsigned int vm_pageout_dirty_no_pager = 0;	/* debugging */
326unsigned int vm_pageout_purged_objects = 0;	/* debugging */
327unsigned int vm_stat_discard = 0;		/* debugging */
328unsigned int vm_stat_discard_sent = 0;		/* debugging */
329unsigned int vm_stat_discard_failure = 0;	/* debugging */
330unsigned int vm_stat_discard_throttle = 0;	/* debugging */
331unsigned int vm_pageout_reactivation_limit_exceeded = 0;	/* debugging */
332unsigned int vm_pageout_catch_ups = 0;				/* debugging */
333unsigned int vm_pageout_inactive_force_reclaim = 0;	/* debugging */
334
335unsigned int vm_pageout_scan_active_throttled = 0;
336unsigned int vm_pageout_scan_inactive_throttled = 0;
337unsigned int vm_pageout_scan_throttle = 0;			/* debugging */
338unsigned int vm_pageout_scan_burst_throttle = 0;		/* debugging */
339unsigned int vm_pageout_scan_empty_throttle = 0;		/* debugging */
340unsigned int vm_pageout_scan_deadlock_detected = 0;		/* debugging */
341unsigned int vm_pageout_scan_active_throttle_success = 0;	/* debugging */
342unsigned int vm_pageout_scan_inactive_throttle_success = 0;	/* debugging */
343/*
344 * Backing store throttle when BS is exhausted
345 */
346unsigned int	vm_backing_store_low = 0;
347
348unsigned int vm_pageout_out_of_line  = 0;
349unsigned int vm_pageout_in_place  = 0;
350
351/*
352 * ENCRYPTED SWAP:
353 * counters and statistics...
354 */
355unsigned long vm_page_decrypt_counter = 0;
356unsigned long vm_page_decrypt_for_upl_counter = 0;
357unsigned long vm_page_encrypt_counter = 0;
358unsigned long vm_page_encrypt_abort_counter = 0;
359unsigned long vm_page_encrypt_already_encrypted_counter = 0;
360boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
361
362struct	vm_pageout_queue vm_pageout_queue_internal;
363struct	vm_pageout_queue vm_pageout_queue_external;
364
365unsigned int vm_page_speculative_target = 0;
366
367vm_object_t 	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
368
369unsigned long vm_cs_validated_resets = 0;
370
371/*
372 *	Routine:	vm_backing_store_disable
373 *	Purpose:
374 *		Suspend non-privileged threads wishing to extend
375 *		backing store when we are low on backing store
376 *		(Synchronized by caller)
377 */
378void
379vm_backing_store_disable(
380	boolean_t	disable)
381{
382	if(disable) {
383		vm_backing_store_low = 1;
384	} else {
385		if(vm_backing_store_low) {
386			vm_backing_store_low = 0;
387			thread_wakeup((event_t) &vm_backing_store_low);
388		}
389	}
390}
391
392
393#if MACH_CLUSTER_STATS
394unsigned long vm_pageout_cluster_dirtied = 0;
395unsigned long vm_pageout_cluster_cleaned = 0;
396unsigned long vm_pageout_cluster_collisions = 0;
397unsigned long vm_pageout_cluster_clusters = 0;
398unsigned long vm_pageout_cluster_conversions = 0;
399unsigned long vm_pageout_target_collisions = 0;
400unsigned long vm_pageout_target_page_dirtied = 0;
401unsigned long vm_pageout_target_page_freed = 0;
402#define CLUSTER_STAT(clause)	clause
403#else	/* MACH_CLUSTER_STATS */
404#define CLUSTER_STAT(clause)
405#endif	/* MACH_CLUSTER_STATS */
406
407/*
408 *	Routine:	vm_pageout_object_terminate
409 *	Purpose:
410 *		Destroy the pageout_object, and perform all of the
411 *		required cleanup actions.
412 *
413 *	In/Out conditions:
414 *		The object must be locked, and will be returned locked.
415 */
416void
417vm_pageout_object_terminate(
418	vm_object_t	object)
419{
420	vm_object_t	shadow_object;
421
422	/*
423	 * Deal with the deallocation (last reference) of a pageout object
424	 * (used for cleaning-in-place) by dropping the paging references/
425	 * freeing pages in the original object.
426	 */
427
428	assert(object->pageout);
429	shadow_object = object->shadow;
430	vm_object_lock(shadow_object);
431
432	while (!queue_empty(&object->memq)) {
433		vm_page_t 		p, m;
434		vm_object_offset_t	offset;
435
436		p = (vm_page_t) queue_first(&object->memq);
437
438		assert(p->private);
439		assert(p->pageout);
440		p->pageout = FALSE;
441		assert(!p->cleaning);
442
443		offset = p->offset;
444		VM_PAGE_FREE(p);
445		p = VM_PAGE_NULL;
446
447		m = vm_page_lookup(shadow_object,
448			offset + object->shadow_offset);
449
450		if(m == VM_PAGE_NULL)
451			continue;
452		assert(m->cleaning);
453		/* used as a trigger on upl_commit etc to recognize the */
454		/* pageout daemon's subseqent desire to pageout a cleaning */
455		/* page.  When the bit is on the upl commit code will   */
456		/* respect the pageout bit in the target page over the  */
457		/* caller's page list indication */
458		m->dump_cleaning = FALSE;
459
460		assert((m->dirty) || (m->precious) ||
461				(m->busy && m->cleaning));
462
463		/*
464		 * Handle the trusted pager throttle.
465		 * Also decrement the burst throttle (if external).
466		 */
467		vm_page_lock_queues();
468		if (m->laundry) {
469			vm_pageout_throttle_up(m);
470		}
471
472		/*
473		 * Handle the "target" page(s). These pages are to be freed if
474		 * successfully cleaned. Target pages are always busy, and are
475		 * wired exactly once. The initial target pages are not mapped,
476		 * (so cannot be referenced or modified) but converted target
477		 * pages may have been modified between the selection as an
478		 * adjacent page and conversion to a target.
479		 */
480		if (m->pageout) {
481			assert(m->busy);
482			assert(m->wire_count == 1);
483			m->cleaning = FALSE;
484			m->encrypted_cleaning = FALSE;
485			m->pageout = FALSE;
486#if MACH_CLUSTER_STATS
487			if (m->wanted) vm_pageout_target_collisions++;
488#endif
489			/*
490			 * Revoke all access to the page. Since the object is
491			 * locked, and the page is busy, this prevents the page
492			 * from being dirtied after the pmap_disconnect() call
493			 * returns.
494			 *
495			 * Since the page is left "dirty" but "not modifed", we
496			 * can detect whether the page was redirtied during
497			 * pageout by checking the modify state.
498			 */
499			if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
500			      m->dirty = TRUE;
501			else
502			      m->dirty = FALSE;
503
504			if (m->dirty) {
505				CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
506				vm_page_unwire(m);/* reactivates */
507				VM_STAT_INCR(reactivations);
508				PAGE_WAKEUP_DONE(m);
509			} else {
510				CLUSTER_STAT(vm_pageout_target_page_freed++;)
511				vm_page_free(m);/* clears busy, etc. */
512			}
513			vm_page_unlock_queues();
514			continue;
515		}
516		/*
517		 * Handle the "adjacent" pages. These pages were cleaned in
518		 * place, and should be left alone.
519		 * If prep_pin_count is nonzero, then someone is using the
520		 * page, so make it active.
521		 */
522		if (!m->active && !m->inactive && !m->throttled && !m->private) {
523			if (m->reference)
524				vm_page_activate(m);
525			else
526				vm_page_deactivate(m);
527		}
528		if((m->busy) && (m->cleaning)) {
529
530			/* the request_page_list case, (COPY_OUT_FROM FALSE) */
531			m->busy = FALSE;
532
533			/* We do not re-set m->dirty ! */
534			/* The page was busy so no extraneous activity     */
535			/* could have occurred. COPY_INTO is a read into the */
536			/* new pages. CLEAN_IN_PLACE does actually write   */
537			/* out the pages but handling outside of this code */
538			/* will take care of resetting dirty. We clear the */
539			/* modify however for the Programmed I/O case.     */
540			pmap_clear_modify(m->phys_page);
541
542			m->absent = FALSE;
543			m->overwriting = FALSE;
544		} else if (m->overwriting) {
545			/* alternate request page list, write to page_list */
546			/* case.  Occurs when the original page was wired  */
547			/* at the time of the list request */
548			assert(m->wire_count != 0);
549			vm_page_unwire(m);/* reactivates */
550			m->overwriting = FALSE;
551		} else {
552		/*
553		 * Set the dirty state according to whether or not the page was
554		 * modified during the pageout. Note that we purposefully do
555		 * NOT call pmap_clear_modify since the page is still mapped.
556		 * If the page were to be dirtied between the 2 calls, this
557		 * this fact would be lost. This code is only necessary to
558		 * maintain statistics, since the pmap module is always
559		 * consulted if m->dirty is false.
560		 */
561#if MACH_CLUSTER_STATS
562			m->dirty = pmap_is_modified(m->phys_page);
563
564			if (m->dirty)	vm_pageout_cluster_dirtied++;
565			else		vm_pageout_cluster_cleaned++;
566			if (m->wanted)	vm_pageout_cluster_collisions++;
567#else
568			m->dirty = 0;
569#endif
570		}
571		m->cleaning = FALSE;
572		m->encrypted_cleaning = FALSE;
573
574		/*
575		 * Wakeup any thread waiting for the page to be un-cleaning.
576		 */
577		PAGE_WAKEUP(m);
578		vm_page_unlock_queues();
579	}
580	/*
581	 * Account for the paging reference taken in vm_paging_object_allocate.
582	 */
583	vm_object_paging_end(shadow_object);
584	vm_object_unlock(shadow_object);
585
586	assert(object->ref_count == 0);
587	assert(object->paging_in_progress == 0);
588	assert(object->resident_page_count == 0);
589	return;
590}
591
592/*
593 * Routine:	vm_pageclean_setup
594 *
595 * Purpose:	setup a page to be cleaned (made non-dirty), but not
596 *		necessarily flushed from the VM page cache.
597 *		This is accomplished by cleaning in place.
598 *
599 *		The page must not be busy, and the object and page
600 *		queues must be locked.
601 *
602 */
603void
604vm_pageclean_setup(
605	vm_page_t		m,
606	vm_page_t		new_m,
607	vm_object_t		new_object,
608	vm_object_offset_t	new_offset)
609{
610	assert(!m->busy);
611#if 0
612	assert(!m->cleaning);
613#endif
614
615	XPR(XPR_VM_PAGEOUT,
616    "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
617		(integer_t)m->object, m->offset, (integer_t)m,
618		(integer_t)new_m, new_offset);
619
620	pmap_clear_modify(m->phys_page);
621
622	/*
623	 * Mark original page as cleaning in place.
624	 */
625	m->cleaning = TRUE;
626	m->dirty = TRUE;
627	m->precious = FALSE;
628
629	/*
630	 * Convert the fictitious page to a private shadow of
631	 * the real page.
632	 */
633	assert(new_m->fictitious);
634	assert(new_m->phys_page == vm_page_fictitious_addr);
635	new_m->fictitious = FALSE;
636	new_m->private = TRUE;
637	new_m->pageout = TRUE;
638	new_m->phys_page = m->phys_page;
639	vm_page_wire(new_m);
640
641	vm_page_insert(new_m, new_object, new_offset);
642	assert(!new_m->wanted);
643	new_m->busy = FALSE;
644}
645
646/*
647 *	Routine:	vm_pageout_initialize_page
648 *	Purpose:
649 *		Causes the specified page to be initialized in
650 *		the appropriate memory object. This routine is used to push
651 *		pages into a copy-object when they are modified in the
652 *		permanent object.
653 *
654 *		The page is moved to a temporary object and paged out.
655 *
656 *	In/out conditions:
657 *		The page in question must not be on any pageout queues.
658 *		The object to which it belongs must be locked.
659 *		The page must be busy, but not hold a paging reference.
660 *
661 *	Implementation:
662 *		Move this page to a completely new object.
663 */
664void
665vm_pageout_initialize_page(
666	vm_page_t	m)
667{
668	vm_object_t		object;
669	vm_object_offset_t	paging_offset;
670	vm_page_t		holding_page;
671	memory_object_t		pager;
672
673	XPR(XPR_VM_PAGEOUT,
674		"vm_pageout_initialize_page, page 0x%X\n",
675		(integer_t)m, 0, 0, 0, 0);
676	assert(m->busy);
677
678	/*
679	 *	Verify that we really want to clean this page
680	 */
681	assert(!m->absent);
682	assert(!m->error);
683	assert(m->dirty);
684
685	/*
686	 *	Create a paging reference to let us play with the object.
687	 */
688	object = m->object;
689	paging_offset = m->offset + object->paging_offset;
690
691	if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
692		VM_PAGE_FREE(m);
693		panic("reservation without pageout?"); /* alan */
694		vm_object_unlock(object);
695
696		return;
697	}
698
699	/*
700	 * If there's no pager, then we can't clean the page.  This should
701	 * never happen since this should be a copy object and therefore not
702	 * an external object, so the pager should always be there.
703	 */
704
705	pager = object->pager;
706
707	if (pager == MEMORY_OBJECT_NULL) {
708		VM_PAGE_FREE(m);
709		panic("missing pager for copy object");
710		return;
711	}
712
713	/* set the page for future call to vm_fault_list_request */
714	vm_object_paging_begin(object);
715	holding_page = NULL;
716	vm_page_lock_queues();
717	pmap_clear_modify(m->phys_page);
718	m->dirty = TRUE;
719	m->busy = TRUE;
720	m->list_req_pending = TRUE;
721	m->cleaning = TRUE;
722	m->pageout = TRUE;
723	vm_page_wire(m);
724	vm_page_unlock_queues();
725	vm_object_unlock(object);
726
727	/*
728	 *	Write the data to its pager.
729	 *	Note that the data is passed by naming the new object,
730	 *	not a virtual address; the pager interface has been
731	 *	manipulated to use the "internal memory" data type.
732	 *	[The object reference from its allocation is donated
733	 *	to the eventual recipient.]
734	 */
735	memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
736
737	vm_object_lock(object);
738	vm_object_paging_end(object);
739}
740
741#if	MACH_CLUSTER_STATS
742#define MAXCLUSTERPAGES	16
743struct {
744	unsigned long pages_in_cluster;
745	unsigned long pages_at_higher_offsets;
746	unsigned long pages_at_lower_offsets;
747} cluster_stats[MAXCLUSTERPAGES];
748#endif	/* MACH_CLUSTER_STATS */
749
750
751/*
752 * vm_pageout_cluster:
753 *
754 * Given a page, queue it to the appropriate I/O thread,
755 * which will page it out and attempt to clean adjacent pages
756 * in the same operation.
757 *
758 * The page must be busy, and the object and queues locked. We will take a
759 * paging reference to prevent deallocation or collapse when we
760 * release the object lock back at the call site.  The I/O thread
761 * is responsible for consuming this reference
762 *
763 * The page must not be on any pageout queue.
764 */
765
766void
767vm_pageout_cluster(vm_page_t m)
768{
769	vm_object_t	object = m->object;
770        struct		vm_pageout_queue *q;
771
772
773	XPR(XPR_VM_PAGEOUT,
774		"vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
775		(integer_t)object, m->offset, (integer_t)m, 0, 0);
776
777	/*
778	 * Only a certain kind of page is appreciated here.
779	 */
780	assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
781	assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
782	assert(!m->throttled);
783
784	/*
785	 * protect the object from collapse -
786	 * locking in the object's paging_offset.
787	 */
788	vm_object_paging_begin(object);
789
790	/*
791	 * set the page for future call to vm_fault_list_request
792	 * page should already be marked busy
793	 */
794	vm_page_wire(m);
795	m->list_req_pending = TRUE;
796	m->cleaning = TRUE;
797	m->pageout = TRUE;
798        m->laundry = TRUE;
799
800	if (object->internal == TRUE)
801	        q = &vm_pageout_queue_internal;
802	else
803	        q = &vm_pageout_queue_external;
804	q->pgo_laundry++;
805
806	m->pageout_queue = TRUE;
807	queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
808
809	if (q->pgo_idle == TRUE) {
810	        q->pgo_idle = FALSE;
811	        thread_wakeup((event_t) &q->pgo_pending);
812	}
813}
814
815
816unsigned long vm_pageout_throttle_up_count = 0;
817
818/*
819 * A page is back from laundry.  See if there are some pages waiting to
820 * go to laundry and if we can let some of them go now.
821 *
822 * Object and page queues must be locked.
823 */
824void
825vm_pageout_throttle_up(
826	vm_page_t	m)
827{
828        struct vm_pageout_queue *q;
829
830	vm_pageout_throttle_up_count++;
831
832	assert(m->laundry);
833	assert(m->object != VM_OBJECT_NULL);
834	assert(m->object != kernel_object);
835
836	if (m->object->internal == TRUE)
837	        q = &vm_pageout_queue_internal;
838	else
839	        q = &vm_pageout_queue_external;
840
841	m->laundry = FALSE;
842	q->pgo_laundry--;
843
844	if (q->pgo_throttled == TRUE) {
845	        q->pgo_throttled = FALSE;
846	        thread_wakeup((event_t) &q->pgo_laundry);
847	}
848}
849
850
851/*
852 *	vm_pageout_scan does the dirty work for the pageout daemon.
853 *	It returns with vm_page_queue_free_lock held and
854 *	vm_page_free_wanted == 0.
855 */
856
857#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
858
859#define	FCS_IDLE		0
860#define FCS_DELAYED		1
861#define FCS_DEADLOCK_DETECTED	2
862
863struct flow_control {
864        int		state;
865        mach_timespec_t	ts;
866};
867
868void
869vm_pageout_scan(void)
870{
871	unsigned int loop_count = 0;
872	unsigned int inactive_burst_count = 0;
873	unsigned int active_burst_count = 0;
874	unsigned int reactivated_this_call;
875	unsigned int reactivate_limit;
876	vm_page_t   local_freeq = NULL;
877	int         local_freed = 0;
878	int         delayed_unlock;
879	int         need_internal_inactive = 0;
880	int	    refmod_state = 0;
881        int	vm_pageout_deadlock_target = 0;
882	struct	vm_pageout_queue *iq;
883	struct	vm_pageout_queue *eq;
884        struct	vm_speculative_age_q *sq;
885	struct  flow_control	flow_control;
886        boolean_t inactive_throttled = FALSE;
887	boolean_t try_failed;
888	mach_timespec_t		ts;
889	unsigned int msecs = 0;
890	vm_object_t	object;
891	vm_object_t	last_object_tried;
892	int	zf_ratio;
893	int	zf_run_count;
894	uint32_t	catch_up_count = 0;
895	uint32_t	inactive_reclaim_run;
896	boolean_t	forced_reclaim;
897
898	flow_control.state = FCS_IDLE;
899	iq = &vm_pageout_queue_internal;
900	eq = &vm_pageout_queue_external;
901	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
902
903
904        XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
905
906
907	vm_page_lock_queues();
908	delayed_unlock = 1;	/* must be nonzero if Qs are locked, 0 if unlocked */
909
910	/*
911	 *	Calculate the max number of referenced pages on the inactive
912	 *	queue that we will reactivate.
913	 */
914	reactivated_this_call = 0;
915	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
916						    vm_page_inactive_count);
917	inactive_reclaim_run = 0;
918
919
920/*???*/	/*
921	 *	We want to gradually dribble pages from the active queue
922	 *	to the inactive queue.  If we let the inactive queue get
923	 *	very small, and then suddenly dump many pages into it,
924	 *	those pages won't get a sufficient chance to be referenced
925	 *	before we start taking them from the inactive queue.
926	 *
927	 *	We must limit the rate at which we send pages to the pagers.
928	 *	data_write messages consume memory, for message buffers and
929	 *	for map-copy objects.  If we get too far ahead of the pagers,
930	 *	we can potentially run out of memory.
931	 *
932	 *	We can use the laundry count to limit directly the number
933	 *	of pages outstanding to the default pager.  A similar
934	 *	strategy for external pagers doesn't work, because
935	 *	external pagers don't have to deallocate the pages sent them,
936	 *	and because we might have to send pages to external pagers
937	 *	even if they aren't processing writes.  So we also
938	 *	use a burst count to limit writes to external pagers.
939	 *
940	 *	When memory is very tight, we can't rely on external pagers to
941	 *	clean pages.  They probably aren't running, because they
942	 *	aren't vm-privileged.  If we kept sending dirty pages to them,
943	 *	we could exhaust the free list.
944	 */
945
946
947Restart:
948	assert(delayed_unlock!=0);
949
950	/*
951	 *	A page is "zero-filled" if it was not paged in from somewhere,
952	 *	and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big.
953	 *	Recalculate the zero-filled page ratio.  We use this to apportion
954	 *	victimized pages between the normal and zero-filled inactive
955	 *	queues according to their relative abundance in memory.  Thus if a task
956	 *	is flooding memory with zf pages, we begin to hunt them down.
957	 *	It would be better to throttle greedy tasks at a higher level,
958	 *	but at the moment mach vm cannot do this.
959	 */
960	{
961		uint32_t  total  = vm_page_active_count + vm_page_inactive_count;
962		uint32_t  normal = total - vm_zf_count;
963
964		/* zf_ratio is the number of zf pages we victimize per normal page */
965
966		if (vm_zf_count < vm_accellerate_zf_pageout_trigger)
967			zf_ratio = 0;
968		else if ((vm_zf_count <= normal) || (normal == 0))
969			zf_ratio = 1;
970		else
971			zf_ratio = vm_zf_count / normal;
972
973		zf_run_count = 0;
974	}
975
976	/*
977	 *	Recalculate vm_page_inactivate_target.
978	 */
979	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
980							  vm_page_inactive_count +
981							  vm_page_speculative_count);
982	/*
983	 * don't want to wake the pageout_scan thread up everytime we fall below
984	 * the targets... set a low water mark at 0.25% below the target
985	 */
986	vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
987
988	vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
989								vm_page_inactive_count);
990	object = NULL;
991	last_object_tried = NULL;
992	try_failed = FALSE;
993
994	if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
995	        catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
996	else
997	        catch_up_count = 0;
998
999	for (;;) {
1000		vm_page_t m;
1001
1002		DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1003
1004		if (delayed_unlock == 0) {
1005		        vm_page_lock_queues();
1006			delayed_unlock = 1;
1007		}
1008
1009		/*
1010		 *	Don't sweep through active queue more than the throttle
1011		 *	which should be kept relatively low
1012		 */
1013		active_burst_count = MIN(vm_pageout_burst_active_throttle, vm_page_active_count);
1014
1015		/*
1016		 *	Move pages from active to inactive.
1017		 */
1018		if (need_internal_inactive == 0 && (vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1019		        goto done_moving_active_pages;
1020
1021		while (!queue_empty(&vm_page_queue_active) &&
1022		       (need_internal_inactive || active_burst_count)) {
1023
1024		        if (active_burst_count)
1025			       active_burst_count--;
1026
1027			vm_pageout_active++;
1028
1029			m = (vm_page_t) queue_first(&vm_page_queue_active);
1030
1031			assert(m->active && !m->inactive);
1032			assert(!m->laundry);
1033			assert(m->object != kernel_object);
1034			assert(m->phys_page != vm_page_guard_addr);
1035
1036			DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1037
1038			/*
1039			 * Try to lock object; since we've already got the
1040			 * page queues lock, we can only 'try' for this one.
1041			 * if the 'try' fails, we need to do a mutex_pause
1042			 * to allow the owner of the object lock a chance to
1043			 * run... otherwise, we're likely to trip over this
1044			 * object in the same state as we work our way through
1045			 * the queue... clumps of pages associated with the same
1046			 * object are fairly typical on the inactive and active queues
1047			 */
1048			if (m->object != object) {
1049			        if (object != NULL) {
1050				        vm_object_unlock(object);
1051					object = NULL;
1052					vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1053				}
1054			        if (!vm_object_lock_try_scan(m->object)) {
1055				        /*
1056					 * move page to end of active queue and continue
1057					 */
1058				        queue_remove(&vm_page_queue_active, m,
1059						     vm_page_t, pageq);
1060					queue_enter(&vm_page_queue_active, m,
1061						    vm_page_t, pageq);
1062
1063					try_failed = TRUE;
1064
1065					m = (vm_page_t) queue_first(&vm_page_queue_active);
1066					/*
1067					 * this is the next object we're going to be interested in
1068					 * try to make sure its available after the mutex_yield
1069					 * returns control
1070					 */
1071					vm_pageout_scan_wants_object = m->object;
1072
1073					goto done_with_activepage;
1074				}
1075				object = m->object;
1076
1077				try_failed = FALSE;
1078			}
1079
1080			/*
1081			 * if the page is BUSY, then we pull it
1082			 * off the active queue and leave it alone.
1083			 * when BUSY is cleared, it will get stuck
1084			 * back on the appropriate queue
1085			 */
1086			if (m->busy) {
1087				queue_remove(&vm_page_queue_active, m,
1088					     vm_page_t, pageq);
1089				m->pageq.next = NULL;
1090				m->pageq.prev = NULL;
1091
1092				if (!m->fictitious)
1093					vm_page_active_count--;
1094				m->active = FALSE;
1095
1096				goto done_with_activepage;
1097			}
1098
1099			/*
1100			 *	Deactivate the page while holding the object
1101			 *	locked, so we know the page is still not busy.
1102			 *	This should prevent races between pmap_enter
1103			 *	and pmap_clear_reference.  The page might be
1104			 *	absent or fictitious, but vm_page_deactivate
1105			 *	can handle that.
1106			 */
1107			vm_page_deactivate(m);
1108
1109			if (need_internal_inactive) {
1110				vm_pageout_scan_active_throttle_success++;
1111				need_internal_inactive--;
1112			}
1113done_with_activepage:
1114			if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1115
1116			        if (object != NULL) {
1117				        vm_object_unlock(object);
1118					object = NULL;
1119					vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1120				}
1121			        if (local_freeq) {
1122				        vm_page_free_list(local_freeq);
1123
1124					local_freeq = NULL;
1125					local_freed = 0;
1126				}
1127			        mutex_yield(&vm_page_queue_lock);
1128
1129				delayed_unlock = 1;
1130
1131				/*
1132				 * continue the while loop processing
1133				 * the active queue... need to hold
1134				 * the page queues lock
1135				 */
1136			}
1137		}
1138
1139
1140
1141		/**********************************************************************
1142		 * above this point we're playing with the active queue
1143		 * below this point we're playing with the throttling mechanisms
1144		 * and the inactive queue
1145		 **********************************************************************/
1146
1147done_moving_active_pages:
1148
1149		/*
1150		 *	We are done if we have met our target *and*
1151		 *	nobody is still waiting for a page.
1152		 */
1153		if (vm_page_free_count + local_freed >= vm_page_free_target) {
1154			if (object != NULL) {
1155			        vm_object_unlock(object);
1156				object = NULL;
1157			}
1158			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1159
1160			if (local_freeq) {
1161			        vm_page_free_list(local_freeq);
1162
1163				local_freeq = NULL;
1164				local_freed = 0;
1165			}
1166			/*
1167			 * inactive target still not met... keep going
1168			 * until we get the queues balanced
1169			 */
1170
1171			/*
1172			 *	Recalculate vm_page_inactivate_target.
1173			 */
1174			vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1175									  vm_page_inactive_count +
1176									  vm_page_speculative_count);
1177
1178#ifndef	CONFIG_EMBEDDED
1179			/*
1180			 * XXX: if no active pages can be reclaimed, pageout scan can be stuck trying
1181			 *      to balance the queues
1182			 */
1183			if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1184			    !queue_empty(&vm_page_queue_active))
1185			        continue;
1186#endif
1187
1188		        mutex_lock(&vm_page_queue_free_lock);
1189
1190			if ((vm_page_free_count >= vm_page_free_target) &&
1191			    (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1192
1193			        vm_page_unlock_queues();
1194
1195				thread_wakeup((event_t) &vm_pageout_garbage_collect);
1196
1197				assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1198
1199				return;
1200			}
1201			mutex_unlock(&vm_page_queue_free_lock);
1202		}
1203		/*
1204		 * Before anything, we check if we have any ripe volatile objects around.
1205		 * If so, purge the first and see what it gives us.
1206		 */
1207		assert (available_for_purge>=0);
1208		if (available_for_purge)
1209		{
1210		        if (object != NULL) {
1211			        vm_object_unlock(object);
1212				object = NULL;
1213			}
1214			vm_purgeable_object_purge_one();
1215			continue;
1216		}
1217
1218		if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1219		        /*
1220			 * try to pull pages from the aging bins
1221			 * see vm_page.h for an explanation of how
1222			 * this mechanism works
1223			 */
1224		        struct vm_speculative_age_q	*aq;
1225			mach_timespec_t	ts_fully_aged;
1226			boolean_t	can_steal = FALSE;
1227
1228			aq = &vm_page_queue_speculative[speculative_steal_index];
1229
1230			while (queue_empty(&aq->age_q)) {
1231
1232			        speculative_steal_index++;
1233
1234				if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1235				        speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1236
1237				aq = &vm_page_queue_speculative[speculative_steal_index];
1238			}
1239			if (vm_page_speculative_count > vm_page_speculative_target)
1240			        can_steal = TRUE;
1241			else {
1242			        ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000;
1243				ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000)
1244				                      * 1000 * NSEC_PER_USEC;
1245
1246				ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1247
1248			        clock_get_system_nanotime(&ts.tv_sec, (unsigned *)&ts.tv_nsec);
1249
1250				if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1251				        can_steal = TRUE;
1252			}
1253			if (can_steal == TRUE)
1254			        vm_page_speculate_ageit(aq);
1255		}
1256
1257		/*
1258		 * Sometimes we have to pause:
1259		 *	1) No inactive pages - nothing to do.
1260		 *	2) Flow control - default pageout queue is full
1261		 *	3) Loop control - no acceptable pages found on the inactive queue
1262		 *         within the last vm_pageout_burst_inactive_throttle iterations
1263		 */
1264		if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) &&
1265		    (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) {
1266		        vm_pageout_scan_empty_throttle++;
1267			msecs = vm_pageout_empty_wait;
1268			goto vm_pageout_scan_delay;
1269
1270		} else if (inactive_burst_count >=
1271			   MIN(vm_pageout_burst_inactive_throttle,
1272			       (vm_page_inactive_count +
1273				vm_page_speculative_count))) {
1274		        vm_pageout_scan_burst_throttle++;
1275			msecs = vm_pageout_burst_wait;
1276			goto vm_pageout_scan_delay;
1277
1278		} else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) {
1279
1280		        switch (flow_control.state) {
1281
1282			case FCS_IDLE:
1283reset_deadlock_timer:
1284			        ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1285				ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1286				clock_get_system_nanotime(&flow_control.ts.tv_sec,
1287							  (unsigned *)&flow_control.ts.tv_nsec);
1288				ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1289
1290				flow_control.state = FCS_DELAYED;
1291				msecs = vm_pageout_deadlock_wait;
1292
1293				break;
1294
1295			case FCS_DELAYED:
1296			        clock_get_system_nanotime(&ts.tv_sec,
1297							  (unsigned *)&ts.tv_nsec);
1298
1299				if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1300				        /*
1301					 * the pageout thread for the default pager is potentially
1302					 * deadlocked since the
1303					 * default pager queue has been throttled for more than the
1304					 * allowable time... we need to move some clean pages or dirty
1305					 * pages belonging to the external pagers if they aren't throttled
1306					 * vm_page_free_wanted represents the number of threads currently
1307					 * blocked waiting for pages... we'll move one page for each of
1308					 * these plus a fixed amount to break the logjam... once we're done
1309					 * moving this number of pages, we'll re-enter the FSC_DELAYED state
1310					 * with a new timeout target since we have no way of knowing
1311					 * whether we've broken the deadlock except through observation
1312					 * of the queue associated with the default pager... we need to
1313					 * stop moving pages and allow the system to run to see what
1314					 * state it settles into.
1315					 */
1316				        vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1317					vm_pageout_scan_deadlock_detected++;
1318					flow_control.state = FCS_DEADLOCK_DETECTED;
1319
1320					thread_wakeup((event_t) &vm_pageout_garbage_collect);
1321					goto consider_inactive;
1322				}
1323				/*
1324				 * just resniff instead of trying
1325				 * to compute a new delay time... we're going to be
1326				 * awakened immediately upon a laundry completion,
1327				 * so we won't wait any longer than necessary
1328				 */
1329				msecs = vm_pageout_idle_wait;
1330				break;
1331
1332			case FCS_DEADLOCK_DETECTED:
1333			        if (vm_pageout_deadlock_target)
1334				        goto consider_inactive;
1335				goto reset_deadlock_timer;
1336
1337			}
1338			vm_pageout_scan_throttle++;
1339			iq->pgo_throttled = TRUE;
1340vm_pageout_scan_delay:
1341			if (object != NULL) {
1342			        vm_object_unlock(object);
1343				object = NULL;
1344			}
1345			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1346
1347			if (local_freeq) {
1348			        vm_page_free_list(local_freeq);
1349
1350				local_freeq = NULL;
1351				local_freed = 0;
1352			}
1353#if CONFIG_EMBEDDED
1354			{
1355			int percent_avail;
1356
1357			/*
1358			 * Decide if we need to send a memory status notification.
1359			 */
1360			percent_avail =
1361				(vm_page_active_count + vm_page_inactive_count +
1362				 vm_page_speculative_count + vm_page_free_count +
1363				 (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
1364				atop_64(max_mem);
1365			if (percent_avail >= (kern_memorystatus_level + 5) ||
1366			    percent_avail <= (kern_memorystatus_level - 5)) {
1367				kern_memorystatus_level = percent_avail;
1368				thread_wakeup((event_t)&kern_memorystatus_wakeup);
1369			}
1370			}
1371#endif
1372			assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1373
1374			counter(c_vm_pageout_scan_block++);
1375
1376			vm_page_unlock_queues();
1377
1378			assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1379
1380			thread_block(THREAD_CONTINUE_NULL);
1381
1382			vm_page_lock_queues();
1383			delayed_unlock = 1;
1384
1385			iq->pgo_throttled = FALSE;
1386
1387			if (loop_count >= vm_page_inactive_count)
1388				loop_count = 0;
1389			inactive_burst_count = 0;
1390
1391			goto Restart;
1392			/*NOTREACHED*/
1393		}
1394
1395
1396		flow_control.state = FCS_IDLE;
1397consider_inactive:
1398		loop_count++;
1399		inactive_burst_count++;
1400		vm_pageout_inactive++;
1401
1402		/* Choose a victim. */
1403
1404		while (1) {
1405			m = NULL;
1406
1407			/*
1408			 * the most eligible pages are ones that were throttled because the
1409			 * pager wasn't ready at the time.  If a pager is ready now,
1410			 * see if one of these is useful.
1411			 */
1412			if (!VM_PAGE_Q_THROTTLED(iq) && !queue_empty(&vm_page_queue_throttled)) {
1413				m = (vm_page_t) queue_first(&vm_page_queue_throttled);
1414				break;
1415			}
1416
1417			/*
1418			 * The second most eligible pages are ones we paged in speculatively,
1419			 * but which have not yet been touched.
1420			 */
1421			if ( !queue_empty(&sq->age_q) ) {
1422			        m = (vm_page_t) queue_first(&sq->age_q);
1423				break;
1424			}
1425			/*
1426			 * Time for a zero-filled inactive page?
1427			 */
1428			if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1429			     queue_empty(&vm_page_queue_inactive)) {
1430				if ( !queue_empty(&vm_page_queue_zf) ) {
1431					m = (vm_page_t) queue_first(&vm_page_queue_zf);
1432					zf_run_count++;
1433					break;
1434				}
1435			}
1436			/*
1437			 * It's either a normal inactive page or nothing.
1438			 */
1439                        if ( !queue_empty(&vm_page_queue_inactive) ) {
1440                                m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1441                                zf_run_count = 0;
1442				break;
1443                        }
1444
1445                        panic("vm_pageout: no victim");
1446		}
1447
1448		assert(!m->active && (m->inactive || m->speculative || m->throttled));
1449		assert(!m->laundry);
1450		assert(m->object != kernel_object);
1451		assert(m->phys_page != vm_page_guard_addr);
1452
1453		DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1454
1455		/*
1456		 * check to see if we currently are working
1457		 * with the same object... if so, we've
1458		 * already got the lock
1459		 */
1460		if (m->object != object) {
1461		        /*
1462			 * the object associated with candidate page is
1463			 * different from the one we were just working
1464			 * with... dump the lock if we still own it
1465			 */
1466		        if (object != NULL) {
1467			        vm_object_unlock(object);
1468				object = NULL;
1469				vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1470			}
1471			/*
1472			 * Try to lock object; since we've alread got the
1473			 * page queues lock, we can only 'try' for this one.
1474			 * if the 'try' fails, we need to do a mutex_pause
1475			 * to allow the owner of the object lock a chance to
1476			 * run... otherwise, we're likely to trip over this
1477			 * object in the same state as we work our way through
1478			 * the queue... clumps of pages associated with the same
1479			 * object are fairly typical on the inactive and active queues
1480			 */
1481			if (!vm_object_lock_try_scan(m->object)) {
1482			        /*
1483				 *	Move page to end and continue.
1484				 * 	Don't re-issue ticket
1485				 */
1486			        if (m->zero_fill) {
1487				        queue_remove(&vm_page_queue_zf, m,
1488						     vm_page_t, pageq);
1489					queue_enter(&vm_page_queue_zf, m,
1490						    vm_page_t, pageq);
1491				} else if (m->speculative) {
1492				        remque(&m->pageq);
1493					m->speculative = FALSE;
1494					vm_page_speculative_count--;
1495
1496					/*
1497					 * move to the tail of the inactive queue
1498					 * to get it out of the way... the speculative
1499					 * queue is generally too small to depend
1500					 * on there being enough pages from other
1501					 * objects to make cycling it back on the
1502					 * same queue a winning proposition
1503					 */
1504					queue_enter(&vm_page_queue_inactive, m,
1505						    vm_page_t, pageq);
1506					m->inactive = TRUE;
1507					vm_page_inactive_count++;
1508					token_new_pagecount++;
1509				}  else if (m->throttled) {
1510					queue_remove(&vm_page_queue_throttled, m,
1511						     vm_page_t, pageq);
1512					m->throttled = FALSE;
1513					vm_page_throttled_count--;
1514
1515					/*
1516					 * not throttled any more, so can stick
1517					 * it on the inactive queue.
1518					 */
1519					queue_enter(&vm_page_queue_inactive, m,
1520						    vm_page_t, pageq);
1521					m->inactive = TRUE;
1522					vm_page_inactive_count++;
1523					token_new_pagecount++;
1524				} else {
1525				        queue_remove(&vm_page_queue_inactive, m,
1526						     vm_page_t, pageq);
1527#if MACH_ASSERT
1528					vm_page_inactive_count--;	/* balance for purgeable queue asserts */
1529#endif
1530					vm_purgeable_q_advance_all();
1531
1532					queue_enter(&vm_page_queue_inactive, m,
1533						    vm_page_t, pageq);
1534#if MACH_ASSERT
1535					vm_page_inactive_count++;	/* balance for purgeable queue asserts */
1536#endif
1537					token_new_pagecount++;
1538				}
1539				pmap_clear_reference(m->phys_page);
1540				m->reference = FALSE;
1541
1542				vm_pageout_inactive_nolock++;
1543
1544				if ( !queue_empty(&sq->age_q) )
1545				        m = (vm_page_t) queue_first(&sq->age_q);
1546				else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1547					  queue_empty(&vm_page_queue_inactive)) {
1548				        if ( !queue_empty(&vm_page_queue_zf) )
1549					        m = (vm_page_t) queue_first(&vm_page_queue_zf);
1550				} else if ( !queue_empty(&vm_page_queue_inactive) ) {
1551				        m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1552				}
1553				/*
1554				 * this is the next object we're going to be interested in
1555				 * try to make sure its available after the mutex_yield
1556				 * returns control
1557				 */
1558				vm_pageout_scan_wants_object = m->object;
1559
1560				/*
1561				 * force us to dump any collected free pages
1562				 * and to pause before moving on
1563				 */
1564				try_failed = TRUE;
1565
1566				goto done_with_inactivepage;
1567			}
1568			object = m->object;
1569			vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1570
1571			try_failed = FALSE;
1572		}
1573
1574		/*
1575		 *	Paging out pages of external objects which
1576		 *	are currently being created must be avoided.
1577		 *	The pager may claim for memory, thus leading to a
1578		 *	possible dead lock between it and the pageout thread,
1579		 *	if such pages are finally chosen. The remaining assumption
1580		 *	is that there will finally be enough available pages in the
1581		 *	inactive pool to page out in order to satisfy all memory
1582		 *	claimed by the thread which concurrently creates the pager.
1583		 */
1584		if (!object->pager_initialized && object->pager_created) {
1585			/*
1586			 *	Move page to end and continue, hoping that
1587			 *	there will be enough other inactive pages to
1588			 *	page out so that the thread which currently
1589			 *	initializes the pager will succeed.
1590			 *	Don't re-grant the ticket, the page should
1591			 *	pulled from the queue and paged out whenever
1592			 *	one of its logically adjacent fellows is
1593			 *	targeted.
1594			 *
1595			 *	Pages found on the speculative list can never be
1596			 *	in this state... they always have a pager associated
1597			 *	with them.
1598			 */
1599		        assert(!m->speculative);
1600
1601			if (m->zero_fill) {
1602				queue_remove(&vm_page_queue_zf, m,
1603					     vm_page_t, pageq);
1604				queue_enter(&vm_page_queue_zf, m,
1605					    vm_page_t, pageq);
1606			} else {
1607				queue_remove(&vm_page_queue_inactive, m,
1608					     vm_page_t, pageq);
1609#if MACH_ASSERT
1610				vm_page_inactive_count--;	/* balance for purgeable queue asserts */
1611#endif
1612				vm_purgeable_q_advance_all();
1613
1614				queue_enter(&vm_page_queue_inactive, m,
1615					    vm_page_t, pageq);
1616#if MACH_ASSERT
1617				vm_page_inactive_count++;	/* balance for purgeable queue asserts */
1618#endif
1619				token_new_pagecount++;
1620			}
1621			vm_pageout_inactive_avoid++;
1622
1623			goto done_with_inactivepage;
1624		}
1625		/*
1626		 *	Remove the page from its list.
1627		 */
1628		if (m->speculative) {
1629			remque(&m->pageq);
1630			m->speculative = FALSE;
1631			vm_page_speculative_count--;
1632		} else if (m->throttled) {
1633			queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq);
1634			m->throttled = FALSE;
1635			vm_page_throttled_count--;
1636		} else {
1637			if (m->zero_fill) {
1638				queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1639				vm_zf_queue_count--;
1640			} else {
1641			        queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1642			}
1643			m->inactive = FALSE;
1644			if (!m->fictitious)
1645				vm_page_inactive_count--;
1646				vm_purgeable_q_advance_all();
1647		}
1648
1649		/* If the object is empty, the page must be reclaimed even if dirty or used. */
1650		/* If the page belongs to a volatile object, we stick it back on. */
1651		if (object->copy == VM_OBJECT_NULL) {
1652			if(object->purgable == VM_PURGABLE_EMPTY && !m->cleaning) {
1653				m->busy = TRUE;
1654				if (m->pmapped == TRUE) {
1655					/* unmap the page */
1656					refmod_state = pmap_disconnect(m->phys_page);
1657					if (refmod_state & VM_MEM_MODIFIED) {
1658						m->dirty = TRUE;
1659					}
1660				}
1661				if (m->dirty || m->precious) {
1662					/* we saved the cost of cleaning this page ! */
1663					vm_page_purged_count++;
1664				}
1665				goto reclaim_page;
1666			}
1667			if (object->purgable == VM_PURGABLE_VOLATILE) {
1668				/* if it's wired, we can't put it on our queue */
1669				assert(m->wire_count == 0);
1670				/* just stick it back on! */
1671				goto reactivate_page;
1672			}
1673		}
1674		m->pageq.next = NULL;
1675		m->pageq.prev = NULL;
1676
1677		if ( !m->fictitious && catch_up_count)
1678		        catch_up_count--;
1679
1680		/*
1681		 * ENCRYPTED SWAP:
1682		 * if this page has already been picked up as part of a
1683		 * page-out cluster, it will be busy because it is being
1684		 * encrypted (see vm_object_upl_request()).  But we still
1685		 * want to demote it from "clean-in-place" (aka "adjacent")
1686		 * to "clean-and-free" (aka "target"), so let's ignore its
1687		 * "busy" bit here and proceed to check for "cleaning" a
1688		 * little bit below...
1689		 */
1690		if ( !m->encrypted_cleaning && (m->busy || !object->alive)) {
1691			/*
1692			 *	Somebody is already playing with this page.
1693			 *	Leave it off the pageout queues.
1694			 *
1695			 */
1696			vm_pageout_inactive_busy++;
1697
1698			goto done_with_inactivepage;
1699		}
1700
1701		/*
1702		 *	If it's absent or in error, we can reclaim the page.
1703		 */
1704
1705		if (m->absent || m->error) {
1706			vm_pageout_inactive_absent++;
1707reclaim_page:
1708			if (vm_pageout_deadlock_target) {
1709				vm_pageout_scan_inactive_throttle_success++;
1710			        vm_pageout_deadlock_target--;
1711			}
1712
1713			DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
1714
1715			if (m->object->internal) {
1716				DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
1717			} else {
1718				DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
1719			}
1720
1721			vm_page_free_prepare(m);
1722
1723			assert(m->pageq.next == NULL &&
1724			       m->pageq.prev == NULL);
1725			m->pageq.next = (queue_entry_t)local_freeq;
1726			local_freeq = m;
1727			local_freed++;
1728
1729			inactive_burst_count = 0;
1730
1731			goto done_with_inactivepage;
1732		}
1733
1734		assert(!m->private);
1735		assert(!m->fictitious);
1736
1737		/*
1738		 *	If already cleaning this page in place, convert from
1739		 *	"adjacent" to "target". We can leave the page mapped,
1740		 *	and vm_pageout_object_terminate will determine whether
1741		 *	to free or reactivate.
1742		 */
1743
1744		if (m->cleaning) {
1745			m->busy = TRUE;
1746			m->pageout = TRUE;
1747			m->dump_cleaning = TRUE;
1748			vm_page_wire(m);
1749
1750			CLUSTER_STAT(vm_pageout_cluster_conversions++);
1751
1752			inactive_burst_count = 0;
1753
1754			goto done_with_inactivepage;
1755		}
1756
1757		/*
1758		 *	If it's being used, reactivate.
1759		 *	(Fictitious pages are either busy or absent.)
1760		 *	First, update the reference and dirty bits
1761		 *	to make sure the page is unreferenced.
1762		 */
1763		refmod_state = -1;
1764
1765		if (m->reference == FALSE && m->pmapped == TRUE) {
1766		        refmod_state = pmap_get_refmod(m->phys_page);
1767
1768		        if (refmod_state & VM_MEM_REFERENCED)
1769			        m->reference = TRUE;
1770		        if (refmod_state & VM_MEM_MODIFIED)
1771			        m->dirty = TRUE;
1772		}
1773		if (m->reference && !m->no_cache) {
1774			/*
1775			 * The page we pulled off the inactive list has
1776			 * been referenced.  It is possible for other
1777			 * processors to be touching pages faster than we
1778			 * can clear the referenced bit and traverse the
1779			 * inactive queue, so we limit the number of
1780			 * reactivations.
1781			 */
1782			if (++reactivated_this_call >= reactivate_limit) {
1783				vm_pageout_reactivation_limit_exceeded++;
1784			} else if (catch_up_count) {
1785				vm_pageout_catch_ups++;
1786			} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
1787				vm_pageout_inactive_force_reclaim++;
1788			} else {
1789			        /*
1790				 * The page was being used, so put back on active list.
1791				 */
1792reactivate_page:
1793				vm_page_activate(m);
1794				VM_STAT_INCR(reactivations);
1795
1796				vm_pageout_inactive_used++;
1797				inactive_burst_count = 0;
1798
1799                                goto done_with_inactivepage;
1800			}
1801			/*
1802			 * Make sure we call pmap_get_refmod() if it
1803			 * wasn't already called just above, to update
1804			 * the dirty bit.
1805			 */
1806			if ((refmod_state == -1) && !m->dirty && m->pmapped) {
1807				refmod_state = pmap_get_refmod(m->phys_page);
1808				if (refmod_state & VM_MEM_MODIFIED)
1809					m->dirty = TRUE;
1810			}
1811			forced_reclaim = TRUE;
1812		} else {
1813			forced_reclaim = FALSE;
1814		}
1815
1816                XPR(XPR_VM_PAGEOUT,
1817                "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1818                (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1819
1820		/*
1821		 * we've got a candidate page to steal...
1822		 *
1823		 * m->dirty is up to date courtesy of the
1824		 * preceding check for m->reference... if
1825		 * we get here, then m->reference had to be
1826		 * FALSE (or possibly "reactivate_limit" was
1827                 * exceeded), but in either case we called
1828                 * pmap_get_refmod() and updated both
1829                 * m->reference and m->dirty
1830		 *
1831		 * if it's dirty or precious we need to
1832		 * see if the target queue is throtttled
1833		 * it if is, we need to skip over it by moving it back
1834		 * to the end of the inactive queue
1835		 */
1836		inactive_throttled = FALSE;
1837
1838		if (m->dirty || m->precious) {
1839		        if (object->internal) {
1840				if (VM_PAGE_Q_THROTTLED(iq))
1841				        inactive_throttled = TRUE;
1842			} else if (VM_PAGE_Q_THROTTLED(eq)) {
1843				inactive_throttled = TRUE;
1844			}
1845		}
1846		if (inactive_throttled == TRUE) {
1847throttle_inactive:
1848			if (!IP_VALID(memory_manager_default) &&
1849				object->internal &&
1850				(object->purgable == VM_PURGABLE_DENY ||
1851				 object->purgable == VM_PURGABLE_NONVOLATILE ||
1852				 object->purgable == VM_PURGABLE_VOLATILE )) {
1853			        queue_enter(&vm_page_queue_throttled, m,
1854					    vm_page_t, pageq);
1855				m->throttled = TRUE;
1856				vm_page_throttled_count++;
1857			} else {
1858			        if (m->zero_fill) {
1859					queue_enter(&vm_page_queue_zf, m,
1860						    vm_page_t, pageq);
1861					vm_zf_queue_count++;
1862				} else
1863					queue_enter(&vm_page_queue_inactive, m,
1864						    vm_page_t, pageq);
1865				m->inactive = TRUE;
1866				if (!m->fictitious) {
1867				        vm_page_inactive_count++;
1868					token_new_pagecount++;
1869				}
1870			}
1871			vm_pageout_scan_inactive_throttled++;
1872			goto done_with_inactivepage;
1873		}
1874
1875		/*
1876		 * we've got a page that we can steal...
1877		 * eliminate all mappings and make sure
1878		 * we have the up-to-date modified state
1879		 * first take the page BUSY, so that no new
1880		 * mappings can be made
1881		 */
1882		m->busy = TRUE;
1883
1884		/*
1885		 * if we need to do a pmap_disconnect then we
1886		 * need to re-evaluate m->dirty since the pmap_disconnect
1887		 * provides the true state atomically... the
1888		 * page was still mapped up to the pmap_disconnect
1889		 * and may have been dirtied at the last microsecond
1890		 *
1891		 * we also check for the page being referenced 'late'
1892		 * if it was, we first need to do a WAKEUP_DONE on it
1893		 * since we already set m->busy = TRUE, before
1894		 * going off to reactivate it
1895		 *
1896		 * Note that if 'pmapped' is FALSE then the page is not
1897		 * and has not been in any map, so there is no point calling
1898		 * pmap_disconnect().  m->dirty and/or m->reference could
1899		 * have been set in anticipation of likely usage of the page.
1900		 */
1901		if (m->pmapped == TRUE) {
1902		        refmod_state = pmap_disconnect(m->phys_page);
1903
1904		        if (refmod_state & VM_MEM_MODIFIED)
1905			        m->dirty = TRUE;
1906		        if (refmod_state & VM_MEM_REFERENCED) {
1907
1908				/* If m->reference is already set, this page must have
1909				 * already failed the reactivate_limit test, so don't
1910				 * bump the counts twice.
1911				 */
1912				if ( ! m->reference ) {
1913					m->reference = TRUE;
1914					if (forced_reclaim ||
1915					    ++reactivated_this_call >= reactivate_limit)
1916						vm_pageout_reactivation_limit_exceeded++;
1917					else {
1918						PAGE_WAKEUP_DONE(m);
1919						goto reactivate_page;
1920					}
1921				}
1922			}
1923		}
1924		/*
1925		 * reset our count of pages that have been reclaimed
1926		 * since the last page was 'stolen'
1927		 */
1928		inactive_reclaim_run = 0;
1929
1930		/*
1931		 *	If it's clean and not precious, we can free the page.
1932		 */
1933		if (!m->dirty && !m->precious) {
1934			vm_pageout_inactive_clean++;
1935			goto reclaim_page;
1936		}
1937
1938		/*
1939		 * The page may have been dirtied since the last check
1940		 * for a throttled target queue (which may have been skipped
1941		 * if the page was clean then).  With the dirty page
1942		 * disconnected here, we can make one final check.
1943		 */
1944		{
1945			boolean_t disconnect_throttled = FALSE;
1946			if (object->internal) {
1947				if (VM_PAGE_Q_THROTTLED(iq))
1948					disconnect_throttled = TRUE;
1949			} else if (VM_PAGE_Q_THROTTLED(eq)) {
1950				disconnect_throttled = TRUE;
1951			}
1952
1953			if (disconnect_throttled == TRUE) {
1954				PAGE_WAKEUP_DONE(m);
1955				goto throttle_inactive;
1956			}
1957		}
1958
1959		vm_pageout_cluster(m);
1960
1961		vm_pageout_inactive_dirty++;
1962
1963		inactive_burst_count = 0;
1964
1965done_with_inactivepage:
1966		if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1967
1968		        if (object != NULL) {
1969			        vm_object_unlock(object);
1970				object = NULL;
1971				vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1972			}
1973		        if (local_freeq) {
1974			        vm_page_free_list(local_freeq);
1975
1976				local_freeq = NULL;
1977				local_freed = 0;
1978			}
1979			mutex_yield(&vm_page_queue_lock);
1980
1981			delayed_unlock = 1;
1982		}
1983		/*
1984		 * back to top of pageout scan loop
1985		 */
1986	}
1987}
1988
1989
1990int vm_page_free_count_init;
1991
1992void
1993vm_page_free_reserve(
1994	int pages)
1995{
1996	int		free_after_reserve;
1997
1998	vm_page_free_reserved += pages;
1999
2000	free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
2001
2002	vm_page_free_min = vm_page_free_reserved +
2003		VM_PAGE_FREE_MIN(free_after_reserve);
2004
2005	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
2006	        vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
2007
2008	vm_page_free_target = vm_page_free_reserved +
2009		VM_PAGE_FREE_TARGET(free_after_reserve);
2010
2011	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
2012	        vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
2013
2014	if (vm_page_free_target < vm_page_free_min + 5)
2015		vm_page_free_target = vm_page_free_min + 5;
2016
2017}
2018
2019/*
2020 *	vm_pageout is the high level pageout daemon.
2021 */
2022
2023void
2024vm_pageout_continue(void)
2025{
2026	DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
2027	vm_pageout_scan_event_counter++;
2028	vm_pageout_scan();
2029	/* we hold vm_page_queue_free_lock now */
2030	assert(vm_page_free_wanted == 0);
2031	assert(vm_page_free_wanted_privileged == 0);
2032	assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2033	mutex_unlock(&vm_page_queue_free_lock);
2034
2035	counter(c_vm_pageout_block++);
2036	thread_block((thread_continue_t)vm_pageout_continue);
2037	/*NOTREACHED*/
2038}
2039
2040
2041/*
2042 * must be called with the
2043 * queues and object locks held
2044 */
2045static void
2046vm_pageout_queue_steal(vm_page_t m)
2047{
2048        struct vm_pageout_queue *q;
2049
2050	if (m->object->internal == TRUE)
2051	        q = &vm_pageout_queue_internal;
2052	else
2053	        q = &vm_pageout_queue_external;
2054
2055	m->laundry = FALSE;
2056	m->pageout_queue = FALSE;
2057	queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
2058
2059	m->pageq.next = NULL;
2060	m->pageq.prev = NULL;
2061
2062	vm_object_paging_end(m->object);
2063
2064	q->pgo_laundry--;
2065}
2066
2067
2068#ifdef FAKE_DEADLOCK
2069
2070#define FAKE_COUNT	5000
2071
2072int internal_count = 0;
2073int fake_deadlock = 0;
2074
2075#endif
2076
2077static void
2078vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2079{
2080	vm_page_t	m = NULL;
2081	vm_object_t	object;
2082	boolean_t	need_wakeup;
2083	memory_object_t	pager;
2084	thread_t	self = current_thread();
2085
2086	if ((vm_pageout_internal_iothread != THREAD_NULL)
2087	    && (self == vm_pageout_external_iothread )
2088	    && (self->options & TH_OPT_VMPRIV))
2089		self->options &= ~TH_OPT_VMPRIV;
2090
2091	vm_page_lockspin_queues();
2092
2093        while ( !queue_empty(&q->pgo_pending) ) {
2094
2095		   q->pgo_busy = TRUE;
2096		   queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2097		   m->pageout_queue = FALSE;
2098		   vm_page_unlock_queues();
2099
2100		   m->pageq.next = NULL;
2101		   m->pageq.prev = NULL;
2102#ifdef FAKE_DEADLOCK
2103		   if (q == &vm_pageout_queue_internal) {
2104		           vm_offset_t addr;
2105			   int	pg_count;
2106
2107			   internal_count++;
2108
2109			   if ((internal_count == FAKE_COUNT)) {
2110
2111				   pg_count = vm_page_free_count + vm_page_free_reserved;
2112
2113			           if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2114				           kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2115				   }
2116				   internal_count = 0;
2117				   fake_deadlock++;
2118			   }
2119		   }
2120#endif
2121		   object = m->object;
2122
2123		   vm_object_lock(object);
2124
2125		   if (!object->pager_initialized) {
2126
2127			   /*
2128			    *	If there is no memory object for the page, create
2129			    *	one and hand it to the default pager.
2130			    */
2131
2132			   if (!object->pager_initialized)
2133			           vm_object_collapse(object,
2134						      (vm_object_offset_t) 0,
2135						      TRUE);
2136			   if (!object->pager_initialized)
2137			           vm_object_pager_create(object);
2138			   if (!object->pager_initialized) {
2139			           /*
2140				    *	Still no pager for the object.
2141				    *	Reactivate the page.
2142				    *
2143				    *	Should only happen if there is no
2144				    *	default pager.
2145				    */
2146			           m->list_req_pending = FALSE;
2147				   m->cleaning = FALSE;
2148				   m->pageout = FALSE;
2149
2150			           vm_page_lockspin_queues();
2151				   vm_page_unwire(m);
2152				   vm_pageout_throttle_up(m);
2153				   vm_pageout_dirty_no_pager++;
2154				   vm_page_activate(m);
2155				   vm_page_unlock_queues();
2156
2157				   /*
2158				    *	And we are done with it.
2159				    */
2160				   PAGE_WAKEUP_DONE(m);
2161
2162			           vm_object_paging_end(object);
2163				   vm_object_unlock(object);
2164
2165				   vm_page_lockspin_queues();
2166				   continue;
2167			   }
2168		   }
2169		   pager = object->pager;
2170	           if (pager == MEMORY_OBJECT_NULL) {
2171		           /*
2172			    * This pager has been destroyed by either
2173			    * memory_object_destroy or vm_object_destroy, and
2174			    * so there is nowhere for the page to go.
2175			    * Just free the page... VM_PAGE_FREE takes
2176			    * care of cleaning up all the state...
2177			    * including doing the vm_pageout_throttle_up
2178			    */
2179
2180		           VM_PAGE_FREE(m);
2181
2182			   vm_object_paging_end(object);
2183			   vm_object_unlock(object);
2184
2185			   vm_page_lockspin_queues();
2186			   continue;
2187		   }
2188		   vm_object_unlock(object);
2189		   /*
2190		    * we expect the paging_in_progress reference to have
2191		    * already been taken on the object before it was added
2192		    * to the appropriate pageout I/O queue... this will
2193		    * keep the object from being terminated and/or the
2194		    * paging_offset from changing until the I/O has
2195		    * completed... therefore no need to lock the object to
2196		    * pull the paging_offset from it.
2197		    *
2198		    * Send the data to the pager.
2199		    * any pageout clustering happens there
2200		    */
2201		   memory_object_data_return(pager,
2202					     m->offset + object->paging_offset,
2203					     PAGE_SIZE,
2204					     NULL,
2205					     NULL,
2206					     FALSE,
2207					     FALSE,
2208					     0);
2209
2210		   vm_object_lock(object);
2211		   vm_object_paging_end(object);
2212		   vm_object_unlock(object);
2213
2214		   vm_page_lockspin_queues();
2215	}
2216	assert_wait((event_t) q, THREAD_UNINT);
2217
2218
2219	if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2220	        q->pgo_throttled = FALSE;
2221		need_wakeup = TRUE;
2222	} else
2223		need_wakeup = FALSE;
2224
2225	q->pgo_busy = FALSE;
2226	q->pgo_idle = TRUE;
2227	vm_page_unlock_queues();
2228
2229	if (need_wakeup == TRUE)
2230	        thread_wakeup((event_t) &q->pgo_laundry);
2231
2232	thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2233	/*NOTREACHED*/
2234}
2235
2236
2237static void
2238vm_pageout_iothread_external(void)
2239{
2240	thread_t	self = current_thread();
2241
2242	self->options |= TH_OPT_VMPRIV;
2243
2244	vm_pageout_iothread_continue(&vm_pageout_queue_external);
2245	/*NOTREACHED*/
2246}
2247
2248
2249static void
2250vm_pageout_iothread_internal(void)
2251{
2252	thread_t	self = current_thread();
2253
2254	self->options |= TH_OPT_VMPRIV;
2255
2256	vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2257	/*NOTREACHED*/
2258}
2259
2260static void
2261vm_pageout_garbage_collect(int collect)
2262{
2263	if (collect) {
2264		stack_collect();
2265
2266		/*
2267		 * consider_zone_gc should be last, because the other operations
2268		 * might return memory to zones.
2269		 */
2270		consider_machine_collect();
2271		consider_zone_gc();
2272
2273		consider_machine_adjust();
2274	}
2275
2276	assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2277
2278	thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2279	/*NOTREACHED*/
2280}
2281
2282
2283
2284void
2285vm_pageout(void)
2286{
2287	thread_t	self = current_thread();
2288	thread_t	thread;
2289	kern_return_t	result;
2290	spl_t		s;
2291
2292	/*
2293	 * Set thread privileges.
2294	 */
2295	s = splsched();
2296	thread_lock(self);
2297	self->priority = BASEPRI_PREEMPT - 1;
2298	set_sched_pri(self, self->priority);
2299	thread_unlock(self);
2300
2301	if (!self->reserved_stack)
2302		self->reserved_stack = self->kernel_stack;
2303
2304	splx(s);
2305
2306	/*
2307	 *	Initialize some paging parameters.
2308	 */
2309
2310	if (vm_pageout_idle_wait == 0)
2311		vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2312
2313	if (vm_pageout_burst_wait == 0)
2314		vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2315
2316	if (vm_pageout_empty_wait == 0)
2317		vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2318
2319	if (vm_pageout_deadlock_wait == 0)
2320		vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2321
2322	if (vm_pageout_deadlock_relief == 0)
2323		vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2324
2325	if (vm_pageout_inactive_relief == 0)
2326		vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2327
2328	if (vm_pageout_burst_active_throttle == 0)
2329	        vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2330
2331	if (vm_pageout_burst_inactive_throttle == 0)
2332	        vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2333
2334	/*
2335	 * Set kernel task to low backing store privileged
2336	 * status
2337	 */
2338	task_lock(kernel_task);
2339	kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2340	task_unlock(kernel_task);
2341
2342	vm_page_free_count_init = vm_page_free_count;
2343
2344	/*
2345	 * even if we've already called vm_page_free_reserve
2346	 * call it again here to insure that the targets are
2347	 * accurately calculated (it uses vm_page_free_count_init)
2348	 * calling it with an arg of 0 will not change the reserve
2349	 * but will re-calculate free_min and free_target
2350	 */
2351	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2352		vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2353	} else
2354		vm_page_free_reserve(0);
2355
2356
2357	queue_init(&vm_pageout_queue_external.pgo_pending);
2358	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2359	vm_pageout_queue_external.pgo_laundry = 0;
2360	vm_pageout_queue_external.pgo_idle = FALSE;
2361	vm_pageout_queue_external.pgo_busy = FALSE;
2362	vm_pageout_queue_external.pgo_throttled = FALSE;
2363
2364	queue_init(&vm_pageout_queue_internal.pgo_pending);
2365	vm_pageout_queue_internal.pgo_maxlaundry = 0;
2366	vm_pageout_queue_internal.pgo_laundry = 0;
2367	vm_pageout_queue_internal.pgo_idle = FALSE;
2368	vm_pageout_queue_internal.pgo_busy = FALSE;
2369	vm_pageout_queue_internal.pgo_throttled = FALSE;
2370
2371
2372	/* internal pageout thread started when default pager registered first time */
2373	/* external pageout and garbage collection threads started here */
2374
2375	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
2376					      BASEPRI_PREEMPT - 1,
2377					      &vm_pageout_external_iothread);
2378	if (result != KERN_SUCCESS)
2379		panic("vm_pageout_iothread_external: create failed");
2380
2381	thread_deallocate(vm_pageout_external_iothread);
2382
2383	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
2384					      MINPRI_KERNEL,
2385					      &thread);
2386	if (result != KERN_SUCCESS)
2387		panic("vm_pageout_garbage_collect: create failed");
2388
2389	thread_deallocate(thread);
2390
2391	vm_object_reaper_init();
2392
2393
2394	vm_pageout_continue();
2395
2396	/*
2397	 * Unreached code!
2398	 *
2399	 * The vm_pageout_continue() call above never returns, so the code below is never
2400	 * executed.  We take advantage of this to declare several DTrace VM related probe
2401	 * points that our kernel doesn't have an analog for.  These are probe points that
2402	 * exist in Solaris and are in the DTrace documentation, so people may have written
2403	 * scripts that use them.  Declaring the probe points here means their scripts will
2404	 * compile and execute which we want for portability of the scripts, but since this
2405	 * section of code is never reached, the probe points will simply never fire.  Yes,
2406	 * this is basically a hack.  The problem is the DTrace probe points were chosen with
2407	 * Solaris specific VM events in mind, not portability to different VM implementations.
2408	 */
2409
2410	DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
2411	DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
2412	DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
2413	DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
2414	DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
2415	DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
2416	DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
2417	/*NOTREACHED*/
2418}
2419
2420kern_return_t
2421vm_pageout_internal_start(void)
2422{
2423	kern_return_t result;
2424
2425	vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2426	result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
2427	if (result == KERN_SUCCESS)
2428		thread_deallocate(vm_pageout_internal_iothread);
2429	return result;
2430}
2431
2432#define UPL_DELAYED_UNLOCK_LIMIT  (MAX_UPL_TRANSFER / 2)
2433
2434static upl_t
2435upl_create(int type, int flags, upl_size_t size)
2436{
2437	upl_t	upl;
2438	int	page_field_size = 0;
2439	int	upl_flags = 0;
2440	int	upl_size  = sizeof(struct upl);
2441
2442	if (type & UPL_CREATE_LITE) {
2443		page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2444		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2445
2446		upl_flags |= UPL_LITE;
2447	}
2448	if (type & UPL_CREATE_INTERNAL) {
2449		upl_size += sizeof(struct upl_page_info) * (size/PAGE_SIZE);
2450
2451		upl_flags |= UPL_INTERNAL;
2452	}
2453	upl = (upl_t)kalloc(upl_size + page_field_size);
2454
2455	if (page_field_size)
2456	        bzero((char *)upl + upl_size, page_field_size);
2457
2458	upl->flags = upl_flags | flags;
2459	upl->src_object = NULL;
2460	upl->kaddr = (vm_offset_t)0;
2461	upl->size = 0;
2462	upl->map_object = NULL;
2463	upl->ref_count = 1;
2464	upl->highest_page = 0;
2465	upl_lock_init(upl);
2466#ifdef UPL_DEBUG
2467	upl->ubc_alias1 = 0;
2468	upl->ubc_alias2 = 0;
2469#endif /* UPL_DEBUG */
2470	return(upl);
2471}
2472
2473static void
2474upl_destroy(upl_t upl)
2475{
2476	int	page_field_size;  /* bit field in word size buf */
2477        int	size;
2478
2479#ifdef UPL_DEBUG
2480	{
2481		vm_object_t	object;
2482
2483		if (upl->flags & UPL_SHADOWED) {
2484			object = upl->map_object->shadow;
2485		} else {
2486			object = upl->map_object;
2487		}
2488		vm_object_lock(object);
2489		queue_remove(&object->uplq, upl, upl_t, uplq);
2490		vm_object_unlock(object);
2491	}
2492#endif /* UPL_DEBUG */
2493	/*
2494	 * drop a reference on the map_object whether or
2495	 * not a pageout object is inserted
2496	 */
2497	if (upl->flags & UPL_SHADOWED)
2498		vm_object_deallocate(upl->map_object);
2499
2500        if (upl->flags & UPL_DEVICE_MEMORY)
2501	        size = PAGE_SIZE;
2502	else
2503	        size = upl->size;
2504	page_field_size = 0;
2505
2506	if (upl->flags & UPL_LITE) {
2507		page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2508		page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2509	}
2510	if (upl->flags & UPL_INTERNAL) {
2511		kfree(upl,
2512		      sizeof(struct upl) +
2513		      (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
2514		      + page_field_size);
2515	} else {
2516		kfree(upl, sizeof(struct upl) + page_field_size);
2517	}
2518}
2519
2520void uc_upl_dealloc(upl_t upl);
2521__private_extern__ void
2522uc_upl_dealloc(upl_t upl)
2523{
2524	if (--upl->ref_count == 0)
2525		upl_destroy(upl);
2526}
2527
2528void
2529upl_deallocate(upl_t upl)
2530{
2531	if (--upl->ref_count == 0)
2532		upl_destroy(upl);
2533}
2534
2535/*
2536 * Statistics about UPL enforcement of copy-on-write obligations.
2537 */
2538unsigned long upl_cow = 0;
2539unsigned long upl_cow_again = 0;
2540unsigned long upl_cow_contiguous = 0;
2541unsigned long upl_cow_pages = 0;
2542unsigned long upl_cow_again_pages = 0;
2543unsigned long upl_cow_contiguous_pages = 0;
2544
2545/*
2546 *	Routine:	vm_object_upl_request
2547 *	Purpose:
2548 *		Cause the population of a portion of a vm_object.
2549 *		Depending on the nature of the request, the pages
2550 *		returned may be contain valid data or be uninitialized.
2551 *		A page list structure, listing the physical pages
2552 *		will be returned upon request.
2553 *		This function is called by the file system or any other
2554 *		supplier of backing store to a pager.
2555 *		IMPORTANT NOTE: The caller must still respect the relationship
2556 *		between the vm_object and its backing memory object.  The
2557 *		caller MUST NOT substitute changes in the backing file
2558 *		without first doing a memory_object_lock_request on the
2559 *		target range unless it is know that the pages are not
2560 *		shared with another entity at the pager level.
2561 *		Copy_in_to:
2562 *			if a page list structure is present
2563 *			return the mapped physical pages, where a
2564 *			page is not present, return a non-initialized
2565 *			one.  If the no_sync bit is turned on, don't
2566 *			call the pager unlock to synchronize with other
2567 *			possible copies of the page. Leave pages busy
2568 *			in the original object, if a page list structure
2569 *			was specified.  When a commit of the page list
2570 *			pages is done, the dirty bit will be set for each one.
2571 *		Copy_out_from:
2572 *			If a page list structure is present, return
2573 *			all mapped pages.  Where a page does not exist
2574 *			map a zero filled one. Leave pages busy in
2575 *			the original object.  If a page list structure
2576 *			is not specified, this call is a no-op.
2577 *
2578 *		Note:  access of default pager objects has a rather interesting
2579 *		twist.  The caller of this routine, presumably the file system
2580 *		page cache handling code, will never actually make a request
2581 *		against a default pager backed object.  Only the default
2582 *		pager will make requests on backing store related vm_objects
2583 *		In this way the default pager can maintain the relationship
2584 *		between backing store files (abstract memory objects) and
2585 *		the vm_objects (cache objects), they support.
2586 *
2587 */
2588
2589__private_extern__ kern_return_t
2590vm_object_upl_request(
2591	vm_object_t		object,
2592	vm_object_offset_t	offset,
2593	upl_size_t		size,
2594	upl_t			*upl_ptr,
2595	upl_page_info_array_t	user_page_list,
2596	unsigned int		*page_list_count,
2597	int			cntrl_flags)
2598{
2599	vm_page_t		dst_page = VM_PAGE_NULL;
2600	vm_object_offset_t	dst_offset;
2601	upl_size_t		xfer_size;
2602	boolean_t		dirty;
2603	boolean_t		hw_dirty;
2604	upl_t			upl = NULL;
2605	unsigned int		entry;
2606#if MACH_CLUSTER_STATS
2607	boolean_t		encountered_lrp = FALSE;
2608#endif
2609	vm_page_t		alias_page = NULL;
2610        int			refmod_state = 0;
2611	wpl_array_t 		lite_list = NULL;
2612	vm_object_t		last_copy_object;
2613	int                     delayed_unlock = 0;
2614	int			j;
2615
2616	if (cntrl_flags & ~UPL_VALID_FLAGS) {
2617		/*
2618		 * For forward compatibility's sake,
2619		 * reject any unknown flag.
2620		 */
2621		return KERN_INVALID_VALUE;
2622	}
2623	if ( (!object->internal) && (object->paging_offset != 0) )
2624		panic("vm_object_upl_request: external object with non-zero paging offset\n");
2625	if (object->phys_contiguous)
2626	        panic("vm_object_upl_request: contiguous object specified\n");
2627
2628
2629	if ((size / PAGE_SIZE) > MAX_UPL_SIZE)
2630		size = MAX_UPL_SIZE * PAGE_SIZE;
2631
2632	if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
2633	        *page_list_count = MAX_UPL_SIZE;
2634
2635	if (cntrl_flags & UPL_SET_INTERNAL) {
2636	        if (cntrl_flags & UPL_SET_LITE) {
2637
2638			upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
2639
2640			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2641			lite_list = (wpl_array_t)
2642					(((uintptr_t)user_page_list) +
2643					((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
2644		} else {
2645		        upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
2646
2647			user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2648		}
2649	} else {
2650	        if (cntrl_flags & UPL_SET_LITE) {
2651
2652			upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
2653
2654			lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
2655		} else {
2656		        upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
2657		}
2658	}
2659	*upl_ptr = upl;
2660
2661	if (user_page_list)
2662	        user_page_list[0].device = FALSE;
2663
2664	if (cntrl_flags & UPL_SET_LITE) {
2665	        upl->map_object = object;
2666	} else {
2667	        upl->map_object = vm_object_allocate(size);
2668		/*
2669		 * No neeed to lock the new object: nobody else knows
2670		 * about it yet, so it's all ours so far.
2671		 */
2672		upl->map_object->shadow = object;
2673		upl->map_object->pageout = TRUE;
2674		upl->map_object->can_persist = FALSE;
2675		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2676		upl->map_object->shadow_offset = offset;
2677		upl->map_object->wimg_bits = object->wimg_bits;
2678
2679		VM_PAGE_GRAB_FICTITIOUS(alias_page);
2680
2681		upl->flags |= UPL_SHADOWED;
2682	}
2683	/*
2684	 * ENCRYPTED SWAP:
2685	 * Just mark the UPL as "encrypted" here.
2686	 * We'll actually encrypt the pages later,
2687	 * in upl_encrypt(), when the caller has
2688	 * selected which pages need to go to swap.
2689	 */
2690	if (cntrl_flags & UPL_ENCRYPT)
2691		upl->flags |= UPL_ENCRYPTED;
2692
2693	if (cntrl_flags & UPL_FOR_PAGEOUT)
2694		upl->flags |= UPL_PAGEOUT;
2695
2696	vm_object_lock(object);
2697	vm_object_paging_begin(object);
2698
2699	/*
2700	 * we can lock in the paging_offset once paging_in_progress is set
2701	 */
2702	upl->size = size;
2703	upl->offset = offset + object->paging_offset;
2704
2705#ifdef UPL_DEBUG
2706	queue_enter(&object->uplq, upl, upl_t, uplq);
2707#endif /* UPL_DEBUG */
2708
2709	if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
2710		/*
2711		 * Honor copy-on-write obligations
2712		 *
2713		 * The caller is gathering these pages and
2714		 * might modify their contents.  We need to
2715		 * make sure that the copy object has its own
2716		 * private copies of these pages before we let
2717		 * the caller modify them.
2718		 */
2719		vm_object_update(object,
2720				 offset,
2721				 size,
2722				 NULL,
2723				 NULL,
2724				 FALSE,	/* should_return */
2725				 MEMORY_OBJECT_COPY_SYNC,
2726				 VM_PROT_NO_CHANGE);
2727		upl_cow++;
2728		upl_cow_pages += size >> PAGE_SHIFT;
2729	}
2730	/*
2731	 * remember which copy object we synchronized with
2732	 */
2733	last_copy_object = object->copy;
2734	entry = 0;
2735
2736	xfer_size = size;
2737	dst_offset = offset;
2738
2739	while (xfer_size) {
2740
2741		if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
2742		        if (delayed_unlock) {
2743			        delayed_unlock = 0;
2744				vm_page_unlock_queues();
2745			}
2746			vm_object_unlock(object);
2747			VM_PAGE_GRAB_FICTITIOUS(alias_page);
2748			goto relock;
2749		}
2750		if (delayed_unlock == 0) {
2751			/*
2752			 * pageout_scan takes the vm_page_lock_queues first
2753			 * then tries for the object lock... to avoid what
2754			 * is effectively a lock inversion, we'll go to the
2755			 * trouble of taking them in that same order... otherwise
2756			 * if this object contains the majority of the pages resident
2757			 * in the UBC (or a small set of large objects actively being
2758			 * worked on contain the majority of the pages), we could
2759			 * cause the pageout_scan thread to 'starve' in its attempt
2760			 * to find pages to move to the free queue, since it has to
2761			 * successfully acquire the object lock of any candidate page
2762			 * before it can steal/clean it.
2763			 */
2764			vm_object_unlock(object);
2765relock:
2766			for (j = 0; ; j++) {
2767				vm_page_lock_queues();
2768
2769				if (vm_object_lock_try(object))
2770					break;
2771				vm_page_unlock_queues();
2772				mutex_pause(j);
2773			}
2774			delayed_unlock = 1;
2775		}
2776		if (cntrl_flags & UPL_COPYOUT_FROM) {
2777		        upl->flags |= UPL_PAGE_SYNC_DONE;
2778
2779			if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2780				dst_page->fictitious ||
2781				dst_page->absent ||
2782				dst_page->error ||
2783			       (dst_page->wire_count && !dst_page->pageout && !dst_page->list_req_pending)) {
2784
2785				if (user_page_list)
2786					user_page_list[entry].phys_addr = 0;
2787
2788				goto delay_unlock_queues;
2789			}
2790			/*
2791			 * grab this up front...
2792			 * a high percentange of the time we're going to
2793			 * need the hardware modification state a bit later
2794			 * anyway... so we can eliminate an extra call into
2795			 * the pmap layer by grabbing it here and recording it
2796			 */
2797			if (dst_page->pmapped)
2798			        refmod_state = pmap_get_refmod(dst_page->phys_page);
2799			else
2800			        refmod_state = 0;
2801
2802			if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
2803			        /*
2804				 * page is on inactive list and referenced...
2805				 * reactivate it now... this gets it out of the
2806				 * way of vm_pageout_scan which would have to
2807				 * reactivate it upon tripping over it
2808				 */
2809			        vm_page_activate(dst_page);
2810				VM_STAT_INCR(reactivations);
2811			}
2812			if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2813			        /*
2814				 * we're only asking for DIRTY pages to be returned
2815				 */
2816			        if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2817				        /*
2818					 * if we were the page stolen by vm_pageout_scan to be
2819					 * cleaned (as opposed to a buddy being clustered in
2820					 * or this request is not being driven by a PAGEOUT cluster
2821					 * then we only need to check for the page being dirty or
2822					 * precious to decide whether to return it
2823					 */
2824				        if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
2825					        goto check_busy;
2826					goto dont_return;
2827				}
2828				/*
2829				 * this is a request for a PAGEOUT cluster and this page
2830				 * is merely along for the ride as a 'buddy'... not only
2831				 * does it have to be dirty to be returned, but it also
2832				 * can't have been referenced recently... note that we've
2833				 * already filtered above based on whether this page is
2834				 * currently on the inactive queue or it meets the page
2835				 * ticket (generation count) check
2836				 */
2837				if ( !(refmod_state & VM_MEM_REFERENCED) &&
2838				     ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
2839				        goto check_busy;
2840				}
2841dont_return:
2842				/*
2843				 * if we reach here, we're not to return
2844				 * the page... go on to the next one
2845				 */
2846				if (user_page_list)
2847				        user_page_list[entry].phys_addr = 0;
2848
2849				goto delay_unlock_queues;
2850			}
2851check_busy:
2852			if (dst_page->busy && (!(dst_page->list_req_pending && dst_page->pageout))) {
2853			        if (cntrl_flags & UPL_NOBLOCK) {
2854				        if (user_page_list)
2855					        user_page_list[entry].phys_addr = 0;
2856
2857					goto delay_unlock_queues;
2858				}
2859				/*
2860				 * someone else is playing with the
2861				 * page.  We will have to wait.
2862				 */
2863				delayed_unlock = 0;
2864				vm_page_unlock_queues();
2865
2866				PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2867
2868				continue;
2869			}
2870			/*
2871			 * Someone else already cleaning the page?
2872			 */
2873			if ((dst_page->cleaning || dst_page->absent || dst_page->wire_count != 0) && !dst_page->list_req_pending) {
2874			        if (user_page_list)
2875				        user_page_list[entry].phys_addr = 0;
2876
2877				goto delay_unlock_queues;
2878			}
2879			/*
2880			 * ENCRYPTED SWAP:
2881			 * The caller is gathering this page and might
2882			 * access its contents later on.  Decrypt the
2883			 * page before adding it to the UPL, so that
2884			 * the caller never sees encrypted data.
2885			 */
2886			if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
2887			        int  was_busy;
2888
2889				delayed_unlock = 0;
2890				vm_page_unlock_queues();
2891				/*
2892				 * save the current state of busy
2893				 * mark page as busy while decrypt
2894				 * is in progress since it will drop
2895				 * the object lock...
2896				 */
2897				was_busy = dst_page->busy;
2898				dst_page->busy = TRUE;
2899
2900				vm_page_decrypt(dst_page, 0);
2901				vm_page_decrypt_for_upl_counter++;
2902				/*
2903				 * restore to original busy state
2904				 */
2905				dst_page->busy = was_busy;
2906
2907				vm_page_lock_queues();
2908				delayed_unlock = 1;
2909			}
2910			if (dst_page->pageout_queue == TRUE)
2911			        /*
2912				 * we've buddied up a page for a clustered pageout
2913				 * that has already been moved to the pageout
2914				 * queue by pageout_scan... we need to remove
2915				 * it from the queue and drop the laundry count
2916				 * on that queue
2917				 */
2918			        vm_pageout_queue_steal(dst_page);
2919#if MACH_CLUSTER_STATS
2920			/*
2921			 * pageout statistics gathering.  count
2922			 * all the pages we will page out that
2923			 * were not counted in the initial
2924			 * vm_pageout_scan work
2925			 */
2926			if (dst_page->list_req_pending)
2927			        encountered_lrp = TRUE;
2928			if ((dst_page->dirty ||	(dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) {
2929			        if (encountered_lrp)
2930				        CLUSTER_STAT(pages_at_higher_offsets++;)
2931				else
2932				        CLUSTER_STAT(pages_at_lower_offsets++;)
2933			}
2934#endif
2935			/*
2936			 * Turn off busy indication on pending
2937			 * pageout.  Note: we can only get here
2938			 * in the request pending case.
2939			 */
2940			dst_page->list_req_pending = FALSE;
2941			dst_page->busy = FALSE;
2942
2943			hw_dirty = refmod_state & VM_MEM_MODIFIED;
2944			dirty = hw_dirty ? TRUE : dst_page->dirty;
2945
2946			if (dst_page->phys_page > upl->highest_page)
2947			        upl->highest_page = dst_page->phys_page;
2948
2949			if (cntrl_flags & UPL_SET_LITE) {
2950			        int	pg_num;
2951
2952				pg_num = (dst_offset-offset)/PAGE_SIZE;
2953				lite_list[pg_num>>5] |= 1 << (pg_num & 31);
2954
2955				if (hw_dirty)
2956				        pmap_clear_modify(dst_page->phys_page);
2957
2958				/*
2959				 * Mark original page as cleaning
2960				 * in place.
2961				 */
2962				dst_page->cleaning = TRUE;
2963				dst_page->precious = FALSE;
2964			} else {
2965			        /*
2966				 * use pageclean setup, it is more
2967				 * convenient even for the pageout
2968				 * cases here
2969				 */
2970			        vm_object_lock(upl->map_object);
2971				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
2972				vm_object_unlock(upl->map_object);
2973
2974				alias_page->absent = FALSE;
2975				alias_page = NULL;
2976			}
2977#if     MACH_PAGEMAP
2978			/*
2979			 * Record that this page has been
2980			 * written out
2981			 */
2982			vm_external_state_set(object->existence_map, dst_page->offset);
2983#endif  /*MACH_PAGEMAP*/
2984			dst_page->dirty = dirty;
2985
2986			if (!dirty)
2987				dst_page->precious = TRUE;
2988
2989			if (dst_page->pageout)
2990			        dst_page->busy = TRUE;
2991
2992			if ( (cntrl_flags & UPL_ENCRYPT) ) {
2993			        /*
2994				 * ENCRYPTED SWAP:
2995				 * We want to deny access to the target page
2996				 * because its contents are about to be
2997				 * encrypted and the user would be very
2998				 * confused to see encrypted data instead
2999				 * of their data.
3000				 * We also set "encrypted_cleaning" to allow
3001				 * vm_pageout_scan() to demote that page
3002				 * from "adjacent/clean-in-place" to
3003				 * "target/clean-and-free" if it bumps into
3004				 * this page during its scanning while we're
3005				 * still processing this cluster.
3006				 */
3007			        dst_page->busy = TRUE;
3008				dst_page->encrypted_cleaning = TRUE;
3009			}
3010			if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
3011			        /*
3012				 * deny access to the target page
3013				 * while it is being worked on
3014				 */
3015			        if ((!dst_page->pageout) && (dst_page->wire_count == 0)) {
3016				        dst_page->busy = TRUE;
3017					dst_page->pageout = TRUE;
3018					vm_page_wire(dst_page);
3019				}
3020			}
3021		} else {
3022			if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
3023				/*
3024				 * Honor copy-on-write obligations
3025				 *
3026				 * The copy object has changed since we
3027				 * last synchronized for copy-on-write.
3028				 * Another copy object might have been
3029				 * inserted while we released the object's
3030				 * lock.  Since someone could have seen the
3031				 * original contents of the remaining pages
3032				 * through that new object, we have to
3033				 * synchronize with it again for the remaining
3034				 * pages only.  The previous pages are "busy"
3035				 * so they can not be seen through the new
3036				 * mapping.  The new mapping will see our
3037				 * upcoming changes for those previous pages,
3038				 * but that's OK since they couldn't see what
3039				 * was there before.  It's just a race anyway
3040				 * and there's no guarantee of consistency or
3041				 * atomicity.  We just don't want new mappings
3042				 * to see both the *before* and *after* pages.
3043				 */
3044				if (object->copy != VM_OBJECT_NULL) {
3045				        delayed_unlock = 0;
3046					vm_page_unlock_queues();
3047
3048					vm_object_update(
3049						object,
3050						dst_offset,/* current offset */
3051						xfer_size, /* remaining size */
3052						NULL,
3053						NULL,
3054						FALSE,	   /* should_return */
3055						MEMORY_OBJECT_COPY_SYNC,
3056						VM_PROT_NO_CHANGE);
3057
3058					upl_cow_again++;
3059					upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
3060
3061					vm_page_lock_queues();
3062					delayed_unlock = 1;
3063				}
3064				/*
3065				 * remember the copy object we synced with
3066				 */
3067				last_copy_object = object->copy;
3068			}
3069			dst_page = vm_page_lookup(object, dst_offset);
3070
3071			if (dst_page != VM_PAGE_NULL) {
3072			        if ( !(dst_page->list_req_pending) ) {
3073				        if ((cntrl_flags & UPL_RET_ONLY_ABSENT) && !dst_page->absent) {
3074					        /*
3075						 * skip over pages already present in the cache
3076						 */
3077					        if (user_page_list)
3078						        user_page_list[entry].phys_addr = 0;
3079
3080						goto delay_unlock_queues;
3081					}
3082					if (dst_page->cleaning) {
3083					        /*
3084						 * someone else is writing to the page... wait...
3085						 */
3086					        delayed_unlock = 0;
3087						vm_page_unlock_queues();
3088
3089					        PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3090
3091						continue;
3092					}
3093				} else {
3094				        if (dst_page->fictitious &&
3095					    dst_page->phys_page == vm_page_fictitious_addr) {
3096					        assert( !dst_page->speculative);
3097					        /*
3098						 * dump the fictitious page
3099						 */
3100					        dst_page->list_req_pending = FALSE;
3101
3102						vm_page_free(dst_page);
3103
3104						dst_page = NULL;
3105					} else if (dst_page->absent) {
3106					        /*
3107						 * the default_pager case
3108						 */
3109					        dst_page->list_req_pending = FALSE;
3110						dst_page->busy = FALSE;
3111					}
3112				}
3113			}
3114			if (dst_page == VM_PAGE_NULL) {
3115				if (object->private) {
3116					/*
3117					 * This is a nasty wrinkle for users
3118					 * of upl who encounter device or
3119					 * private memory however, it is
3120					 * unavoidable, only a fault can
3121					 * resolve the actual backing
3122					 * physical page by asking the
3123					 * backing device.
3124					 */
3125					if (user_page_list)
3126						user_page_list[entry].phys_addr = 0;
3127
3128					goto delay_unlock_queues;
3129				}
3130				/*
3131				 * need to allocate a page
3132				 */
3133		 		dst_page = vm_page_grab();
3134
3135				if (dst_page == VM_PAGE_NULL) {
3136				        if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3137					       /*
3138						* we don't want to stall waiting for pages to come onto the free list
3139						* while we're already holding absent pages in this UPL
3140						* the caller will deal with the empty slots
3141						*/
3142					        if (user_page_list)
3143						        user_page_list[entry].phys_addr = 0;
3144
3145						goto try_next_page;
3146					}
3147				        /*
3148					 * no pages available... wait
3149					 * then try again for the same
3150					 * offset...
3151					 */
3152					delayed_unlock = 0;
3153					vm_page_unlock_queues();
3154
3155					vm_object_unlock(object);
3156					VM_PAGE_WAIT();
3157
3158					/*
3159					 * pageout_scan takes the vm_page_lock_queues first
3160					 * then tries for the object lock... to avoid what
3161					 * is effectively a lock inversion, we'll go to the
3162					 * trouble of taking them in that same order... otherwise
3163					 * if this object contains the majority of the pages resident
3164					 * in the UBC (or a small set of large objects actively being
3165					 * worked on contain the majority of the pages), we could
3166					 * cause the pageout_scan thread to 'starve' in its attempt
3167					 * to find pages to move to the free queue, since it has to
3168					 * successfully acquire the object lock of any candidate page
3169					 * before it can steal/clean it.
3170					 */
3171					for (j = 0; ; j++) {
3172						vm_page_lock_queues();
3173
3174						if (vm_object_lock_try(object))
3175							break;
3176						vm_page_unlock_queues();
3177						mutex_pause(j);
3178					}
3179					delayed_unlock = 1;
3180
3181					continue;
3182				}
3183				vm_page_insert_internal(dst_page, object, dst_offset, TRUE);
3184
3185				dst_page->absent = TRUE;
3186				dst_page->busy = FALSE;
3187
3188				if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
3189				        /*
3190					 * if UPL_RET_ONLY_ABSENT was specified,
3191					 * than we're definitely setting up a
3192					 * upl for a clustered read/pagein
3193					 * operation... mark the pages as clustered
3194					 * so upl_commit_range can put them on the
3195					 * speculative list
3196					 */
3197				        dst_page->clustered = TRUE;
3198				}
3199			}
3200			/*
3201			 * ENCRYPTED SWAP:
3202			 */
3203			if (cntrl_flags & UPL_ENCRYPT) {
3204				/*
3205				 * The page is going to be encrypted when we
3206				 * get it from the pager, so mark it so.
3207				 */
3208				dst_page->encrypted = TRUE;
3209			} else {
3210				/*
3211				 * Otherwise, the page will not contain
3212				 * encrypted data.
3213				 */
3214				dst_page->encrypted = FALSE;
3215			}
3216			dst_page->overwriting = TRUE;
3217
3218			if (dst_page->fictitious) {
3219				panic("need corner case for fictitious page");
3220			}
3221			if (dst_page->busy) {
3222				/*
3223				 * someone else is playing with the
3224				 * page.  We will have to wait.
3225				 */
3226			        delayed_unlock = 0;
3227				vm_page_unlock_queues();
3228
3229				PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3230
3231				continue;
3232			}
3233			if (dst_page->pmapped) {
3234			        if ( !(cntrl_flags & UPL_FILE_IO))
3235				        /*
3236					 * eliminate all mappings from the
3237					 * original object and its prodigy
3238					 */
3239				        refmod_state = pmap_disconnect(dst_page->phys_page);
3240				else
3241				        refmod_state = pmap_get_refmod(dst_page->phys_page);
3242			} else
3243			        refmod_state = 0;
3244
3245			hw_dirty = refmod_state & VM_MEM_MODIFIED;
3246			dirty = hw_dirty ? TRUE : dst_page->dirty;
3247
3248			if (cntrl_flags & UPL_SET_LITE) {
3249				int	pg_num;
3250
3251				pg_num = (dst_offset-offset)/PAGE_SIZE;
3252				lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3253
3254				if (hw_dirty)
3255				        pmap_clear_modify(dst_page->phys_page);
3256
3257				/*
3258				 * Mark original page as cleaning
3259				 * in place.
3260				 */
3261				dst_page->cleaning = TRUE;
3262				dst_page->precious = FALSE;
3263			} else {
3264				/*
3265				 * use pageclean setup, it is more
3266				 * convenient even for the pageout
3267				 * cases here
3268				 */
3269			        vm_object_lock(upl->map_object);
3270				vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3271			        vm_object_unlock(upl->map_object);
3272
3273				alias_page->absent = FALSE;
3274				alias_page = NULL;
3275			}
3276
3277			if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
3278				/*
3279				 * clean in place for read implies
3280				 * that a write will be done on all
3281				 * the pages that are dirty before
3282				 * a upl commit is done.  The caller
3283				 * is obligated to preserve the
3284				 * contents of all pages marked dirty
3285				 */
3286				upl->flags |= UPL_CLEAR_DIRTY;
3287			}
3288			dst_page->dirty = dirty;
3289
3290			if (!dirty)
3291				dst_page->precious = TRUE;
3292
3293			if (dst_page->wire_count == 0) {
3294			        /*
3295				 * deny access to the target page while
3296				 * it is being worked on
3297				 */
3298				dst_page->busy = TRUE;
3299			} else
3300		 		vm_page_wire(dst_page);
3301
3302			if (dst_page->clustered) {
3303			        /*
3304				 * expect the page not to be used
3305				 * since it's coming in as part
3306				 * of a speculative cluster...
3307				 * pages that are 'consumed' will
3308				 * get a hardware reference
3309				 */
3310			        dst_page->reference = FALSE;
3311			} else {
3312			        /*
3313				 * expect the page to be used
3314				 */
3315			        dst_page->reference = TRUE;
3316			}
3317			dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE;
3318		}
3319		if (dst_page->phys_page > upl->highest_page)
3320		        upl->highest_page = dst_page->phys_page;
3321		if (user_page_list) {
3322			user_page_list[entry].phys_addr = dst_page->phys_page;
3323			user_page_list[entry].pageout	= dst_page->pageout;
3324			user_page_list[entry].absent	= dst_page->absent;
3325			user_page_list[entry].dirty	= dst_page->dirty;
3326			user_page_list[entry].precious	= dst_page->precious;
3327			user_page_list[entry].device	= FALSE;
3328			if (dst_page->clustered == TRUE)
3329			        user_page_list[entry].speculative = dst_page->speculative;
3330			else
3331			        user_page_list[entry].speculative = FALSE;
3332			user_page_list[entry].cs_validated = dst_page->cs_validated;
3333			user_page_list[entry].cs_tainted = dst_page->cs_tainted;
3334		}
3335	        /*
3336		 * if UPL_RET_ONLY_ABSENT is set, then
3337		 * we are working with a fresh page and we've
3338		 * just set the clustered flag on it to
3339		 * indicate that it was drug in as part of a
3340		 * speculative cluster... so leave it alone
3341		 */
3342		if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3343		        /*
3344			 * someone is explicitly grabbing this page...
3345			 * update clustered and speculative state
3346			 *
3347			 */
3348		        VM_PAGE_CONSUME_CLUSTERED(dst_page);
3349		}
3350delay_unlock_queues:
3351		if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
3352			/*
3353			 * pageout_scan takes the vm_page_lock_queues first
3354			 * then tries for the object lock... to avoid what
3355			 * is effectively a lock inversion, we'll go to the
3356			 * trouble of taking them in that same order... otherwise
3357			 * if this object contains the majority of the pages resident
3358			 * in the UBC (or a small set of large objects actively being
3359			 * worked on contain the majority of the pages), we could
3360			 * cause the pageout_scan thread to 'starve' in its attempt
3361			 * to find pages to move to the free queue, since it has to
3362			 * successfully acquire the object lock of any candidate page
3363			 * before it can steal/clean it.
3364			 */
3365			vm_object_unlock(object);
3366			mutex_yield(&vm_page_queue_lock);
3367
3368			for (j = 0; ; j++) {
3369				if (vm_object_lock_try(object))
3370					break;
3371				vm_page_unlock_queues();
3372				mutex_pause(j);
3373				vm_page_lock_queues();
3374			}
3375		        delayed_unlock = 1;
3376		}
3377try_next_page:
3378		entry++;
3379		dst_offset += PAGE_SIZE_64;
3380		xfer_size -= PAGE_SIZE;
3381	}
3382	if (alias_page != NULL) {
3383	        if (delayed_unlock == 0) {
3384		        vm_page_lock_queues();
3385			delayed_unlock = 1;
3386		}
3387		vm_page_free(alias_page);
3388	}
3389	if (delayed_unlock)
3390	        vm_page_unlock_queues();
3391
3392	if (page_list_count != NULL) {
3393	        if (upl->flags & UPL_INTERNAL)
3394			*page_list_count = 0;
3395		else if (*page_list_count > entry)
3396			*page_list_count = entry;
3397	}
3398	vm_object_unlock(object);
3399
3400	return KERN_SUCCESS;
3401}
3402
3403/* JMM - Backward compatability for now */
3404kern_return_t
3405vm_fault_list_request(			/* forward */
3406	memory_object_control_t		control,
3407	vm_object_offset_t	offset,
3408	upl_size_t		size,
3409	upl_t			*upl_ptr,
3410	upl_page_info_t		**user_page_list_ptr,
3411	unsigned int		page_list_count,
3412	int			cntrl_flags);
3413kern_return_t
3414vm_fault_list_request(
3415	memory_object_control_t		control,
3416	vm_object_offset_t	offset,
3417	upl_size_t		size,
3418	upl_t			*upl_ptr,
3419	upl_page_info_t		**user_page_list_ptr,
3420	unsigned int		page_list_count,
3421	int			cntrl_flags)
3422{
3423	unsigned int		local_list_count;
3424	upl_page_info_t		*user_page_list;
3425	kern_return_t		kr;
3426
3427	if (user_page_list_ptr != NULL) {
3428		local_list_count = page_list_count;
3429		user_page_list = *user_page_list_ptr;
3430	} else {
3431		local_list_count = 0;
3432		user_page_list = NULL;
3433	}
3434	kr =  memory_object_upl_request(control,
3435				offset,
3436				size,
3437				upl_ptr,
3438				user_page_list,
3439				&local_list_count,
3440				cntrl_flags);
3441
3442	if(kr != KERN_SUCCESS)
3443		return kr;
3444
3445	if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3446		*user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3447	}
3448
3449	return KERN_SUCCESS;
3450}
3451
3452
3453
3454/*
3455 *	Routine:	vm_object_super_upl_request
3456 *	Purpose:
3457 *		Cause the population of a portion of a vm_object
3458 *		in much the same way as memory_object_upl_request.
3459 *		Depending on the nature of the request, the pages
3460 *		returned may be contain valid data or be uninitialized.
3461 *		However, the region may be expanded up to the super
3462 *		cluster size provided.
3463 */
3464
3465__private_extern__ kern_return_t
3466vm_object_super_upl_request(
3467	vm_object_t object,
3468	vm_object_offset_t	offset,
3469	upl_size_t		size,
3470	upl_size_t		super_cluster,
3471	upl_t			*upl,
3472	upl_page_info_t		*user_page_list,
3473	unsigned int		*page_list_count,
3474	int			cntrl_flags)
3475{
3476	if (object->paging_offset > offset)
3477		return KERN_FAILURE;
3478
3479	assert(object->paging_in_progress);
3480	offset = offset - object->paging_offset;
3481
3482	if (super_cluster > size) {
3483
3484		vm_object_offset_t	base_offset;
3485		upl_size_t		super_size;
3486
3487		base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
3488		super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
3489		super_size = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size;
3490
3491		if (offset > (base_offset + super_size)) {
3492		        panic("vm_object_super_upl_request: Missed target pageout"
3493			      " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3494			      offset, base_offset, super_size, super_cluster,
3495			      size, object->paging_offset);
3496		}
3497		/*
3498		 * apparently there is a case where the vm requests a
3499		 * page to be written out who's offset is beyond the
3500		 * object size
3501		 */
3502		if ((offset + size) > (base_offset + super_size))
3503		        super_size = (offset + size) - base_offset;
3504
3505		offset = base_offset;
3506		size = super_size;
3507	}
3508	return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
3509}
3510
3511
3512kern_return_t
3513vm_map_create_upl(
3514	vm_map_t		map,
3515	vm_map_address_t	offset,
3516	upl_size_t		*upl_size,
3517	upl_t			*upl,
3518	upl_page_info_array_t	page_list,
3519	unsigned int		*count,
3520	int			*flags)
3521{
3522	vm_map_entry_t	entry;
3523	int		caller_flags;
3524	int		force_data_sync;
3525	int		sync_cow_data;
3526	vm_object_t	local_object;
3527	vm_map_offset_t	local_offset;
3528	vm_map_offset_t	local_start;
3529	kern_return_t	ret;
3530
3531	caller_flags = *flags;
3532
3533	if (caller_flags & ~UPL_VALID_FLAGS) {
3534		/*
3535		 * For forward compatibility's sake,
3536		 * reject any unknown flag.
3537		 */
3538		return KERN_INVALID_VALUE;
3539	}
3540	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3541	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3542
3543	if (upl == NULL)
3544		return KERN_INVALID_ARGUMENT;
3545
3546REDISCOVER_ENTRY:
3547	vm_map_lock(map);
3548
3549	if (vm_map_lookup_entry(map, offset, &entry)) {
3550
3551		if ((entry->vme_end - offset) < *upl_size)
3552			*upl_size = entry->vme_end - offset;
3553
3554		if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3555		        *flags = 0;
3556
3557			if (entry->object.vm_object != VM_OBJECT_NULL) {
3558			        if (entry->object.vm_object->private)
3559				        *flags = UPL_DEV_MEMORY;
3560
3561				if (entry->object.vm_object->phys_contiguous)
3562					*flags |= UPL_PHYS_CONTIG;
3563			}
3564			vm_map_unlock(map);
3565
3566			return KERN_SUCCESS;
3567		}
3568	        if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
3569        		if ((*upl_size/page_size) > MAX_UPL_SIZE)
3570               			*upl_size = MAX_UPL_SIZE * page_size;
3571		}
3572		/*
3573		 *      Create an object if necessary.
3574		 */
3575		if (entry->object.vm_object == VM_OBJECT_NULL) {
3576			entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
3577			entry->offset = 0;
3578		}
3579		if (!(caller_flags & UPL_COPYOUT_FROM)) {
3580			if (!(entry->protection & VM_PROT_WRITE)) {
3581				vm_map_unlock(map);
3582				return KERN_PROTECTION_FAILURE;
3583			}
3584			if (entry->needs_copy)  {
3585				vm_map_t		local_map;
3586				vm_object_t		object;
3587				vm_object_offset_t	new_offset;
3588				vm_prot_t		prot;
3589				boolean_t		wired;
3590				vm_map_version_t	version;
3591				vm_map_t		real_map;
3592
3593				local_map = map;
3594				vm_map_lock_write_to_read(map);
3595
3596				if (vm_map_lookup_locked(&local_map,
3597							 offset, VM_PROT_WRITE,
3598							 OBJECT_LOCK_EXCLUSIVE,
3599							 &version, &object,
3600							 &new_offset, &prot, &wired,
3601							 NULL,
3602							 &real_map)) {
3603				        vm_map_unlock(local_map);
3604					return KERN_FAILURE;
3605				}
3606				if (real_map != map)
3607					vm_map_unlock(real_map);
3608				vm_object_unlock(object);
3609				vm_map_unlock(local_map);
3610
3611				goto REDISCOVER_ENTRY;
3612			}
3613		}
3614		if (entry->is_sub_map) {
3615			vm_map_t	submap;
3616
3617			submap = entry->object.sub_map;
3618			local_start = entry->vme_start;
3619			local_offset = entry->offset;
3620
3621			vm_map_reference(submap);
3622			vm_map_unlock(map);
3623
3624			ret = vm_map_create_upl(submap,
3625						local_offset + (offset - local_start),
3626						upl_size, upl, page_list, count, flags);
3627			vm_map_deallocate(submap);
3628
3629			return ret;
3630		}
3631		if (sync_cow_data) {
3632			if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
3633				local_object = entry->object.vm_object;
3634				local_start = entry->vme_start;
3635				local_offset = entry->offset;
3636
3637				vm_object_reference(local_object);
3638				vm_map_unlock(map);
3639
3640				if (entry->object.vm_object->shadow && entry->object.vm_object->copy) {
3641				        vm_object_lock_request(
3642							       local_object->shadow,
3643							       (vm_object_offset_t)
3644							       ((offset - local_start) +
3645								local_offset) +
3646							       local_object->shadow_offset,
3647							       *upl_size, FALSE,
3648							       MEMORY_OBJECT_DATA_SYNC,
3649							       VM_PROT_NO_CHANGE);
3650				}
3651				sync_cow_data = FALSE;
3652				vm_object_deallocate(local_object);
3653
3654				goto REDISCOVER_ENTRY;
3655			}
3656		}
3657		if (force_data_sync) {
3658			local_object = entry->object.vm_object;
3659			local_start = entry->vme_start;
3660			local_offset = entry->offset;
3661
3662			vm_object_reference(local_object);
3663		        vm_map_unlock(map);
3664
3665			vm_object_lock_request(
3666					       local_object,
3667					       (vm_object_offset_t)
3668					       ((offset - local_start) + local_offset),
3669					       (vm_object_size_t)*upl_size, FALSE,
3670					       MEMORY_OBJECT_DATA_SYNC,
3671					       VM_PROT_NO_CHANGE);
3672
3673			force_data_sync = FALSE;
3674			vm_object_deallocate(local_object);
3675
3676			goto REDISCOVER_ENTRY;
3677		}
3678		if (entry->object.vm_object->private)
3679		        *flags = UPL_DEV_MEMORY;
3680		else
3681		        *flags = 0;
3682
3683		if (entry->object.vm_object->phys_contiguous)
3684		        *flags |= UPL_PHYS_CONTIG;
3685
3686		local_object = entry->object.vm_object;
3687		local_offset = entry->offset;
3688		local_start = entry->vme_start;
3689
3690		vm_object_reference(local_object);
3691		vm_map_unlock(map);
3692
3693		ret = vm_object_iopl_request(local_object,
3694					      (vm_object_offset_t) ((offset - local_start) + local_offset),
3695					      *upl_size,
3696					      upl,
3697					      page_list,
3698					      count,
3699					      caller_flags);
3700		vm_object_deallocate(local_object);
3701
3702		return(ret);
3703	}
3704	vm_map_unlock(map);
3705
3706	return(KERN_FAILURE);
3707}
3708
3709/*
3710 * Internal routine to enter a UPL into a VM map.
3711 *
3712 * JMM - This should just be doable through the standard
3713 * vm_map_enter() API.
3714 */
3715kern_return_t
3716vm_map_enter_upl(
3717	vm_map_t		map,
3718	upl_t			upl,
3719	vm_map_offset_t	*dst_addr)
3720{
3721	vm_map_size_t	 	size;
3722	vm_object_offset_t 	offset;
3723	vm_map_offset_t		addr;
3724	vm_page_t		m;
3725	kern_return_t		kr;
3726
3727	if (upl == UPL_NULL)
3728		return KERN_INVALID_ARGUMENT;
3729
3730	upl_lock(upl);
3731
3732	/*
3733	 * check to see if already mapped
3734	 */
3735	if (UPL_PAGE_LIST_MAPPED & upl->flags) {
3736		upl_unlock(upl);
3737		return KERN_FAILURE;
3738	}
3739
3740	if ((!(upl->flags & UPL_SHADOWED)) && !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3741					       (upl->map_object->phys_contiguous))) {
3742		vm_object_t 		object;
3743		vm_page_t		alias_page;
3744		vm_object_offset_t	new_offset;
3745		int			pg_num;
3746		wpl_array_t 		lite_list;
3747
3748		if (upl->flags & UPL_INTERNAL) {
3749			lite_list = (wpl_array_t)
3750				((((uintptr_t)upl) + sizeof(struct upl))
3751				 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3752		} else {
3753		        lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
3754		}
3755		object = upl->map_object;
3756		upl->map_object = vm_object_allocate(upl->size);
3757
3758		vm_object_lock(upl->map_object);
3759
3760		upl->map_object->shadow = object;
3761		upl->map_object->pageout = TRUE;
3762		upl->map_object->can_persist = FALSE;
3763		upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3764		upl->map_object->shadow_offset = upl->offset - object->paging_offset;
3765		upl->map_object->wimg_bits = object->wimg_bits;
3766		offset = upl->map_object->shadow_offset;
3767		new_offset = 0;
3768		size = upl->size;
3769
3770		upl->flags |= UPL_SHADOWED;
3771
3772		while (size) {
3773		        pg_num = (new_offset)/PAGE_SIZE;
3774
3775			if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3776
3777				VM_PAGE_GRAB_FICTITIOUS(alias_page);
3778
3779				vm_object_lock(object);
3780
3781				m = vm_page_lookup(object, offset);
3782				if (m == VM_PAGE_NULL) {
3783				        panic("vm_upl_map: page missing\n");
3784				}
3785
3786				/*
3787				 * Convert the fictitious page to a private
3788				 * shadow of the real page.
3789				 */
3790				assert(alias_page->fictitious);
3791				alias_page->fictitious = FALSE;
3792				alias_page->private = TRUE;
3793				alias_page->pageout = TRUE;
3794				/*
3795				 * since m is a page in the upl it must
3796				 * already be wired or BUSY, so it's
3797				 * safe to assign the underlying physical
3798				 * page to the alias
3799				 */
3800				alias_page->phys_page = m->phys_page;
3801
3802			        vm_object_unlock(object);
3803
3804				vm_page_lockspin_queues();
3805				vm_page_wire(alias_page);
3806				vm_page_unlock_queues();
3807
3808				/*
3809				 * ENCRYPTED SWAP:
3810				 * The virtual page ("m") has to be wired in some way
3811				 * here or its physical page ("m->phys_page") could
3812				 * be recycled at any time.
3813				 * Assuming this is enforced by the caller, we can't
3814				 * get an encrypted page here.  Since the encryption
3815				 * key depends on the VM page's "pager" object and
3816				 * the "paging_offset", we couldn't handle 2 pageable
3817				 * VM pages (with different pagers and paging_offsets)
3818				 * sharing the same physical page:  we could end up
3819				 * encrypting with one key (via one VM page) and
3820				 * decrypting with another key (via the alias VM page).
3821				 */
3822				ASSERT_PAGE_DECRYPTED(m);
3823
3824				vm_page_insert(alias_page, upl->map_object, new_offset);
3825
3826				assert(!alias_page->wanted);
3827				alias_page->busy = FALSE;
3828				alias_page->absent = FALSE;
3829			}
3830			size -= PAGE_SIZE;
3831			offset += PAGE_SIZE_64;
3832			new_offset += PAGE_SIZE_64;
3833		}
3834		vm_object_unlock(upl->map_object);
3835	}
3836	if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3837	        offset = upl->offset - upl->map_object->paging_offset;
3838	else
3839	        offset = 0;
3840	size = upl->size;
3841
3842	vm_object_reference(upl->map_object);
3843
3844	*dst_addr = 0;
3845	/*
3846	 * NEED A UPL_MAP ALIAS
3847	 */
3848	kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3849			  VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3850			  VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3851
3852	if (kr != KERN_SUCCESS) {
3853		upl_unlock(upl);
3854		return(kr);
3855	}
3856	vm_object_lock(upl->map_object);
3857
3858	for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
3859		m = vm_page_lookup(upl->map_object, offset);
3860
3861		if (m) {
3862		        unsigned int	cache_attr;
3863			cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3864
3865			m->pmapped = TRUE;
3866			m->wpmapped = TRUE;
3867
3868			PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE);
3869		}
3870		offset += PAGE_SIZE_64;
3871	}
3872	vm_object_unlock(upl->map_object);
3873
3874	/*
3875	 * hold a reference for the mapping
3876	 */
3877	upl->ref_count++;
3878	upl->flags |= UPL_PAGE_LIST_MAPPED;
3879	upl->kaddr = *dst_addr;
3880	upl_unlock(upl);
3881
3882	return KERN_SUCCESS;
3883}
3884
3885/*
3886 * Internal routine to remove a UPL mapping from a VM map.
3887 *
3888 * XXX - This should just be doable through a standard
3889 * vm_map_remove() operation.  Otherwise, implicit clean-up
3890 * of the target map won't be able to correctly remove
3891 * these (and release the reference on the UPL).  Having
3892 * to do this means we can't map these into user-space
3893 * maps yet.
3894 */
3895kern_return_t
3896vm_map_remove_upl(
3897	vm_map_t	map,
3898	upl_t		upl)
3899{
3900	vm_address_t	addr;
3901	upl_size_t	size;
3902
3903	if (upl == UPL_NULL)
3904		return KERN_INVALID_ARGUMENT;
3905
3906	upl_lock(upl);
3907
3908	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
3909		addr = upl->kaddr;
3910		size = upl->size;
3911
3912		assert(upl->ref_count > 1);
3913		upl->ref_count--;		/* removing mapping ref */
3914
3915		upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3916		upl->kaddr = (vm_offset_t) 0;
3917		upl_unlock(upl);
3918
3919		vm_map_remove(map,
3920			      vm_map_trunc_page(addr),
3921			      vm_map_round_page(addr + size),
3922			      VM_MAP_NO_FLAGS);
3923
3924		return KERN_SUCCESS;
3925	}
3926	upl_unlock(upl);
3927
3928	return KERN_FAILURE;
3929}
3930
3931kern_return_t
3932upl_commit_range(
3933	upl_t			upl,
3934	upl_offset_t		offset,
3935	upl_size_t		size,
3936	int			flags,
3937	upl_page_info_t		*page_list,
3938	mach_msg_type_number_t	count,
3939	boolean_t		*empty)
3940{
3941	upl_size_t		xfer_size;
3942	vm_object_t		shadow_object;
3943	vm_object_t		object;
3944	vm_object_offset_t	target_offset;
3945	int			entry;
3946	wpl_array_t 		lite_list;
3947	int			occupied;
3948	int                     delayed_unlock = 0;
3949	int			clear_refmod = 0;
3950	int			pgpgout_count = 0;
3951	int			j;
3952
3953	*empty = FALSE;
3954
3955	if (upl == UPL_NULL)
3956		return KERN_INVALID_ARGUMENT;
3957
3958	if (count == 0)
3959		page_list = NULL;
3960
3961	if (upl->flags & UPL_DEVICE_MEMORY)
3962		xfer_size = 0;
3963	else if ((offset + size) <= upl->size)
3964	        xfer_size = size;
3965	else
3966		return KERN_FAILURE;
3967
3968	upl_lock(upl);
3969
3970	if (upl->flags & UPL_ACCESS_BLOCKED) {
3971		/*
3972		 * We used this UPL to block access to the pages by marking
3973		 * them "busy".  Now we need to clear the "busy" bit to allow
3974		 * access to these pages again.
3975		 */
3976		flags |= UPL_COMMIT_ALLOW_ACCESS;
3977	}
3978	if (upl->flags & UPL_CLEAR_DIRTY)
3979	        flags |= UPL_COMMIT_CLEAR_DIRTY;
3980
3981	if (upl->flags & UPL_INTERNAL)
3982		lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
3983					   + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3984	else
3985		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
3986
3987	object = upl->map_object;
3988
3989	if (upl->flags & UPL_SHADOWED) {
3990	        vm_object_lock(object);
3991		shadow_object = object->shadow;
3992	} else {
3993		shadow_object = object;
3994	}
3995	entry = offset/PAGE_SIZE;
3996	target_offset = (vm_object_offset_t)offset;
3997
3998	/*
3999	 * pageout_scan takes the vm_page_lock_queues first
4000	 * then tries for the object lock... to avoid what
4001	 * is effectively a lock inversion, we'll go to the
4002	 * trouble of taking them in that same order... otherwise
4003	 * if this object contains the majority of the pages resident
4004	 * in the UBC (or a small set of large objects actively being
4005	 * worked on contain the majority of the pages), we could
4006	 * cause the pageout_scan thread to 'starve' in its attempt
4007	 * to find pages to move to the free queue, since it has to
4008	 * successfully acquire the object lock of any candidate page
4009	 * before it can steal/clean it.
4010	 */
4011	for (j = 0; ; j++) {
4012		vm_page_lock_queues();
4013
4014		if (vm_object_lock_try(shadow_object))
4015			break;
4016		vm_page_unlock_queues();
4017		mutex_pause(j);
4018	}
4019	delayed_unlock = 1;
4020
4021	if (shadow_object->code_signed) {
4022		/*
4023		 * CODE SIGNING:
4024		 * If the object is code-signed, do not let this UPL tell
4025		 * us if the pages are valid or not.  Let the pages be
4026		 * validated by VM the normal way (when they get mapped or
4027		 * copied).
4028		 */
4029		flags &= ~UPL_COMMIT_CS_VALIDATED;
4030	}
4031	if (! page_list) {
4032		/*
4033		 * No page list to get the code-signing info from !?
4034		 */
4035		flags &= ~UPL_COMMIT_CS_VALIDATED;
4036	}
4037
4038	while (xfer_size) {
4039		vm_page_t	t, m;
4040
4041		m = VM_PAGE_NULL;
4042
4043		if (upl->flags & UPL_LITE) {
4044		        int	pg_num;
4045
4046			pg_num = target_offset/PAGE_SIZE;
4047
4048			if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4049			        lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4050
4051				m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
4052			}
4053		}
4054		if (upl->flags & UPL_SHADOWED) {
4055			if ((t = vm_page_lookup(object, target_offset))	!= VM_PAGE_NULL) {
4056
4057				t->pageout = FALSE;
4058
4059				vm_page_free(t);
4060
4061				if (m == VM_PAGE_NULL)
4062					m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4063			}
4064		}
4065		if (m == VM_PAGE_NULL) {
4066			goto commit_next_page;
4067		}
4068
4069		clear_refmod = 0;
4070
4071		if (flags & UPL_COMMIT_CS_VALIDATED) {
4072			/*
4073			 * CODE SIGNING:
4074			 * Set the code signing bits according to
4075			 * what the UPL says they should be.
4076			 */
4077			m->cs_validated = page_list[entry].cs_validated;
4078			m->cs_tainted = page_list[entry].cs_tainted;
4079		}
4080		if (upl->flags & UPL_IO_WIRE) {
4081
4082			vm_page_unwire(m);
4083
4084			if (page_list)
4085				page_list[entry].phys_addr = 0;
4086
4087			if (flags & UPL_COMMIT_SET_DIRTY)
4088				m->dirty = TRUE;
4089			else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4090				m->dirty = FALSE;
4091				if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4092				    m->cs_validated && !m->cs_tainted) {
4093					/*
4094					 * CODE SIGNING:
4095					 * This page is no longer dirty
4096					 * but could have been modified,
4097					 * so it will need to be
4098					 * re-validated.
4099					 */
4100					m->cs_validated = FALSE;
4101					vm_cs_validated_resets++;
4102				}
4103				clear_refmod |= VM_MEM_MODIFIED;
4104			}
4105
4106			if (flags & UPL_COMMIT_INACTIVATE)
4107				vm_page_deactivate(m);
4108
4109			if (clear_refmod)
4110				pmap_clear_refmod(m->phys_page, clear_refmod);
4111
4112			if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4113				/*
4114				 * We blocked access to the pages in this UPL.
4115				 * Clear the "busy" bit and wake up any waiter
4116				 * for this page.
4117				 */
4118				PAGE_WAKEUP_DONE(m);
4119			}
4120			goto commit_next_page;
4121		}
4122		/*
4123		 * make sure to clear the hardware
4124		 * modify or reference bits before
4125		 * releasing the BUSY bit on this page
4126		 * otherwise we risk losing a legitimate
4127		 * change of state
4128		 */
4129		if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4130			m->dirty = FALSE;
4131
4132			if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4133			    m->cs_validated && !m->cs_tainted) {
4134				/*
4135				 * CODE SIGNING:
4136				 * This page is no longer dirty
4137				 * but could have been modified,
4138				 * so it will need to be
4139				 * re-validated.
4140				 */
4141				m->cs_validated = FALSE;
4142#if DEVELOPMENT || DEBUG
4143				vm_cs_validated_resets++;
4144#endif
4145			}
4146			clear_refmod |= VM_MEM_MODIFIED;
4147		}
4148		if (clear_refmod)
4149			pmap_clear_refmod(m->phys_page, clear_refmod);
4150
4151		if (page_list) {
4152			upl_page_info_t *p;
4153
4154			p = &(page_list[entry]);
4155
4156			if (p->phys_addr && p->pageout && !m->pageout) {
4157				m->busy = TRUE;
4158				m->pageout = TRUE;
4159				vm_page_wire(m);
4160			} else if (p->phys_addr &&
4161				   !p->pageout && m->pageout &&
4162				   !m->dump_cleaning) {
4163				m->pageout = FALSE;
4164				m->absent = FALSE;
4165				m->overwriting = FALSE;
4166				vm_page_unwire(m);
4167
4168				PAGE_WAKEUP_DONE(m);
4169			}
4170			page_list[entry].phys_addr = 0;
4171		}
4172		m->dump_cleaning = FALSE;
4173
4174		if (m->laundry)
4175			vm_pageout_throttle_up(m);
4176
4177		if (m->pageout) {
4178			m->cleaning = FALSE;
4179			m->encrypted_cleaning = FALSE;
4180			m->pageout = FALSE;
4181#if MACH_CLUSTER_STATS
4182			if (m->wanted) vm_pageout_target_collisions++;
4183#endif
4184			m->dirty = FALSE;
4185
4186			if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4187			    m->cs_validated && !m->cs_tainted) {
4188				/*
4189				 * CODE SIGNING:
4190				 * This page is no longer dirty
4191				 * but could have been modified,
4192				 * so it will need to be
4193				 * re-validated.
4194				 */
4195				m->cs_validated = FALSE;
4196#if DEVELOPMENT || DEBUG
4197				vm_cs_validated_resets++;
4198#endif
4199			}
4200
4201			if (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))
4202				m->dirty = TRUE;
4203
4204			if (m->dirty) {
4205				/*
4206				 * page was re-dirtied after we started
4207				 * the pageout... reactivate it since
4208				 * we don't know whether the on-disk
4209				 * copy matches what is now in memory
4210				 */
4211				vm_page_unwire(m);
4212
4213				if (upl->flags & UPL_PAGEOUT) {
4214					CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4215					VM_STAT_INCR(reactivations);
4216					DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4217				}
4218				PAGE_WAKEUP_DONE(m);
4219			} else {
4220				/*
4221				 * page has been successfully cleaned
4222				 * go ahead and free it for other use
4223				 */
4224
4225				if (m->object->internal) {
4226					DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
4227				} else {
4228					DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
4229				}
4230
4231				vm_page_free(m);
4232
4233				if (upl->flags & UPL_PAGEOUT) {
4234					CLUSTER_STAT(vm_pageout_target_page_freed++;)
4235
4236					if (page_list[entry].dirty) {
4237						VM_STAT_INCR(pageouts);
4238						DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
4239						pgpgout_count++;
4240					}
4241				}
4242			}
4243			goto commit_next_page;
4244		}
4245#if MACH_CLUSTER_STATS
4246		if (m->wpmapped)
4247			m->dirty = pmap_is_modified(m->phys_page);
4248
4249		if (m->dirty)   vm_pageout_cluster_dirtied++;
4250		else            vm_pageout_cluster_cleaned++;
4251		if (m->wanted)  vm_pageout_cluster_collisions++;
4252#endif
4253		m->dirty = FALSE;
4254
4255		if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4256		    m->cs_validated && !m->cs_tainted) {
4257			/*
4258			 * CODE SIGNING:
4259			 * This page is no longer dirty
4260			 * but could have been modified,
4261			 * so it will need to be
4262			 * re-validated.
4263			 */
4264			m->cs_validated = FALSE;
4265#if DEVELOPMENT || DEBUG
4266			vm_cs_validated_resets++;
4267#endif
4268		}
4269
4270		if ((m->busy) && (m->cleaning)) {
4271			/*
4272			 * the request_page_list case
4273			 */
4274			m->absent = FALSE;
4275			m->overwriting = FALSE;
4276			m->busy = FALSE;
4277		} else if (m->overwriting) {
4278			/*
4279			 * alternate request page list, write to
4280			 * page_list case.  Occurs when the original
4281			 * page was wired at the time of the list
4282			 * request
4283			 */
4284			assert(m->wire_count != 0);
4285			vm_page_unwire(m);/* reactivates */
4286			m->overwriting = FALSE;
4287		}
4288		m->cleaning = FALSE;
4289		m->encrypted_cleaning = FALSE;
4290
4291		/*
4292		 * It is a part of the semantic of COPYOUT_FROM
4293		 * UPLs that a commit implies cache sync
4294		 * between the vm page and the backing store
4295		 * this can be used to strip the precious bit
4296		 * as well as clean
4297		 */
4298		if (upl->flags & UPL_PAGE_SYNC_DONE)
4299			m->precious = FALSE;
4300
4301		if (flags & UPL_COMMIT_SET_DIRTY)
4302			m->dirty = TRUE;
4303
4304		if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
4305			vm_page_deactivate(m);
4306		} else if (!m->active && !m->inactive && !m->speculative) {
4307
4308			if (m->clustered)
4309				vm_page_speculate(m, TRUE);
4310			else if (m->reference)
4311				vm_page_activate(m);
4312			else
4313				vm_page_deactivate(m);
4314		}
4315		if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4316			/*
4317			 * We blocked access to the pages in this URL.
4318			 * Clear the "busy" bit on this page before we
4319			 * wake up any waiter.
4320			 */
4321			m->busy = FALSE;
4322		}
4323		/*
4324		 * Wakeup any thread waiting for the page to be un-cleaning.
4325		 */
4326		PAGE_WAKEUP(m);
4327
4328commit_next_page:
4329		target_offset += PAGE_SIZE_64;
4330		xfer_size -= PAGE_SIZE;
4331		entry++;
4332
4333		if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4334			/*
4335			 * pageout_scan takes the vm_page_lock_queues first
4336			 * then tries for the object lock... to avoid what
4337			 * is effectively a lock inversion, we'll go to the
4338			 * trouble of taking them in that same order... otherwise
4339			 * if this object contains the majority of the pages resident
4340			 * in the UBC (or a small set of large objects actively being
4341			 * worked on contain the majority of the pages), we could
4342			 * cause the pageout_scan thread to 'starve' in its attempt
4343			 * to find pages to move to the free queue, since it has to
4344			 * successfully acquire the object lock of any candidate page
4345			 * before it can steal/clean it.
4346			 */
4347			vm_object_unlock(shadow_object);
4348			mutex_yield(&vm_page_queue_lock);
4349
4350			for (j = 0; ; j++) {
4351				if (vm_object_lock_try(shadow_object))
4352					break;
4353				vm_page_unlock_queues();
4354				mutex_pause(j);
4355				vm_page_lock_queues();
4356			}
4357		        delayed_unlock = 1;
4358		}
4359	}
4360	if (delayed_unlock)
4361	        vm_page_unlock_queues();
4362
4363	occupied = 1;
4364
4365	if (upl->flags & UPL_DEVICE_MEMORY)  {
4366		occupied = 0;
4367	} else if (upl->flags & UPL_LITE) {
4368		int	pg_num;
4369		int	i;
4370
4371		pg_num = upl->size/PAGE_SIZE;
4372		pg_num = (pg_num + 31) >> 5;
4373		occupied = 0;
4374
4375		for (i = 0; i < pg_num; i++) {
4376			if (lite_list[i] != 0) {
4377				occupied = 1;
4378				break;
4379			}
4380		}
4381	} else {
4382		if (queue_empty(&upl->map_object->memq))
4383			occupied = 0;
4384	}
4385	if (occupied == 0) {
4386		if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4387			*empty = TRUE;
4388
4389		if (object == shadow_object) {
4390		        /*
4391			 * this is not a paging object
4392			 * so we need to drop the paging reference
4393			 * that was taken when we created the UPL
4394			 * against this object
4395			 */
4396			vm_object_paging_end(shadow_object);
4397		} else {
4398		         /*
4399			  * we dontated the paging reference to
4400			  * the map object... vm_pageout_object_terminate
4401			  * will drop this reference
4402			  */
4403		}
4404	}
4405	vm_object_unlock(shadow_object);
4406	if (object != shadow_object)
4407	        vm_object_unlock(object);
4408	upl_unlock(upl);
4409
4410	if (pgpgout_count) {
4411		DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
4412	}
4413
4414	return KERN_SUCCESS;
4415}
4416
4417kern_return_t
4418upl_abort_range(
4419	upl_t			upl,
4420	upl_offset_t		offset,
4421	upl_size_t		size,
4422	int			error,
4423	boolean_t		*empty)
4424{
4425	upl_size_t		xfer_size;
4426	vm_object_t		shadow_object;
4427	vm_object_t		object;
4428	vm_object_offset_t	target_offset;
4429	int			entry;
4430	wpl_array_t 	 	lite_list;
4431	int			occupied;
4432	int			delayed_unlock = 0;
4433	int			j;
4434
4435	*empty = FALSE;
4436
4437	if (upl == UPL_NULL)
4438		return KERN_INVALID_ARGUMENT;
4439
4440	if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
4441		return upl_commit_range(upl, offset, size, 0, NULL, 0, empty);
4442
4443	if (upl->flags & UPL_DEVICE_MEMORY)
4444		xfer_size = 0;
4445	else if ((offset + size) <= upl->size)
4446	        xfer_size = size;
4447	else
4448		return KERN_FAILURE;
4449
4450	upl_lock(upl);
4451
4452	if (upl->flags & UPL_INTERNAL) {
4453		lite_list = (wpl_array_t)
4454			((((uintptr_t)upl) + sizeof(struct upl))
4455			+ ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4456	} else {
4457		lite_list = (wpl_array_t)
4458			(((uintptr_t)upl) + sizeof(struct upl));
4459	}
4460	object = upl->map_object;
4461
4462	if (upl->flags & UPL_SHADOWED) {
4463	        vm_object_lock(object);
4464		shadow_object = object->shadow;
4465	} else
4466		shadow_object = object;
4467
4468	entry = offset/PAGE_SIZE;
4469	target_offset = (vm_object_offset_t)offset;
4470
4471	/*
4472	 * pageout_scan takes the vm_page_lock_queues first
4473	 * then tries for the object lock... to avoid what
4474	 * is effectively a lock inversion, we'll go to the
4475	 * trouble of taking them in that same order... otherwise
4476	 * if this object contains the majority of the pages resident
4477	 * in the UBC (or a small set of large objects actively being
4478	 * worked on contain the majority of the pages), we could
4479	 * cause the pageout_scan thread to 'starve' in its attempt
4480	 * to find pages to move to the free queue, since it has to
4481	 * successfully acquire the object lock of any candidate page
4482	 * before it can steal/clean it.
4483	 */
4484	for (j = 0; ; j++) {
4485		vm_page_lock_queues();
4486
4487		if (vm_object_lock_try(shadow_object))
4488			break;
4489		vm_page_unlock_queues();
4490		mutex_pause(j);
4491	}
4492	delayed_unlock = 1;
4493
4494	while (xfer_size) {
4495		vm_page_t	t, m;
4496
4497		m = VM_PAGE_NULL;
4498
4499		if (upl->flags & UPL_LITE) {
4500			int	pg_num;
4501			pg_num = target_offset/PAGE_SIZE;
4502
4503			if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4504				lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4505
4506				m = vm_page_lookup(shadow_object, target_offset +
4507						   (upl->offset - shadow_object->paging_offset));
4508			}
4509		}
4510		if (upl->flags & UPL_SHADOWED) {
4511		        if ((t = vm_page_lookup(object, target_offset))	!= VM_PAGE_NULL) {
4512			        t->pageout = FALSE;
4513
4514				vm_page_free(t);
4515
4516				if (m == VM_PAGE_NULL)
4517					m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4518			}
4519		}
4520		if (m != VM_PAGE_NULL) {
4521
4522			if (m->absent) {
4523			        boolean_t must_free = TRUE;
4524
4525				m->clustered = FALSE;
4526				/*
4527				 * COPYOUT = FALSE case
4528				 * check for error conditions which must
4529				 * be passed back to the pages customer
4530				 */
4531				if (error & UPL_ABORT_RESTART) {
4532					m->restart = TRUE;
4533					m->absent = FALSE;
4534					m->error = TRUE;
4535					m->unusual = TRUE;
4536					must_free = FALSE;
4537				} else if (error & UPL_ABORT_UNAVAILABLE) {
4538					m->restart = FALSE;
4539					m->unusual = TRUE;
4540					must_free = FALSE;
4541				} else if (error & UPL_ABORT_ERROR) {
4542					m->restart = FALSE;
4543					m->absent = FALSE;
4544					m->error = TRUE;
4545					m->unusual = TRUE;
4546					must_free = FALSE;
4547				}
4548
4549				/*
4550				 * ENCRYPTED SWAP:
4551				 * If the page was already encrypted,
4552				 * we don't really need to decrypt it
4553				 * now.  It will get decrypted later,
4554				 * on demand, as soon as someone needs
4555				 * to access its contents.
4556				 */
4557
4558				m->cleaning = FALSE;
4559				m->encrypted_cleaning = FALSE;
4560				m->overwriting = FALSE;
4561				PAGE_WAKEUP_DONE(m);
4562
4563				if (must_free == TRUE)
4564					vm_page_free(m);
4565				else
4566					vm_page_activate(m);
4567			} else {
4568			        /*
4569				 * Handle the trusted pager throttle.
4570				 */
4571			        if (m->laundry)
4572				        vm_pageout_throttle_up(m);
4573
4574				if (m->pageout) {
4575				        assert(m->busy);
4576					assert(m->wire_count == 1);
4577					m->pageout = FALSE;
4578					vm_page_unwire(m);
4579				}
4580				m->dump_cleaning = FALSE;
4581				m->cleaning = FALSE;
4582				m->encrypted_cleaning = FALSE;
4583				m->overwriting = FALSE;
4584#if	MACH_PAGEMAP
4585				vm_external_state_clr(m->object->existence_map, m->offset);
4586#endif	/* MACH_PAGEMAP */
4587				if (error & UPL_ABORT_DUMP_PAGES) {
4588					pmap_disconnect(m->phys_page);
4589				        vm_page_free(m);
4590				} else {
4591				        if (error & UPL_ABORT_REFERENCE) {
4592						/*
4593						 * we've been told to explictly
4594						 * reference this page... for
4595						 * file I/O, this is done by
4596						 * implementing an LRU on the inactive q
4597						 */
4598						vm_page_lru(m);
4599					}
4600				        PAGE_WAKEUP_DONE(m);
4601				}
4602			}
4603		}
4604		if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4605			/*
4606			 * pageout_scan takes the vm_page_lock_queues first
4607			 * then tries for the object lock... to avoid what
4608			 * is effectively a lock inversion, we'll go to the
4609			 * trouble of taking them in that same order... otherwise
4610			 * if this object contains the majority of the pages resident
4611			 * in the UBC (or a small set of large objects actively being
4612			 * worked on contain the majority of the pages), we could
4613			 * cause the pageout_scan thread to 'starve' in its attempt
4614			 * to find pages to move to the free queue, since it has to
4615			 * successfully acquire the object lock of any candidate page
4616			 * before it can steal/clean it.
4617			 */
4618			vm_object_unlock(shadow_object);
4619			mutex_yield(&vm_page_queue_lock);
4620
4621			for (j = 0; ; j++) {
4622				if (vm_object_lock_try(shadow_object))
4623					break;
4624				vm_page_unlock_queues();
4625				mutex_pause(j);
4626				vm_page_lock_queues();
4627			}
4628		        delayed_unlock = 1;
4629		}
4630		target_offset += PAGE_SIZE_64;
4631		xfer_size -= PAGE_SIZE;
4632		entry++;
4633	}
4634	if (delayed_unlock)
4635	        vm_page_unlock_queues();
4636
4637	occupied = 1;
4638
4639	if (upl->flags & UPL_DEVICE_MEMORY)  {
4640		occupied = 0;
4641	} else if (upl->flags & UPL_LITE) {
4642		int	pg_num;
4643		int	i;
4644
4645		pg_num = upl->size/PAGE_SIZE;
4646		pg_num = (pg_num + 31) >> 5;
4647		occupied = 0;
4648
4649		for (i = 0; i < pg_num; i++) {
4650			if (lite_list[i] != 0) {
4651				occupied = 1;
4652				break;
4653			}
4654		}
4655	} else {
4656		if (queue_empty(&upl->map_object->memq))
4657			occupied = 0;
4658	}
4659	if (occupied == 0) {
4660		if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4661			*empty = TRUE;
4662
4663		if (object == shadow_object) {
4664		        /*
4665			 * this is not a paging object
4666			 * so we need to drop the paging reference
4667			 * that was taken when we created the UPL
4668			 * against this object
4669			 */
4670			vm_object_paging_end(shadow_object);
4671		} else {
4672		         /*
4673			  * we dontated the paging reference to
4674			  * the map object... vm_pageout_object_terminate
4675			  * will drop this reference
4676			  */
4677		}
4678	}
4679	vm_object_unlock(shadow_object);
4680	if (object != shadow_object)
4681	        vm_object_unlock(object);
4682	upl_unlock(upl);
4683
4684	return KERN_SUCCESS;
4685}
4686
4687
4688kern_return_t
4689upl_abort(
4690	upl_t	upl,
4691	int	error)
4692{
4693	boolean_t	empty;
4694
4695	return upl_abort_range(upl, 0, upl->size, error, &empty);
4696}
4697
4698
4699/* an option on commit should be wire */
4700kern_return_t
4701upl_commit(
4702	upl_t			upl,
4703	upl_page_info_t		*page_list,
4704	mach_msg_type_number_t	count)
4705{
4706	boolean_t	empty;
4707
4708	return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
4709}
4710
4711
4712kern_return_t
4713vm_object_iopl_request(
4714	vm_object_t		object,
4715	vm_object_offset_t	offset,
4716	upl_size_t		size,
4717	upl_t			*upl_ptr,
4718	upl_page_info_array_t	user_page_list,
4719	unsigned int		*page_list_count,
4720	int			cntrl_flags)
4721{
4722	vm_page_t		dst_page;
4723	vm_object_offset_t	dst_offset;
4724	upl_size_t		xfer_size;
4725	upl_t			upl = NULL;
4726	unsigned int		entry;
4727	wpl_array_t 		lite_list = NULL;
4728	int                     delayed_unlock = 0;
4729	int			no_zero_fill = FALSE;
4730	u_int32_t		psize;
4731	kern_return_t		ret;
4732	vm_prot_t		prot;
4733	struct vm_object_fault_info fault_info;
4734
4735
4736	if (cntrl_flags & ~UPL_VALID_FLAGS) {
4737		/*
4738		 * For forward compatibility's sake,
4739		 * reject any unknown flag.
4740		 */
4741		return KERN_INVALID_VALUE;
4742	}
4743	if (vm_lopage_poolsize == 0)
4744	        cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4745
4746	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4747	        if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4748		        return KERN_INVALID_VALUE;
4749
4750		if (object->phys_contiguous) {
4751		        if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4752			        return KERN_INVALID_ADDRESS;
4753
4754			if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4755			        return KERN_INVALID_ADDRESS;
4756		}
4757	}
4758
4759	if (cntrl_flags & UPL_ENCRYPT) {
4760		/*
4761		 * ENCRYPTED SWAP:
4762		 * The paging path doesn't use this interface,
4763		 * so we don't support the UPL_ENCRYPT flag
4764		 * here.  We won't encrypt the pages.
4765		 */
4766		assert(! (cntrl_flags & UPL_ENCRYPT));
4767	}
4768	if (cntrl_flags & UPL_NOZEROFILL)
4769	        no_zero_fill = TRUE;
4770
4771	if (cntrl_flags & UPL_COPYOUT_FROM)
4772		prot = VM_PROT_READ;
4773	else
4774		prot = VM_PROT_READ | VM_PROT_WRITE;
4775
4776	if (((size/page_size) > MAX_UPL_SIZE) && !object->phys_contiguous)
4777		size = MAX_UPL_SIZE * page_size;
4778
4779	if (cntrl_flags & UPL_SET_INTERNAL) {
4780		if (page_list_count != NULL)
4781			*page_list_count = MAX_UPL_SIZE;
4782	}
4783	if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4784	    ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
4785	        return KERN_INVALID_ARGUMENT;
4786
4787	if ((!object->internal) && (object->paging_offset != 0))
4788		panic("vm_object_iopl_request: external object with non-zero paging offset\n");
4789
4790
4791	if (object->phys_contiguous)
4792	        psize = PAGE_SIZE;
4793	else
4794	        psize = size;
4795
4796	if (cntrl_flags & UPL_SET_INTERNAL) {
4797	        upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4798
4799		user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
4800		lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
4801					   ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
4802	} else {
4803	        upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4804
4805		lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4806	}
4807	if (user_page_list)
4808	        user_page_list[0].device = FALSE;
4809	*upl_ptr = upl;
4810
4811	upl->map_object = object;
4812	upl->size = size;
4813
4814	vm_object_lock(object);
4815	vm_object_paging_begin(object);
4816	/*
4817	 * paging in progress also protects the paging_offset
4818	 */
4819	upl->offset = offset + object->paging_offset;
4820
4821	if (object->phys_contiguous) {
4822#ifdef UPL_DEBUG
4823		queue_enter(&object->uplq, upl, upl_t, uplq);
4824#endif /* UPL_DEBUG */
4825
4826		vm_object_unlock(object);
4827
4828		/*
4829		 * don't need any shadow mappings for this one
4830		 * since it is already I/O memory
4831		 */
4832		upl->flags |= UPL_DEVICE_MEMORY;
4833
4834		upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4835
4836		if (user_page_list) {
4837		        user_page_list[0].phys_addr = (offset + object->shadow_offset)>>PAGE_SHIFT;
4838			user_page_list[0].device = TRUE;
4839		}
4840		if (page_list_count != NULL) {
4841		        if (upl->flags & UPL_INTERNAL)
4842			        *page_list_count = 0;
4843			else
4844			        *page_list_count = 1;
4845		}
4846		return KERN_SUCCESS;
4847	}
4848	/*
4849	 * Protect user space from future COW operations
4850	 */
4851	object->true_share = TRUE;
4852
4853	if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4854	        object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4855
4856#ifdef UPL_DEBUG
4857	queue_enter(&object->uplq, upl, upl_t, uplq);
4858#endif /* UPL_DEBUG */
4859
4860	if (cntrl_flags & UPL_BLOCK_ACCESS) {
4861		/*
4862		 * The user requested that access to the pages in this URL
4863		 * be blocked until the UPL is commited or aborted.
4864		 */
4865		upl->flags |= UPL_ACCESS_BLOCKED;
4866	}
4867	entry = 0;
4868
4869	xfer_size = size;
4870	dst_offset = offset;
4871
4872	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
4873	fault_info.user_tag  = 0;
4874	fault_info.lo_offset = offset;
4875	fault_info.hi_offset = offset + xfer_size;
4876	fault_info.no_cache  = FALSE;
4877
4878	while (xfer_size) {
4879	        vm_fault_return_t	result;
4880	        int			pg_num;
4881
4882		dst_page = vm_page_lookup(object, dst_offset);
4883
4884		/*
4885		 * ENCRYPTED SWAP:
4886		 * If the page is encrypted, we need to decrypt it,
4887		 * so force a soft page fault.
4888		 */
4889		if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4890		    (dst_page->encrypted) ||
4891		    (dst_page->unusual && (dst_page->error ||
4892					   dst_page->restart ||
4893					   dst_page->absent ||
4894					   dst_page->fictitious))) {
4895
4896		   do {
4897			vm_page_t	top_page;
4898			kern_return_t	error_code;
4899			int		interruptible;
4900
4901		        if (delayed_unlock) {
4902			        delayed_unlock = 0;
4903			        vm_page_unlock_queues();
4904			}
4905			if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
4906				interruptible = THREAD_ABORTSAFE;
4907			else
4908				interruptible = THREAD_UNINT;
4909
4910			fault_info.interruptible = interruptible;
4911			fault_info.cluster_size = xfer_size;
4912
4913			result = vm_fault_page(object, dst_offset,
4914					       prot | VM_PROT_WRITE, FALSE,
4915					       &prot, &dst_page, &top_page,
4916					       (int *)0,
4917					       &error_code, no_zero_fill,
4918					       FALSE, &fault_info);
4919
4920			switch (result) {
4921
4922			case VM_FAULT_SUCCESS:
4923
4924				PAGE_WAKEUP_DONE(dst_page);
4925				/*
4926				 *	Release paging references and
4927				 *	top-level placeholder page, if any.
4928				 */
4929				if (top_page != VM_PAGE_NULL) {
4930					vm_object_t local_object;
4931
4932					local_object = top_page->object;
4933
4934					if (top_page->object != dst_page->object) {
4935						vm_object_lock(local_object);
4936						VM_PAGE_FREE(top_page);
4937						vm_object_paging_end(local_object);
4938						vm_object_unlock(local_object);
4939					} else {
4940						VM_PAGE_FREE(top_page);
4941						vm_object_paging_end(local_object);
4942					}
4943				}
4944				break;
4945
4946			case VM_FAULT_RETRY:
4947				vm_object_lock(object);
4948				vm_object_paging_begin(object);
4949				break;
4950
4951			case VM_FAULT_FICTITIOUS_SHORTAGE:
4952				vm_page_more_fictitious();
4953
4954				vm_object_lock(object);
4955				vm_object_paging_begin(object);
4956				break;
4957
4958			case VM_FAULT_MEMORY_SHORTAGE:
4959				if (vm_page_wait(interruptible)) {
4960					vm_object_lock(object);
4961					vm_object_paging_begin(object);
4962					break;
4963				}
4964				/* fall thru */
4965
4966			case VM_FAULT_INTERRUPTED:
4967				error_code = MACH_SEND_INTERRUPTED;
4968			case VM_FAULT_MEMORY_ERROR:
4969				ret = (error_code ? error_code:	KERN_MEMORY_ERROR);
4970
4971				vm_object_lock(object);
4972				vm_object_paging_begin(object);
4973				goto return_err;
4974			}
4975		   } while (result != VM_FAULT_SUCCESS);
4976		}
4977
4978		if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
4979		     dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
4980		        vm_page_t	low_page;
4981			int 		refmod;
4982
4983			/*
4984			 * support devices that can't DMA above 32 bits
4985			 * by substituting pages from a pool of low address
4986			 * memory for any pages we find above the 4G mark
4987			 * can't substitute if the page is already wired because
4988			 * we don't know whether that physical address has been
4989			 * handed out to some other 64 bit capable DMA device to use
4990			 */
4991			if (dst_page->wire_count) {
4992			        ret = KERN_PROTECTION_FAILURE;
4993				goto return_err;
4994			}
4995			if (delayed_unlock) {
4996			        delayed_unlock = 0;
4997				vm_page_unlock_queues();
4998			}
4999			low_page = vm_page_grablo();
5000
5001			if (low_page == VM_PAGE_NULL) {
5002			        ret = KERN_RESOURCE_SHORTAGE;
5003				goto return_err;
5004			}
5005			/*
5006			 * from here until the vm_page_replace completes
5007			 * we musn't drop the object lock... we don't
5008			 * want anyone refaulting this page in and using
5009			 * it after we disconnect it... we want the fault
5010			 * to find the new page being substituted.
5011			 */
5012			if (dst_page->pmapped)
5013			        refmod = pmap_disconnect(dst_page->phys_page);
5014			else
5015			        refmod = 0;
5016			vm_page_copy(dst_page, low_page);
5017
5018			low_page->reference = dst_page->reference;
5019			low_page->dirty     = dst_page->dirty;
5020
5021			if (refmod & VM_MEM_REFERENCED)
5022			        low_page->reference = TRUE;
5023			if (refmod & VM_MEM_MODIFIED)
5024			        low_page->dirty = TRUE;
5025
5026			vm_page_lock_queues();
5027			vm_page_replace(low_page, object, dst_offset);
5028			/*
5029			 * keep the queue lock since we're going to
5030			 * need it immediately
5031			 */
5032			delayed_unlock = 1;
5033
5034			dst_page = low_page;
5035			/*
5036			 * vm_page_grablo returned the page marked
5037			 * BUSY... we don't need a PAGE_WAKEUP_DONE
5038			 * here, because we've never dropped the object lock
5039			 */
5040			dst_page->busy = FALSE;
5041		}
5042		if (delayed_unlock == 0)
5043		        vm_page_lock_queues();
5044
5045		vm_page_wire(dst_page);
5046
5047		if (cntrl_flags & UPL_BLOCK_ACCESS) {
5048			/*
5049			 * Mark the page "busy" to block any future page fault
5050			 * on this page.  We'll also remove the mapping
5051			 * of all these pages before leaving this routine.
5052			 */
5053			assert(!dst_page->fictitious);
5054			dst_page->busy = TRUE;
5055		}
5056		pg_num = (dst_offset-offset)/PAGE_SIZE;
5057		lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5058
5059		/*
5060		 * expect the page to be used
5061		 * page queues lock must be held to set 'reference'
5062		 */
5063		dst_page->reference = TRUE;
5064
5065   		if (!(cntrl_flags & UPL_COPYOUT_FROM))
5066			dst_page->dirty = TRUE;
5067
5068		if (dst_page->phys_page > upl->highest_page)
5069		        upl->highest_page = dst_page->phys_page;
5070
5071		if (user_page_list) {
5072			user_page_list[entry].phys_addr	= dst_page->phys_page;
5073			user_page_list[entry].pageout	= dst_page->pageout;
5074			user_page_list[entry].absent	= dst_page->absent;
5075			user_page_list[entry].dirty 	= dst_page->dirty;
5076			user_page_list[entry].precious	= dst_page->precious;
5077			user_page_list[entry].device 	= FALSE;
5078			if (dst_page->clustered == TRUE)
5079			        user_page_list[entry].speculative = dst_page->speculative;
5080			else
5081			        user_page_list[entry].speculative = FALSE;
5082			user_page_list[entry].cs_validated = dst_page->cs_validated;
5083			user_page_list[entry].cs_tainted = dst_page->cs_tainted;
5084		}
5085		/*
5086		 * someone is explicitly grabbing this page...
5087		 * update clustered and speculative state
5088		 *
5089		 */
5090		VM_PAGE_CONSUME_CLUSTERED(dst_page);
5091
5092		if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
5093			mutex_yield(&vm_page_queue_lock);
5094		        delayed_unlock = 1;
5095		}
5096		entry++;
5097		dst_offset += PAGE_SIZE_64;
5098		xfer_size -= PAGE_SIZE;
5099	}
5100	if (delayed_unlock)
5101	        vm_page_unlock_queues();
5102
5103	if (page_list_count != NULL) {
5104	        if (upl->flags & UPL_INTERNAL)
5105			*page_list_count = 0;
5106		else if (*page_list_count > entry)
5107			*page_list_count = entry;
5108	}
5109	vm_object_unlock(object);
5110
5111	if (cntrl_flags & UPL_BLOCK_ACCESS) {
5112		/*
5113		 * We've marked all the pages "busy" so that future
5114		 * page faults will block.
5115		 * Now remove the mapping for these pages, so that they
5116		 * can't be accessed without causing a page fault.
5117		 */
5118		vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5119				       PMAP_NULL, 0, VM_PROT_NONE);
5120	}
5121	return KERN_SUCCESS;
5122
5123return_err:
5124	if (delayed_unlock)
5125	        vm_page_unlock_queues();
5126
5127	for (; offset < dst_offset; offset += PAGE_SIZE) {
5128	        dst_page = vm_page_lookup(object, offset);
5129
5130		if (dst_page == VM_PAGE_NULL)
5131		        panic("vm_object_iopl_request: Wired pages missing. \n");
5132
5133		vm_page_lockspin_queues();
5134		vm_page_unwire(dst_page);
5135		vm_page_unlock_queues();
5136
5137		VM_STAT_INCR(reactivations);
5138	}
5139	vm_object_paging_end(object);
5140	vm_object_unlock(object);
5141	upl_destroy(upl);
5142
5143	return ret;
5144}
5145
5146kern_return_t
5147upl_transpose(
5148	upl_t		upl1,
5149	upl_t		upl2)
5150{
5151	kern_return_t		retval;
5152	boolean_t		upls_locked;
5153	vm_object_t		object1, object2;
5154
5155	if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5156		return KERN_INVALID_ARGUMENT;
5157	}
5158
5159	upls_locked = FALSE;
5160
5161	/*
5162	 * Since we need to lock both UPLs at the same time,
5163	 * avoid deadlocks by always taking locks in the same order.
5164	 */
5165	if (upl1 < upl2) {
5166		upl_lock(upl1);
5167		upl_lock(upl2);
5168	} else {
5169		upl_lock(upl2);
5170		upl_lock(upl1);
5171	}
5172	upls_locked = TRUE;	/* the UPLs will need to be unlocked */
5173
5174	object1 = upl1->map_object;
5175	object2 = upl2->map_object;
5176
5177	if (upl1->offset != 0 || upl2->offset != 0 ||
5178	    upl1->size != upl2->size) {
5179		/*
5180		 * We deal only with full objects, not subsets.
5181		 * That's because we exchange the entire backing store info
5182		 * for the objects: pager, resident pages, etc...  We can't do
5183		 * only part of it.
5184		 */
5185		retval = KERN_INVALID_VALUE;
5186		goto done;
5187	}
5188
5189	/*
5190	 * Tranpose the VM objects' backing store.
5191	 */
5192	retval = vm_object_transpose(object1, object2,
5193				     (vm_object_size_t) upl1->size);
5194
5195	if (retval == KERN_SUCCESS) {
5196		/*
5197		 * Make each UPL point to the correct VM object, i.e. the
5198		 * object holding the pages that the UPL refers to...
5199		 */
5200#ifdef UPL_DEBUG
5201		queue_remove(&object1->uplq, upl1, upl_t, uplq);
5202		queue_remove(&object2->uplq, upl2, upl_t, uplq);
5203#endif
5204		upl1->map_object = object2;
5205		upl2->map_object = object1;
5206#ifdef UPL_DEBUG
5207		queue_enter(&object1->uplq, upl2, upl_t, uplq);
5208		queue_enter(&object2->uplq, upl1, upl_t, uplq);
5209#endif
5210	}
5211
5212done:
5213	/*
5214	 * Cleanup.
5215	 */
5216	if (upls_locked) {
5217		upl_unlock(upl1);
5218		upl_unlock(upl2);
5219		upls_locked = FALSE;
5220	}
5221
5222	return retval;
5223}
5224
5225/*
5226 * ENCRYPTED SWAP:
5227 *
5228 * Rationale:  the user might have some encrypted data on disk (via
5229 * FileVault or any other mechanism).  That data is then decrypted in
5230 * memory, which is safe as long as the machine is secure.  But that
5231 * decrypted data in memory could be paged out to disk by the default
5232 * pager.  The data would then be stored on disk in clear (not encrypted)
5233 * and it could be accessed by anyone who gets physical access to the
5234 * disk (if the laptop or the disk gets stolen for example).  This weakens
5235 * the security offered by FileVault.
5236 *
5237 * Solution:  the default pager will optionally request that all the
5238 * pages it gathers for pageout be encrypted, via the UPL interfaces,
5239 * before it sends this UPL to disk via the vnode_pageout() path.
5240 *
5241 * Notes:
5242 *
5243 * To avoid disrupting the VM LRU algorithms, we want to keep the
5244 * clean-in-place mechanisms, which allow us to send some extra pages to
5245 * swap (clustering) without actually removing them from the user's
5246 * address space.  We don't want the user to unknowingly access encrypted
5247 * data, so we have to actually remove the encrypted pages from the page
5248 * table.  When the user accesses the data, the hardware will fail to
5249 * locate the virtual page in its page table and will trigger a page
5250 * fault.  We can then decrypt the page and enter it in the page table
5251 * again.  Whenever we allow the user to access the contents of a page,
5252 * we have to make sure it's not encrypted.
5253 *
5254 *
5255 */
5256/*
5257 * ENCRYPTED SWAP:
5258 * Reserve of virtual addresses in the kernel address space.
5259 * We need to map the physical pages in the kernel, so that we
5260 * can call the encryption/decryption routines with a kernel
5261 * virtual address.  We keep this pool of pre-allocated kernel
5262 * virtual addresses so that we don't have to scan the kernel's
5263 * virtaul address space each time we need to encrypt or decrypt
5264 * a physical page.
5265 * It would be nice to be able to encrypt and decrypt in physical
5266 * mode but that might not always be more efficient...
5267 */
5268decl_simple_lock_data(,vm_paging_lock)
5269#define VM_PAGING_NUM_PAGES	64
5270vm_map_offset_t vm_paging_base_address = 0;
5271boolean_t	vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5272int		vm_paging_max_index = 0;
5273int		vm_paging_page_waiter = 0;
5274int		vm_paging_page_waiter_total = 0;
5275unsigned long	vm_paging_no_kernel_page = 0;
5276unsigned long	vm_paging_objects_mapped = 0;
5277unsigned long	vm_paging_pages_mapped = 0;
5278unsigned long	vm_paging_objects_mapped_slow = 0;
5279unsigned long	vm_paging_pages_mapped_slow = 0;
5280
5281void
5282vm_paging_map_init(void)
5283{
5284	kern_return_t	kr;
5285	vm_map_offset_t	page_map_offset;
5286	vm_map_entry_t	map_entry;
5287
5288	assert(vm_paging_base_address == 0);
5289
5290	/*
5291	 * Initialize our pool of pre-allocated kernel
5292	 * virtual addresses.
5293	 */
5294	page_map_offset = 0;
5295	kr = vm_map_find_space(kernel_map,
5296			       &page_map_offset,
5297			       VM_PAGING_NUM_PAGES * PAGE_SIZE,
5298			       0,
5299			       0,
5300			       &map_entry);
5301	if (kr != KERN_SUCCESS) {
5302		panic("vm_paging_map_init: kernel_map full\n");
5303	}
5304	map_entry->object.vm_object = kernel_object;
5305	map_entry->offset =
5306		page_map_offset - VM_MIN_KERNEL_ADDRESS;
5307	vm_object_reference(kernel_object);
5308	vm_map_unlock(kernel_map);
5309
5310	assert(vm_paging_base_address == 0);
5311	vm_paging_base_address = page_map_offset;
5312}
5313
5314/*
5315 * ENCRYPTED SWAP:
5316 * vm_paging_map_object:
5317 *	Maps part of a VM object's pages in the kernel
5318 * 	virtual address space, using the pre-allocated
5319 *	kernel virtual addresses, if possible.
5320 * Context:
5321 * 	The VM object is locked.  This lock will get
5322 * 	dropped and re-acquired though, so the caller
5323 * 	must make sure the VM object is kept alive
5324 *	(by holding a VM map that has a reference
5325 * 	on it, for example, or taking an extra reference).
5326 * 	The page should also be kept busy to prevent
5327 *	it from being reclaimed.
5328 */
5329kern_return_t
5330vm_paging_map_object(
5331	vm_map_offset_t		*address,
5332	vm_page_t		page,
5333	vm_object_t		object,
5334	vm_object_offset_t	offset,
5335	vm_map_size_t		*size,
5336	vm_prot_t		protection,
5337	boolean_t		can_unlock_object)
5338{
5339	kern_return_t		kr;
5340	vm_map_offset_t		page_map_offset;
5341	vm_map_size_t		map_size;
5342	vm_object_offset_t	object_offset;
5343	int			i;
5344
5345
5346	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5347		assert(page->busy);
5348		/*
5349		 * Use one of the pre-allocated kernel virtual addresses
5350		 * and just enter the VM page in the kernel address space
5351		 * at that virtual address.
5352		 */
5353		simple_lock(&vm_paging_lock);
5354
5355		/*
5356		 * Try and find an available kernel virtual address
5357		 * from our pre-allocated pool.
5358		 */
5359		page_map_offset = 0;
5360		for (;;) {
5361			for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5362				if (vm_paging_page_inuse[i] == FALSE) {
5363					page_map_offset =
5364						vm_paging_base_address +
5365						(i * PAGE_SIZE);
5366					break;
5367				}
5368			}
5369			if (page_map_offset != 0) {
5370				/* found a space to map our page ! */
5371				break;
5372			}
5373
5374			if (can_unlock_object) {
5375				/*
5376				 * If we can afford to unlock the VM object,
5377				 * let's take the slow path now...
5378				 */
5379				break;
5380			}
5381			/*
5382			 * We can't afford to unlock the VM object, so
5383			 * let's wait for a space to become available...
5384			 */
5385			vm_paging_page_waiter_total++;
5386			vm_paging_page_waiter++;
5387			thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
5388						       &vm_paging_lock,
5389						       THREAD_UNINT);
5390			vm_paging_page_waiter--;
5391			/* ... and try again */
5392		}
5393
5394		if (page_map_offset != 0) {
5395			/*
5396			 * We found a kernel virtual address;
5397			 * map the physical page to that virtual address.
5398			 */
5399			if (i > vm_paging_max_index) {
5400				vm_paging_max_index = i;
5401			}
5402			vm_paging_page_inuse[i] = TRUE;
5403			simple_unlock(&vm_paging_lock);
5404
5405			if (page->pmapped == FALSE) {
5406				pmap_sync_page_data_phys(page->phys_page);
5407			}
5408			page->pmapped = TRUE;
5409
5410			/*
5411			 * Keep the VM object locked over the PMAP_ENTER
5412			 * and the actual use of the page by the kernel,
5413			 * or this pmap mapping might get undone by a
5414			 * vm_object_pmap_protect() call...
5415			 */
5416			PMAP_ENTER(kernel_pmap,
5417				   page_map_offset,
5418				   page,
5419				   protection,
5420				   ((int) page->object->wimg_bits &
5421				    VM_WIMG_MASK),
5422				   TRUE);
5423			vm_paging_objects_mapped++;
5424			vm_paging_pages_mapped++;
5425			*address = page_map_offset;
5426
5427			/* all done and mapped, ready to use ! */
5428			return KERN_SUCCESS;
5429		}
5430
5431		/*
5432		 * We ran out of pre-allocated kernel virtual
5433		 * addresses.  Just map the page in the kernel
5434		 * the slow and regular way.
5435		 */
5436		vm_paging_no_kernel_page++;
5437		simple_unlock(&vm_paging_lock);
5438	}
5439
5440	if (! can_unlock_object) {
5441		return KERN_NOT_SUPPORTED;
5442	}
5443
5444	object_offset = vm_object_trunc_page(offset);
5445	map_size = vm_map_round_page(*size);
5446
5447	/*
5448	 * Try and map the required range of the object
5449	 * in the kernel_map
5450	 */
5451
5452	vm_object_reference_locked(object);	/* for the map entry */
5453	vm_object_unlock(object);
5454
5455	kr = vm_map_enter(kernel_map,
5456			  address,
5457			  map_size,
5458			  0,
5459			  VM_FLAGS_ANYWHERE,
5460			  object,
5461			  object_offset,
5462			  FALSE,
5463			  protection,
5464			  VM_PROT_ALL,
5465			  VM_INHERIT_NONE);
5466	if (kr != KERN_SUCCESS) {
5467		*address = 0;
5468		*size = 0;
5469		vm_object_deallocate(object);	/* for the map entry */
5470		vm_object_lock(object);
5471		return kr;
5472	}
5473
5474	*size = map_size;
5475
5476	/*
5477	 * Enter the mapped pages in the page table now.
5478	 */
5479	vm_object_lock(object);
5480	/*
5481	 * VM object must be kept locked from before PMAP_ENTER()
5482	 * until after the kernel is done accessing the page(s).
5483	 * Otherwise, the pmap mappings in the kernel could be
5484	 * undone by a call to vm_object_pmap_protect().
5485	 */
5486
5487	for (page_map_offset = 0;
5488	     map_size != 0;
5489	     map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5490		unsigned int	cache_attr;
5491
5492		page = vm_page_lookup(object, offset + page_map_offset);
5493		if (page == VM_PAGE_NULL) {
5494			printf("vm_paging_map_object: no page !?");
5495			vm_object_unlock(object);
5496			kr = vm_map_remove(kernel_map, *address, *size,
5497					   VM_MAP_NO_FLAGS);
5498			assert(kr == KERN_SUCCESS);
5499			*address = 0;
5500			*size = 0;
5501			vm_object_lock(object);
5502			return KERN_MEMORY_ERROR;
5503		}
5504		if (page->pmapped == FALSE) {
5505			pmap_sync_page_data_phys(page->phys_page);
5506		}
5507		page->pmapped = TRUE;
5508		cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5509
5510		//assert(pmap_verify_free(page->phys_page));
5511		PMAP_ENTER(kernel_pmap,
5512			   *address + page_map_offset,
5513			   page,
5514			   protection,
5515			   cache_attr,
5516			   TRUE);
5517	}
5518
5519	vm_paging_objects_mapped_slow++;
5520	vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5521
5522	return KERN_SUCCESS;
5523}
5524
5525/*
5526 * ENCRYPTED SWAP:
5527 * vm_paging_unmap_object:
5528 *	Unmaps part of a VM object's pages from the kernel
5529 * 	virtual address space.
5530 * Context:
5531 * 	The VM object is locked.  This lock will get
5532 * 	dropped and re-acquired though.
5533 */
5534void
5535vm_paging_unmap_object(
5536	vm_object_t	object,
5537	vm_map_offset_t	start,
5538	vm_map_offset_t	end)
5539{
5540	kern_return_t	kr;
5541	int		i;
5542
5543	if ((vm_paging_base_address == 0) ||
5544	    (start < vm_paging_base_address) ||
5545	    (end > (vm_paging_base_address
5546		     + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
5547		/*
5548		 * We didn't use our pre-allocated pool of
5549		 * kernel virtual address.  Deallocate the
5550		 * virtual memory.
5551		 */
5552		if (object != VM_OBJECT_NULL) {
5553			vm_object_unlock(object);
5554		}
5555		kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5556		if (object != VM_OBJECT_NULL) {
5557			vm_object_lock(object);
5558		}
5559		assert(kr == KERN_SUCCESS);
5560	} else {
5561		/*
5562		 * We used a kernel virtual address from our
5563		 * pre-allocated pool.  Put it back in the pool
5564		 * for next time.
5565		 */
5566		assert(end - start == PAGE_SIZE);
5567		i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5568
5569		/* undo the pmap mapping */
5570		pmap_remove(kernel_pmap, start, end);
5571
5572		simple_lock(&vm_paging_lock);
5573		vm_paging_page_inuse[i] = FALSE;
5574		if (vm_paging_page_waiter) {
5575			thread_wakeup(&vm_paging_page_waiter);
5576		}
5577		simple_unlock(&vm_paging_lock);
5578	}
5579}
5580
5581#if CRYPTO
5582/*
5583 * Encryption data.
5584 * "iv" is the "initial vector".  Ideally, we want to
5585 * have a different one for each page we encrypt, so that
5586 * crackers can't find encryption patterns too easily.
5587 */
5588#define SWAP_CRYPT_AES_KEY_SIZE	128	/* XXX 192 and 256 don't work ! */
5589boolean_t		swap_crypt_ctx_initialized = FALSE;
5590aes_32t 		swap_crypt_key[8]; /* big enough for a 256 key */
5591aes_ctx			swap_crypt_ctx;
5592const unsigned char	swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5593
5594#if DEBUG
5595boolean_t		swap_crypt_ctx_tested = FALSE;
5596unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5597unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5598unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5599#endif /* DEBUG */
5600
5601extern u_long random(void);
5602
5603/*
5604 * Initialize the encryption context: key and key size.
5605 */
5606void swap_crypt_ctx_initialize(void); /* forward */
5607void
5608swap_crypt_ctx_initialize(void)
5609{
5610	unsigned int	i;
5611
5612	/*
5613	 * No need for locking to protect swap_crypt_ctx_initialized
5614	 * because the first use of encryption will come from the
5615	 * pageout thread (we won't pagein before there's been a pageout)
5616	 * and there's only one pageout thread.
5617	 */
5618	if (swap_crypt_ctx_initialized == FALSE) {
5619		for (i = 0;
5620		     i < (sizeof (swap_crypt_key) /
5621			  sizeof (swap_crypt_key[0]));
5622		     i++) {
5623			swap_crypt_key[i] = random();
5624		}
5625		aes_encrypt_key((const unsigned char *) swap_crypt_key,
5626				SWAP_CRYPT_AES_KEY_SIZE,
5627				&swap_crypt_ctx.encrypt);
5628		aes_decrypt_key((const unsigned char *) swap_crypt_key,
5629				SWAP_CRYPT_AES_KEY_SIZE,
5630				&swap_crypt_ctx.decrypt);
5631		swap_crypt_ctx_initialized = TRUE;
5632	}
5633
5634#if DEBUG
5635	/*
5636	 * Validate the encryption algorithms.
5637	 */
5638	if (swap_crypt_ctx_tested == FALSE) {
5639		/* initialize */
5640		for (i = 0; i < 4096; i++) {
5641			swap_crypt_test_page_ref[i] = (char) i;
5642		}
5643		/* encrypt */
5644		aes_encrypt_cbc(swap_crypt_test_page_ref,
5645				swap_crypt_null_iv,
5646				PAGE_SIZE / AES_BLOCK_SIZE,
5647				swap_crypt_test_page_encrypt,
5648				&swap_crypt_ctx.encrypt);
5649		/* decrypt */
5650		aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5651				swap_crypt_null_iv,
5652				PAGE_SIZE / AES_BLOCK_SIZE,
5653				swap_crypt_test_page_decrypt,
5654				&swap_crypt_ctx.decrypt);
5655		/* compare result with original */
5656		for (i = 0; i < 4096; i ++) {
5657			if (swap_crypt_test_page_decrypt[i] !=
5658			    swap_crypt_test_page_ref[i]) {
5659				panic("encryption test failed");
5660			}
5661		}
5662
5663		/* encrypt again */
5664		aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5665				swap_crypt_null_iv,
5666				PAGE_SIZE / AES_BLOCK_SIZE,
5667				swap_crypt_test_page_decrypt,
5668				&swap_crypt_ctx.encrypt);
5669		/* decrypt in place */
5670		aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5671				swap_crypt_null_iv,
5672				PAGE_SIZE / AES_BLOCK_SIZE,
5673				swap_crypt_test_page_decrypt,
5674				&swap_crypt_ctx.decrypt);
5675		for (i = 0; i < 4096; i ++) {
5676			if (swap_crypt_test_page_decrypt[i] !=
5677			    swap_crypt_test_page_ref[i]) {
5678				panic("in place encryption test failed");
5679			}
5680		}
5681
5682		swap_crypt_ctx_tested = TRUE;
5683	}
5684#endif /* DEBUG */
5685}
5686
5687/*
5688 * ENCRYPTED SWAP:
5689 * vm_page_encrypt:
5690 * 	Encrypt the given page, for secure paging.
5691 * 	The page might already be mapped at kernel virtual
5692 * 	address "kernel_mapping_offset".  Otherwise, we need
5693 * 	to map it.
5694 *
5695 * Context:
5696 * 	The page's object is locked, but this lock will be released
5697 * 	and re-acquired.
5698 * 	The page is busy and not accessible by users (not entered in any pmap).
5699 */
5700void
5701vm_page_encrypt(
5702	vm_page_t	page,
5703	vm_map_offset_t	kernel_mapping_offset)
5704{
5705	kern_return_t		kr;
5706	vm_map_size_t		kernel_mapping_size;
5707	vm_offset_t		kernel_vaddr;
5708	union {
5709		unsigned char	aes_iv[AES_BLOCK_SIZE];
5710		struct {
5711			memory_object_t		pager_object;
5712			vm_object_offset_t	paging_offset;
5713		} vm;
5714	} encrypt_iv;
5715
5716	if (! vm_pages_encrypted) {
5717		vm_pages_encrypted = TRUE;
5718	}
5719
5720	assert(page->busy);
5721	assert(page->dirty || page->precious);
5722
5723	if (page->encrypted) {
5724		/*
5725		 * Already encrypted: no need to do it again.
5726		 */
5727		vm_page_encrypt_already_encrypted_counter++;
5728		return;
5729	}
5730	ASSERT_PAGE_DECRYPTED(page);
5731
5732	/*
5733	 * Take a paging-in-progress reference to keep the object
5734	 * alive even if we have to unlock it (in vm_paging_map_object()
5735	 * for example)...
5736	 */
5737	vm_object_paging_begin(page->object);
5738
5739	if (kernel_mapping_offset == 0) {
5740		/*
5741		 * The page hasn't already been mapped in kernel space
5742		 * by the caller.  Map it now, so that we can access
5743		 * its contents and encrypt them.
5744		 */
5745		kernel_mapping_size = PAGE_SIZE;
5746		kr = vm_paging_map_object(&kernel_mapping_offset,
5747					  page,
5748					  page->object,
5749					  page->offset,
5750					  &kernel_mapping_size,
5751					  VM_PROT_READ | VM_PROT_WRITE,
5752					  FALSE);
5753		if (kr != KERN_SUCCESS) {
5754			panic("vm_page_encrypt: "
5755			      "could not map page in kernel: 0x%x\n",
5756			      kr);
5757		}
5758	} else {
5759		kernel_mapping_size = 0;
5760	}
5761	kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5762
5763	if (swap_crypt_ctx_initialized == FALSE) {
5764		swap_crypt_ctx_initialize();
5765	}
5766	assert(swap_crypt_ctx_initialized);
5767
5768	/*
5769	 * Prepare an "initial vector" for the encryption.
5770	 * We use the "pager" and the "paging_offset" for that
5771	 * page to obfuscate the encrypted data a bit more and
5772	 * prevent crackers from finding patterns that they could
5773	 * use to break the key.
5774	 */
5775	bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5776	encrypt_iv.vm.pager_object = page->object->pager;
5777	encrypt_iv.vm.paging_offset =
5778		page->object->paging_offset + page->offset;
5779
5780	/* encrypt the "initial vector" */
5781	aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5782			swap_crypt_null_iv,
5783			1,
5784			&encrypt_iv.aes_iv[0],
5785			&swap_crypt_ctx.encrypt);
5786
5787	/*
5788	 * Encrypt the page.
5789	 */
5790	aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5791			&encrypt_iv.aes_iv[0],
5792			PAGE_SIZE / AES_BLOCK_SIZE,
5793			(unsigned char *) kernel_vaddr,
5794			&swap_crypt_ctx.encrypt);
5795
5796	vm_page_encrypt_counter++;
5797
5798	/*
5799	 * Unmap the page from the kernel's address space,
5800	 * if we had to map it ourselves.  Otherwise, let
5801	 * the caller undo the mapping if needed.
5802	 */
5803	if (kernel_mapping_size != 0) {
5804		vm_paging_unmap_object(page->object,
5805				       kernel_mapping_offset,
5806				       kernel_mapping_offset + kernel_mapping_size);
5807	}
5808
5809	/*
5810	 * Clear the "reference" and "modified" bits.
5811	 * This should clean up any impact the encryption had
5812	 * on them.
5813	 * The page was kept busy and disconnected from all pmaps,
5814	 * so it can't have been referenced or modified from user
5815	 * space.
5816	 * The software bits will be reset later after the I/O
5817	 * has completed (in upl_commit_range()).
5818	 */
5819	pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
5820
5821	page->encrypted = TRUE;
5822
5823	vm_object_paging_end(page->object);
5824}
5825
5826/*
5827 * ENCRYPTED SWAP:
5828 * vm_page_decrypt:
5829 * 	Decrypt the given page.
5830 * 	The page might already be mapped at kernel virtual
5831 * 	address "kernel_mapping_offset".  Otherwise, we need
5832 * 	to map it.
5833 *
5834 * Context:
5835 *	The page's VM object is locked but will be unlocked and relocked.
5836 * 	The page is busy and not accessible by users (not entered in any pmap).
5837 */
5838void
5839vm_page_decrypt(
5840	vm_page_t	page,
5841	vm_map_offset_t	kernel_mapping_offset)
5842{
5843	kern_return_t		kr;
5844	vm_map_size_t		kernel_mapping_size;
5845	vm_offset_t		kernel_vaddr;
5846	union {
5847		unsigned char	aes_iv[AES_BLOCK_SIZE];
5848		struct {
5849			memory_object_t		pager_object;
5850			vm_object_offset_t	paging_offset;
5851		} vm;
5852	} decrypt_iv;
5853
5854	assert(page->busy);
5855	assert(page->encrypted);
5856
5857	/*
5858	 * Take a paging-in-progress reference to keep the object
5859	 * alive even if we have to unlock it (in vm_paging_map_object()
5860	 * for example)...
5861	 */
5862	vm_object_paging_begin(page->object);
5863
5864	if (kernel_mapping_offset == 0) {
5865		/*
5866		 * The page hasn't already been mapped in kernel space
5867		 * by the caller.  Map it now, so that we can access
5868		 * its contents and decrypt them.
5869		 */
5870		kernel_mapping_size = PAGE_SIZE;
5871		kr = vm_paging_map_object(&kernel_mapping_offset,
5872					  page,
5873					  page->object,
5874					  page->offset,
5875					  &kernel_mapping_size,
5876					  VM_PROT_READ | VM_PROT_WRITE,
5877					  FALSE);
5878		if (kr != KERN_SUCCESS) {
5879			panic("vm_page_decrypt: "
5880			      "could not map page in kernel: 0x%x\n",
5881			      kr);
5882		}
5883	} else {
5884		kernel_mapping_size = 0;
5885	}
5886	kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5887
5888	assert(swap_crypt_ctx_initialized);
5889
5890	/*
5891	 * Prepare an "initial vector" for the decryption.
5892	 * It has to be the same as the "initial vector" we
5893	 * used to encrypt that page.
5894	 */
5895	bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5896	decrypt_iv.vm.pager_object = page->object->pager;
5897	decrypt_iv.vm.paging_offset =
5898		page->object->paging_offset + page->offset;
5899
5900	/* encrypt the "initial vector" */
5901	aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5902			swap_crypt_null_iv,
5903			1,
5904			&decrypt_iv.aes_iv[0],
5905			&swap_crypt_ctx.encrypt);
5906
5907	/*
5908	 * Decrypt the page.
5909	 */
5910	aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5911			&decrypt_iv.aes_iv[0],
5912			PAGE_SIZE / AES_BLOCK_SIZE,
5913			(unsigned char *) kernel_vaddr,
5914			&swap_crypt_ctx.decrypt);
5915	vm_page_decrypt_counter++;
5916
5917	/*
5918	 * Unmap the page from the kernel's address space,
5919	 * if we had to map it ourselves.  Otherwise, let
5920	 * the caller undo the mapping if needed.
5921	 */
5922	if (kernel_mapping_size != 0) {
5923		vm_paging_unmap_object(page->object,
5924				       kernel_vaddr,
5925				       kernel_vaddr + PAGE_SIZE);
5926	}
5927
5928	/*
5929	 * After decryption, the page is actually clean.
5930	 * It was encrypted as part of paging, which "cleans"
5931	 * the "dirty" pages.
5932	 * Noone could access it after it was encrypted
5933	 * and the decryption doesn't count.
5934	 */
5935	page->dirty = FALSE;
5936	if (page->cs_validated && !page->cs_tainted) {
5937		/*
5938		 * CODE SIGNING:
5939		 * This page is no longer dirty
5940		 * but could have been modified,
5941		 * so it will need to be
5942		 * re-validated.
5943		 */
5944		page->cs_validated = FALSE;
5945		vm_cs_validated_resets++;
5946	}
5947	pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
5948
5949	page->encrypted = FALSE;
5950
5951	/*
5952	 * We've just modified the page's contents via the data cache and part
5953	 * of the new contents might still be in the cache and not yet in RAM.
5954	 * Since the page is now available and might get gathered in a UPL to
5955	 * be part of a DMA transfer from a driver that expects the memory to
5956	 * be coherent at this point, we have to flush the data cache.
5957	 */
5958	pmap_sync_page_attributes_phys(page->phys_page);
5959	/*
5960	 * Since the page is not mapped yet, some code might assume that it
5961	 * doesn't need to invalidate the instruction cache when writing to
5962	 * that page.  That code relies on "pmapped" being FALSE, so that the
5963	 * caches get synchronized when the page is first mapped.
5964	 */
5965	assert(pmap_verify_free(page->phys_page));
5966	page->pmapped = FALSE;
5967	page->wpmapped = FALSE;
5968
5969	vm_object_paging_end(page->object);
5970}
5971
5972unsigned long upl_encrypt_upls = 0;
5973unsigned long upl_encrypt_pages = 0;
5974
5975/*
5976 * ENCRYPTED SWAP:
5977 *
5978 * upl_encrypt:
5979 * 	Encrypts all the pages in the UPL, within the specified range.
5980 *
5981 */
5982void
5983upl_encrypt(
5984	upl_t			upl,
5985	upl_offset_t		crypt_offset,
5986	upl_size_t		crypt_size)
5987{
5988	upl_size_t		upl_size;
5989	upl_offset_t		upl_offset;
5990	vm_object_t		upl_object;
5991	vm_page_t		page;
5992	vm_object_t		shadow_object;
5993	vm_object_offset_t	shadow_offset;
5994	vm_object_offset_t	paging_offset;
5995	vm_object_offset_t	base_offset;
5996
5997	upl_encrypt_upls++;
5998	upl_encrypt_pages += crypt_size / PAGE_SIZE;
5999
6000	upl_object = upl->map_object;
6001	upl_offset = upl->offset;
6002	upl_size = upl->size;
6003
6004	vm_object_lock(upl_object);
6005
6006	/*
6007	 * Find the VM object that contains the actual pages.
6008	 */
6009	if (upl_object->pageout) {
6010		shadow_object = upl_object->shadow;
6011		/*
6012		 * The offset in the shadow object is actually also
6013		 * accounted for in upl->offset.  It possibly shouldn't be
6014		 * this way, but for now don't account for it twice.
6015		 */
6016		shadow_offset = 0;
6017		assert(upl_object->paging_offset == 0);	/* XXX ? */
6018		vm_object_lock(shadow_object);
6019	} else {
6020		shadow_object = upl_object;
6021		shadow_offset = 0;
6022	}
6023
6024	paging_offset = shadow_object->paging_offset;
6025	vm_object_paging_begin(shadow_object);
6026
6027	if (shadow_object != upl_object)
6028	        vm_object_unlock(upl_object);
6029
6030
6031	base_offset = shadow_offset;
6032	base_offset += upl_offset;
6033	base_offset += crypt_offset;
6034	base_offset -= paging_offset;
6035
6036	assert(crypt_offset + crypt_size <= upl_size);
6037
6038	for (upl_offset = 0;
6039	     upl_offset < crypt_size;
6040	     upl_offset += PAGE_SIZE) {
6041		page = vm_page_lookup(shadow_object,
6042				      base_offset + upl_offset);
6043		if (page == VM_PAGE_NULL) {
6044			panic("upl_encrypt: "
6045			      "no page for (obj=%p,off=%lld+%d)!\n",
6046			      shadow_object,
6047			      base_offset,
6048			      upl_offset);
6049		}
6050		/*
6051		 * Disconnect the page from all pmaps, so that nobody can
6052		 * access it while it's encrypted.  After that point, all
6053		 * accesses to this page will cause a page fault and block
6054		 * while the page is busy being encrypted.  After the
6055		 * encryption completes, any access will cause a
6056		 * page fault and the page gets decrypted at that time.
6057		 */
6058		pmap_disconnect(page->phys_page);
6059		vm_page_encrypt(page, 0);
6060
6061		if (shadow_object == vm_pageout_scan_wants_object) {
6062			/*
6063			 * Give vm_pageout_scan() a chance to convert more
6064			 * pages from "clean-in-place" to "clean-and-free",
6065			 * if it's interested in the same pages we selected
6066			 * in this cluster.
6067			 */
6068			vm_object_unlock(shadow_object);
6069			vm_object_lock(shadow_object);
6070		}
6071	}
6072
6073	vm_object_paging_end(shadow_object);
6074	vm_object_unlock(shadow_object);
6075}
6076
6077#else /* CRYPTO */
6078void
6079upl_encrypt(
6080	__unused upl_t			upl,
6081	__unused upl_offset_t	crypt_offset,
6082	__unused upl_size_t	crypt_size)
6083{
6084}
6085
6086void
6087vm_page_encrypt(
6088	__unused vm_page_t		page,
6089	__unused vm_map_offset_t	kernel_mapping_offset)
6090{
6091}
6092
6093void
6094vm_page_decrypt(
6095	__unused vm_page_t		page,
6096	__unused vm_map_offset_t	kernel_mapping_offset)
6097{
6098}
6099
6100#endif /* CRYPTO */
6101
6102vm_size_t
6103upl_get_internal_pagelist_offset(void)
6104{
6105	return sizeof(struct upl);
6106}
6107
6108void
6109upl_clear_dirty(
6110	upl_t		upl,
6111	boolean_t 	value)
6112{
6113	if (value) {
6114		upl->flags |= UPL_CLEAR_DIRTY;
6115	} else {
6116		upl->flags &= ~UPL_CLEAR_DIRTY;
6117	}
6118}
6119
6120
6121#ifdef MACH_BSD
6122
6123boolean_t  upl_device_page(upl_page_info_t *upl)
6124{
6125	return(UPL_DEVICE_PAGE(upl));
6126}
6127boolean_t  upl_page_present(upl_page_info_t *upl, int index)
6128{
6129	return(UPL_PAGE_PRESENT(upl, index));
6130}
6131boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
6132{
6133	return(UPL_SPECULATIVE_PAGE(upl, index));
6134}
6135boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
6136{
6137	return(UPL_DIRTY_PAGE(upl, index));
6138}
6139boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
6140{
6141	return(UPL_VALID_PAGE(upl, index));
6142}
6143ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
6144{
6145	return(UPL_PHYS_PAGE(upl, index));
6146}
6147
6148
6149void
6150vm_countdirtypages(void)
6151{
6152	vm_page_t m;
6153	int dpages;
6154	int pgopages;
6155	int precpages;
6156
6157
6158	dpages=0;
6159	pgopages=0;
6160	precpages=0;
6161
6162	vm_page_lock_queues();
6163	m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6164	do {
6165		if (m ==(vm_page_t )0) break;
6166
6167		if(m->dirty) dpages++;
6168		if(m->pageout) pgopages++;
6169		if(m->precious) precpages++;
6170
6171		assert(m->object != kernel_object);
6172		m = (vm_page_t) queue_next(&m->pageq);
6173		if (m ==(vm_page_t )0) break;
6174
6175	} while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6176	vm_page_unlock_queues();
6177
6178	vm_page_lock_queues();
6179	m = (vm_page_t) queue_first(&vm_page_queue_throttled);
6180	do {
6181		if (m ==(vm_page_t )0) break;
6182
6183		dpages++;
6184		assert(m->dirty);
6185		assert(!m->pageout);
6186		assert(m->object != kernel_object);
6187		m = (vm_page_t) queue_next(&m->pageq);
6188		if (m ==(vm_page_t )0) break;
6189
6190	} while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
6191	vm_page_unlock_queues();
6192
6193	vm_page_lock_queues();
6194	m = (vm_page_t) queue_first(&vm_page_queue_zf);
6195	do {
6196		if (m ==(vm_page_t )0) break;
6197
6198		if(m->dirty) dpages++;
6199		if(m->pageout) pgopages++;
6200		if(m->precious) precpages++;
6201
6202		assert(m->object != kernel_object);
6203		m = (vm_page_t) queue_next(&m->pageq);
6204		if (m ==(vm_page_t )0) break;
6205
6206	} while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6207	vm_page_unlock_queues();
6208
6209	printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6210
6211	dpages=0;
6212	pgopages=0;
6213	precpages=0;
6214
6215	vm_page_lock_queues();
6216	m = (vm_page_t) queue_first(&vm_page_queue_active);
6217
6218	do {
6219		if(m == (vm_page_t )0) break;
6220		if(m->dirty) dpages++;
6221		if(m->pageout) pgopages++;
6222		if(m->precious) precpages++;
6223
6224		assert(m->object != kernel_object);
6225		m = (vm_page_t) queue_next(&m->pageq);
6226		if(m == (vm_page_t )0) break;
6227
6228	} while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6229	vm_page_unlock_queues();
6230
6231	printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6232
6233}
6234#endif /* MACH_BSD */
6235
6236ppnum_t upl_get_highest_page(
6237			     upl_t			upl)
6238{
6239        return upl->highest_page;
6240}
6241
6242#ifdef UPL_DEBUG
6243kern_return_t  upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6244{
6245	upl->ubc_alias1 = alias1;
6246	upl->ubc_alias2 = alias2;
6247	return KERN_SUCCESS;
6248}
6249int  upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6250{
6251	if(al)
6252		*al = upl->ubc_alias1;
6253	if(al2)
6254		*al2 = upl->ubc_alias2;
6255	return KERN_SUCCESS;
6256}
6257#endif /* UPL_DEBUG */
6258
6259
6260
6261#if	MACH_KDB
6262#include <ddb/db_output.h>
6263#include <ddb/db_print.h>
6264#include <vm/vm_print.h>
6265
6266#define	printf	kdbprintf
6267void		db_pageout(void);
6268
6269void
6270db_vm(void)
6271{
6272
6273	iprintf("VM Statistics:\n");
6274	db_indent += 2;
6275	iprintf("pages:\n");
6276	db_indent += 2;
6277	iprintf("activ %5d  inact %5d  free  %5d",
6278		vm_page_active_count, vm_page_inactive_count,
6279		vm_page_free_count);
6280	printf("   wire  %5d  gobbl %5d\n",
6281	       vm_page_wire_count, vm_page_gobble_count);
6282	db_indent -= 2;
6283	iprintf("target:\n");
6284	db_indent += 2;
6285	iprintf("min   %5d  inact %5d  free  %5d",
6286		vm_page_free_min, vm_page_inactive_target,
6287		vm_page_free_target);
6288	printf("   resrv %5d\n", vm_page_free_reserved);
6289	db_indent -= 2;
6290	iprintf("pause:\n");
6291	db_pageout();
6292	db_indent -= 2;
6293}
6294
6295#if	MACH_COUNTERS
6296extern int c_laundry_pages_freed;
6297#endif	/* MACH_COUNTERS */
6298
6299void
6300db_pageout(void)
6301{
6302	iprintf("Pageout Statistics:\n");
6303	db_indent += 2;
6304	iprintf("active %5d  inactv %5d\n",
6305		vm_pageout_active, vm_pageout_inactive);
6306	iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
6307		vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6308		vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6309	iprintf("used   %5d  clean  %5d  dirty  %5d\n",
6310		vm_pageout_inactive_used, vm_pageout_inactive_clean,
6311		vm_pageout_inactive_dirty);
6312#if	MACH_COUNTERS
6313	iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6314#endif	/* MACH_COUNTERS */
6315#if	MACH_CLUSTER_STATS
6316	iprintf("Cluster Statistics:\n");
6317	db_indent += 2;
6318	iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
6319		vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6320		vm_pageout_cluster_collisions);
6321	iprintf("clusters  %5d   conversions  %5d\n",
6322		vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6323	db_indent -= 2;
6324	iprintf("Target Statistics:\n");
6325	db_indent += 2;
6326	iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
6327		vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6328		vm_pageout_target_page_freed);
6329	db_indent -= 2;
6330#endif	/* MACH_CLUSTER_STATS */
6331	db_indent -= 2;
6332}
6333
6334#endif	/* MACH_KDB */
6335