1/*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 *	File:	vm/vm_object.c
60 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 *	Virtual memory object module.
63 */
64
65#include <debug.h>
66#include <mach_pagemap.h>
67#include <task_swapper.h>
68
69#include <mach/mach_types.h>
70#include <mach/memory_object.h>
71#include <mach/memory_object_default.h>
72#include <mach/memory_object_control_server.h>
73#include <mach/vm_param.h>
74
75#include <mach/sdt.h>
76
77#include <ipc/ipc_types.h>
78#include <ipc/ipc_port.h>
79
80#include <kern/kern_types.h>
81#include <kern/assert.h>
82#include <kern/queue.h>
83#include <kern/xpr.h>
84#include <kern/kalloc.h>
85#include <kern/zalloc.h>
86#include <kern/host.h>
87#include <kern/host_statistics.h>
88#include <kern/processor.h>
89#include <kern/misc_protos.h>
90
91#include <vm/memory_object.h>
92#include <vm/vm_compressor_pager.h>
93#include <vm/vm_fault.h>
94#include <vm/vm_map.h>
95#include <vm/vm_object.h>
96#include <vm/vm_page.h>
97#include <vm/vm_pageout.h>
98#include <vm/vm_protos.h>
99#include <vm/vm_purgeable_internal.h>
100
101#include <vm/vm_compressor.h>
102
103#if CONFIG_PHANTOM_CACHE
104#include <vm/vm_phantom_cache.h>
105#endif
106
107boolean_t vm_object_collapse_compressor_allowed = TRUE;
108
109struct vm_counters vm_counters;
110
111#if VM_OBJECT_TRACKING
112boolean_t vm_object_tracking_inited = FALSE;
113decl_simple_lock_data(static,vm_object_tracking_lock_data);
114btlog_t *vm_object_tracking_btlog;
115static void
116vm_object_tracking_lock(void *context)
117{
118	simple_lock((simple_lock_t)context);
119}
120static void
121vm_object_tracking_unlock(void *context)
122{
123	simple_unlock((simple_lock_t)context);
124}
125void
126vm_object_tracking_init(void)
127{
128	int vm_object_tracking;
129
130	vm_object_tracking = 1;
131	PE_parse_boot_argn("vm_object_tracking", &vm_object_tracking,
132			   sizeof (vm_object_tracking));
133
134	if (vm_object_tracking) {
135		simple_lock_init(&vm_object_tracking_lock_data, 0);
136		vm_object_tracking_btlog = btlog_create(
137			50000,
138			VM_OBJECT_TRACKING_BTDEPTH,
139			vm_object_tracking_lock,
140			vm_object_tracking_unlock,
141			&vm_object_tracking_lock_data);
142		assert(vm_object_tracking_btlog);
143		vm_object_tracking_inited = TRUE;
144	}
145}
146#endif /* VM_OBJECT_TRACKING */
147
148/*
149 *	Virtual memory objects maintain the actual data
150 *	associated with allocated virtual memory.  A given
151 *	page of memory exists within exactly one object.
152 *
153 *	An object is only deallocated when all "references"
154 *	are given up.
155 *
156 *	Associated with each object is a list of all resident
157 *	memory pages belonging to that object; this list is
158 *	maintained by the "vm_page" module, but locked by the object's
159 *	lock.
160 *
161 *	Each object also records the memory object reference
162 *	that is used by the kernel to request and write
163 *	back data (the memory object, field "pager"), etc...
164 *
165 *	Virtual memory objects are allocated to provide
166 *	zero-filled memory (vm_allocate) or map a user-defined
167 *	memory object into a virtual address space (vm_map).
168 *
169 *	Virtual memory objects that refer to a user-defined
170 *	memory object are called "permanent", because all changes
171 *	made in virtual memory are reflected back to the
172 *	memory manager, which may then store it permanently.
173 *	Other virtual memory objects are called "temporary",
174 *	meaning that changes need be written back only when
175 *	necessary to reclaim pages, and that storage associated
176 *	with the object can be discarded once it is no longer
177 *	mapped.
178 *
179 *	A permanent memory object may be mapped into more
180 *	than one virtual address space.  Moreover, two threads
181 *	may attempt to make the first mapping of a memory
182 *	object concurrently.  Only one thread is allowed to
183 *	complete this mapping; all others wait for the
184 *	"pager_initialized" field is asserted, indicating
185 *	that the first thread has initialized all of the
186 *	necessary fields in the virtual memory object structure.
187 *
188 *	The kernel relies on a *default memory manager* to
189 *	provide backing storage for the zero-filled virtual
190 *	memory objects.  The pager memory objects associated
191 *	with these temporary virtual memory objects are only
192 *	requested from the default memory manager when it
193 *	becomes necessary.  Virtual memory objects
194 *	that depend on the default memory manager are called
195 *	"internal".  The "pager_created" field is provided to
196 *	indicate whether these ports have ever been allocated.
197 *
198 *	The kernel may also create virtual memory objects to
199 *	hold changed pages after a copy-on-write operation.
200 *	In this case, the virtual memory object (and its
201 *	backing storage -- its memory object) only contain
202 *	those pages that have been changed.  The "shadow"
203 *	field refers to the virtual memory object that contains
204 *	the remainder of the contents.  The "shadow_offset"
205 *	field indicates where in the "shadow" these contents begin.
206 *	The "copy" field refers to a virtual memory object
207 *	to which changed pages must be copied before changing
208 *	this object, in order to implement another form
209 *	of copy-on-write optimization.
210 *
211 *	The virtual memory object structure also records
212 *	the attributes associated with its memory object.
213 *	The "pager_ready", "can_persist" and "copy_strategy"
214 *	fields represent those attributes.  The "cached_list"
215 *	field is used in the implementation of the persistence
216 *	attribute.
217 *
218 * ZZZ Continue this comment.
219 */
220
221/* Forward declarations for internal functions. */
222static kern_return_t	vm_object_terminate(
223				vm_object_t	object);
224
225extern void		vm_object_remove(
226				vm_object_t	object);
227
228static kern_return_t	vm_object_copy_call(
229				vm_object_t		src_object,
230				vm_object_offset_t	src_offset,
231				vm_object_size_t	size,
232				vm_object_t		*_result_object);
233
234static void		vm_object_do_collapse(
235				vm_object_t	object,
236				vm_object_t	backing_object);
237
238static void		vm_object_do_bypass(
239				vm_object_t	object,
240				vm_object_t	backing_object);
241
242static void		vm_object_release_pager(
243	                        memory_object_t	pager,
244				boolean_t	hashed);
245
246static zone_t		vm_object_zone;		/* vm backing store zone */
247
248/*
249 *	All wired-down kernel memory belongs to a single virtual
250 *	memory object (kernel_object) to avoid wasting data structures.
251 */
252static struct vm_object			kernel_object_store;
253vm_object_t						kernel_object;
254
255static struct vm_object			compressor_object_store;
256vm_object_t				compressor_object = &compressor_object_store;
257
258/*
259 *	The submap object is used as a placeholder for vm_map_submap
260 *	operations.  The object is declared in vm_map.c because it
261 *	is exported by the vm_map module.  The storage is declared
262 *	here because it must be initialized here.
263 */
264static struct vm_object			vm_submap_object_store;
265
266/*
267 *	Virtual memory objects are initialized from
268 *	a template (see vm_object_allocate).
269 *
270 *	When adding a new field to the virtual memory
271 *	object structure, be sure to add initialization
272 *	(see _vm_object_allocate()).
273 */
274static struct vm_object			vm_object_template;
275
276unsigned int vm_page_purged_wired = 0;
277unsigned int vm_page_purged_busy = 0;
278unsigned int vm_page_purged_others = 0;
279
280#if VM_OBJECT_CACHE
281/*
282 *	Virtual memory objects that are not referenced by
283 *	any address maps, but that are allowed to persist
284 *	(an attribute specified by the associated memory manager),
285 *	are kept in a queue (vm_object_cached_list).
286 *
287 *	When an object from this queue is referenced again,
288 *	for example to make another address space mapping,
289 *	it must be removed from the queue.  That is, the
290 *	queue contains *only* objects with zero references.
291 *
292 *	The kernel may choose to terminate objects from this
293 *	queue in order to reclaim storage.  The current policy
294 *	is to permit a fixed maximum number of unreferenced
295 *	objects (vm_object_cached_max).
296 *
297 *	A spin lock (accessed by routines
298 *	vm_object_cache_{lock,lock_try,unlock}) governs the
299 *	object cache.  It must be held when objects are
300 *	added to or removed from the cache (in vm_object_terminate).
301 *	The routines that acquire a reference to a virtual
302 *	memory object based on one of the memory object ports
303 *	must also lock the cache.
304 *
305 *	Ideally, the object cache should be more isolated
306 *	from the reference mechanism, so that the lock need
307 *	not be held to make simple references.
308 */
309static vm_object_t	vm_object_cache_trim(
310				boolean_t called_from_vm_object_deallocate);
311
312static void		vm_object_deactivate_all_pages(
313				vm_object_t	object);
314
315static int		vm_object_cached_high;	/* highest # cached objects */
316static int		vm_object_cached_max = 512;	/* may be patched*/
317
318#define vm_object_cache_lock()		\
319		lck_mtx_lock(&vm_object_cached_lock_data)
320#define vm_object_cache_lock_try()		\
321		lck_mtx_try_lock(&vm_object_cached_lock_data)
322
323#endif	/* VM_OBJECT_CACHE */
324
325static queue_head_t	vm_object_cached_list;
326static uint32_t		vm_object_cache_pages_freed = 0;
327static uint32_t		vm_object_cache_pages_moved = 0;
328static uint32_t		vm_object_cache_pages_skipped = 0;
329static uint32_t		vm_object_cache_adds = 0;
330static uint32_t		vm_object_cached_count = 0;
331static lck_mtx_t	vm_object_cached_lock_data;
332static lck_mtx_ext_t	vm_object_cached_lock_data_ext;
333
334static uint32_t		vm_object_page_grab_failed = 0;
335static uint32_t		vm_object_page_grab_skipped = 0;
336static uint32_t		vm_object_page_grab_returned = 0;
337static uint32_t		vm_object_page_grab_pmapped = 0;
338static uint32_t		vm_object_page_grab_reactivations = 0;
339
340#define vm_object_cache_lock_spin()		\
341		lck_mtx_lock_spin(&vm_object_cached_lock_data)
342#define vm_object_cache_unlock()	\
343		lck_mtx_unlock(&vm_object_cached_lock_data)
344
345static void	vm_object_cache_remove_locked(vm_object_t);
346
347
348#define	VM_OBJECT_HASH_COUNT		1024
349#define	VM_OBJECT_HASH_LOCK_COUNT	512
350
351static lck_mtx_t	vm_object_hashed_lock_data[VM_OBJECT_HASH_LOCK_COUNT];
352static lck_mtx_ext_t	vm_object_hashed_lock_data_ext[VM_OBJECT_HASH_LOCK_COUNT];
353
354static queue_head_t	vm_object_hashtable[VM_OBJECT_HASH_COUNT];
355static struct zone	*vm_object_hash_zone;
356
357struct vm_object_hash_entry {
358	queue_chain_t		hash_link;	/* hash chain link */
359	memory_object_t	pager;		/* pager we represent */
360	vm_object_t		object;		/* corresponding object */
361	boolean_t		waiting;	/* someone waiting for
362						 * termination */
363};
364
365typedef struct vm_object_hash_entry	*vm_object_hash_entry_t;
366#define VM_OBJECT_HASH_ENTRY_NULL	((vm_object_hash_entry_t) 0)
367
368#define VM_OBJECT_HASH_SHIFT	5
369#define vm_object_hash(pager) \
370	((int)((((uintptr_t)pager) >> VM_OBJECT_HASH_SHIFT) % VM_OBJECT_HASH_COUNT))
371
372#define vm_object_lock_hash(pager) \
373	((int)((((uintptr_t)pager) >> VM_OBJECT_HASH_SHIFT) % VM_OBJECT_HASH_LOCK_COUNT))
374
375void vm_object_hash_entry_free(
376	vm_object_hash_entry_t	entry);
377
378static void vm_object_reap(vm_object_t object);
379static void vm_object_reap_async(vm_object_t object);
380static void vm_object_reaper_thread(void);
381
382static lck_mtx_t	vm_object_reaper_lock_data;
383static lck_mtx_ext_t	vm_object_reaper_lock_data_ext;
384
385static queue_head_t vm_object_reaper_queue; /* protected by vm_object_reaper_lock() */
386unsigned int vm_object_reap_count = 0;
387unsigned int vm_object_reap_count_async = 0;
388
389#define vm_object_reaper_lock()		\
390		lck_mtx_lock(&vm_object_reaper_lock_data)
391#define vm_object_reaper_lock_spin()		\
392		lck_mtx_lock_spin(&vm_object_reaper_lock_data)
393#define vm_object_reaper_unlock()	\
394		lck_mtx_unlock(&vm_object_reaper_lock_data)
395
396#if CONFIG_IOSCHED
397/* I/O Re-prioritization request list */
398queue_head_t 	io_reprioritize_list;
399lck_spin_t 	io_reprioritize_list_lock;
400
401#define IO_REPRIORITIZE_LIST_LOCK() 	\
402		lck_spin_lock(&io_reprioritize_list_lock)
403#define IO_REPRIORITIZE_LIST_UNLOCK() 	\
404		lck_spin_unlock(&io_reprioritize_list_lock)
405
406#define MAX_IO_REPRIORITIZE_REQS 	8192
407zone_t 		io_reprioritize_req_zone;
408
409/* I/O Re-prioritization thread */
410int io_reprioritize_wakeup = 0;
411static void io_reprioritize_thread(void *param __unused, wait_result_t wr __unused);
412
413#define IO_REPRIO_THREAD_WAKEUP() 	thread_wakeup((event_t)&io_reprioritize_wakeup)
414#define IO_REPRIO_THREAD_CONTINUATION() 				\
415{ 								\
416	assert_wait(&io_reprioritize_wakeup, THREAD_UNINT);	\
417	thread_block(io_reprioritize_thread);			\
418}
419
420void vm_page_request_reprioritize(vm_object_t, uint64_t, uint32_t, int);
421void vm_page_handle_prio_inversion(vm_object_t, vm_page_t);
422void vm_decmp_upl_reprioritize(upl_t, int);
423#endif
424
425#if 0
426#undef KERNEL_DEBUG
427#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
428#endif
429
430
431static lck_mtx_t *
432vm_object_hash_lock_spin(
433	memory_object_t	pager)
434{
435	int	index;
436
437	index = vm_object_lock_hash(pager);
438
439	lck_mtx_lock_spin(&vm_object_hashed_lock_data[index]);
440
441	return (&vm_object_hashed_lock_data[index]);
442}
443
444static void
445vm_object_hash_unlock(lck_mtx_t *lck)
446{
447	lck_mtx_unlock(lck);
448}
449
450
451/*
452 *	vm_object_hash_lookup looks up a pager in the hashtable
453 *	and returns the corresponding entry, with optional removal.
454 */
455static vm_object_hash_entry_t
456vm_object_hash_lookup(
457	memory_object_t	pager,
458	boolean_t	remove_entry)
459{
460	queue_t			bucket;
461	vm_object_hash_entry_t	entry;
462
463	bucket = &vm_object_hashtable[vm_object_hash(pager)];
464
465	entry = (vm_object_hash_entry_t)queue_first(bucket);
466	while (!queue_end(bucket, (queue_entry_t)entry)) {
467		if (entry->pager == pager) {
468			if (remove_entry) {
469				queue_remove(bucket, entry,
470					     vm_object_hash_entry_t, hash_link);
471			}
472			return(entry);
473		}
474		entry = (vm_object_hash_entry_t)queue_next(&entry->hash_link);
475	}
476	return(VM_OBJECT_HASH_ENTRY_NULL);
477}
478
479/*
480 *	vm_object_hash_enter enters the specified
481 *	pager / cache object association in the hashtable.
482 */
483
484static void
485vm_object_hash_insert(
486	vm_object_hash_entry_t	entry,
487	vm_object_t		object)
488{
489	queue_t		bucket;
490
491	vm_object_lock_assert_exclusive(object);
492
493	bucket = &vm_object_hashtable[vm_object_hash(entry->pager)];
494
495	queue_enter(bucket, entry, vm_object_hash_entry_t, hash_link);
496
497	entry->object = object;
498	object->hashed = TRUE;
499}
500
501static vm_object_hash_entry_t
502vm_object_hash_entry_alloc(
503	memory_object_t	pager)
504{
505	vm_object_hash_entry_t	entry;
506
507	entry = (vm_object_hash_entry_t)zalloc(vm_object_hash_zone);
508	entry->pager = pager;
509	entry->object = VM_OBJECT_NULL;
510	entry->waiting = FALSE;
511
512	return(entry);
513}
514
515void
516vm_object_hash_entry_free(
517	vm_object_hash_entry_t	entry)
518{
519	zfree(vm_object_hash_zone, entry);
520}
521
522/*
523 *	vm_object_allocate:
524 *
525 *	Returns a new object with the given size.
526 */
527
528__private_extern__ void
529_vm_object_allocate(
530	vm_object_size_t	size,
531	vm_object_t		object)
532{
533	XPR(XPR_VM_OBJECT,
534		"vm_object_allocate, object 0x%X size 0x%X\n",
535		object, size, 0,0,0);
536
537	*object = vm_object_template;
538	queue_init(&object->memq);
539	queue_init(&object->msr_q);
540#if UPL_DEBUG || CONFIG_IOSCHED
541	queue_init(&object->uplq);
542#endif
543	vm_object_lock_init(object);
544	object->vo_size = size;
545
546#if VM_OBJECT_TRACKING_OP_CREATED
547	if (vm_object_tracking_inited) {
548		void	*bt[VM_OBJECT_TRACKING_BTDEPTH];
549		int	numsaved = 0;
550
551		numsaved = OSBacktrace(bt, VM_OBJECT_TRACKING_BTDEPTH);
552		btlog_add_entry(vm_object_tracking_btlog,
553				object,
554				VM_OBJECT_TRACKING_OP_CREATED,
555				bt,
556				numsaved);
557	}
558#endif /* VM_OBJECT_TRACKING_OP_CREATED */
559}
560
561__private_extern__ vm_object_t
562vm_object_allocate(
563	vm_object_size_t	size)
564{
565	register vm_object_t object;
566
567	object = (vm_object_t) zalloc(vm_object_zone);
568
569//	dbgLog(object, size, 0, 2);			/* (TEST/DEBUG) */
570
571	if (object != VM_OBJECT_NULL)
572		_vm_object_allocate(size, object);
573
574	return object;
575}
576
577
578lck_grp_t		vm_object_lck_grp;
579lck_grp_t		vm_object_cache_lck_grp;
580lck_grp_attr_t		vm_object_lck_grp_attr;
581lck_attr_t		vm_object_lck_attr;
582lck_attr_t		kernel_object_lck_attr;
583lck_attr_t		compressor_object_lck_attr;
584
585/*
586 *	vm_object_bootstrap:
587 *
588 *	Initialize the VM objects module.
589 */
590__private_extern__ void
591vm_object_bootstrap(void)
592{
593	register int	i;
594
595	vm_object_zone = zinit((vm_size_t) sizeof(struct vm_object),
596				round_page(512*1024),
597				round_page(12*1024),
598				"vm objects");
599	zone_change(vm_object_zone, Z_CALLERACCT, FALSE); /* don't charge caller */
600	zone_change(vm_object_zone, Z_NOENCRYPT, TRUE);
601
602	vm_object_init_lck_grp();
603
604	queue_init(&vm_object_cached_list);
605
606	lck_mtx_init_ext(&vm_object_cached_lock_data,
607		&vm_object_cached_lock_data_ext,
608		&vm_object_cache_lck_grp,
609		&vm_object_lck_attr);
610
611	queue_init(&vm_object_reaper_queue);
612
613	for (i = 0; i < VM_OBJECT_HASH_LOCK_COUNT; i++) {
614		lck_mtx_init_ext(&vm_object_hashed_lock_data[i],
615				 &vm_object_hashed_lock_data_ext[i],
616				 &vm_object_lck_grp,
617				 &vm_object_lck_attr);
618	}
619	lck_mtx_init_ext(&vm_object_reaper_lock_data,
620		&vm_object_reaper_lock_data_ext,
621		&vm_object_lck_grp,
622		&vm_object_lck_attr);
623
624	vm_object_hash_zone =
625			zinit((vm_size_t) sizeof (struct vm_object_hash_entry),
626			      round_page(512*1024),
627			      round_page(12*1024),
628			      "vm object hash entries");
629	zone_change(vm_object_hash_zone, Z_CALLERACCT, FALSE);
630	zone_change(vm_object_hash_zone, Z_NOENCRYPT, TRUE);
631
632	for (i = 0; i < VM_OBJECT_HASH_COUNT; i++)
633		queue_init(&vm_object_hashtable[i]);
634
635
636	/*
637	 *	Fill in a template object, for quick initialization
638	 */
639
640	/* memq; Lock; init after allocation */
641	vm_object_template.memq.prev = NULL;
642	vm_object_template.memq.next = NULL;
643#if 0
644	/*
645	 * We can't call vm_object_lock_init() here because that will
646	 * allocate some memory and VM is not fully initialized yet.
647	 * The lock will be initialized for each allocated object in
648	 * _vm_object_allocate(), so we don't need to initialize it in
649	 * the vm_object_template.
650	 */
651	vm_object_lock_init(&vm_object_template);
652#endif
653	vm_object_template.vo_size = 0;
654	vm_object_template.memq_hint = VM_PAGE_NULL;
655	vm_object_template.ref_count = 1;
656#if	TASK_SWAPPER
657	vm_object_template.res_count = 1;
658#endif	/* TASK_SWAPPER */
659	vm_object_template.resident_page_count = 0;
660	vm_object_template.wired_page_count = 0;
661	vm_object_template.reusable_page_count = 0;
662	vm_object_template.copy = VM_OBJECT_NULL;
663	vm_object_template.shadow = VM_OBJECT_NULL;
664	vm_object_template.vo_shadow_offset = (vm_object_offset_t) 0;
665	vm_object_template.pager = MEMORY_OBJECT_NULL;
666	vm_object_template.paging_offset = 0;
667	vm_object_template.pager_control = MEMORY_OBJECT_CONTROL_NULL;
668	vm_object_template.copy_strategy = MEMORY_OBJECT_COPY_SYMMETRIC;
669	vm_object_template.paging_in_progress = 0;
670#if __LP64__
671	vm_object_template.__object1_unused_bits = 0;
672#endif /* __LP64__ */
673	vm_object_template.activity_in_progress = 0;
674
675	/* Begin bitfields */
676	vm_object_template.all_wanted = 0; /* all bits FALSE */
677	vm_object_template.pager_created = FALSE;
678	vm_object_template.pager_initialized = FALSE;
679	vm_object_template.pager_ready = FALSE;
680	vm_object_template.pager_trusted = FALSE;
681	vm_object_template.can_persist = FALSE;
682	vm_object_template.internal = TRUE;
683	vm_object_template.temporary = TRUE;
684	vm_object_template.private = FALSE;
685	vm_object_template.pageout = FALSE;
686	vm_object_template.alive = TRUE;
687	vm_object_template.purgable = VM_PURGABLE_DENY;
688	vm_object_template.purgeable_when_ripe = FALSE;
689	vm_object_template.shadowed = FALSE;
690	vm_object_template.advisory_pageout = FALSE;
691	vm_object_template.true_share = FALSE;
692	vm_object_template.terminating = FALSE;
693	vm_object_template.named = FALSE;
694	vm_object_template.shadow_severed = FALSE;
695	vm_object_template.phys_contiguous = FALSE;
696	vm_object_template.nophyscache = FALSE;
697	/* End bitfields */
698
699	vm_object_template.cached_list.prev = NULL;
700	vm_object_template.cached_list.next = NULL;
701	vm_object_template.msr_q.prev = NULL;
702	vm_object_template.msr_q.next = NULL;
703
704	vm_object_template.last_alloc = (vm_object_offset_t) 0;
705	vm_object_template.sequential = (vm_object_offset_t) 0;
706	vm_object_template.pages_created = 0;
707	vm_object_template.pages_used = 0;
708	vm_object_template.scan_collisions = 0;
709#if CONFIG_PHANTOM_CACHE
710	vm_object_template.phantom_object_id = 0;
711#endif
712#if	MACH_PAGEMAP
713	vm_object_template.existence_map = VM_EXTERNAL_NULL;
714#endif	/* MACH_PAGEMAP */
715	vm_object_template.cow_hint = ~(vm_offset_t)0;
716#if	MACH_ASSERT
717	vm_object_template.paging_object = VM_OBJECT_NULL;
718#endif	/* MACH_ASSERT */
719
720	/* cache bitfields */
721	vm_object_template.wimg_bits = VM_WIMG_USE_DEFAULT;
722	vm_object_template.set_cache_attr = FALSE;
723	vm_object_template.object_slid = FALSE;
724	vm_object_template.code_signed = FALSE;
725	vm_object_template.hashed = FALSE;
726	vm_object_template.transposed = FALSE;
727	vm_object_template.mapping_in_progress = FALSE;
728	vm_object_template.phantom_isssd = FALSE;
729	vm_object_template.volatile_empty = FALSE;
730	vm_object_template.volatile_fault = FALSE;
731	vm_object_template.all_reusable = FALSE;
732	vm_object_template.blocked_access = FALSE;
733	vm_object_template.__object2_unused_bits = 0;
734#if CONFIG_IOSCHED || UPL_DEBUG
735	vm_object_template.uplq.prev = NULL;
736	vm_object_template.uplq.next = NULL;
737#endif /* UPL_DEBUG */
738#ifdef VM_PIP_DEBUG
739	bzero(&vm_object_template.pip_holders,
740	      sizeof (vm_object_template.pip_holders));
741#endif /* VM_PIP_DEBUG */
742
743	vm_object_template.objq.next = NULL;
744	vm_object_template.objq.prev = NULL;
745
746	vm_object_template.purgeable_queue_type = PURGEABLE_Q_TYPE_MAX;
747	vm_object_template.purgeable_queue_group = 0;
748
749	vm_object_template.vo_cache_ts = 0;
750
751#if DEBUG
752	bzero(&vm_object_template.purgeable_owner_bt[0],
753	      sizeof (vm_object_template.purgeable_owner_bt));
754	vm_object_template.vo_purgeable_volatilizer = NULL;
755	bzero(&vm_object_template.purgeable_volatilizer_bt[0],
756	      sizeof (vm_object_template.purgeable_volatilizer_bt));
757#endif /* DEBUG */
758
759	/*
760	 *	Initialize the "kernel object"
761	 */
762
763	kernel_object = &kernel_object_store;
764
765/*
766 *	Note that in the following size specifications, we need to add 1 because
767 *	VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size.
768 */
769
770#ifdef ppc
771	_vm_object_allocate(vm_last_addr + 1,
772			    kernel_object);
773#else
774	_vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
775			    kernel_object);
776
777	_vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
778			    compressor_object);
779#endif
780	kernel_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
781	compressor_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
782
783	/*
784	 *	Initialize the "submap object".  Make it as large as the
785	 *	kernel object so that no limit is imposed on submap sizes.
786	 */
787
788	vm_submap_object = &vm_submap_object_store;
789#ifdef ppc
790	_vm_object_allocate(vm_last_addr + 1,
791			    vm_submap_object);
792#else
793	_vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
794			    vm_submap_object);
795#endif
796	vm_submap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
797
798	/*
799	 * Create an "extra" reference to this object so that we never
800	 * try to deallocate it; zfree doesn't like to be called with
801	 * non-zone memory.
802	 */
803	vm_object_reference(vm_submap_object);
804
805#if	MACH_PAGEMAP
806	vm_external_module_initialize();
807#endif	/* MACH_PAGEMAP */
808}
809
810#if CONFIG_IOSCHED
811void
812vm_io_reprioritize_init(void)
813{
814	kern_return_t 	result;
815	thread_t 	thread = THREAD_NULL;
816
817	/* Initialze the I/O reprioritization subsystem */
818        lck_spin_init(&io_reprioritize_list_lock, &vm_object_lck_grp, &vm_object_lck_attr);
819        queue_init(&io_reprioritize_list);
820
821	io_reprioritize_req_zone = zinit(sizeof(struct io_reprioritize_req),
822					 MAX_IO_REPRIORITIZE_REQS * sizeof(struct io_reprioritize_req),
823                                      	 4096, "io_reprioritize_req");
824
825	result = kernel_thread_start_priority(io_reprioritize_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
826        if (result == KERN_SUCCESS) {
827                thread_deallocate(thread);
828        } else {
829                panic("Could not create io_reprioritize_thread");
830        }
831}
832#endif
833
834void
835vm_object_reaper_init(void)
836{
837	kern_return_t	kr;
838	thread_t	thread;
839
840	kr = kernel_thread_start_priority(
841		(thread_continue_t) vm_object_reaper_thread,
842		NULL,
843		BASEPRI_PREEMPT - 1,
844		&thread);
845	if (kr != KERN_SUCCESS) {
846		panic("failed to launch vm_object_reaper_thread kr=0x%x", kr);
847	}
848	thread_deallocate(thread);
849}
850
851__private_extern__ void
852vm_object_init(void)
853{
854	/*
855	 *	Finish initializing the kernel object.
856	 */
857}
858
859
860__private_extern__ void
861vm_object_init_lck_grp(void)
862{
863	/*
864	 * initialze the vm_object lock world
865	 */
866	lck_grp_attr_setdefault(&vm_object_lck_grp_attr);
867	lck_grp_init(&vm_object_lck_grp, "vm_object", &vm_object_lck_grp_attr);
868	lck_grp_init(&vm_object_cache_lck_grp, "vm_object_cache", &vm_object_lck_grp_attr);
869	lck_attr_setdefault(&vm_object_lck_attr);
870	lck_attr_setdefault(&kernel_object_lck_attr);
871	lck_attr_cleardebug(&kernel_object_lck_attr);
872	lck_attr_setdefault(&compressor_object_lck_attr);
873	lck_attr_cleardebug(&compressor_object_lck_attr);
874}
875
876#if VM_OBJECT_CACHE
877#define	MIGHT_NOT_CACHE_SHADOWS		1
878#if	MIGHT_NOT_CACHE_SHADOWS
879static int cache_shadows = TRUE;
880#endif	/* MIGHT_NOT_CACHE_SHADOWS */
881#endif
882
883/*
884 *	vm_object_deallocate:
885 *
886 *	Release a reference to the specified object,
887 *	gained either through a vm_object_allocate
888 *	or a vm_object_reference call.  When all references
889 *	are gone, storage associated with this object
890 *	may be relinquished.
891 *
892 *	No object may be locked.
893 */
894unsigned long vm_object_deallocate_shared_successes = 0;
895unsigned long vm_object_deallocate_shared_failures = 0;
896unsigned long vm_object_deallocate_shared_swap_failures = 0;
897__private_extern__ void
898vm_object_deallocate(
899	register vm_object_t	object)
900{
901#if VM_OBJECT_CACHE
902	boolean_t	retry_cache_trim = FALSE;
903	uint32_t	try_failed_count = 0;
904#endif
905	vm_object_t	shadow = VM_OBJECT_NULL;
906
907//	if(object)dbgLog(object, object->ref_count, object->can_persist, 3);	/* (TEST/DEBUG) */
908//	else dbgLog(object, 0, 0, 3);	/* (TEST/DEBUG) */
909
910	if (object == VM_OBJECT_NULL)
911	        return;
912
913	if (object == kernel_object || object == compressor_object) {
914		vm_object_lock_shared(object);
915
916		OSAddAtomic(-1, &object->ref_count);
917
918		if (object->ref_count == 0) {
919			if (object == kernel_object)
920				panic("vm_object_deallocate: losing kernel_object\n");
921			else
922				panic("vm_object_deallocate: losing compressor_object\n");
923		}
924		vm_object_unlock(object);
925		return;
926	}
927
928	if (object->ref_count == 2 &&
929	    object->named) {
930		/*
931		 * This "named" object's reference count is about to
932		 * drop from 2 to 1:
933		 * we'll need to call memory_object_last_unmap().
934		 */
935	} else if (object->ref_count == 2 &&
936		   object->internal &&
937		   object->shadow != VM_OBJECT_NULL) {
938		/*
939		 * This internal object's reference count is about to
940		 * drop from 2 to 1 and it has a shadow object:
941		 * we'll want to try and collapse this object with its
942		 * shadow.
943		 */
944	} else if (object->ref_count >= 2) {
945		UInt32		original_ref_count;
946		volatile UInt32	*ref_count_p;
947		Boolean		atomic_swap;
948
949		/*
950		 * The object currently looks like it is not being
951		 * kept alive solely by the reference we're about to release.
952		 * Let's try and release our reference without taking
953		 * all the locks we would need if we had to terminate the
954		 * object (cache lock + exclusive object lock).
955		 * Lock the object "shared" to make sure we don't race with
956		 * anyone holding it "exclusive".
957		 */
958	        vm_object_lock_shared(object);
959		ref_count_p = (volatile UInt32 *) &object->ref_count;
960		original_ref_count = object->ref_count;
961		/*
962		 * Test again as "ref_count" could have changed.
963		 * "named" shouldn't change.
964		 */
965		if (original_ref_count == 2 &&
966		    object->named) {
967			/* need to take slow path for m_o_last_unmap() */
968			atomic_swap = FALSE;
969		} else if (original_ref_count == 2 &&
970			   object->internal &&
971			   object->shadow != VM_OBJECT_NULL) {
972			/* need to take slow path for vm_object_collapse() */
973			atomic_swap = FALSE;
974		} else if (original_ref_count < 2) {
975			/* need to take slow path for vm_object_terminate() */
976			atomic_swap = FALSE;
977		} else {
978			/* try an atomic update with the shared lock */
979			atomic_swap = OSCompareAndSwap(
980				original_ref_count,
981				original_ref_count - 1,
982				(UInt32 *) &object->ref_count);
983			if (atomic_swap == FALSE) {
984				vm_object_deallocate_shared_swap_failures++;
985				/* fall back to the slow path... */
986			}
987		}
988
989		vm_object_unlock(object);
990
991		if (atomic_swap) {
992			/*
993			 * ref_count was updated atomically !
994			 */
995			vm_object_deallocate_shared_successes++;
996			return;
997		}
998
999		/*
1000		 * Someone else updated the ref_count at the same
1001		 * time and we lost the race.  Fall back to the usual
1002		 * slow but safe path...
1003		 */
1004		vm_object_deallocate_shared_failures++;
1005	}
1006
1007	while (object != VM_OBJECT_NULL) {
1008
1009		vm_object_lock(object);
1010
1011		assert(object->ref_count > 0);
1012
1013		/*
1014		 *	If the object has a named reference, and only
1015		 *	that reference would remain, inform the pager
1016		 *	about the last "mapping" reference going away.
1017		 */
1018		if ((object->ref_count == 2)  && (object->named)) {
1019			memory_object_t	pager = object->pager;
1020
1021			/* Notify the Pager that there are no */
1022			/* more mappers for this object */
1023
1024			if (pager != MEMORY_OBJECT_NULL) {
1025				vm_object_mapping_wait(object, THREAD_UNINT);
1026				vm_object_mapping_begin(object);
1027				vm_object_unlock(object);
1028
1029				memory_object_last_unmap(pager);
1030
1031				vm_object_lock(object);
1032				vm_object_mapping_end(object);
1033			}
1034			assert(object->ref_count > 0);
1035		}
1036
1037		/*
1038		 *	Lose the reference. If other references
1039		 *	remain, then we are done, unless we need
1040		 *	to retry a cache trim.
1041		 *	If it is the last reference, then keep it
1042		 *	until any pending initialization is completed.
1043		 */
1044
1045		/* if the object is terminating, it cannot go into */
1046		/* the cache and we obviously should not call      */
1047		/* terminate again.  */
1048
1049		if ((object->ref_count > 1) || object->terminating) {
1050			vm_object_lock_assert_exclusive(object);
1051			object->ref_count--;
1052			vm_object_res_deallocate(object);
1053
1054			if (object->ref_count == 1 &&
1055			    object->shadow != VM_OBJECT_NULL) {
1056				/*
1057				 * There's only one reference left on this
1058				 * VM object.  We can't tell if it's a valid
1059				 * one (from a mapping for example) or if this
1060				 * object is just part of a possibly stale and
1061				 * useless shadow chain.
1062				 * We would like to try and collapse it into
1063				 * its parent, but we don't have any pointers
1064				 * back to this parent object.
1065				 * But we can try and collapse this object with
1066				 * its own shadows, in case these are useless
1067				 * too...
1068				 * We can't bypass this object though, since we
1069				 * don't know if this last reference on it is
1070				 * meaningful or not.
1071				 */
1072				vm_object_collapse(object, 0, FALSE);
1073			}
1074			vm_object_unlock(object);
1075#if VM_OBJECT_CACHE
1076			if (retry_cache_trim &&
1077			    ((object = vm_object_cache_trim(TRUE)) !=
1078			     VM_OBJECT_NULL)) {
1079				continue;
1080			}
1081#endif
1082			return;
1083		}
1084
1085		/*
1086		 *	We have to wait for initialization
1087		 *	before destroying or caching the object.
1088		 */
1089
1090		if (object->pager_created && ! object->pager_initialized) {
1091			assert(! object->can_persist);
1092			vm_object_assert_wait(object,
1093					      VM_OBJECT_EVENT_INITIALIZED,
1094					      THREAD_UNINT);
1095			vm_object_unlock(object);
1096
1097			thread_block(THREAD_CONTINUE_NULL);
1098			continue;
1099		}
1100
1101#if VM_OBJECT_CACHE
1102		/*
1103		 *	If this object can persist, then enter it in
1104		 *	the cache. Otherwise, terminate it.
1105		 *
1106		 * 	NOTE:  Only permanent objects are cached, and
1107		 *	permanent objects cannot have shadows.  This
1108		 *	affects the residence counting logic in a minor
1109		 *	way (can do it in-line, mostly).
1110		 */
1111
1112		if ((object->can_persist) && (object->alive)) {
1113			/*
1114			 *	Now it is safe to decrement reference count,
1115			 *	and to return if reference count is > 0.
1116			 */
1117
1118			vm_object_lock_assert_exclusive(object);
1119			if (--object->ref_count > 0) {
1120				vm_object_res_deallocate(object);
1121				vm_object_unlock(object);
1122
1123				if (retry_cache_trim &&
1124				    ((object = vm_object_cache_trim(TRUE)) !=
1125				     VM_OBJECT_NULL)) {
1126					continue;
1127				}
1128				return;
1129			}
1130
1131#if	MIGHT_NOT_CACHE_SHADOWS
1132			/*
1133			 *	Remove shadow now if we don't
1134			 *	want to cache shadows.
1135			 */
1136			if (! cache_shadows) {
1137				shadow = object->shadow;
1138				object->shadow = VM_OBJECT_NULL;
1139			}
1140#endif	/* MIGHT_NOT_CACHE_SHADOWS */
1141
1142			/*
1143			 *	Enter the object onto the queue of
1144			 *	cached objects, and deactivate
1145			 *	all of its pages.
1146			 */
1147			assert(object->shadow == VM_OBJECT_NULL);
1148			VM_OBJ_RES_DECR(object);
1149			XPR(XPR_VM_OBJECT,
1150		      "vm_o_deallocate: adding %x to cache, queue = (%x, %x)\n",
1151				object,
1152				vm_object_cached_list.next,
1153				vm_object_cached_list.prev,0,0);
1154
1155
1156			vm_object_unlock(object);
1157
1158			try_failed_count = 0;
1159			for (;;) {
1160				vm_object_cache_lock();
1161
1162				/*
1163				 * if we try to take a regular lock here
1164				 * we risk deadlocking against someone
1165				 * holding a lock on this object while
1166				 * trying to vm_object_deallocate a different
1167				 * object
1168				 */
1169				if (vm_object_lock_try(object))
1170					break;
1171				vm_object_cache_unlock();
1172				try_failed_count++;
1173
1174				mutex_pause(try_failed_count);  /* wait a bit */
1175			}
1176			vm_object_cached_count++;
1177			if (vm_object_cached_count > vm_object_cached_high)
1178				vm_object_cached_high = vm_object_cached_count;
1179			queue_enter(&vm_object_cached_list, object,
1180				vm_object_t, cached_list);
1181			vm_object_cache_unlock();
1182
1183			vm_object_deactivate_all_pages(object);
1184			vm_object_unlock(object);
1185
1186#if	MIGHT_NOT_CACHE_SHADOWS
1187			/*
1188			 *	If we have a shadow that we need
1189			 *	to deallocate, do so now, remembering
1190			 *	to trim the cache later.
1191			 */
1192			if (! cache_shadows && shadow != VM_OBJECT_NULL) {
1193				object = shadow;
1194				retry_cache_trim = TRUE;
1195				continue;
1196			}
1197#endif	/* MIGHT_NOT_CACHE_SHADOWS */
1198
1199			/*
1200			 *	Trim the cache. If the cache trim
1201			 *	returns with a shadow for us to deallocate,
1202			 *	then remember to retry the cache trim
1203			 *	when we are done deallocating the shadow.
1204			 *	Otherwise, we are done.
1205			 */
1206
1207			object = vm_object_cache_trim(TRUE);
1208			if (object == VM_OBJECT_NULL) {
1209				return;
1210			}
1211			retry_cache_trim = TRUE;
1212		} else
1213#endif	/* VM_OBJECT_CACHE */
1214		{
1215			/*
1216			 *	This object is not cachable; terminate it.
1217			 */
1218			XPR(XPR_VM_OBJECT,
1219	 "vm_o_deallocate: !cacheable 0x%X res %d paging_ops %d thread 0x%p ref %d\n",
1220			    object, object->resident_page_count,
1221			    object->paging_in_progress,
1222			    (void *)current_thread(),object->ref_count);
1223
1224			VM_OBJ_RES_DECR(object);	/* XXX ? */
1225			/*
1226			 *	Terminate this object. If it had a shadow,
1227			 *	then deallocate it; otherwise, if we need
1228			 *	to retry a cache trim, do so now; otherwise,
1229			 *	we are done. "pageout" objects have a shadow,
1230			 *	but maintain a "paging reference" rather than
1231			 *	a normal reference.
1232			 */
1233			shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
1234
1235			if (vm_object_terminate(object) != KERN_SUCCESS) {
1236				return;
1237			}
1238			if (shadow != VM_OBJECT_NULL) {
1239				object = shadow;
1240				continue;
1241			}
1242#if VM_OBJECT_CACHE
1243			if (retry_cache_trim &&
1244			    ((object = vm_object_cache_trim(TRUE)) !=
1245			     VM_OBJECT_NULL)) {
1246				continue;
1247			}
1248#endif
1249			return;
1250		}
1251	}
1252#if VM_OBJECT_CACHE
1253	assert(! retry_cache_trim);
1254#endif
1255}
1256
1257
1258
1259vm_page_t
1260vm_object_page_grab(
1261	vm_object_t	object)
1262{
1263	vm_page_t	p, next_p;
1264	int		p_limit = 0;
1265	int		p_skipped = 0;
1266
1267	vm_object_lock_assert_exclusive(object);
1268
1269	next_p = (vm_page_t)queue_first(&object->memq);
1270	p_limit = MIN(50, object->resident_page_count);
1271
1272	while (!queue_end(&object->memq, (queue_entry_t)next_p) && --p_limit > 0) {
1273
1274		p = next_p;
1275		next_p = (vm_page_t)queue_next(&next_p->listq);
1276
1277		if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry || p->fictitious)
1278			goto move_page_in_obj;
1279
1280		if (p->pmapped || p->dirty || p->precious) {
1281			vm_page_lockspin_queues();
1282
1283			if (p->pmapped) {
1284				int refmod_state;
1285
1286				vm_object_page_grab_pmapped++;
1287
1288				if (p->reference == FALSE || p->dirty == FALSE) {
1289
1290					refmod_state = pmap_get_refmod(p->phys_page);
1291
1292					if (refmod_state & VM_MEM_REFERENCED)
1293						p->reference = TRUE;
1294					if (refmod_state & VM_MEM_MODIFIED) {
1295						SET_PAGE_DIRTY(p, FALSE);
1296					}
1297				}
1298				if (p->dirty == FALSE && p->precious == FALSE) {
1299
1300					refmod_state = pmap_disconnect(p->phys_page);
1301
1302					if (refmod_state & VM_MEM_REFERENCED)
1303						p->reference = TRUE;
1304					if (refmod_state & VM_MEM_MODIFIED) {
1305						SET_PAGE_DIRTY(p, FALSE);
1306					}
1307
1308					if (p->dirty == FALSE)
1309						goto take_page;
1310				}
1311			}
1312			if (p->inactive && p->reference == TRUE) {
1313				vm_page_activate(p);
1314
1315				VM_STAT_INCR(reactivations);
1316				vm_object_page_grab_reactivations++;
1317			}
1318			vm_page_unlock_queues();
1319move_page_in_obj:
1320			queue_remove(&object->memq, p, vm_page_t, listq);
1321			queue_enter(&object->memq, p, vm_page_t, listq);
1322
1323			p_skipped++;
1324			continue;
1325		}
1326		vm_page_lockspin_queues();
1327take_page:
1328		vm_page_free_prepare_queues(p);
1329		vm_object_page_grab_returned++;
1330		vm_object_page_grab_skipped += p_skipped;
1331
1332		vm_page_unlock_queues();
1333
1334		vm_page_free_prepare_object(p, TRUE);
1335
1336		return (p);
1337	}
1338	vm_object_page_grab_skipped += p_skipped;
1339	vm_object_page_grab_failed++;
1340
1341	return (NULL);
1342}
1343
1344
1345
1346#define EVICT_PREPARE_LIMIT	64
1347#define EVICT_AGE		10
1348
1349static	clock_sec_t	vm_object_cache_aging_ts = 0;
1350
1351static void
1352vm_object_cache_remove_locked(
1353	vm_object_t	object)
1354{
1355	queue_remove(&vm_object_cached_list, object, vm_object_t, objq);
1356	object->objq.next = NULL;
1357	object->objq.prev = NULL;
1358
1359	vm_object_cached_count--;
1360}
1361
1362void
1363vm_object_cache_remove(
1364	vm_object_t	object)
1365{
1366	vm_object_cache_lock_spin();
1367
1368	if (object->objq.next || object->objq.prev)
1369		vm_object_cache_remove_locked(object);
1370
1371	vm_object_cache_unlock();
1372}
1373
1374void
1375vm_object_cache_add(
1376	vm_object_t	object)
1377{
1378	clock_sec_t sec;
1379	clock_nsec_t nsec;
1380
1381	if (object->resident_page_count == 0)
1382		return;
1383	clock_get_system_nanotime(&sec, &nsec);
1384
1385	vm_object_cache_lock_spin();
1386
1387	if (object->objq.next == NULL && object->objq.prev == NULL) {
1388		queue_enter(&vm_object_cached_list, object, vm_object_t, objq);
1389		object->vo_cache_ts = sec + EVICT_AGE;
1390		object->vo_cache_pages_to_scan = object->resident_page_count;
1391
1392		vm_object_cached_count++;
1393		vm_object_cache_adds++;
1394	}
1395	vm_object_cache_unlock();
1396}
1397
1398int
1399vm_object_cache_evict(
1400	int	num_to_evict,
1401	int	max_objects_to_examine)
1402{
1403	vm_object_t	object = VM_OBJECT_NULL;
1404	vm_object_t	next_obj = VM_OBJECT_NULL;
1405	vm_page_t	local_free_q = VM_PAGE_NULL;
1406	vm_page_t	p;
1407	vm_page_t	next_p;
1408	int		object_cnt = 0;
1409	vm_page_t	ep_array[EVICT_PREPARE_LIMIT];
1410	int		ep_count;
1411	int		ep_limit;
1412	int		ep_index;
1413	int		ep_freed = 0;
1414	int		ep_moved = 0;
1415	uint32_t	ep_skipped = 0;
1416	clock_sec_t	sec;
1417	clock_nsec_t	nsec;
1418
1419	KERNEL_DEBUG(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
1420	/*
1421	 * do a couple of quick checks to see if it's
1422	 * worthwhile grabbing the lock
1423	 */
1424	if (queue_empty(&vm_object_cached_list)) {
1425		KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, 0, 0, 0, 0, 0);
1426		return (0);
1427	}
1428	clock_get_system_nanotime(&sec, &nsec);
1429
1430	/*
1431	 * the object on the head of the queue has not
1432	 * yet sufficiently aged
1433	 */
1434	if (sec < vm_object_cache_aging_ts) {
1435		KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, 0, 0, 0, 0, 0);
1436		return (0);
1437	}
1438	/*
1439	 * don't need the queue lock to find
1440	 * and lock an object on the cached list
1441	 */
1442	vm_page_unlock_queues();
1443
1444	vm_object_cache_lock_spin();
1445
1446	for (;;) {
1447		next_obj = (vm_object_t)queue_first(&vm_object_cached_list);
1448
1449		while (!queue_end(&vm_object_cached_list, (queue_entry_t)next_obj) && object_cnt++ < max_objects_to_examine) {
1450
1451			object = next_obj;
1452			next_obj = (vm_object_t)queue_next(&next_obj->objq);
1453
1454			if (sec < object->vo_cache_ts) {
1455				KERNEL_DEBUG(0x130020c, object, object->resident_page_count, object->vo_cache_ts, sec, 0);
1456
1457				vm_object_cache_aging_ts = object->vo_cache_ts;
1458				object = VM_OBJECT_NULL;
1459				break;
1460			}
1461			if (!vm_object_lock_try_scan(object)) {
1462				/*
1463				 * just skip over this guy for now... if we find
1464				 * an object to steal pages from, we'll revist in a bit...
1465				 * hopefully, the lock will have cleared
1466				 */
1467				KERNEL_DEBUG(0x13001f8, object, object->resident_page_count, 0, 0, 0);
1468
1469				object = VM_OBJECT_NULL;
1470				continue;
1471			}
1472			if (queue_empty(&object->memq) || object->vo_cache_pages_to_scan == 0) {
1473				/*
1474				 * this case really shouldn't happen, but it's not fatal
1475				 * so deal with it... if we don't remove the object from
1476				 * the list, we'll never move past it.
1477				 */
1478				KERNEL_DEBUG(0x13001fc, object, object->resident_page_count, ep_freed, ep_moved, 0);
1479
1480				vm_object_cache_remove_locked(object);
1481				vm_object_unlock(object);
1482				object = VM_OBJECT_NULL;
1483				continue;
1484			}
1485			/*
1486			 * we have a locked object with pages...
1487			 * time to start harvesting
1488			 */
1489			break;
1490		}
1491		vm_object_cache_unlock();
1492
1493		if (object == VM_OBJECT_NULL)
1494			break;
1495
1496		/*
1497		 * object is locked at this point and
1498		 * has resident pages
1499		 */
1500		next_p = (vm_page_t)queue_first(&object->memq);
1501
1502		/*
1503		 * break the page scan into 2 pieces to minimize the time spent
1504		 * behind the page queue lock...
1505		 * the list of pages on these unused objects is likely to be cold
1506		 * w/r to the cpu cache which increases the time to scan the list
1507		 * tenfold...  and we may have a 'run' of pages we can't utilize that
1508		 * needs to be skipped over...
1509		 */
1510		if ((ep_limit = num_to_evict - (ep_freed + ep_moved)) > EVICT_PREPARE_LIMIT)
1511			ep_limit = EVICT_PREPARE_LIMIT;
1512		ep_count = 0;
1513
1514		while (!queue_end(&object->memq, (queue_entry_t)next_p) && object->vo_cache_pages_to_scan && ep_count < ep_limit) {
1515
1516			p = next_p;
1517			next_p = (vm_page_t)queue_next(&next_p->listq);
1518
1519			object->vo_cache_pages_to_scan--;
1520
1521			if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry) {
1522				queue_remove(&object->memq, p, vm_page_t, listq);
1523				queue_enter(&object->memq, p, vm_page_t, listq);
1524
1525				ep_skipped++;
1526				continue;
1527			}
1528			if (p->wpmapped || p->dirty || p->precious) {
1529				queue_remove(&object->memq, p, vm_page_t, listq);
1530				queue_enter(&object->memq, p, vm_page_t, listq);
1531
1532				pmap_clear_reference(p->phys_page);
1533			}
1534			ep_array[ep_count++] = p;
1535		}
1536		KERNEL_DEBUG(0x13001f4 | DBG_FUNC_START, object, object->resident_page_count, ep_freed, ep_moved, 0);
1537
1538		vm_page_lockspin_queues();
1539
1540		for (ep_index = 0; ep_index < ep_count; ep_index++) {
1541
1542			p = ep_array[ep_index];
1543
1544			if (p->wpmapped || p->dirty || p->precious) {
1545				p->reference = FALSE;
1546				p->no_cache = FALSE;
1547
1548				/*
1549				 * we've already filtered out pages that are in the laundry
1550				 * so if we get here, this page can't be on the pageout queue
1551				 */
1552				assert(!p->pageout_queue);
1553
1554				VM_PAGE_QUEUES_REMOVE(p);
1555				VM_PAGE_ENQUEUE_INACTIVE(p, TRUE);
1556
1557				ep_moved++;
1558			} else {
1559#if CONFIG_PHANTOM_CACHE
1560				vm_phantom_cache_add_ghost(p);
1561#endif
1562				vm_page_free_prepare_queues(p);
1563
1564				assert(p->pageq.next == NULL && p->pageq.prev == NULL);
1565				/*
1566				 * Add this page to our list of reclaimed pages,
1567				 * to be freed later.
1568				 */
1569				p->pageq.next = (queue_entry_t) local_free_q;
1570				local_free_q = p;
1571
1572				ep_freed++;
1573			}
1574		}
1575		vm_page_unlock_queues();
1576
1577		KERNEL_DEBUG(0x13001f4 | DBG_FUNC_END, object, object->resident_page_count, ep_freed, ep_moved, 0);
1578
1579		if (local_free_q) {
1580			vm_page_free_list(local_free_q, TRUE);
1581			local_free_q = VM_PAGE_NULL;
1582		}
1583		if (object->vo_cache_pages_to_scan == 0) {
1584			KERNEL_DEBUG(0x1300208, object, object->resident_page_count, ep_freed, ep_moved, 0);
1585
1586			vm_object_cache_remove(object);
1587
1588			KERNEL_DEBUG(0x13001fc, object, object->resident_page_count, ep_freed, ep_moved, 0);
1589		}
1590		/*
1591		 * done with this object
1592		 */
1593		vm_object_unlock(object);
1594		object = VM_OBJECT_NULL;
1595
1596		/*
1597		 * at this point, we are not holding any locks
1598		 */
1599		if ((ep_freed + ep_moved) >= num_to_evict) {
1600			/*
1601			 * we've reached our target for the
1602			 * number of pages to evict
1603			 */
1604			break;
1605		}
1606		vm_object_cache_lock_spin();
1607	}
1608	/*
1609	 * put the page queues lock back to the caller's
1610	 * idea of it
1611	 */
1612	vm_page_lock_queues();
1613
1614	vm_object_cache_pages_freed += ep_freed;
1615	vm_object_cache_pages_moved += ep_moved;
1616	vm_object_cache_pages_skipped += ep_skipped;
1617
1618	KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, ep_freed, 0, 0, 0, 0);
1619	return (ep_freed);
1620}
1621
1622
1623#if VM_OBJECT_CACHE
1624/*
1625 *	Check to see whether we really need to trim
1626 *	down the cache. If so, remove an object from
1627 *	the cache, terminate it, and repeat.
1628 *
1629 *	Called with, and returns with, cache lock unlocked.
1630 */
1631vm_object_t
1632vm_object_cache_trim(
1633	boolean_t called_from_vm_object_deallocate)
1634{
1635	register vm_object_t object = VM_OBJECT_NULL;
1636	vm_object_t shadow;
1637
1638	for (;;) {
1639
1640		/*
1641		 *	If we no longer need to trim the cache,
1642		 *	then we are done.
1643		 */
1644		if (vm_object_cached_count <= vm_object_cached_max)
1645			return VM_OBJECT_NULL;
1646
1647		vm_object_cache_lock();
1648		if (vm_object_cached_count <= vm_object_cached_max) {
1649			vm_object_cache_unlock();
1650			return VM_OBJECT_NULL;
1651		}
1652
1653		/*
1654		 *	We must trim down the cache, so remove
1655		 *	the first object in the cache.
1656		 */
1657		XPR(XPR_VM_OBJECT,
1658		"vm_object_cache_trim: removing from front of cache (%x, %x)\n",
1659			vm_object_cached_list.next,
1660			vm_object_cached_list.prev, 0, 0, 0);
1661
1662		object = (vm_object_t) queue_first(&vm_object_cached_list);
1663		if(object == (vm_object_t) &vm_object_cached_list) {
1664			/* something's wrong with the calling parameter or */
1665			/* the value of vm_object_cached_count, just fix   */
1666			/* and return */
1667			if(vm_object_cached_max < 0)
1668				vm_object_cached_max = 0;
1669			vm_object_cached_count = 0;
1670			vm_object_cache_unlock();
1671			return VM_OBJECT_NULL;
1672		}
1673		vm_object_lock(object);
1674		queue_remove(&vm_object_cached_list, object, vm_object_t,
1675			     cached_list);
1676		vm_object_cached_count--;
1677
1678		vm_object_cache_unlock();
1679		/*
1680		 *	Since this object is in the cache, we know
1681		 *	that it is initialized and has no references.
1682		 *	Take a reference to avoid recursive deallocations.
1683		 */
1684
1685		assert(object->pager_initialized);
1686		assert(object->ref_count == 0);
1687		vm_object_lock_assert_exclusive(object);
1688		object->ref_count++;
1689
1690		/*
1691		 *	Terminate the object.
1692		 *	If the object had a shadow, we let vm_object_deallocate
1693		 *	deallocate it. "pageout" objects have a shadow, but
1694		 *	maintain a "paging reference" rather than a normal
1695		 *	reference.
1696		 *	(We are careful here to limit recursion.)
1697		 */
1698		shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
1699
1700		if(vm_object_terminate(object) != KERN_SUCCESS)
1701			continue;
1702
1703		if (shadow != VM_OBJECT_NULL) {
1704			if (called_from_vm_object_deallocate) {
1705				return shadow;
1706			} else {
1707				vm_object_deallocate(shadow);
1708			}
1709		}
1710	}
1711}
1712#endif
1713
1714
1715/*
1716 *	Routine:	vm_object_terminate
1717 *	Purpose:
1718 *		Free all resources associated with a vm_object.
1719 *	In/out conditions:
1720 *		Upon entry, the object must be locked,
1721 *		and the object must have exactly one reference.
1722 *
1723 *		The shadow object reference is left alone.
1724 *
1725 *		The object must be unlocked if its found that pages
1726 *		must be flushed to a backing object.  If someone
1727 *		manages to map the object while it is being flushed
1728 *		the object is returned unlocked and unchanged.  Otherwise,
1729 *		upon exit, the cache will be unlocked, and the
1730 *		object will cease to exist.
1731 */
1732static kern_return_t
1733vm_object_terminate(
1734	vm_object_t	object)
1735{
1736	vm_object_t	shadow_object;
1737
1738	XPR(XPR_VM_OBJECT, "vm_object_terminate, object 0x%X ref %d\n",
1739		object, object->ref_count, 0, 0, 0);
1740
1741	if (!object->pageout && (!object->temporary || object->can_persist) &&
1742	    (object->pager != NULL || object->shadow_severed)) {
1743		/*
1744		 * Clear pager_trusted bit so that the pages get yanked
1745		 * out of the object instead of cleaned in place.  This
1746		 * prevents a deadlock in XMM and makes more sense anyway.
1747		 */
1748		object->pager_trusted = FALSE;
1749
1750		vm_object_reap_pages(object, REAP_TERMINATE);
1751	}
1752	/*
1753	 *	Make sure the object isn't already being terminated
1754	 */
1755	if (object->terminating) {
1756		vm_object_lock_assert_exclusive(object);
1757		object->ref_count--;
1758		assert(object->ref_count > 0);
1759		vm_object_unlock(object);
1760		return KERN_FAILURE;
1761	}
1762
1763	/*
1764	 * Did somebody get a reference to the object while we were
1765	 * cleaning it?
1766	 */
1767	if (object->ref_count != 1) {
1768		vm_object_lock_assert_exclusive(object);
1769		object->ref_count--;
1770		assert(object->ref_count > 0);
1771		vm_object_res_deallocate(object);
1772		vm_object_unlock(object);
1773		return KERN_FAILURE;
1774	}
1775
1776	/*
1777	 *	Make sure no one can look us up now.
1778	 */
1779
1780	object->terminating = TRUE;
1781	object->alive = FALSE;
1782
1783	if ( !object->internal && (object->objq.next || object->objq.prev))
1784		vm_object_cache_remove(object);
1785
1786	if (object->hashed) {
1787		lck_mtx_t	*lck;
1788
1789		lck = vm_object_hash_lock_spin(object->pager);
1790		vm_object_remove(object);
1791		vm_object_hash_unlock(lck);
1792	}
1793	/*
1794	 *	Detach the object from its shadow if we are the shadow's
1795	 *	copy. The reference we hold on the shadow must be dropped
1796	 *	by our caller.
1797	 */
1798	if (((shadow_object = object->shadow) != VM_OBJECT_NULL) &&
1799	    !(object->pageout)) {
1800		vm_object_lock(shadow_object);
1801		if (shadow_object->copy == object)
1802			shadow_object->copy = VM_OBJECT_NULL;
1803		vm_object_unlock(shadow_object);
1804	}
1805
1806	if (object->paging_in_progress != 0 ||
1807	    object->activity_in_progress != 0) {
1808		/*
1809		 * There are still some paging_in_progress references
1810		 * on this object, meaning that there are some paging
1811		 * or other I/O operations in progress for this VM object.
1812		 * Such operations take some paging_in_progress references
1813		 * up front to ensure that the object doesn't go away, but
1814		 * they may also need to acquire a reference on the VM object,
1815		 * to map it in kernel space, for example.  That means that
1816		 * they may end up releasing the last reference on the VM
1817		 * object, triggering its termination, while still holding
1818		 * paging_in_progress references.  Waiting for these
1819		 * pending paging_in_progress references to go away here would
1820		 * deadlock.
1821		 *
1822		 * To avoid deadlocking, we'll let the vm_object_reaper_thread
1823		 * complete the VM object termination if it still holds
1824		 * paging_in_progress references at this point.
1825		 *
1826		 * No new paging_in_progress should appear now that the
1827		 * VM object is "terminating" and not "alive".
1828		 */
1829		vm_object_reap_async(object);
1830		vm_object_unlock(object);
1831		/*
1832		 * Return KERN_FAILURE to let the caller know that we
1833		 * haven't completed the termination and it can't drop this
1834		 * object's reference on its shadow object yet.
1835		 * The reaper thread will take care of that once it has
1836		 * completed this object's termination.
1837		 */
1838		return KERN_FAILURE;
1839	}
1840	/*
1841	 * complete the VM object termination
1842	 */
1843	vm_object_reap(object);
1844	object = VM_OBJECT_NULL;
1845
1846	/*
1847	 * the object lock was released by vm_object_reap()
1848	 *
1849	 * KERN_SUCCESS means that this object has been terminated
1850	 * and no longer needs its shadow object but still holds a
1851	 * reference on it.
1852	 * The caller is responsible for dropping that reference.
1853	 * We can't call vm_object_deallocate() here because that
1854	 * would create a recursion.
1855	 */
1856	return KERN_SUCCESS;
1857}
1858
1859
1860/*
1861 * vm_object_reap():
1862 *
1863 * Complete the termination of a VM object after it's been marked
1864 * as "terminating" and "!alive" by vm_object_terminate().
1865 *
1866 * The VM object must be locked by caller.
1867 * The lock will be released on return and the VM object is no longer valid.
1868 */
1869void
1870vm_object_reap(
1871	vm_object_t object)
1872{
1873	memory_object_t		pager;
1874
1875	vm_object_lock_assert_exclusive(object);
1876	assert(object->paging_in_progress == 0);
1877	assert(object->activity_in_progress == 0);
1878
1879	vm_object_reap_count++;
1880
1881	/*
1882	 * Disown this purgeable object to cleanup its owner's purgeable
1883	 * ledgers.  We need to do this before disconnecting the object
1884	 * from its pager, to properly account for compressed pages.
1885	 */
1886	if (object->internal &&
1887	    object->purgable != VM_PURGABLE_DENY) {
1888		vm_purgeable_accounting(object,
1889					object->purgable,
1890					TRUE); /* disown */
1891	}
1892
1893	pager = object->pager;
1894	object->pager = MEMORY_OBJECT_NULL;
1895
1896	if (pager != MEMORY_OBJECT_NULL)
1897		memory_object_control_disable(object->pager_control);
1898
1899	object->ref_count--;
1900#if	TASK_SWAPPER
1901	assert(object->res_count == 0);
1902#endif	/* TASK_SWAPPER */
1903
1904	assert (object->ref_count == 0);
1905
1906	/*
1907	 * remove from purgeable queue if it's on
1908	 */
1909	if (object->internal) {
1910		task_t owner;
1911
1912		owner = object->vo_purgeable_owner;
1913
1914		if (object->purgable == VM_PURGABLE_DENY) {
1915			/* not purgeable: nothing to do */
1916		} else if (object->purgable == VM_PURGABLE_VOLATILE) {
1917			purgeable_q_t queue;
1918
1919			assert(object->vo_purgeable_owner == NULL);
1920
1921			queue = vm_purgeable_object_remove(object);
1922			assert(queue);
1923
1924			if (object->purgeable_when_ripe) {
1925				/*
1926				 * Must take page lock for this -
1927				 * using it to protect token queue
1928				 */
1929				vm_page_lock_queues();
1930				vm_purgeable_token_delete_first(queue);
1931
1932				assert(queue->debug_count_objects>=0);
1933				vm_page_unlock_queues();
1934			}
1935
1936			/*
1937			 * Update "vm_page_purgeable_count" in bulk and mark
1938			 * object as VM_PURGABLE_EMPTY to avoid updating
1939			 * "vm_page_purgeable_count" again in vm_page_remove()
1940			 * when reaping the pages.
1941			 */
1942			unsigned int delta;
1943			assert(object->resident_page_count >=
1944			       object->wired_page_count);
1945			delta = (object->resident_page_count -
1946				 object->wired_page_count);
1947			if (delta != 0) {
1948				assert(vm_page_purgeable_count >= delta);
1949				OSAddAtomic(-delta,
1950					    (SInt32 *)&vm_page_purgeable_count);
1951			}
1952			if (object->wired_page_count != 0) {
1953				assert(vm_page_purgeable_wired_count >=
1954				       object->wired_page_count);
1955				OSAddAtomic(-object->wired_page_count,
1956					    (SInt32 *)&vm_page_purgeable_wired_count);
1957			}
1958			object->purgable = VM_PURGABLE_EMPTY;
1959		}
1960		else if (object->purgable == VM_PURGABLE_NONVOLATILE ||
1961			 object->purgable == VM_PURGABLE_EMPTY) {
1962			/* remove from nonvolatile queue */
1963			assert(object->vo_purgeable_owner == TASK_NULL);
1964			vm_purgeable_nonvolatile_dequeue(object);
1965		} else {
1966			panic("object %p in unexpected purgeable state 0x%x\n",
1967			      object, object->purgable);
1968		}
1969		assert(object->objq.next == NULL);
1970		assert(object->objq.prev == NULL);
1971	}
1972
1973	/*
1974	 *	Clean or free the pages, as appropriate.
1975	 *	It is possible for us to find busy/absent pages,
1976	 *	if some faults on this object were aborted.
1977	 */
1978	if (object->pageout) {
1979		assert(object->shadow != VM_OBJECT_NULL);
1980
1981		vm_pageout_object_terminate(object);
1982
1983	} else if (((object->temporary && !object->can_persist) || (pager == MEMORY_OBJECT_NULL))) {
1984
1985		vm_object_reap_pages(object, REAP_REAP);
1986	}
1987	assert(queue_empty(&object->memq));
1988	assert(object->paging_in_progress == 0);
1989	assert(object->activity_in_progress == 0);
1990	assert(object->ref_count == 0);
1991
1992	/*
1993	 * If the pager has not already been released by
1994	 * vm_object_destroy, we need to terminate it and
1995	 * release our reference to it here.
1996	 */
1997	if (pager != MEMORY_OBJECT_NULL) {
1998		vm_object_unlock(object);
1999		vm_object_release_pager(pager, object->hashed);
2000		vm_object_lock(object);
2001	}
2002
2003	/* kick off anyone waiting on terminating */
2004	object->terminating = FALSE;
2005	vm_object_paging_begin(object);
2006	vm_object_paging_end(object);
2007	vm_object_unlock(object);
2008
2009#if	MACH_PAGEMAP
2010	vm_external_destroy(object->existence_map, object->vo_size);
2011#endif	/* MACH_PAGEMAP */
2012
2013	object->shadow = VM_OBJECT_NULL;
2014
2015#if VM_OBJECT_TRACKING
2016	if (vm_object_tracking_inited) {
2017		btlog_remove_entries_for_element(vm_object_tracking_btlog,
2018						 object);
2019	}
2020#endif /* VM_OBJECT_TRACKING */
2021
2022	vm_object_lock_destroy(object);
2023	/*
2024	 *	Free the space for the object.
2025	 */
2026	zfree(vm_object_zone, object);
2027	object = VM_OBJECT_NULL;
2028}
2029
2030
2031unsigned int vm_max_batch = 256;
2032
2033#define V_O_R_MAX_BATCH 128
2034
2035#define BATCH_LIMIT(max) 	(vm_max_batch >= max ? max : vm_max_batch)
2036
2037
2038#define VM_OBJ_REAP_FREELIST(_local_free_q, do_disconnect)		\
2039	MACRO_BEGIN							\
2040	if (_local_free_q) {						\
2041		if (do_disconnect) {					\
2042			vm_page_t m;					\
2043			for (m = _local_free_q;				\
2044			     m != VM_PAGE_NULL;				\
2045			     m = (vm_page_t) m->pageq.next) {		\
2046				if (m->pmapped) {			\
2047					pmap_disconnect(m->phys_page);	\
2048				}					\
2049			}						\
2050		}							\
2051		vm_page_free_list(_local_free_q, TRUE);			\
2052		_local_free_q = VM_PAGE_NULL;				\
2053	}								\
2054	MACRO_END
2055
2056
2057void
2058vm_object_reap_pages(
2059	vm_object_t 	object,
2060	int		reap_type)
2061{
2062	vm_page_t	p;
2063	vm_page_t	next;
2064	vm_page_t	local_free_q = VM_PAGE_NULL;
2065	int		loop_count;
2066	boolean_t	disconnect_on_release;
2067	pmap_flush_context	pmap_flush_context_storage;
2068
2069	if (reap_type == REAP_DATA_FLUSH) {
2070		/*
2071		 * We need to disconnect pages from all pmaps before
2072		 * releasing them to the free list
2073		 */
2074		disconnect_on_release = TRUE;
2075	} else {
2076		/*
2077		 * Either the caller has already disconnected the pages
2078		 * from all pmaps, or we disconnect them here as we add
2079		 * them to out local list of pages to be released.
2080		 * No need to re-disconnect them when we release the pages
2081		 * to the free list.
2082		 */
2083		disconnect_on_release = FALSE;
2084	}
2085
2086restart_after_sleep:
2087	if (queue_empty(&object->memq))
2088		return;
2089	loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH);
2090
2091	if (reap_type == REAP_PURGEABLE)
2092		pmap_flush_context_init(&pmap_flush_context_storage);
2093
2094	vm_page_lockspin_queues();
2095
2096	next = (vm_page_t)queue_first(&object->memq);
2097
2098	while (!queue_end(&object->memq, (queue_entry_t)next)) {
2099
2100		p = next;
2101		next = (vm_page_t)queue_next(&next->listq);
2102
2103		if (--loop_count == 0) {
2104
2105			vm_page_unlock_queues();
2106
2107			if (local_free_q) {
2108
2109				if (reap_type == REAP_PURGEABLE) {
2110					pmap_flush(&pmap_flush_context_storage);
2111					pmap_flush_context_init(&pmap_flush_context_storage);
2112				}
2113				/*
2114				 * Free the pages we reclaimed so far
2115				 * and take a little break to avoid
2116				 * hogging the page queue lock too long
2117				 */
2118				VM_OBJ_REAP_FREELIST(local_free_q,
2119						     disconnect_on_release);
2120			} else
2121				mutex_pause(0);
2122
2123			loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH);
2124
2125			vm_page_lockspin_queues();
2126		}
2127		if (reap_type == REAP_DATA_FLUSH || reap_type == REAP_TERMINATE) {
2128
2129			if (p->busy || p->cleaning) {
2130
2131				vm_page_unlock_queues();
2132				/*
2133				 * free the pages reclaimed so far
2134				 */
2135				VM_OBJ_REAP_FREELIST(local_free_q,
2136						     disconnect_on_release);
2137
2138				PAGE_SLEEP(object, p, THREAD_UNINT);
2139
2140				goto restart_after_sleep;
2141			}
2142			if (p->laundry) {
2143				p->pageout = FALSE;
2144
2145				vm_pageout_steal_laundry(p, TRUE);
2146			}
2147		}
2148		switch (reap_type) {
2149
2150		case REAP_DATA_FLUSH:
2151			if (VM_PAGE_WIRED(p)) {
2152				/*
2153				 * this is an odd case... perhaps we should
2154				 * zero-fill this page since we're conceptually
2155				 * tossing its data at this point, but leaving
2156				 * it on the object to honor the 'wire' contract
2157				 */
2158				continue;
2159			}
2160			break;
2161
2162		case REAP_PURGEABLE:
2163			if (VM_PAGE_WIRED(p)) {
2164				/*
2165				 * can't purge a wired page
2166				 */
2167				vm_page_purged_wired++;
2168				continue;
2169			}
2170			if (p->laundry && !p->busy && !p->cleaning) {
2171				p->pageout = FALSE;
2172
2173				vm_pageout_steal_laundry(p, TRUE);
2174			}
2175			if (p->cleaning || p->laundry || p->absent) {
2176				/*
2177				 * page is being acted upon,
2178				 * so don't mess with it
2179				 */
2180				vm_page_purged_others++;
2181				continue;
2182			}
2183			if (p->busy) {
2184				/*
2185				 * We can't reclaim a busy page but we can
2186				 * make it more likely to be paged (it's not wired) to make
2187				 * sure that it gets considered by
2188				 * vm_pageout_scan() later.
2189				 */
2190				vm_page_deactivate(p);
2191				vm_page_purged_busy++;
2192				continue;
2193			}
2194
2195			assert(p->object != kernel_object);
2196
2197			/*
2198			 * we can discard this page...
2199			 */
2200			if (p->pmapped == TRUE) {
2201				/*
2202				 * unmap the page
2203				 */
2204				pmap_disconnect_options(p->phys_page, PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_NOREFMOD, (void *)&pmap_flush_context_storage);
2205			}
2206			vm_page_purged_count++;
2207
2208			break;
2209
2210		case REAP_TERMINATE:
2211			if (p->absent || p->private) {
2212				/*
2213				 *	For private pages, VM_PAGE_FREE just
2214				 *	leaves the page structure around for
2215				 *	its owner to clean up.  For absent
2216				 *	pages, the structure is returned to
2217				 *	the appropriate pool.
2218				 */
2219				break;
2220			}
2221			if (p->fictitious) {
2222				assert (p->phys_page == vm_page_guard_addr);
2223				break;
2224			}
2225			if (!p->dirty && p->wpmapped)
2226				p->dirty = pmap_is_modified(p->phys_page);
2227
2228			if ((p->dirty || p->precious) && !p->error && object->alive) {
2229
2230				if (!p->laundry) {
2231					VM_PAGE_QUEUES_REMOVE(p);
2232					/*
2233					 * flush page... page will be freed
2234					 * upon completion of I/O
2235					 */
2236					vm_pageout_cluster(p, TRUE);
2237				}
2238				vm_page_unlock_queues();
2239				/*
2240				 * free the pages reclaimed so far
2241				 */
2242				VM_OBJ_REAP_FREELIST(local_free_q,
2243						     disconnect_on_release);
2244
2245				vm_object_paging_wait(object, THREAD_UNINT);
2246
2247				goto restart_after_sleep;
2248			}
2249			break;
2250
2251		case REAP_REAP:
2252			break;
2253		}
2254		vm_page_free_prepare_queues(p);
2255		assert(p->pageq.next == NULL && p->pageq.prev == NULL);
2256		/*
2257		 * Add this page to our list of reclaimed pages,
2258		 * to be freed later.
2259		 */
2260		p->pageq.next = (queue_entry_t) local_free_q;
2261		local_free_q = p;
2262	}
2263	vm_page_unlock_queues();
2264
2265	/*
2266	 * Free the remaining reclaimed pages
2267	 */
2268	if (reap_type == REAP_PURGEABLE)
2269		pmap_flush(&pmap_flush_context_storage);
2270
2271	VM_OBJ_REAP_FREELIST(local_free_q,
2272			     disconnect_on_release);
2273}
2274
2275
2276void
2277vm_object_reap_async(
2278	vm_object_t	object)
2279{
2280	vm_object_lock_assert_exclusive(object);
2281
2282	vm_object_reaper_lock_spin();
2283
2284	vm_object_reap_count_async++;
2285
2286	/* enqueue the VM object... */
2287	queue_enter(&vm_object_reaper_queue, object,
2288		    vm_object_t, cached_list);
2289
2290	vm_object_reaper_unlock();
2291
2292	/* ... and wake up the reaper thread */
2293	thread_wakeup((event_t) &vm_object_reaper_queue);
2294}
2295
2296
2297void
2298vm_object_reaper_thread(void)
2299{
2300	vm_object_t	object, shadow_object;
2301
2302	vm_object_reaper_lock_spin();
2303
2304	while (!queue_empty(&vm_object_reaper_queue)) {
2305		queue_remove_first(&vm_object_reaper_queue,
2306				   object,
2307				   vm_object_t,
2308				   cached_list);
2309
2310		vm_object_reaper_unlock();
2311		vm_object_lock(object);
2312
2313		assert(object->terminating);
2314		assert(!object->alive);
2315
2316		/*
2317		 * The pageout daemon might be playing with our pages.
2318		 * Now that the object is dead, it won't touch any more
2319		 * pages, but some pages might already be on their way out.
2320		 * Hence, we wait until the active paging activities have
2321		 * ceased before we break the association with the pager
2322		 * itself.
2323		 */
2324		while (object->paging_in_progress != 0 ||
2325			object->activity_in_progress != 0) {
2326			vm_object_wait(object,
2327				       VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
2328				       THREAD_UNINT);
2329			vm_object_lock(object);
2330		}
2331
2332		shadow_object =
2333			object->pageout ? VM_OBJECT_NULL : object->shadow;
2334
2335		vm_object_reap(object);
2336		/* cache is unlocked and object is no longer valid */
2337		object = VM_OBJECT_NULL;
2338
2339		if (shadow_object != VM_OBJECT_NULL) {
2340			/*
2341			 * Drop the reference "object" was holding on
2342			 * its shadow object.
2343			 */
2344			vm_object_deallocate(shadow_object);
2345			shadow_object = VM_OBJECT_NULL;
2346		}
2347		vm_object_reaper_lock_spin();
2348	}
2349
2350	/* wait for more work... */
2351	assert_wait((event_t) &vm_object_reaper_queue, THREAD_UNINT);
2352
2353	vm_object_reaper_unlock();
2354
2355	thread_block((thread_continue_t) vm_object_reaper_thread);
2356	/*NOTREACHED*/
2357}
2358
2359/*
2360 *	Routine:	vm_object_pager_wakeup
2361 *	Purpose:	Wake up anyone waiting for termination of a pager.
2362 */
2363
2364static void
2365vm_object_pager_wakeup(
2366	memory_object_t	pager)
2367{
2368	vm_object_hash_entry_t	entry;
2369	boolean_t		waiting = FALSE;
2370	lck_mtx_t		*lck;
2371
2372	/*
2373	 *	If anyone was waiting for the memory_object_terminate
2374	 *	to be queued, wake them up now.
2375	 */
2376	lck = vm_object_hash_lock_spin(pager);
2377	entry = vm_object_hash_lookup(pager, TRUE);
2378	if (entry != VM_OBJECT_HASH_ENTRY_NULL)
2379		waiting = entry->waiting;
2380	vm_object_hash_unlock(lck);
2381
2382	if (entry != VM_OBJECT_HASH_ENTRY_NULL) {
2383		if (waiting)
2384			thread_wakeup((event_t) pager);
2385		vm_object_hash_entry_free(entry);
2386	}
2387}
2388
2389/*
2390 *	Routine:	vm_object_release_pager
2391 *	Purpose:	Terminate the pager and, upon completion,
2392 *			release our last reference to it.
2393 *			just like memory_object_terminate, except
2394 *			that we wake up anyone blocked in vm_object_enter
2395 *			waiting for termination message to be queued
2396 *			before calling memory_object_init.
2397 */
2398static void
2399vm_object_release_pager(
2400	memory_object_t	pager,
2401	boolean_t	hashed)
2402{
2403
2404	/*
2405	 *	Terminate the pager.
2406	 */
2407
2408	(void) memory_object_terminate(pager);
2409
2410	if (hashed == TRUE) {
2411		/*
2412		 *	Wakeup anyone waiting for this terminate
2413		 *      and remove the entry from the hash
2414		 */
2415		vm_object_pager_wakeup(pager);
2416	}
2417	/*
2418	 *	Release reference to pager.
2419	 */
2420	memory_object_deallocate(pager);
2421}
2422
2423/*
2424 *	Routine:	vm_object_destroy
2425 *	Purpose:
2426 *		Shut down a VM object, despite the
2427 *		presence of address map (or other) references
2428 *		to the vm_object.
2429 */
2430kern_return_t
2431vm_object_destroy(
2432	vm_object_t		object,
2433	__unused kern_return_t		reason)
2434{
2435	memory_object_t		old_pager;
2436
2437	if (object == VM_OBJECT_NULL)
2438		return(KERN_SUCCESS);
2439
2440	/*
2441	 *	Remove the pager association immediately.
2442	 *
2443	 *	This will prevent the memory manager from further
2444	 *	meddling.  [If it wanted to flush data or make
2445	 *	other changes, it should have done so before performing
2446	 *	the destroy call.]
2447	 */
2448
2449	vm_object_lock(object);
2450	object->can_persist = FALSE;
2451	object->named = FALSE;
2452	object->alive = FALSE;
2453
2454	if (object->hashed) {
2455		lck_mtx_t	*lck;
2456		/*
2457		 *	Rip out the pager from the vm_object now...
2458		 */
2459		lck = vm_object_hash_lock_spin(object->pager);
2460		vm_object_remove(object);
2461		vm_object_hash_unlock(lck);
2462	}
2463	old_pager = object->pager;
2464	object->pager = MEMORY_OBJECT_NULL;
2465	if (old_pager != MEMORY_OBJECT_NULL)
2466		memory_object_control_disable(object->pager_control);
2467
2468	/*
2469	 * Wait for the existing paging activity (that got
2470	 * through before we nulled out the pager) to subside.
2471	 */
2472
2473	vm_object_paging_wait(object, THREAD_UNINT);
2474	vm_object_unlock(object);
2475
2476	/*
2477	 *	Terminate the object now.
2478	 */
2479	if (old_pager != MEMORY_OBJECT_NULL) {
2480		vm_object_release_pager(old_pager, object->hashed);
2481
2482		/*
2483		 * JMM - Release the caller's reference.  This assumes the
2484		 * caller had a reference to release, which is a big (but
2485		 * currently valid) assumption if this is driven from the
2486		 * vnode pager (it is holding a named reference when making
2487		 * this call)..
2488		 */
2489		vm_object_deallocate(object);
2490
2491	}
2492	return(KERN_SUCCESS);
2493}
2494
2495
2496#if VM_OBJECT_CACHE
2497
2498#define VM_OBJ_DEACT_ALL_STATS DEBUG
2499#if VM_OBJ_DEACT_ALL_STATS
2500uint32_t vm_object_deactivate_all_pages_batches = 0;
2501uint32_t vm_object_deactivate_all_pages_pages = 0;
2502#endif /* VM_OBJ_DEACT_ALL_STATS */
2503/*
2504 *	vm_object_deactivate_all_pages
2505 *
2506 *	Deactivate all pages in the specified object.  (Keep its pages
2507 *	in memory even though it is no longer referenced.)
2508 *
2509 *	The object must be locked.
2510 */
2511static void
2512vm_object_deactivate_all_pages(
2513	register vm_object_t	object)
2514{
2515	register vm_page_t	p;
2516	int			loop_count;
2517#if VM_OBJ_DEACT_ALL_STATS
2518	int			pages_count;
2519#endif /* VM_OBJ_DEACT_ALL_STATS */
2520#define V_O_D_A_P_MAX_BATCH	256
2521
2522	loop_count = BATCH_LIMIT(V_O_D_A_P_MAX_BATCH);
2523#if VM_OBJ_DEACT_ALL_STATS
2524	pages_count = 0;
2525#endif /* VM_OBJ_DEACT_ALL_STATS */
2526	vm_page_lock_queues();
2527	queue_iterate(&object->memq, p, vm_page_t, listq) {
2528		if (--loop_count == 0) {
2529#if VM_OBJ_DEACT_ALL_STATS
2530			hw_atomic_add(&vm_object_deactivate_all_pages_batches,
2531				      1);
2532			hw_atomic_add(&vm_object_deactivate_all_pages_pages,
2533				      pages_count);
2534			pages_count = 0;
2535#endif /* VM_OBJ_DEACT_ALL_STATS */
2536			lck_mtx_yield(&vm_page_queue_lock);
2537			loop_count = BATCH_LIMIT(V_O_D_A_P_MAX_BATCH);
2538		}
2539		if (!p->busy && !p->throttled) {
2540#if VM_OBJ_DEACT_ALL_STATS
2541			pages_count++;
2542#endif /* VM_OBJ_DEACT_ALL_STATS */
2543			vm_page_deactivate(p);
2544		}
2545	}
2546#if VM_OBJ_DEACT_ALL_STATS
2547	if (pages_count) {
2548		hw_atomic_add(&vm_object_deactivate_all_pages_batches, 1);
2549		hw_atomic_add(&vm_object_deactivate_all_pages_pages,
2550			      pages_count);
2551		pages_count = 0;
2552	}
2553#endif /* VM_OBJ_DEACT_ALL_STATS */
2554	vm_page_unlock_queues();
2555}
2556#endif	/* VM_OBJECT_CACHE */
2557
2558
2559
2560/*
2561 * The "chunk" macros are used by routines below when looking for pages to deactivate.  These
2562 * exist because of the need to handle shadow chains.  When deactivating pages, we only
2563 * want to deactive the ones at the top most level in the object chain.  In order to do
2564 * this efficiently, the specified address range is divided up into "chunks" and we use
2565 * a bit map to keep track of which pages have already been processed as we descend down
2566 * the shadow chain.  These chunk macros hide the details of the bit map implementation
2567 * as much as we can.
2568 *
2569 * For convenience, we use a 64-bit data type as the bit map, and therefore a chunk is
2570 * set to 64 pages.  The bit map is indexed from the low-order end, so that the lowest
2571 * order bit represents page 0 in the current range and highest order bit represents
2572 * page 63.
2573 *
2574 * For further convenience, we also use negative logic for the page state in the bit map.
2575 * The bit is set to 1 to indicate it has not yet been seen, and to 0 to indicate it has
2576 * been processed.  This way we can simply test the 64-bit long word to see if it's zero
2577 * to easily tell if the whole range has been processed.  Therefore, the bit map starts
2578 * out with all the bits set.  The macros below hide all these details from the caller.
2579 */
2580
2581#define PAGES_IN_A_CHUNK	64	/* The number of pages in the chunk must */
2582					/* be the same as the number of bits in  */
2583					/* the chunk_state_t type. We use 64     */
2584					/* just for convenience.		 */
2585
2586#define CHUNK_SIZE	(PAGES_IN_A_CHUNK * PAGE_SIZE_64)	/* Size of a chunk in bytes */
2587
2588typedef uint64_t	chunk_state_t;
2589
2590/*
2591 * The bit map uses negative logic, so we start out with all 64 bits set to indicate
2592 * that no pages have been processed yet.  Also, if len is less than the full CHUNK_SIZE,
2593 * then we mark pages beyond the len as having been "processed" so that we don't waste time
2594 * looking at pages in that range.  This can save us from unnecessarily chasing down the
2595 * shadow chain.
2596 */
2597
2598#define CHUNK_INIT(c, len) 						\
2599	MACRO_BEGIN							\
2600	uint64_t p;							\
2601									\
2602	(c) = 0xffffffffffffffffLL; 					\
2603									\
2604	for (p = (len) / PAGE_SIZE_64; p < PAGES_IN_A_CHUNK; p++)	\
2605		MARK_PAGE_HANDLED(c, p);				\
2606	MACRO_END
2607
2608
2609/*
2610 * Return true if all pages in the chunk have not yet been processed.
2611 */
2612
2613#define CHUNK_NOT_COMPLETE(c)	((c) != 0)
2614
2615/*
2616 * Return true if the page at offset 'p' in the bit map has already been handled
2617 * while processing a higher level object in the shadow chain.
2618 */
2619
2620#define PAGE_ALREADY_HANDLED(c, p)	(((c) & (1LL << (p))) == 0)
2621
2622/*
2623 * Mark the page at offset 'p' in the bit map as having been processed.
2624 */
2625
2626#define MARK_PAGE_HANDLED(c, p) \
2627MACRO_BEGIN \
2628	(c) = (c) & ~(1LL << (p)); \
2629MACRO_END
2630
2631
2632/*
2633 * Return true if the page at the given offset has been paged out.  Object is
2634 * locked upon entry and returned locked.
2635 */
2636
2637static boolean_t
2638page_is_paged_out(
2639	vm_object_t		object,
2640	vm_object_offset_t	offset)
2641{
2642	kern_return_t	kr;
2643	memory_object_t	pager;
2644
2645	/*
2646	 * Check the existence map for the page if we have one, otherwise
2647	 * ask the pager about this page.
2648	 */
2649
2650#if MACH_PAGEMAP
2651	if (object->existence_map) {
2652		if (vm_external_state_get(object->existence_map, offset)
2653		    == VM_EXTERNAL_STATE_EXISTS) {
2654			/*
2655			 * We found the page
2656			 */
2657
2658			return TRUE;
2659		}
2660	} else
2661#endif /* MACH_PAGEMAP */
2662	if (object->internal &&
2663	   object->alive &&
2664	   !object->terminating &&
2665	   object->pager_ready) {
2666
2667		if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
2668			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
2669			    == VM_EXTERNAL_STATE_EXISTS) {
2670				return TRUE;
2671			} else {
2672				return FALSE;
2673			}
2674		}
2675
2676		/*
2677		 * We're already holding a "paging in progress" reference
2678		 * so the object can't disappear when we release the lock.
2679		 */
2680
2681		assert(object->paging_in_progress);
2682		pager = object->pager;
2683		vm_object_unlock(object);
2684
2685		kr = memory_object_data_request(
2686			pager,
2687			offset + object->paging_offset,
2688			0,	/* just poke the pager */
2689			VM_PROT_READ,
2690			NULL);
2691
2692		vm_object_lock(object);
2693
2694		if (kr == KERN_SUCCESS) {
2695
2696			/*
2697			 * We found the page
2698			 */
2699
2700			return TRUE;
2701		}
2702	}
2703
2704	return FALSE;
2705}
2706
2707
2708
2709/*
2710 * madvise_free_debug
2711 *
2712 * To help debug madvise(MADV_FREE*) mis-usage, this triggers a
2713 * zero-fill as soon as a page is affected by a madvise(MADV_FREE*), to
2714 * simulate the loss of the page's contents as if the page had been
2715 * reclaimed and then re-faulted.
2716 */
2717#if DEVELOPMENT || DEBUG
2718int madvise_free_debug = 1;
2719#else /* DEBUG */
2720int madvise_free_debug = 0;
2721#endif /* DEBUG */
2722
2723/*
2724 * Deactivate the pages in the specified object and range.  If kill_page is set, also discard any
2725 * page modified state from the pmap.  Update the chunk_state as we go along.  The caller must specify
2726 * a size that is less than or equal to the CHUNK_SIZE.
2727 */
2728
2729static void
2730deactivate_pages_in_object(
2731	vm_object_t		object,
2732	vm_object_offset_t	offset,
2733	vm_object_size_t	size,
2734	boolean_t               kill_page,
2735	boolean_t		reusable_page,
2736	boolean_t		all_reusable,
2737	chunk_state_t		*chunk_state,
2738	pmap_flush_context      *pfc)
2739{
2740	vm_page_t	m;
2741	int		p;
2742	struct vm_page_delayed_work	dw_array[DEFAULT_DELAYED_WORK_LIMIT];
2743	struct vm_page_delayed_work	*dwp;
2744	int		dw_count;
2745	int		dw_limit;
2746	unsigned int	reusable = 0;
2747
2748	/*
2749	 * Examine each page in the chunk.  The variable 'p' is the page number relative to the start of the
2750	 * chunk.  Since this routine is called once for each level in the shadow chain, the chunk_state may
2751	 * have pages marked as having been processed already.  We stop the loop early if we find we've handled
2752	 * all the pages in the chunk.
2753	 */
2754
2755	dwp = &dw_array[0];
2756	dw_count = 0;
2757	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
2758
2759	for(p = 0; size && CHUNK_NOT_COMPLETE(*chunk_state); p++, size -= PAGE_SIZE_64, offset += PAGE_SIZE_64) {
2760
2761		/*
2762		 * If this offset has already been found and handled in a higher level object, then don't
2763		 * do anything with it in the current shadow object.
2764		 */
2765
2766		if (PAGE_ALREADY_HANDLED(*chunk_state, p))
2767			continue;
2768
2769		/*
2770		 * See if the page at this offset is around.  First check to see if the page is resident,
2771		 * then if not, check the existence map or with the pager.
2772		 */
2773
2774	        if ((m = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
2775
2776			/*
2777			 * We found a page we were looking for.  Mark it as "handled" now in the chunk_state
2778			 * so that we won't bother looking for a page at this offset again if there are more
2779			 * shadow objects.  Then deactivate the page.
2780			 */
2781
2782			MARK_PAGE_HANDLED(*chunk_state, p);
2783
2784			if (( !VM_PAGE_WIRED(m)) && (!m->private) && (!m->gobbled) && (!m->busy) && (!m->laundry)) {
2785				int	clear_refmod;
2786				int	pmap_options;
2787
2788				dwp->dw_mask = 0;
2789
2790				pmap_options = 0;
2791				clear_refmod = VM_MEM_REFERENCED;
2792				dwp->dw_mask |= DW_clear_reference;
2793
2794				if ((kill_page) && (object->internal)) {
2795					if (madvise_free_debug) {
2796						/*
2797						 * zero-fill the page now
2798						 * to simulate it being
2799						 * reclaimed and re-faulted.
2800						 */
2801						pmap_zero_page(m->phys_page);
2802					}
2803			        	m->precious = FALSE;
2804				        m->dirty = FALSE;
2805
2806					clear_refmod |= VM_MEM_MODIFIED;
2807					if (m->throttled) {
2808						/*
2809						 * This page is now clean and
2810						 * reclaimable.  Move it out
2811						 * of the throttled queue, so
2812						 * that vm_pageout_scan() can
2813						 * find it.
2814						 */
2815						dwp->dw_mask |= DW_move_page;
2816					}
2817#if	MACH_PAGEMAP
2818					vm_external_state_clr(object->existence_map, offset);
2819#endif	/* MACH_PAGEMAP */
2820					VM_COMPRESSOR_PAGER_STATE_CLR(object,
2821								      offset);
2822
2823					if (reusable_page && !m->reusable) {
2824						assert(!all_reusable);
2825						assert(!object->all_reusable);
2826						m->reusable = TRUE;
2827						object->reusable_page_count++;
2828						assert(object->resident_page_count >= object->reusable_page_count);
2829						reusable++;
2830						/*
2831						 * Tell pmap this page is now
2832						 * "reusable" (to update pmap
2833						 * stats for all mappings).
2834						 */
2835						pmap_options |=	PMAP_OPTIONS_SET_REUSABLE;
2836					}
2837				}
2838				pmap_options |= PMAP_OPTIONS_NOFLUSH;
2839				pmap_clear_refmod_options(m->phys_page,
2840							  clear_refmod,
2841							  pmap_options,
2842							  (void *)pfc);
2843
2844				if (!m->throttled && !(reusable_page || all_reusable))
2845					dwp->dw_mask |= DW_move_page;
2846
2847				if (dwp->dw_mask)
2848					VM_PAGE_ADD_DELAYED_WORK(dwp, m,
2849								 dw_count);
2850
2851				if (dw_count >= dw_limit) {
2852					if (reusable) {
2853						OSAddAtomic(reusable,
2854							    &vm_page_stats_reusable.reusable_count);
2855						vm_page_stats_reusable.reusable += reusable;
2856						reusable = 0;
2857					}
2858					vm_page_do_delayed_work(object, &dw_array[0], dw_count);
2859
2860					dwp = &dw_array[0];
2861					dw_count = 0;
2862				}
2863			}
2864
2865		} else {
2866
2867			/*
2868			 * The page at this offset isn't memory resident, check to see if it's
2869			 * been paged out.  If so, mark it as handled so we don't bother looking
2870			 * for it in the shadow chain.
2871			 */
2872
2873			if (page_is_paged_out(object, offset)) {
2874				MARK_PAGE_HANDLED(*chunk_state, p);
2875
2876				/*
2877				 * If we're killing a non-resident page, then clear the page in the existence
2878				 * map so we don't bother paging it back in if it's touched again in the future.
2879				 */
2880
2881				if ((kill_page) && (object->internal)) {
2882#if	MACH_PAGEMAP
2883					vm_external_state_clr(object->existence_map, offset);
2884#endif	/* MACH_PAGEMAP */
2885					VM_COMPRESSOR_PAGER_STATE_CLR(object,
2886								      offset);
2887				}
2888			}
2889		}
2890	}
2891
2892	if (reusable) {
2893		OSAddAtomic(reusable, &vm_page_stats_reusable.reusable_count);
2894		vm_page_stats_reusable.reusable += reusable;
2895		reusable = 0;
2896	}
2897
2898	if (dw_count)
2899		vm_page_do_delayed_work(object, &dw_array[0], dw_count);
2900}
2901
2902
2903/*
2904 * Deactive a "chunk" of the given range of the object starting at offset.  A "chunk"
2905 * will always be less than or equal to the given size.  The total range is divided up
2906 * into chunks for efficiency and performance related to the locks and handling the shadow
2907 * chain.  This routine returns how much of the given "size" it actually processed.  It's
2908 * up to the caler to loop and keep calling this routine until the entire range they want
2909 * to process has been done.
2910 */
2911
2912static vm_object_size_t
2913deactivate_a_chunk(
2914	vm_object_t		orig_object,
2915	vm_object_offset_t	offset,
2916	vm_object_size_t	size,
2917	boolean_t               kill_page,
2918	boolean_t		reusable_page,
2919	boolean_t		all_reusable,
2920	pmap_flush_context      *pfc)
2921{
2922	vm_object_t		object;
2923	vm_object_t		tmp_object;
2924	vm_object_size_t	length;
2925	chunk_state_t		chunk_state;
2926
2927
2928	/*
2929	 * Get set to do a chunk.  We'll do up to CHUNK_SIZE, but no more than the
2930	 * remaining size the caller asked for.
2931	 */
2932
2933	length = MIN(size, CHUNK_SIZE);
2934
2935	/*
2936	 * The chunk_state keeps track of which pages we've already processed if there's
2937	 * a shadow chain on this object.  At this point, we haven't done anything with this
2938	 * range of pages yet, so initialize the state to indicate no pages processed yet.
2939	 */
2940
2941	CHUNK_INIT(chunk_state, length);
2942	object = orig_object;
2943
2944	/*
2945	 * Start at the top level object and iterate around the loop once for each object
2946	 * in the shadow chain.  We stop processing early if we've already found all the pages
2947	 * in the range.  Otherwise we stop when we run out of shadow objects.
2948	 */
2949
2950	while (object && CHUNK_NOT_COMPLETE(chunk_state)) {
2951		vm_object_paging_begin(object);
2952
2953		deactivate_pages_in_object(object, offset, length, kill_page, reusable_page, all_reusable, &chunk_state, pfc);
2954
2955		vm_object_paging_end(object);
2956
2957		/*
2958		 * We've finished with this object, see if there's a shadow object.  If
2959		 * there is, update the offset and lock the new object.  We also turn off
2960		 * kill_page at this point since we only kill pages in the top most object.
2961		 */
2962
2963		tmp_object = object->shadow;
2964
2965		if (tmp_object) {
2966			kill_page = FALSE;
2967			reusable_page = FALSE;
2968			all_reusable = FALSE;
2969		        offset += object->vo_shadow_offset;
2970		        vm_object_lock(tmp_object);
2971		}
2972
2973		if (object != orig_object)
2974		        vm_object_unlock(object);
2975
2976		object = tmp_object;
2977	}
2978
2979	if (object && object != orig_object)
2980	        vm_object_unlock(object);
2981
2982	return length;
2983}
2984
2985
2986
2987/*
2988 * Move any resident pages in the specified range to the inactive queue.  If kill_page is set,
2989 * we also clear the modified status of the page and "forget" any changes that have been made
2990 * to the page.
2991 */
2992
2993__private_extern__ void
2994vm_object_deactivate_pages(
2995	vm_object_t		object,
2996	vm_object_offset_t	offset,
2997	vm_object_size_t	size,
2998	boolean_t               kill_page,
2999	boolean_t		reusable_page)
3000{
3001	vm_object_size_t	length;
3002	boolean_t		all_reusable;
3003	pmap_flush_context	pmap_flush_context_storage;
3004
3005	/*
3006	 * We break the range up into chunks and do one chunk at a time.  This is for
3007	 * efficiency and performance while handling the shadow chains and the locks.
3008	 * The deactivate_a_chunk() function returns how much of the range it processed.
3009	 * We keep calling this routine until the given size is exhausted.
3010	 */
3011
3012
3013	all_reusable = FALSE;
3014#if 11
3015	/*
3016	 * For the sake of accurate "reusable" pmap stats, we need
3017	 * to tell pmap about each page that is no longer "reusable",
3018	 * so we can't do the "all_reusable" optimization.
3019	 */
3020#else
3021	if (reusable_page &&
3022	    object->internal &&
3023	    object->vo_size != 0 &&
3024	    object->vo_size == size &&
3025	    object->reusable_page_count == 0) {
3026		all_reusable = TRUE;
3027		reusable_page = FALSE;
3028	}
3029#endif
3030
3031	if ((reusable_page || all_reusable) && object->all_reusable) {
3032		/* This means MADV_FREE_REUSABLE has been called twice, which
3033		 * is probably illegal. */
3034		return;
3035	}
3036
3037	pmap_flush_context_init(&pmap_flush_context_storage);
3038
3039	while (size) {
3040		length = deactivate_a_chunk(object, offset, size, kill_page, reusable_page, all_reusable, &pmap_flush_context_storage);
3041
3042		size -= length;
3043		offset += length;
3044	}
3045	pmap_flush(&pmap_flush_context_storage);
3046
3047	if (all_reusable) {
3048		if (!object->all_reusable) {
3049			unsigned int reusable;
3050
3051			object->all_reusable = TRUE;
3052			assert(object->reusable_page_count == 0);
3053			/* update global stats */
3054			reusable = object->resident_page_count;
3055			OSAddAtomic(reusable,
3056				    &vm_page_stats_reusable.reusable_count);
3057			vm_page_stats_reusable.reusable += reusable;
3058			vm_page_stats_reusable.all_reusable_calls++;
3059		}
3060	} else if (reusable_page) {
3061		vm_page_stats_reusable.partial_reusable_calls++;
3062	}
3063}
3064
3065void
3066vm_object_reuse_pages(
3067	vm_object_t		object,
3068	vm_object_offset_t	start_offset,
3069	vm_object_offset_t	end_offset,
3070	boolean_t		allow_partial_reuse)
3071{
3072	vm_object_offset_t	cur_offset;
3073	vm_page_t		m;
3074	unsigned int		reused, reusable;
3075
3076#define VM_OBJECT_REUSE_PAGE(object, m, reused)				\
3077	MACRO_BEGIN							\
3078		if ((m) != VM_PAGE_NULL &&				\
3079		    (m)->reusable) {					\
3080			assert((object)->reusable_page_count <=		\
3081			       (object)->resident_page_count);		\
3082			assert((object)->reusable_page_count > 0);	\
3083			(object)->reusable_page_count--;		\
3084			(m)->reusable = FALSE;				\
3085			(reused)++;					\
3086			/*						\
3087			 * Tell pmap that this page is no longer	\
3088			 * "reusable", to update the "reusable" stats	\
3089			 * for all the pmaps that have mapped this	\
3090			 * page.					\
3091			 */						\
3092			pmap_clear_refmod_options((m)->phys_page,	\
3093						  0, /* refmod */	\
3094						  (PMAP_OPTIONS_CLEAR_REUSABLE \
3095						   | PMAP_OPTIONS_NOFLUSH), \
3096						  NULL);		\
3097		}							\
3098	MACRO_END
3099
3100	reused = 0;
3101	reusable = 0;
3102
3103	vm_object_lock_assert_exclusive(object);
3104
3105	if (object->all_reusable) {
3106		panic("object %p all_reusable: can't update pmap stats\n",
3107		      object);
3108		assert(object->reusable_page_count == 0);
3109		object->all_reusable = FALSE;
3110		if (end_offset - start_offset == object->vo_size ||
3111		    !allow_partial_reuse) {
3112			vm_page_stats_reusable.all_reuse_calls++;
3113			reused = object->resident_page_count;
3114		} else {
3115			vm_page_stats_reusable.partial_reuse_calls++;
3116			queue_iterate(&object->memq, m, vm_page_t, listq) {
3117				if (m->offset < start_offset ||
3118				    m->offset >= end_offset) {
3119					m->reusable = TRUE;
3120					object->reusable_page_count++;
3121					assert(object->resident_page_count >= object->reusable_page_count);
3122					continue;
3123				} else {
3124					assert(!m->reusable);
3125					reused++;
3126				}
3127			}
3128		}
3129	} else if (object->resident_page_count >
3130		   ((end_offset - start_offset) >> PAGE_SHIFT)) {
3131		vm_page_stats_reusable.partial_reuse_calls++;
3132		for (cur_offset = start_offset;
3133		     cur_offset < end_offset;
3134		     cur_offset += PAGE_SIZE_64) {
3135			if (object->reusable_page_count == 0) {
3136				break;
3137			}
3138			m = vm_page_lookup(object, cur_offset);
3139			VM_OBJECT_REUSE_PAGE(object, m, reused);
3140		}
3141	} else {
3142		vm_page_stats_reusable.partial_reuse_calls++;
3143		queue_iterate(&object->memq, m, vm_page_t, listq) {
3144			if (object->reusable_page_count == 0) {
3145				break;
3146			}
3147			if (m->offset < start_offset ||
3148			    m->offset >= end_offset) {
3149				continue;
3150			}
3151			VM_OBJECT_REUSE_PAGE(object, m, reused);
3152		}
3153	}
3154
3155	/* update global stats */
3156	OSAddAtomic(reusable-reused, &vm_page_stats_reusable.reusable_count);
3157	vm_page_stats_reusable.reused += reused;
3158	vm_page_stats_reusable.reusable += reusable;
3159}
3160
3161/*
3162 *	Routine:	vm_object_pmap_protect
3163 *
3164 *	Purpose:
3165 *		Reduces the permission for all physical
3166 *		pages in the specified object range.
3167 *
3168 *		If removing write permission only, it is
3169 *		sufficient to protect only the pages in
3170 *		the top-level object; only those pages may
3171 *		have write permission.
3172 *
3173 *		If removing all access, we must follow the
3174 *		shadow chain from the top-level object to
3175 *		remove access to all pages in shadowed objects.
3176 *
3177 *		The object must *not* be locked.  The object must
3178 *		be temporary/internal.
3179 *
3180 *              If pmap is not NULL, this routine assumes that
3181 *              the only mappings for the pages are in that
3182 *              pmap.
3183 */
3184
3185__private_extern__ void
3186vm_object_pmap_protect(
3187	register vm_object_t		object,
3188	register vm_object_offset_t	offset,
3189	vm_object_size_t		size,
3190	pmap_t				pmap,
3191	vm_map_offset_t			pmap_start,
3192	vm_prot_t			prot)
3193{
3194	vm_object_pmap_protect_options(object, offset, size,
3195				       pmap, pmap_start, prot, 0);
3196}
3197
3198__private_extern__ void
3199vm_object_pmap_protect_options(
3200	register vm_object_t		object,
3201	register vm_object_offset_t	offset,
3202	vm_object_size_t		size,
3203	pmap_t				pmap,
3204	vm_map_offset_t			pmap_start,
3205	vm_prot_t			prot,
3206	int				options)
3207{
3208	pmap_flush_context	pmap_flush_context_storage;
3209	boolean_t		delayed_pmap_flush = FALSE;
3210
3211	if (object == VM_OBJECT_NULL)
3212		return;
3213	size = vm_object_round_page(size);
3214	offset = vm_object_trunc_page(offset);
3215
3216	vm_object_lock(object);
3217
3218	if (object->phys_contiguous) {
3219		if (pmap != NULL) {
3220			vm_object_unlock(object);
3221			pmap_protect_options(pmap,
3222					     pmap_start,
3223					     pmap_start + size,
3224					     prot,
3225					     options & ~PMAP_OPTIONS_NOFLUSH,
3226					     NULL);
3227		} else {
3228			vm_object_offset_t phys_start, phys_end, phys_addr;
3229
3230			phys_start = object->vo_shadow_offset + offset;
3231			phys_end = phys_start + size;
3232			assert(phys_start <= phys_end);
3233			assert(phys_end <= object->vo_shadow_offset + object->vo_size);
3234			vm_object_unlock(object);
3235
3236			pmap_flush_context_init(&pmap_flush_context_storage);
3237			delayed_pmap_flush = FALSE;
3238
3239			for (phys_addr = phys_start;
3240			     phys_addr < phys_end;
3241			     phys_addr += PAGE_SIZE_64) {
3242				pmap_page_protect_options(
3243					(ppnum_t) (phys_addr >> PAGE_SHIFT),
3244					prot,
3245					options | PMAP_OPTIONS_NOFLUSH,
3246					(void *)&pmap_flush_context_storage);
3247				delayed_pmap_flush = TRUE;
3248			}
3249			if (delayed_pmap_flush == TRUE)
3250				pmap_flush(&pmap_flush_context_storage);
3251		}
3252		return;
3253	}
3254
3255	assert(object->internal);
3256
3257	while (TRUE) {
3258	   if (ptoa_64(object->resident_page_count) > size/2 && pmap != PMAP_NULL) {
3259		vm_object_unlock(object);
3260		pmap_protect_options(pmap, pmap_start, pmap_start + size, prot,
3261				     options & ~PMAP_OPTIONS_NOFLUSH, NULL);
3262		return;
3263	    }
3264
3265	   pmap_flush_context_init(&pmap_flush_context_storage);
3266	   delayed_pmap_flush = FALSE;
3267
3268	    /*
3269	     * if we are doing large ranges with respect to resident
3270	     * page count then we should interate over pages otherwise
3271	     * inverse page look-up will be faster
3272	     */
3273	    if (ptoa_64(object->resident_page_count / 4) <  size) {
3274		vm_page_t		p;
3275		vm_object_offset_t	end;
3276
3277		end = offset + size;
3278
3279		queue_iterate(&object->memq, p, vm_page_t, listq) {
3280			if (!p->fictitious && (offset <= p->offset) && (p->offset < end)) {
3281				vm_map_offset_t start;
3282
3283				start = pmap_start + p->offset - offset;
3284
3285				if (pmap != PMAP_NULL)
3286					pmap_protect_options(
3287						pmap,
3288						start,
3289						start + PAGE_SIZE_64,
3290						prot,
3291						options | PMAP_OPTIONS_NOFLUSH,
3292						&pmap_flush_context_storage);
3293				else
3294					pmap_page_protect_options(
3295						p->phys_page,
3296						prot,
3297						options | PMAP_OPTIONS_NOFLUSH,
3298						&pmap_flush_context_storage);
3299					delayed_pmap_flush = TRUE;
3300			}
3301		}
3302
3303	   } else {
3304		vm_page_t		p;
3305		vm_object_offset_t	end;
3306		vm_object_offset_t	target_off;
3307
3308		end = offset + size;
3309
3310		for (target_off = offset;
3311		     target_off < end; target_off += PAGE_SIZE) {
3312
3313			p = vm_page_lookup(object, target_off);
3314
3315			if (p != VM_PAGE_NULL) {
3316				vm_object_offset_t start;
3317
3318				start = pmap_start + (p->offset - offset);
3319
3320				if (pmap != PMAP_NULL)
3321					pmap_protect_options(
3322						pmap,
3323						start,
3324						start + PAGE_SIZE_64,
3325						prot,
3326						options | PMAP_OPTIONS_NOFLUSH,
3327						&pmap_flush_context_storage);
3328				else
3329					pmap_page_protect_options(
3330						p->phys_page,
3331						prot,
3332						options | PMAP_OPTIONS_NOFLUSH,
3333						&pmap_flush_context_storage);
3334					delayed_pmap_flush = TRUE;
3335		    	}
3336		}
3337	    }
3338	    if (delayed_pmap_flush == TRUE)
3339		    pmap_flush(&pmap_flush_context_storage);
3340
3341	    if (prot == VM_PROT_NONE) {
3342		/*
3343		 * Must follow shadow chain to remove access
3344		 * to pages in shadowed objects.
3345		 */
3346		register vm_object_t	next_object;
3347
3348		next_object = object->shadow;
3349		if (next_object != VM_OBJECT_NULL) {
3350		    offset += object->vo_shadow_offset;
3351		    vm_object_lock(next_object);
3352		    vm_object_unlock(object);
3353		    object = next_object;
3354		}
3355		else {
3356		    /*
3357		     * End of chain - we are done.
3358		     */
3359		    break;
3360		}
3361	    }
3362	    else {
3363		/*
3364		 * Pages in shadowed objects may never have
3365		 * write permission - we may stop here.
3366		 */
3367		break;
3368	    }
3369	}
3370
3371	vm_object_unlock(object);
3372}
3373
3374/*
3375 *	Routine:	vm_object_copy_slowly
3376 *
3377 *	Description:
3378 *		Copy the specified range of the source
3379 *		virtual memory object without using
3380 *		protection-based optimizations (such
3381 *		as copy-on-write).  The pages in the
3382 *		region are actually copied.
3383 *
3384 *	In/out conditions:
3385 *		The caller must hold a reference and a lock
3386 *		for the source virtual memory object.  The source
3387 *		object will be returned *unlocked*.
3388 *
3389 *	Results:
3390 *		If the copy is completed successfully, KERN_SUCCESS is
3391 *		returned.  If the caller asserted the interruptible
3392 *		argument, and an interruption occurred while waiting
3393 *		for a user-generated event, MACH_SEND_INTERRUPTED is
3394 *		returned.  Other values may be returned to indicate
3395 *		hard errors during the copy operation.
3396 *
3397 *		A new virtual memory object is returned in a
3398 *		parameter (_result_object).  The contents of this
3399 *		new object, starting at a zero offset, are a copy
3400 *		of the source memory region.  In the event of
3401 *		an error, this parameter will contain the value
3402 *		VM_OBJECT_NULL.
3403 */
3404__private_extern__ kern_return_t
3405vm_object_copy_slowly(
3406	register vm_object_t	src_object,
3407	vm_object_offset_t	src_offset,
3408	vm_object_size_t	size,
3409	boolean_t		interruptible,
3410	vm_object_t		*_result_object)	/* OUT */
3411{
3412	vm_object_t		new_object;
3413	vm_object_offset_t	new_offset;
3414
3415	struct vm_object_fault_info fault_info;
3416
3417	XPR(XPR_VM_OBJECT, "v_o_c_slowly obj 0x%x off 0x%x size 0x%x\n",
3418	    src_object, src_offset, size, 0, 0);
3419
3420	if (size == 0) {
3421		vm_object_unlock(src_object);
3422		*_result_object = VM_OBJECT_NULL;
3423		return(KERN_INVALID_ARGUMENT);
3424	}
3425
3426	/*
3427	 *	Prevent destruction of the source object while we copy.
3428	 */
3429
3430	vm_object_reference_locked(src_object);
3431	vm_object_unlock(src_object);
3432
3433	/*
3434	 *	Create a new object to hold the copied pages.
3435	 *	A few notes:
3436	 *		We fill the new object starting at offset 0,
3437	 *		 regardless of the input offset.
3438	 *		We don't bother to lock the new object within
3439	 *		 this routine, since we have the only reference.
3440	 */
3441
3442	new_object = vm_object_allocate(size);
3443	new_offset = 0;
3444
3445	assert(size == trunc_page_64(size));	/* Will the loop terminate? */
3446
3447	fault_info.interruptible = interruptible;
3448	fault_info.behavior  = VM_BEHAVIOR_SEQUENTIAL;
3449	fault_info.user_tag = 0;
3450	fault_info.pmap_options = 0;
3451	fault_info.lo_offset = src_offset;
3452	fault_info.hi_offset = src_offset + size;
3453	fault_info.no_cache  = FALSE;
3454	fault_info.stealth = TRUE;
3455	fault_info.io_sync = FALSE;
3456	fault_info.cs_bypass = FALSE;
3457	fault_info.mark_zf_absent = FALSE;
3458	fault_info.batch_pmap_op = FALSE;
3459
3460	for ( ;
3461	    size != 0 ;
3462	    src_offset += PAGE_SIZE_64,
3463			new_offset += PAGE_SIZE_64, size -= PAGE_SIZE_64
3464	    ) {
3465		vm_page_t	new_page;
3466		vm_fault_return_t result;
3467
3468		vm_object_lock(new_object);
3469
3470		while ((new_page = vm_page_alloc(new_object, new_offset))
3471				== VM_PAGE_NULL) {
3472
3473			vm_object_unlock(new_object);
3474
3475			if (!vm_page_wait(interruptible)) {
3476				vm_object_deallocate(new_object);
3477				vm_object_deallocate(src_object);
3478				*_result_object = VM_OBJECT_NULL;
3479				return(MACH_SEND_INTERRUPTED);
3480			}
3481			vm_object_lock(new_object);
3482		}
3483		vm_object_unlock(new_object);
3484
3485		do {
3486			vm_prot_t	prot = VM_PROT_READ;
3487			vm_page_t	_result_page;
3488			vm_page_t	top_page;
3489			register
3490			vm_page_t	result_page;
3491			kern_return_t	error_code;
3492
3493			vm_object_lock(src_object);
3494			vm_object_paging_begin(src_object);
3495
3496			if (size > (vm_size_t) -1) {
3497				/* 32-bit overflow */
3498				fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
3499			} else {
3500				fault_info.cluster_size = (vm_size_t) size;
3501				assert(fault_info.cluster_size == size);
3502			}
3503
3504			XPR(XPR_VM_FAULT,"vm_object_copy_slowly -> vm_fault_page",0,0,0,0,0);
3505			_result_page = VM_PAGE_NULL;
3506			result = vm_fault_page(src_object, src_offset,
3507				VM_PROT_READ, FALSE,
3508				FALSE, /* page not looked up */
3509				&prot, &_result_page, &top_page,
3510			        (int *)0,
3511				&error_code, FALSE, FALSE, &fault_info);
3512
3513			switch(result) {
3514			case VM_FAULT_SUCCESS:
3515				result_page = _result_page;
3516
3517				/*
3518				 *	Copy the page to the new object.
3519				 *
3520				 *	POLICY DECISION:
3521				 *		If result_page is clean,
3522				 *		we could steal it instead
3523				 *		of copying.
3524				 */
3525
3526				vm_page_copy(result_page, new_page);
3527				vm_object_unlock(result_page->object);
3528
3529				/*
3530				 *	Let go of both pages (make them
3531				 *	not busy, perform wakeup, activate).
3532				 */
3533				vm_object_lock(new_object);
3534				SET_PAGE_DIRTY(new_page, FALSE);
3535				PAGE_WAKEUP_DONE(new_page);
3536				vm_object_unlock(new_object);
3537
3538				vm_object_lock(result_page->object);
3539				PAGE_WAKEUP_DONE(result_page);
3540
3541				vm_page_lockspin_queues();
3542				if (!result_page->active &&
3543				    !result_page->inactive &&
3544				    !result_page->throttled)
3545					vm_page_activate(result_page);
3546				vm_page_activate(new_page);
3547				vm_page_unlock_queues();
3548
3549				/*
3550				 *	Release paging references and
3551				 *	top-level placeholder page, if any.
3552				 */
3553
3554				vm_fault_cleanup(result_page->object,
3555						 top_page);
3556
3557				break;
3558
3559			case VM_FAULT_RETRY:
3560				break;
3561
3562			case VM_FAULT_MEMORY_SHORTAGE:
3563				if (vm_page_wait(interruptible))
3564					break;
3565				/* fall thru */
3566
3567			case VM_FAULT_INTERRUPTED:
3568				vm_object_lock(new_object);
3569				VM_PAGE_FREE(new_page);
3570				vm_object_unlock(new_object);
3571
3572				vm_object_deallocate(new_object);
3573				vm_object_deallocate(src_object);
3574				*_result_object = VM_OBJECT_NULL;
3575				return(MACH_SEND_INTERRUPTED);
3576
3577			case VM_FAULT_SUCCESS_NO_VM_PAGE:
3578				/* success but no VM page: fail */
3579				vm_object_paging_end(src_object);
3580				vm_object_unlock(src_object);
3581				/*FALLTHROUGH*/
3582			case VM_FAULT_MEMORY_ERROR:
3583				/*
3584				 * A policy choice:
3585				 *	(a) ignore pages that we can't
3586				 *	    copy
3587				 *	(b) return the null object if
3588				 *	    any page fails [chosen]
3589				 */
3590
3591				vm_object_lock(new_object);
3592				VM_PAGE_FREE(new_page);
3593				vm_object_unlock(new_object);
3594
3595				vm_object_deallocate(new_object);
3596				vm_object_deallocate(src_object);
3597				*_result_object = VM_OBJECT_NULL;
3598				return(error_code ? error_code:
3599				       KERN_MEMORY_ERROR);
3600
3601			default:
3602				panic("vm_object_copy_slowly: unexpected error"
3603				      " 0x%x from vm_fault_page()\n", result);
3604			}
3605		} while (result != VM_FAULT_SUCCESS);
3606	}
3607
3608	/*
3609	 *	Lose the extra reference, and return our object.
3610	 */
3611	vm_object_deallocate(src_object);
3612	*_result_object = new_object;
3613	return(KERN_SUCCESS);
3614}
3615
3616/*
3617 *	Routine:	vm_object_copy_quickly
3618 *
3619 *	Purpose:
3620 *		Copy the specified range of the source virtual
3621 *		memory object, if it can be done without waiting
3622 *		for user-generated events.
3623 *
3624 *	Results:
3625 *		If the copy is successful, the copy is returned in
3626 *		the arguments; otherwise, the arguments are not
3627 *		affected.
3628 *
3629 *	In/out conditions:
3630 *		The object should be unlocked on entry and exit.
3631 */
3632
3633/*ARGSUSED*/
3634__private_extern__ boolean_t
3635vm_object_copy_quickly(
3636	vm_object_t		*_object,		/* INOUT */
3637	__unused vm_object_offset_t	offset,	/* IN */
3638	__unused vm_object_size_t	size,	/* IN */
3639	boolean_t		*_src_needs_copy,	/* OUT */
3640	boolean_t		*_dst_needs_copy)	/* OUT */
3641{
3642	vm_object_t	object = *_object;
3643	memory_object_copy_strategy_t copy_strategy;
3644
3645	XPR(XPR_VM_OBJECT, "v_o_c_quickly obj 0x%x off 0x%x size 0x%x\n",
3646	    *_object, offset, size, 0, 0);
3647	if (object == VM_OBJECT_NULL) {
3648		*_src_needs_copy = FALSE;
3649		*_dst_needs_copy = FALSE;
3650		return(TRUE);
3651	}
3652
3653	vm_object_lock(object);
3654
3655	copy_strategy = object->copy_strategy;
3656
3657	switch (copy_strategy) {
3658	case MEMORY_OBJECT_COPY_SYMMETRIC:
3659
3660		/*
3661		 *	Symmetric copy strategy.
3662		 *	Make another reference to the object.
3663		 *	Leave object/offset unchanged.
3664		 */
3665
3666		vm_object_reference_locked(object);
3667		object->shadowed = TRUE;
3668		vm_object_unlock(object);
3669
3670		/*
3671		 *	Both source and destination must make
3672		 *	shadows, and the source must be made
3673		 *	read-only if not already.
3674		 */
3675
3676		*_src_needs_copy = TRUE;
3677		*_dst_needs_copy = TRUE;
3678
3679		break;
3680
3681	case MEMORY_OBJECT_COPY_DELAY:
3682		vm_object_unlock(object);
3683		return(FALSE);
3684
3685	default:
3686		vm_object_unlock(object);
3687		return(FALSE);
3688	}
3689	return(TRUE);
3690}
3691
3692static int copy_call_count = 0;
3693static int copy_call_sleep_count = 0;
3694static int copy_call_restart_count = 0;
3695
3696/*
3697 *	Routine:	vm_object_copy_call [internal]
3698 *
3699 *	Description:
3700 *		Copy the source object (src_object), using the
3701 *		user-managed copy algorithm.
3702 *
3703 *	In/out conditions:
3704 *		The source object must be locked on entry.  It
3705 *		will be *unlocked* on exit.
3706 *
3707 *	Results:
3708 *		If the copy is successful, KERN_SUCCESS is returned.
3709 *		A new object that represents the copied virtual
3710 *		memory is returned in a parameter (*_result_object).
3711 *		If the return value indicates an error, this parameter
3712 *		is not valid.
3713 */
3714static kern_return_t
3715vm_object_copy_call(
3716	vm_object_t		src_object,
3717	vm_object_offset_t	src_offset,
3718	vm_object_size_t	size,
3719	vm_object_t		*_result_object)	/* OUT */
3720{
3721	kern_return_t	kr;
3722	vm_object_t	copy;
3723	boolean_t	check_ready = FALSE;
3724	uint32_t	try_failed_count = 0;
3725
3726	/*
3727	 *	If a copy is already in progress, wait and retry.
3728	 *
3729	 *	XXX
3730	 *	Consider making this call interruptable, as Mike
3731	 *	intended it to be.
3732	 *
3733	 *	XXXO
3734	 *	Need a counter or version or something to allow
3735	 *	us to use the copy that the currently requesting
3736	 *	thread is obtaining -- is it worth adding to the
3737	 *	vm object structure? Depends how common this case it.
3738	 */
3739	copy_call_count++;
3740	while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) {
3741		vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL,
3742			       THREAD_UNINT);
3743		copy_call_restart_count++;
3744	}
3745
3746	/*
3747	 *	Indicate (for the benefit of memory_object_create_copy)
3748	 *	that we want a copy for src_object. (Note that we cannot
3749	 *	do a real assert_wait before calling memory_object_copy,
3750	 *	so we simply set the flag.)
3751	 */
3752
3753	vm_object_set_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL);
3754	vm_object_unlock(src_object);
3755
3756	/*
3757	 *	Ask the memory manager to give us a memory object
3758	 *	which represents a copy of the src object.
3759	 *	The memory manager may give us a memory object
3760	 *	which we already have, or it may give us a
3761	 *	new memory object. This memory object will arrive
3762	 *	via memory_object_create_copy.
3763	 */
3764
3765	kr = KERN_FAILURE;	/* XXX need to change memory_object.defs */
3766	if (kr != KERN_SUCCESS) {
3767		return kr;
3768	}
3769
3770	/*
3771	 *	Wait for the copy to arrive.
3772	 */
3773	vm_object_lock(src_object);
3774	while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) {
3775		vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL,
3776			       THREAD_UNINT);
3777		copy_call_sleep_count++;
3778	}
3779Retry:
3780	assert(src_object->copy != VM_OBJECT_NULL);
3781	copy = src_object->copy;
3782	if (!vm_object_lock_try(copy)) {
3783		vm_object_unlock(src_object);
3784
3785		try_failed_count++;
3786		mutex_pause(try_failed_count);	/* wait a bit */
3787
3788		vm_object_lock(src_object);
3789		goto Retry;
3790	}
3791	if (copy->vo_size < src_offset+size)
3792		copy->vo_size = src_offset+size;
3793
3794	if (!copy->pager_ready)
3795		check_ready = TRUE;
3796
3797	/*
3798	 *	Return the copy.
3799	 */
3800	*_result_object = copy;
3801	vm_object_unlock(copy);
3802	vm_object_unlock(src_object);
3803
3804	/* Wait for the copy to be ready. */
3805	if (check_ready == TRUE) {
3806		vm_object_lock(copy);
3807		while (!copy->pager_ready) {
3808			vm_object_sleep(copy, VM_OBJECT_EVENT_PAGER_READY, THREAD_UNINT);
3809		}
3810		vm_object_unlock(copy);
3811	}
3812
3813	return KERN_SUCCESS;
3814}
3815
3816static int copy_delayed_lock_collisions = 0;
3817static int copy_delayed_max_collisions = 0;
3818static int copy_delayed_lock_contention = 0;
3819static int copy_delayed_protect_iterate = 0;
3820
3821/*
3822 *	Routine:	vm_object_copy_delayed [internal]
3823 *
3824 *	Description:
3825 *		Copy the specified virtual memory object, using
3826 *		the asymmetric copy-on-write algorithm.
3827 *
3828 *	In/out conditions:
3829 *		The src_object must be locked on entry.  It will be unlocked
3830 *		on exit - so the caller must also hold a reference to it.
3831 *
3832 *		This routine will not block waiting for user-generated
3833 *		events.  It is not interruptible.
3834 */
3835__private_extern__ vm_object_t
3836vm_object_copy_delayed(
3837	vm_object_t		src_object,
3838	vm_object_offset_t	src_offset,
3839	vm_object_size_t	size,
3840	boolean_t		src_object_shared)
3841{
3842	vm_object_t		new_copy = VM_OBJECT_NULL;
3843	vm_object_t		old_copy;
3844	vm_page_t		p;
3845	vm_object_size_t	copy_size = src_offset + size;
3846	pmap_flush_context	pmap_flush_context_storage;
3847	boolean_t		delayed_pmap_flush = FALSE;
3848
3849
3850	int collisions = 0;
3851	/*
3852	 *	The user-level memory manager wants to see all of the changes
3853	 *	to this object, but it has promised not to make any changes on
3854 	 *	its own.
3855	 *
3856	 *	Perform an asymmetric copy-on-write, as follows:
3857	 *		Create a new object, called a "copy object" to hold
3858	 *		 pages modified by the new mapping  (i.e., the copy,
3859	 *		 not the original mapping).
3860	 *		Record the original object as the backing object for
3861	 *		 the copy object.  If the original mapping does not
3862	 *		 change a page, it may be used read-only by the copy.
3863	 *		Record the copy object in the original object.
3864	 *		 When the original mapping causes a page to be modified,
3865	 *		 it must be copied to a new page that is "pushed" to
3866	 *		 the copy object.
3867	 *		Mark the new mapping (the copy object) copy-on-write.
3868	 *		 This makes the copy object itself read-only, allowing
3869	 *		 it to be reused if the original mapping makes no
3870	 *		 changes, and simplifying the synchronization required
3871	 *		 in the "push" operation described above.
3872	 *
3873	 *	The copy-on-write is said to be assymetric because the original
3874	 *	object is *not* marked copy-on-write. A copied page is pushed
3875	 *	to the copy object, regardless which party attempted to modify
3876	 *	the page.
3877	 *
3878	 *	Repeated asymmetric copy operations may be done. If the
3879	 *	original object has not been changed since the last copy, its
3880	 *	copy object can be reused. Otherwise, a new copy object can be
3881	 *	inserted between the original object and its previous copy
3882	 *	object.  Since any copy object is read-only, this cannot affect
3883	 *	affect the contents of the previous copy object.
3884	 *
3885	 *	Note that a copy object is higher in the object tree than the
3886	 *	original object; therefore, use of the copy object recorded in
3887	 *	the original object must be done carefully, to avoid deadlock.
3888	 */
3889
3890 Retry:
3891
3892	/*
3893	 * Wait for paging in progress.
3894	 */
3895	if (!src_object->true_share &&
3896	    (src_object->paging_in_progress != 0 ||
3897	     src_object->activity_in_progress != 0)) {
3898	        if (src_object_shared == TRUE) {
3899		        vm_object_unlock(src_object);
3900			vm_object_lock(src_object);
3901			src_object_shared = FALSE;
3902			goto Retry;
3903		}
3904		vm_object_paging_wait(src_object, THREAD_UNINT);
3905	}
3906	/*
3907	 *	See whether we can reuse the result of a previous
3908	 *	copy operation.
3909	 */
3910
3911	old_copy = src_object->copy;
3912	if (old_copy != VM_OBJECT_NULL) {
3913	        int lock_granted;
3914
3915		/*
3916		 *	Try to get the locks (out of order)
3917		 */
3918		if (src_object_shared == TRUE)
3919		        lock_granted = vm_object_lock_try_shared(old_copy);
3920		else
3921		        lock_granted = vm_object_lock_try(old_copy);
3922
3923		if (!lock_granted) {
3924			vm_object_unlock(src_object);
3925
3926			if (collisions++ == 0)
3927				copy_delayed_lock_contention++;
3928			mutex_pause(collisions);
3929
3930			/* Heisenberg Rules */
3931			copy_delayed_lock_collisions++;
3932
3933			if (collisions > copy_delayed_max_collisions)
3934				copy_delayed_max_collisions = collisions;
3935
3936			if (src_object_shared == TRUE)
3937			        vm_object_lock_shared(src_object);
3938			else
3939			        vm_object_lock(src_object);
3940
3941			goto Retry;
3942		}
3943
3944		/*
3945		 *	Determine whether the old copy object has
3946		 *	been modified.
3947		 */
3948
3949		if (old_copy->resident_page_count == 0 &&
3950		    !old_copy->pager_created) {
3951			/*
3952			 *	It has not been modified.
3953			 *
3954			 *	Return another reference to
3955			 *	the existing copy-object if
3956			 *	we can safely grow it (if
3957			 *	needed).
3958			 */
3959
3960			if (old_copy->vo_size < copy_size) {
3961			        if (src_object_shared == TRUE) {
3962				        vm_object_unlock(old_copy);
3963					vm_object_unlock(src_object);
3964
3965					vm_object_lock(src_object);
3966					src_object_shared = FALSE;
3967					goto Retry;
3968				}
3969				/*
3970				 * We can't perform a delayed copy if any of the
3971				 * pages in the extended range are wired (because
3972				 * we can't safely take write permission away from
3973				 * wired pages).  If the pages aren't wired, then
3974				 * go ahead and protect them.
3975				 */
3976				copy_delayed_protect_iterate++;
3977
3978				pmap_flush_context_init(&pmap_flush_context_storage);
3979				delayed_pmap_flush = FALSE;
3980
3981				queue_iterate(&src_object->memq, p, vm_page_t, listq) {
3982					if (!p->fictitious &&
3983					    p->offset >= old_copy->vo_size &&
3984					    p->offset < copy_size) {
3985						if (VM_PAGE_WIRED(p)) {
3986							vm_object_unlock(old_copy);
3987							vm_object_unlock(src_object);
3988
3989							if (new_copy != VM_OBJECT_NULL) {
3990								vm_object_unlock(new_copy);
3991								vm_object_deallocate(new_copy);
3992							}
3993							if (delayed_pmap_flush == TRUE)
3994								pmap_flush(&pmap_flush_context_storage);
3995
3996							return VM_OBJECT_NULL;
3997						} else {
3998							pmap_page_protect_options(p->phys_page, (VM_PROT_ALL & ~VM_PROT_WRITE),
3999										  PMAP_OPTIONS_NOFLUSH, (void *)&pmap_flush_context_storage);
4000							delayed_pmap_flush = TRUE;
4001						}
4002					}
4003				}
4004				if (delayed_pmap_flush == TRUE)
4005					pmap_flush(&pmap_flush_context_storage);
4006
4007				old_copy->vo_size = copy_size;
4008			}
4009			if (src_object_shared == TRUE)
4010			        vm_object_reference_shared(old_copy);
4011			else
4012			        vm_object_reference_locked(old_copy);
4013			vm_object_unlock(old_copy);
4014			vm_object_unlock(src_object);
4015
4016			if (new_copy != VM_OBJECT_NULL) {
4017				vm_object_unlock(new_copy);
4018				vm_object_deallocate(new_copy);
4019			}
4020			return(old_copy);
4021		}
4022
4023
4024
4025		/*
4026		 * Adjust the size argument so that the newly-created
4027		 * copy object will be large enough to back either the
4028		 * old copy object or the new mapping.
4029		 */
4030		if (old_copy->vo_size > copy_size)
4031			copy_size = old_copy->vo_size;
4032
4033		if (new_copy == VM_OBJECT_NULL) {
4034			vm_object_unlock(old_copy);
4035			vm_object_unlock(src_object);
4036			new_copy = vm_object_allocate(copy_size);
4037			vm_object_lock(src_object);
4038			vm_object_lock(new_copy);
4039
4040			src_object_shared = FALSE;
4041			goto Retry;
4042		}
4043		new_copy->vo_size = copy_size;
4044
4045		/*
4046		 *	The copy-object is always made large enough to
4047		 *	completely shadow the original object, since
4048		 *	it may have several users who want to shadow
4049		 *	the original object at different points.
4050		 */
4051
4052		assert((old_copy->shadow == src_object) &&
4053		    (old_copy->vo_shadow_offset == (vm_object_offset_t) 0));
4054
4055	} else if (new_copy == VM_OBJECT_NULL) {
4056		vm_object_unlock(src_object);
4057		new_copy = vm_object_allocate(copy_size);
4058		vm_object_lock(src_object);
4059		vm_object_lock(new_copy);
4060
4061		src_object_shared = FALSE;
4062		goto Retry;
4063	}
4064
4065	/*
4066	 * We now have the src object locked, and the new copy object
4067	 * allocated and locked (and potentially the old copy locked).
4068	 * Before we go any further, make sure we can still perform
4069	 * a delayed copy, as the situation may have changed.
4070	 *
4071	 * Specifically, we can't perform a delayed copy if any of the
4072	 * pages in the range are wired (because we can't safely take
4073	 * write permission away from wired pages).  If the pages aren't
4074	 * wired, then go ahead and protect them.
4075	 */
4076	copy_delayed_protect_iterate++;
4077
4078	pmap_flush_context_init(&pmap_flush_context_storage);
4079	delayed_pmap_flush = FALSE;
4080
4081	queue_iterate(&src_object->memq, p, vm_page_t, listq) {
4082		if (!p->fictitious && p->offset < copy_size) {
4083			if (VM_PAGE_WIRED(p)) {
4084				if (old_copy)
4085					vm_object_unlock(old_copy);
4086				vm_object_unlock(src_object);
4087				vm_object_unlock(new_copy);
4088				vm_object_deallocate(new_copy);
4089
4090				if (delayed_pmap_flush == TRUE)
4091					pmap_flush(&pmap_flush_context_storage);
4092
4093				return VM_OBJECT_NULL;
4094			} else {
4095				pmap_page_protect_options(p->phys_page, (VM_PROT_ALL & ~VM_PROT_WRITE),
4096							  PMAP_OPTIONS_NOFLUSH, (void *)&pmap_flush_context_storage);
4097				delayed_pmap_flush = TRUE;
4098			}
4099		}
4100	}
4101	if (delayed_pmap_flush == TRUE)
4102		pmap_flush(&pmap_flush_context_storage);
4103
4104	if (old_copy != VM_OBJECT_NULL) {
4105		/*
4106		 *	Make the old copy-object shadow the new one.
4107		 *	It will receive no more pages from the original
4108		 *	object.
4109		 */
4110
4111		/* remove ref. from old_copy */
4112		vm_object_lock_assert_exclusive(src_object);
4113		src_object->ref_count--;
4114		assert(src_object->ref_count > 0);
4115		vm_object_lock_assert_exclusive(old_copy);
4116		old_copy->shadow = new_copy;
4117		vm_object_lock_assert_exclusive(new_copy);
4118		assert(new_copy->ref_count > 0);
4119		new_copy->ref_count++;		/* for old_copy->shadow ref. */
4120
4121#if TASK_SWAPPER
4122		if (old_copy->res_count) {
4123			VM_OBJ_RES_INCR(new_copy);
4124			VM_OBJ_RES_DECR(src_object);
4125		}
4126#endif
4127
4128		vm_object_unlock(old_copy);	/* done with old_copy */
4129	}
4130
4131	/*
4132	 *	Point the new copy at the existing object.
4133	 */
4134	vm_object_lock_assert_exclusive(new_copy);
4135	new_copy->shadow = src_object;
4136	new_copy->vo_shadow_offset = 0;
4137	new_copy->shadowed = TRUE;	/* caller must set needs_copy */
4138
4139	vm_object_lock_assert_exclusive(src_object);
4140	vm_object_reference_locked(src_object);
4141	src_object->copy = new_copy;
4142	vm_object_unlock(src_object);
4143	vm_object_unlock(new_copy);
4144
4145	XPR(XPR_VM_OBJECT,
4146		"vm_object_copy_delayed: used copy object %X for source %X\n",
4147		new_copy, src_object, 0, 0, 0);
4148
4149	return new_copy;
4150}
4151
4152/*
4153 *	Routine:	vm_object_copy_strategically
4154 *
4155 *	Purpose:
4156 *		Perform a copy according to the source object's
4157 *		declared strategy.  This operation may block,
4158 *		and may be interrupted.
4159 */
4160__private_extern__ kern_return_t
4161vm_object_copy_strategically(
4162	register vm_object_t	src_object,
4163	vm_object_offset_t	src_offset,
4164	vm_object_size_t	size,
4165	vm_object_t		*dst_object,	/* OUT */
4166	vm_object_offset_t	*dst_offset,	/* OUT */
4167	boolean_t		*dst_needs_copy) /* OUT */
4168{
4169	boolean_t	result;
4170	boolean_t	interruptible = THREAD_ABORTSAFE; /* XXX */
4171	boolean_t	object_lock_shared = FALSE;
4172	memory_object_copy_strategy_t copy_strategy;
4173
4174	assert(src_object != VM_OBJECT_NULL);
4175
4176	copy_strategy = src_object->copy_strategy;
4177
4178	if (copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4179	        vm_object_lock_shared(src_object);
4180		object_lock_shared = TRUE;
4181	} else
4182	        vm_object_lock(src_object);
4183
4184	/*
4185	 *	The copy strategy is only valid if the memory manager
4186	 *	is "ready". Internal objects are always ready.
4187	 */
4188
4189	while (!src_object->internal && !src_object->pager_ready) {
4190		wait_result_t wait_result;
4191
4192		if (object_lock_shared == TRUE) {
4193		        vm_object_unlock(src_object);
4194			vm_object_lock(src_object);
4195			object_lock_shared = FALSE;
4196			continue;
4197		}
4198		wait_result = vm_object_sleep(	src_object,
4199						VM_OBJECT_EVENT_PAGER_READY,
4200						interruptible);
4201		if (wait_result != THREAD_AWAKENED) {
4202			vm_object_unlock(src_object);
4203			*dst_object = VM_OBJECT_NULL;
4204			*dst_offset = 0;
4205			*dst_needs_copy = FALSE;
4206			return(MACH_SEND_INTERRUPTED);
4207		}
4208	}
4209
4210	/*
4211	 *	Use the appropriate copy strategy.
4212	 */
4213
4214	switch (copy_strategy) {
4215	    case MEMORY_OBJECT_COPY_DELAY:
4216		*dst_object = vm_object_copy_delayed(src_object,
4217						     src_offset, size, object_lock_shared);
4218		if (*dst_object != VM_OBJECT_NULL) {
4219			*dst_offset = src_offset;
4220			*dst_needs_copy = TRUE;
4221			result = KERN_SUCCESS;
4222			break;
4223		}
4224		vm_object_lock(src_object);
4225		/* fall thru when delayed copy not allowed */
4226
4227	    case MEMORY_OBJECT_COPY_NONE:
4228		result = vm_object_copy_slowly(src_object, src_offset, size,
4229					       interruptible, dst_object);
4230		if (result == KERN_SUCCESS) {
4231			*dst_offset = 0;
4232			*dst_needs_copy = FALSE;
4233		}
4234		break;
4235
4236	    case MEMORY_OBJECT_COPY_CALL:
4237		result = vm_object_copy_call(src_object, src_offset, size,
4238				dst_object);
4239		if (result == KERN_SUCCESS) {
4240			*dst_offset = src_offset;
4241			*dst_needs_copy = TRUE;
4242		}
4243		break;
4244
4245	    case MEMORY_OBJECT_COPY_SYMMETRIC:
4246		XPR(XPR_VM_OBJECT, "v_o_c_strategically obj 0x%x off 0x%x size 0x%x\n", src_object, src_offset, size, 0, 0);
4247		vm_object_unlock(src_object);
4248		result = KERN_MEMORY_RESTART_COPY;
4249		break;
4250
4251	    default:
4252		panic("copy_strategically: bad strategy");
4253		result = KERN_INVALID_ARGUMENT;
4254	}
4255	return(result);
4256}
4257
4258/*
4259 *	vm_object_shadow:
4260 *
4261 *	Create a new object which is backed by the
4262 *	specified existing object range.  The source
4263 *	object reference is deallocated.
4264 *
4265 *	The new object and offset into that object
4266 *	are returned in the source parameters.
4267 */
4268boolean_t vm_object_shadow_check = TRUE;
4269
4270__private_extern__ boolean_t
4271vm_object_shadow(
4272	vm_object_t		*object,	/* IN/OUT */
4273	vm_object_offset_t	*offset,	/* IN/OUT */
4274	vm_object_size_t	length)
4275{
4276	register vm_object_t	source;
4277	register vm_object_t	result;
4278
4279	source = *object;
4280	assert(source != VM_OBJECT_NULL);
4281	if (source == VM_OBJECT_NULL)
4282		return FALSE;
4283
4284#if 0
4285	/*
4286	 * XXX FBDP
4287	 * This assertion is valid but it gets triggered by Rosetta for example
4288	 * due to a combination of vm_remap() that changes a VM object's
4289	 * copy_strategy from SYMMETRIC to DELAY and vm_protect(VM_PROT_COPY)
4290	 * that then sets "needs_copy" on its map entry.  This creates a
4291	 * mapping situation that VM should never see and doesn't know how to
4292	 * handle.
4293	 * It's not clear if this can create any real problem but we should
4294	 * look into fixing this, probably by having vm_protect(VM_PROT_COPY)
4295	 * do more than just set "needs_copy" to handle the copy-on-write...
4296	 * In the meantime, let's disable the assertion.
4297	 */
4298	assert(source->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4299#endif
4300
4301	/*
4302	 *	Determine if we really need a shadow.
4303	 *
4304	 *	If the source object is larger than what we are trying
4305	 *	to create, then force the shadow creation even if the
4306	 *	ref count is 1.  This will allow us to [potentially]
4307	 *	collapse the underlying object away in the future
4308	 *	(freeing up the extra data it might contain and that
4309	 *	we don't need).
4310	 */
4311	if (vm_object_shadow_check &&
4312	    source->vo_size == length &&
4313	    source->ref_count == 1 &&
4314	    (source->shadow == VM_OBJECT_NULL ||
4315	     source->shadow->copy == VM_OBJECT_NULL) )
4316	{
4317		source->shadowed = FALSE;
4318		return FALSE;
4319	}
4320
4321	/*
4322	 *	Allocate a new object with the given length
4323	 */
4324
4325	if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL)
4326		panic("vm_object_shadow: no object for shadowing");
4327
4328	/*
4329	 *	The new object shadows the source object, adding
4330	 *	a reference to it.  Our caller changes his reference
4331	 *	to point to the new object, removing a reference to
4332	 *	the source object.  Net result: no change of reference
4333	 *	count.
4334	 */
4335	result->shadow = source;
4336
4337	/*
4338	 *	Store the offset into the source object,
4339	 *	and fix up the offset into the new object.
4340	 */
4341
4342	result->vo_shadow_offset = *offset;
4343
4344	/*
4345	 *	Return the new things
4346	 */
4347
4348	*offset = 0;
4349	*object = result;
4350	return TRUE;
4351}
4352
4353/*
4354 *	The relationship between vm_object structures and
4355 *	the memory_object requires careful synchronization.
4356 *
4357 *	All associations are created by memory_object_create_named
4358 *  for external pagers and vm_object_pager_create for internal
4359 *  objects as follows:
4360 *
4361 *		pager:	the memory_object itself, supplied by
4362 *			the user requesting a mapping (or the kernel,
4363 *			when initializing internal objects); the
4364 *			kernel simulates holding send rights by keeping
4365 *			a port reference;
4366 *
4367 *		pager_request:
4368 *			the memory object control port,
4369 *			created by the kernel; the kernel holds
4370 *			receive (and ownership) rights to this
4371 *			port, but no other references.
4372 *
4373 *	When initialization is complete, the "initialized" field
4374 *	is asserted.  Other mappings using a particular memory object,
4375 *	and any references to the vm_object gained through the
4376 *	port association must wait for this initialization to occur.
4377 *
4378 *	In order to allow the memory manager to set attributes before
4379 *	requests (notably virtual copy operations, but also data or
4380 *	unlock requests) are made, a "ready" attribute is made available.
4381 *	Only the memory manager may affect the value of this attribute.
4382 *	Its value does not affect critical kernel functions, such as
4383 *	internal object initialization or destruction.  [Furthermore,
4384 *	memory objects created by the kernel are assumed to be ready
4385 *	immediately; the default memory manager need not explicitly
4386 *	set the "ready" attribute.]
4387 *
4388 *	[Both the "initialized" and "ready" attribute wait conditions
4389 *	use the "pager" field as the wait event.]
4390 *
4391 *	The port associations can be broken down by any of the
4392 *	following routines:
4393 *		vm_object_terminate:
4394 *			No references to the vm_object remain, and
4395 *			the object cannot (or will not) be cached.
4396 *			This is the normal case, and is done even
4397 *			though one of the other cases has already been
4398 *			done.
4399 *		memory_object_destroy:
4400 *			The memory manager has requested that the
4401 *			kernel relinquish references to the memory
4402 *			object. [The memory manager may not want to
4403 *			destroy the memory object, but may wish to
4404 *			refuse or tear down existing memory mappings.]
4405 *
4406 *	Each routine that breaks an association must break all of
4407 *	them at once.  At some later time, that routine must clear
4408 *	the pager field and release the memory object references.
4409 *	[Furthermore, each routine must cope with the simultaneous
4410 *	or previous operations of the others.]
4411 *
4412 *	In addition to the lock on the object, the vm_object_hash_lock
4413 *	governs the associations.  References gained through the
4414 *	association require use of the hash lock.
4415 *
4416 *	Because the pager field may be cleared spontaneously, it
4417 *	cannot be used to determine whether a memory object has
4418 *	ever been associated with a particular vm_object.  [This
4419 *	knowledge is important to the shadow object mechanism.]
4420 *	For this reason, an additional "created" attribute is
4421 *	provided.
4422 *
4423 *	During various paging operations, the pager reference found in the
4424 *	vm_object must be valid.  To prevent this from being released,
4425 *	(other than being removed, i.e., made null), routines may use
4426 *	the vm_object_paging_begin/end routines [actually, macros].
4427 *	The implementation uses the "paging_in_progress" and "wanted" fields.
4428 *	[Operations that alter the validity of the pager values include the
4429 *	termination routines and vm_object_collapse.]
4430 */
4431
4432
4433/*
4434 *	Routine:	vm_object_enter
4435 *	Purpose:
4436 *		Find a VM object corresponding to the given
4437 *		pager; if no such object exists, create one,
4438 *		and initialize the pager.
4439 */
4440vm_object_t
4441vm_object_enter(
4442	memory_object_t		pager,
4443	vm_object_size_t	size,
4444	boolean_t		internal,
4445	boolean_t		init,
4446	boolean_t		named)
4447{
4448	register vm_object_t	object;
4449	vm_object_t		new_object;
4450	boolean_t		must_init;
4451	vm_object_hash_entry_t	entry, new_entry;
4452	uint32_t        try_failed_count = 0;
4453	lck_mtx_t	*lck;
4454
4455	if (pager == MEMORY_OBJECT_NULL)
4456		return(vm_object_allocate(size));
4457
4458	new_object = VM_OBJECT_NULL;
4459	new_entry = VM_OBJECT_HASH_ENTRY_NULL;
4460	must_init = init;
4461
4462	/*
4463	 *	Look for an object associated with this port.
4464	 */
4465Retry:
4466	lck = vm_object_hash_lock_spin(pager);
4467	do {
4468		entry = vm_object_hash_lookup(pager, FALSE);
4469
4470		if (entry == VM_OBJECT_HASH_ENTRY_NULL) {
4471			if (new_object == VM_OBJECT_NULL) {
4472				/*
4473				 *	We must unlock to create a new object;
4474				 *	if we do so, we must try the lookup again.
4475				 */
4476				vm_object_hash_unlock(lck);
4477				assert(new_entry == VM_OBJECT_HASH_ENTRY_NULL);
4478				new_entry = vm_object_hash_entry_alloc(pager);
4479				new_object = vm_object_allocate(size);
4480				lck = vm_object_hash_lock_spin(pager);
4481			} else {
4482				/*
4483				 *	Lookup failed twice, and we have something
4484				 *	to insert; set the object.
4485				 */
4486				vm_object_lock(new_object);
4487				vm_object_hash_insert(new_entry, new_object);
4488				vm_object_unlock(new_object);
4489				entry = new_entry;
4490				new_entry = VM_OBJECT_HASH_ENTRY_NULL;
4491				new_object = VM_OBJECT_NULL;
4492				must_init = TRUE;
4493			}
4494		} else if (entry->object == VM_OBJECT_NULL) {
4495			/*
4496		 	 *	If a previous object is being terminated,
4497			 *	we must wait for the termination message
4498			 *	to be queued (and lookup the entry again).
4499			 */
4500			entry->waiting = TRUE;
4501			entry = VM_OBJECT_HASH_ENTRY_NULL;
4502			assert_wait((event_t) pager, THREAD_UNINT);
4503			vm_object_hash_unlock(lck);
4504
4505			thread_block(THREAD_CONTINUE_NULL);
4506			lck = vm_object_hash_lock_spin(pager);
4507		}
4508	} while (entry == VM_OBJECT_HASH_ENTRY_NULL);
4509
4510	object = entry->object;
4511	assert(object != VM_OBJECT_NULL);
4512
4513	if (!must_init) {
4514	        if ( !vm_object_lock_try(object)) {
4515
4516		        vm_object_hash_unlock(lck);
4517
4518		        try_failed_count++;
4519			mutex_pause(try_failed_count);  /* wait a bit */
4520			goto Retry;
4521		}
4522		assert(!internal || object->internal);
4523#if VM_OBJECT_CACHE
4524		if (object->ref_count == 0) {
4525			if ( !vm_object_cache_lock_try()) {
4526
4527				vm_object_hash_unlock(lck);
4528				vm_object_unlock(object);
4529
4530				try_failed_count++;
4531				mutex_pause(try_failed_count);  /* wait a bit */
4532				goto Retry;
4533			}
4534			XPR(XPR_VM_OBJECT_CACHE,
4535			    "vm_object_enter: removing %x from cache, head (%x, %x)\n",
4536				object,
4537				vm_object_cached_list.next,
4538				vm_object_cached_list.prev, 0,0);
4539			queue_remove(&vm_object_cached_list, object,
4540				     vm_object_t, cached_list);
4541			vm_object_cached_count--;
4542
4543			vm_object_cache_unlock();
4544		}
4545#endif
4546		if (named) {
4547			assert(!object->named);
4548			object->named = TRUE;
4549		}
4550		vm_object_lock_assert_exclusive(object);
4551		object->ref_count++;
4552		vm_object_res_reference(object);
4553
4554		vm_object_hash_unlock(lck);
4555		vm_object_unlock(object);
4556
4557		VM_STAT_INCR(hits);
4558	} else
4559		vm_object_hash_unlock(lck);
4560
4561	assert(object->ref_count > 0);
4562
4563	VM_STAT_INCR(lookups);
4564
4565	XPR(XPR_VM_OBJECT,
4566		"vm_o_enter: pager 0x%x obj 0x%x must_init %d\n",
4567		pager, object, must_init, 0, 0);
4568
4569	/*
4570	 *	If we raced to create a vm_object but lost, let's
4571	 *	throw away ours.
4572	 */
4573
4574	if (new_object != VM_OBJECT_NULL)
4575		vm_object_deallocate(new_object);
4576
4577	if (new_entry != VM_OBJECT_HASH_ENTRY_NULL)
4578		vm_object_hash_entry_free(new_entry);
4579
4580	if (must_init) {
4581		memory_object_control_t control;
4582
4583		/*
4584		 *	Allocate request port.
4585		 */
4586
4587		control = memory_object_control_allocate(object);
4588		assert (control != MEMORY_OBJECT_CONTROL_NULL);
4589
4590		vm_object_lock(object);
4591		assert(object != kernel_object);
4592
4593		/*
4594		 *	Copy the reference we were given.
4595		 */
4596
4597		memory_object_reference(pager);
4598		object->pager_created = TRUE;
4599		object->pager = pager;
4600		object->internal = internal;
4601		object->pager_trusted = internal;
4602		if (!internal) {
4603			/* copy strategy invalid until set by memory manager */
4604			object->copy_strategy = MEMORY_OBJECT_COPY_INVALID;
4605		}
4606		object->pager_control = control;
4607		object->pager_ready = FALSE;
4608
4609		vm_object_unlock(object);
4610
4611		/*
4612		 *	Let the pager know we're using it.
4613		 */
4614
4615		(void) memory_object_init(pager,
4616			object->pager_control,
4617			PAGE_SIZE);
4618
4619		vm_object_lock(object);
4620		if (named)
4621			object->named = TRUE;
4622		if (internal) {
4623			object->pager_ready = TRUE;
4624			vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY);
4625		}
4626
4627		object->pager_initialized = TRUE;
4628		vm_object_wakeup(object, VM_OBJECT_EVENT_INITIALIZED);
4629	} else {
4630		vm_object_lock(object);
4631	}
4632
4633	/*
4634	 *	[At this point, the object must be locked]
4635	 */
4636
4637	/*
4638	 *	Wait for the work above to be done by the first
4639	 *	thread to map this object.
4640	 */
4641
4642	while (!object->pager_initialized) {
4643		vm_object_sleep(object,
4644				VM_OBJECT_EVENT_INITIALIZED,
4645				THREAD_UNINT);
4646	}
4647	vm_object_unlock(object);
4648
4649	XPR(XPR_VM_OBJECT,
4650	    "vm_object_enter: vm_object %x, memory_object %x, internal %d\n",
4651	    object, object->pager, internal, 0,0);
4652	return(object);
4653}
4654
4655/*
4656 *	Routine:	vm_object_pager_create
4657 *	Purpose:
4658 *		Create a memory object for an internal object.
4659 *	In/out conditions:
4660 *		The object is locked on entry and exit;
4661 *		it may be unlocked within this call.
4662 *	Limitations:
4663 *		Only one thread may be performing a
4664 *		vm_object_pager_create on an object at
4665 *		a time.  Presumably, only the pageout
4666 *		daemon will be using this routine.
4667 */
4668
4669void
4670vm_object_pager_create(
4671	register vm_object_t	object)
4672{
4673	memory_object_t		pager;
4674	vm_object_hash_entry_t	entry;
4675	lck_mtx_t		*lck;
4676#if	MACH_PAGEMAP
4677	vm_object_size_t	size;
4678	vm_external_map_t	map;
4679#endif	/* MACH_PAGEMAP */
4680
4681	XPR(XPR_VM_OBJECT, "vm_object_pager_create, object 0x%X\n",
4682		object, 0,0,0,0);
4683
4684	assert(object != kernel_object);
4685
4686	if (memory_manager_default_check() != KERN_SUCCESS)
4687		return;
4688
4689	/*
4690	 *	Prevent collapse or termination by holding a paging reference
4691	 */
4692
4693	vm_object_paging_begin(object);
4694	if (object->pager_created) {
4695		/*
4696		 *	Someone else got to it first...
4697		 *	wait for them to finish initializing the ports
4698		 */
4699		while (!object->pager_initialized) {
4700			vm_object_sleep(object,
4701				        VM_OBJECT_EVENT_INITIALIZED,
4702				        THREAD_UNINT);
4703		}
4704		vm_object_paging_end(object);
4705		return;
4706	}
4707
4708	/*
4709	 *	Indicate that a memory object has been assigned
4710	 *	before dropping the lock, to prevent a race.
4711	 */
4712
4713	object->pager_created = TRUE;
4714	object->paging_offset = 0;
4715
4716#if	MACH_PAGEMAP
4717	size = object->vo_size;
4718#endif	/* MACH_PAGEMAP */
4719	vm_object_unlock(object);
4720
4721#if	MACH_PAGEMAP
4722	if (DEFAULT_PAGER_IS_ACTIVE) {
4723		map = vm_external_create(size);
4724		vm_object_lock(object);
4725		assert(object->vo_size == size);
4726		object->existence_map = map;
4727		vm_object_unlock(object);
4728	}
4729#endif	/* MACH_PAGEMAP */
4730
4731	if ((uint32_t) object->vo_size != object->vo_size) {
4732		panic("vm_object_pager_create(): object size 0x%llx >= 4GB\n",
4733		      (uint64_t) object->vo_size);
4734	}
4735
4736	/*
4737	 *	Create the [internal] pager, and associate it with this object.
4738	 *
4739	 *	We make the association here so that vm_object_enter()
4740	 * 	can look up the object to complete initializing it.  No
4741	 *	user will ever map this object.
4742	 */
4743	{
4744		memory_object_default_t		dmm;
4745
4746		/* acquire a reference for the default memory manager */
4747		dmm = memory_manager_default_reference();
4748
4749		assert(object->temporary);
4750
4751		/* create our new memory object */
4752		assert((vm_size_t) object->vo_size == object->vo_size);
4753		(void) memory_object_create(dmm, (vm_size_t) object->vo_size,
4754					    &pager);
4755
4756		memory_object_default_deallocate(dmm);
4757       }
4758
4759	entry = vm_object_hash_entry_alloc(pager);
4760
4761	vm_object_lock(object);
4762	lck = vm_object_hash_lock_spin(pager);
4763	vm_object_hash_insert(entry, object);
4764	vm_object_hash_unlock(lck);
4765	vm_object_unlock(object);
4766
4767	/*
4768	 *	A reference was returned by
4769	 *	memory_object_create(), and it is
4770	 *	copied by vm_object_enter().
4771	 */
4772
4773	if (vm_object_enter(pager, object->vo_size, TRUE, TRUE, FALSE) != object)
4774		panic("vm_object_pager_create: mismatch");
4775
4776	/*
4777	 *	Drop the reference we were passed.
4778	 */
4779	memory_object_deallocate(pager);
4780
4781	vm_object_lock(object);
4782
4783	/*
4784	 *	Release the paging reference
4785	 */
4786	vm_object_paging_end(object);
4787}
4788
4789void
4790vm_object_compressor_pager_create(
4791	register vm_object_t	object)
4792{
4793	memory_object_t		pager;
4794	vm_object_hash_entry_t	entry;
4795	lck_mtx_t		*lck;
4796	vm_object_t		pager_object = VM_OBJECT_NULL;
4797
4798	assert(object != kernel_object);
4799
4800	/*
4801	 *	Prevent collapse or termination by holding a paging reference
4802	 */
4803
4804	vm_object_paging_begin(object);
4805	if (object->pager_created) {
4806		/*
4807		 *	Someone else got to it first...
4808		 *	wait for them to finish initializing the ports
4809		 */
4810		while (!object->pager_initialized) {
4811			vm_object_sleep(object,
4812				        VM_OBJECT_EVENT_INITIALIZED,
4813				        THREAD_UNINT);
4814		}
4815		vm_object_paging_end(object);
4816		return;
4817	}
4818
4819	/*
4820	 *	Indicate that a memory object has been assigned
4821	 *	before dropping the lock, to prevent a race.
4822	 */
4823
4824	object->pager_created = TRUE;
4825	object->paging_offset = 0;
4826
4827	vm_object_unlock(object);
4828
4829	if ((uint32_t) (object->vo_size/PAGE_SIZE) !=
4830	    (object->vo_size/PAGE_SIZE)) {
4831		panic("vm_object_compressor_pager_create(%p): "
4832		      "object size 0x%llx >= 0x%llx\n",
4833		      object,
4834		      (uint64_t) object->vo_size,
4835		      0x0FFFFFFFFULL*PAGE_SIZE);
4836	}
4837
4838	/*
4839	 *	Create the [internal] pager, and associate it with this object.
4840	 *
4841	 *	We make the association here so that vm_object_enter()
4842	 * 	can look up the object to complete initializing it.  No
4843	 *	user will ever map this object.
4844	 */
4845	{
4846		assert(object->temporary);
4847
4848		/* create our new memory object */
4849		assert((uint32_t) (object->vo_size/PAGE_SIZE) ==
4850		       (object->vo_size/PAGE_SIZE));
4851		(void) compressor_memory_object_create(
4852			(memory_object_size_t) object->vo_size,
4853			&pager);
4854		if (pager == NULL) {
4855			panic("vm_object_compressor_pager_create(): "
4856			      "no pager for object %p size 0x%llx\n",
4857			      object, (uint64_t) object->vo_size);
4858		}
4859       }
4860
4861	entry = vm_object_hash_entry_alloc(pager);
4862
4863	vm_object_lock(object);
4864	lck = vm_object_hash_lock_spin(pager);
4865	vm_object_hash_insert(entry, object);
4866	vm_object_hash_unlock(lck);
4867	vm_object_unlock(object);
4868
4869	/*
4870	 *	A reference was returned by
4871	 *	memory_object_create(), and it is
4872	 *	copied by vm_object_enter().
4873	 */
4874
4875	pager_object = vm_object_enter(pager, object->vo_size, TRUE, TRUE, FALSE);
4876
4877	if (pager_object != object) {
4878		panic("vm_object_compressor_pager_create: mismatch (pager: %p, pager_object: %p, orig_object: %p, orig_object size: 0x%llx)\n", pager, pager_object, object, (uint64_t) object->vo_size);
4879	}
4880
4881	/*
4882	 *	Drop the reference we were passed.
4883	 */
4884	memory_object_deallocate(pager);
4885
4886	vm_object_lock(object);
4887
4888	/*
4889	 *	Release the paging reference
4890	 */
4891	vm_object_paging_end(object);
4892}
4893
4894/*
4895 *	Routine:	vm_object_remove
4896 *	Purpose:
4897 *		Eliminate the pager/object association
4898 *		for this pager.
4899 *	Conditions:
4900 *		The object cache must be locked.
4901 */
4902__private_extern__ void
4903vm_object_remove(
4904	vm_object_t	object)
4905{
4906	memory_object_t pager;
4907
4908	if ((pager = object->pager) != MEMORY_OBJECT_NULL) {
4909		vm_object_hash_entry_t	entry;
4910
4911		entry = vm_object_hash_lookup(pager, FALSE);
4912		if (entry != VM_OBJECT_HASH_ENTRY_NULL)
4913			entry->object = VM_OBJECT_NULL;
4914	}
4915
4916}
4917
4918/*
4919 *	Global variables for vm_object_collapse():
4920 *
4921 *		Counts for normal collapses and bypasses.
4922 *		Debugging variables, to watch or disable collapse.
4923 */
4924static long	object_collapses = 0;
4925static long	object_bypasses  = 0;
4926
4927static boolean_t	vm_object_collapse_allowed = TRUE;
4928static boolean_t	vm_object_bypass_allowed = TRUE;
4929
4930#if MACH_PAGEMAP
4931static int	vm_external_discarded;
4932static int	vm_external_collapsed;
4933#endif
4934
4935unsigned long vm_object_collapse_encrypted = 0;
4936
4937void vm_object_do_collapse_compressor(vm_object_t object,
4938				      vm_object_t backing_object);
4939void
4940vm_object_do_collapse_compressor(
4941	vm_object_t object,
4942	vm_object_t backing_object)
4943{
4944	vm_object_offset_t new_offset, backing_offset;
4945	vm_object_size_t size;
4946
4947	vm_counters.do_collapse_compressor++;
4948
4949	vm_object_lock_assert_exclusive(object);
4950	vm_object_lock_assert_exclusive(backing_object);
4951
4952	size = object->vo_size;
4953
4954	/*
4955	 *	Move all compressed pages from backing_object
4956	 *	to the parent.
4957	 */
4958
4959	for (backing_offset = object->vo_shadow_offset;
4960	     backing_offset < object->vo_shadow_offset + object->vo_size;
4961	     backing_offset += PAGE_SIZE) {
4962		memory_object_offset_t backing_pager_offset;
4963
4964		/* find the next compressed page at or after this offset */
4965		backing_pager_offset = (backing_offset +
4966					backing_object->paging_offset);
4967		backing_pager_offset = vm_compressor_pager_next_compressed(
4968			backing_object->pager,
4969			backing_pager_offset);
4970		if (backing_pager_offset == (memory_object_offset_t) -1) {
4971			/* no more compressed pages */
4972			break;
4973		}
4974		backing_offset = (backing_pager_offset -
4975				  backing_object->paging_offset);
4976
4977		new_offset = backing_offset - object->vo_shadow_offset;
4978
4979		if (new_offset >= object->vo_size) {
4980			/* we're out of the scope of "object": done */
4981			break;
4982		}
4983
4984		if ((vm_page_lookup(object, new_offset) != VM_PAGE_NULL) ||
4985		    (vm_compressor_pager_state_get(object->pager,
4986						   (new_offset +
4987						    object->paging_offset)) ==
4988		     VM_EXTERNAL_STATE_EXISTS)) {
4989			/*
4990			 * This page already exists in object, resident or
4991			 * compressed.
4992			 * We don't need this compressed page in backing_object
4993			 * and it will be reclaimed when we release
4994			 * backing_object.
4995			 */
4996			continue;
4997		}
4998
4999		/*
5000		 * backing_object has this page in the VM compressor and
5001		 * we need to transfer it to object.
5002		 */
5003		vm_counters.do_collapse_compressor_pages++;
5004		vm_compressor_pager_transfer(
5005			/* destination: */
5006			object->pager,
5007			(new_offset + object->paging_offset),
5008			/* source: */
5009			backing_object->pager,
5010			(backing_offset + backing_object->paging_offset));
5011	}
5012}
5013
5014/*
5015 *	Routine:	vm_object_do_collapse
5016 *	Purpose:
5017 *		Collapse an object with the object backing it.
5018 *		Pages in the backing object are moved into the
5019 *		parent, and the backing object is deallocated.
5020 *	Conditions:
5021 *		Both objects and the cache are locked; the page
5022 *		queues are unlocked.
5023 *
5024 */
5025static void
5026vm_object_do_collapse(
5027	vm_object_t object,
5028	vm_object_t backing_object)
5029{
5030	vm_page_t p, pp;
5031	vm_object_offset_t new_offset, backing_offset;
5032	vm_object_size_t size;
5033
5034	vm_object_lock_assert_exclusive(object);
5035	vm_object_lock_assert_exclusive(backing_object);
5036
5037	assert(object->purgable == VM_PURGABLE_DENY);
5038	assert(backing_object->purgable == VM_PURGABLE_DENY);
5039
5040	backing_offset = object->vo_shadow_offset;
5041	size = object->vo_size;
5042
5043	/*
5044	 *	Move all in-memory pages from backing_object
5045	 *	to the parent.  Pages that have been paged out
5046	 *	will be overwritten by any of the parent's
5047	 *	pages that shadow them.
5048	 */
5049
5050	while (!queue_empty(&backing_object->memq)) {
5051
5052		p = (vm_page_t) queue_first(&backing_object->memq);
5053
5054		new_offset = (p->offset - backing_offset);
5055
5056		assert(!p->busy || p->absent);
5057
5058		/*
5059		 *	If the parent has a page here, or if
5060		 *	this page falls outside the parent,
5061		 *	dispose of it.
5062		 *
5063		 *	Otherwise, move it as planned.
5064		 */
5065
5066		if (p->offset < backing_offset || new_offset >= size) {
5067			VM_PAGE_FREE(p);
5068		} else {
5069			/*
5070			 * ENCRYPTED SWAP:
5071			 * The encryption key includes the "pager" and the
5072			 * "paging_offset".  These will not change during the
5073			 * object collapse, so we can just move an encrypted
5074			 * page from one object to the other in this case.
5075			 * We can't decrypt the page here, since we can't drop
5076			 * the object lock.
5077			 */
5078			if (p->encrypted) {
5079				vm_object_collapse_encrypted++;
5080			}
5081			pp = vm_page_lookup(object, new_offset);
5082			if (pp == VM_PAGE_NULL) {
5083
5084				if (VM_COMPRESSOR_PAGER_STATE_GET(object,
5085								  new_offset)
5086				    == VM_EXTERNAL_STATE_EXISTS) {
5087					/*
5088					 * Parent object has this page
5089					 * in the VM compressor.
5090					 * Throw away the backing
5091					 * object's page.
5092					 */
5093					VM_PAGE_FREE(p);
5094				} else {
5095					/*
5096					 *	Parent now has no page.
5097					 *	Move the backing object's page
5098					 * 	up.
5099					 */
5100					vm_page_rename(p, object, new_offset,
5101						       TRUE);
5102				}
5103
5104#if	MACH_PAGEMAP
5105			} else if (pp->absent) {
5106
5107				/*
5108				 *	Parent has an absent page...
5109				 *	it's not being paged in, so
5110				 *	it must really be missing from
5111				 *	the parent.
5112				 *
5113				 *	Throw out the absent page...
5114				 *	any faults looking for that
5115				 *	page will restart with the new
5116				 *	one.
5117				 */
5118
5119				VM_PAGE_FREE(pp);
5120				vm_page_rename(p, object, new_offset, TRUE);
5121#endif	/* MACH_PAGEMAP */
5122			} else {
5123				assert(! pp->absent);
5124
5125				/*
5126				 *	Parent object has a real page.
5127				 *	Throw away the backing object's
5128				 *	page.
5129				 */
5130				VM_PAGE_FREE(p);
5131			}
5132		}
5133	}
5134
5135	if (vm_object_collapse_compressor_allowed &&
5136	    object->pager != MEMORY_OBJECT_NULL &&
5137	    backing_object->pager != MEMORY_OBJECT_NULL) {
5138
5139		/* move compressed pages from backing_object to object */
5140		vm_object_do_collapse_compressor(object, backing_object);
5141
5142	} else if (backing_object->pager != MEMORY_OBJECT_NULL) {
5143		vm_object_hash_entry_t	entry;
5144
5145#if	!MACH_PAGEMAP
5146		assert((!object->pager_created &&
5147			(object->pager == MEMORY_OBJECT_NULL)) ||
5148		       (!backing_object->pager_created &&
5149			(backing_object->pager == MEMORY_OBJECT_NULL)));
5150#else
5151		assert(!object->pager_created &&
5152		       object->pager == MEMORY_OBJECT_NULL);
5153#endif	/* !MACH_PAGEMAP */
5154
5155		/*
5156		 *	Move the pager from backing_object to object.
5157		 *
5158		 *	XXX We're only using part of the paging space
5159		 *	for keeps now... we ought to discard the
5160		 *	unused portion.
5161		 */
5162
5163		assert(!object->paging_in_progress);
5164		assert(!object->activity_in_progress);
5165		assert(!object->pager_created);
5166		assert(object->pager == NULL);
5167		object->pager = backing_object->pager;
5168
5169		if (backing_object->hashed) {
5170			lck_mtx_t	*lck;
5171
5172			lck = vm_object_hash_lock_spin(backing_object->pager);
5173			entry = vm_object_hash_lookup(object->pager, FALSE);
5174			assert(entry != VM_OBJECT_HASH_ENTRY_NULL);
5175			entry->object = object;
5176			vm_object_hash_unlock(lck);
5177
5178			object->hashed = TRUE;
5179		}
5180		object->pager_created = backing_object->pager_created;
5181		object->pager_control = backing_object->pager_control;
5182		object->pager_ready = backing_object->pager_ready;
5183		object->pager_initialized = backing_object->pager_initialized;
5184		object->paging_offset =
5185		    backing_object->paging_offset + backing_offset;
5186		if (object->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
5187			memory_object_control_collapse(object->pager_control,
5188						       object);
5189		}
5190		/* the backing_object has lost its pager: reset all fields */
5191		backing_object->pager_created = FALSE;
5192		backing_object->pager_control = NULL;
5193		backing_object->pager_ready = FALSE;
5194		backing_object->paging_offset = 0;
5195		backing_object->pager = NULL;
5196	}
5197
5198#if	MACH_PAGEMAP
5199	/*
5200	 *	If the shadow offset is 0, the use the existence map from
5201	 *	the backing object if there is one. If the shadow offset is
5202	 *	not zero, toss it.
5203	 *
5204	 *	XXX - If the shadow offset is not 0 then a bit copy is needed
5205	 *	if the map is to be salvaged.  For now, we just just toss the
5206	 *	old map, giving the collapsed object no map. This means that
5207	 *	the pager is invoked for zero fill pages.  If analysis shows
5208	 *	that this happens frequently and is a performance hit, then
5209	 *	this code should be fixed to salvage the map.
5210	 */
5211	assert(object->existence_map == VM_EXTERNAL_NULL);
5212	if (backing_offset || (size != backing_object->vo_size)) {
5213		vm_external_discarded++;
5214		vm_external_destroy(backing_object->existence_map,
5215			backing_object->vo_size);
5216	}
5217	else {
5218		vm_external_collapsed++;
5219		object->existence_map = backing_object->existence_map;
5220	}
5221	backing_object->existence_map = VM_EXTERNAL_NULL;
5222#endif	/* MACH_PAGEMAP */
5223
5224	/*
5225	 *	Object now shadows whatever backing_object did.
5226	 *	Note that the reference to backing_object->shadow
5227	 *	moves from within backing_object to within object.
5228	 */
5229
5230	assert(!object->phys_contiguous);
5231	assert(!backing_object->phys_contiguous);
5232	object->shadow = backing_object->shadow;
5233	if (object->shadow) {
5234		object->vo_shadow_offset += backing_object->vo_shadow_offset;
5235		/* "backing_object" gave its shadow to "object" */
5236		backing_object->shadow = VM_OBJECT_NULL;
5237		backing_object->vo_shadow_offset = 0;
5238	} else {
5239		/* no shadow, therefore no shadow offset... */
5240		object->vo_shadow_offset = 0;
5241	}
5242	assert((object->shadow == VM_OBJECT_NULL) ||
5243	       (object->shadow->copy != backing_object));
5244
5245	/*
5246	 *	Discard backing_object.
5247	 *
5248	 *	Since the backing object has no pages, no
5249	 *	pager left, and no object references within it,
5250	 *	all that is necessary is to dispose of it.
5251	 */
5252	object_collapses++;
5253
5254	assert(backing_object->ref_count == 1);
5255	assert(backing_object->resident_page_count == 0);
5256	assert(backing_object->paging_in_progress == 0);
5257	assert(backing_object->activity_in_progress == 0);
5258	assert(backing_object->shadow == VM_OBJECT_NULL);
5259	assert(backing_object->vo_shadow_offset == 0);
5260
5261	if (backing_object->pager != MEMORY_OBJECT_NULL) {
5262		/* ... unless it has a pager; need to terminate pager too */
5263		vm_counters.do_collapse_terminate++;
5264		if (vm_object_terminate(backing_object) != KERN_SUCCESS) {
5265			vm_counters.do_collapse_terminate_failure++;
5266		}
5267		return;
5268	}
5269
5270	assert(backing_object->pager == NULL);
5271
5272	backing_object->alive = FALSE;
5273	vm_object_unlock(backing_object);
5274
5275	XPR(XPR_VM_OBJECT, "vm_object_collapse, collapsed 0x%X\n",
5276		backing_object, 0,0,0,0);
5277
5278#if VM_OBJECT_TRACKING
5279	if (vm_object_tracking_inited) {
5280		btlog_remove_entries_for_element(vm_object_tracking_btlog,
5281						 backing_object);
5282	}
5283#endif /* VM_OBJECT_TRACKING */
5284
5285	vm_object_lock_destroy(backing_object);
5286
5287	zfree(vm_object_zone, backing_object);
5288
5289}
5290
5291static void
5292vm_object_do_bypass(
5293	vm_object_t object,
5294	vm_object_t backing_object)
5295{
5296	/*
5297	 *	Make the parent shadow the next object
5298	 *	in the chain.
5299	 */
5300
5301	vm_object_lock_assert_exclusive(object);
5302	vm_object_lock_assert_exclusive(backing_object);
5303
5304#if	TASK_SWAPPER
5305	/*
5306	 *	Do object reference in-line to
5307	 *	conditionally increment shadow's
5308	 *	residence count.  If object is not
5309	 *	resident, leave residence count
5310	 *	on shadow alone.
5311	 */
5312	if (backing_object->shadow != VM_OBJECT_NULL) {
5313		vm_object_lock(backing_object->shadow);
5314		vm_object_lock_assert_exclusive(backing_object->shadow);
5315		backing_object->shadow->ref_count++;
5316		if (object->res_count != 0)
5317			vm_object_res_reference(backing_object->shadow);
5318		vm_object_unlock(backing_object->shadow);
5319	}
5320#else	/* TASK_SWAPPER */
5321	vm_object_reference(backing_object->shadow);
5322#endif	/* TASK_SWAPPER */
5323
5324	assert(!object->phys_contiguous);
5325	assert(!backing_object->phys_contiguous);
5326	object->shadow = backing_object->shadow;
5327	if (object->shadow) {
5328		object->vo_shadow_offset += backing_object->vo_shadow_offset;
5329	} else {
5330		/* no shadow, therefore no shadow offset... */
5331		object->vo_shadow_offset = 0;
5332	}
5333
5334	/*
5335	 *	Backing object might have had a copy pointer
5336	 *	to us.  If it did, clear it.
5337	 */
5338	if (backing_object->copy == object) {
5339		backing_object->copy = VM_OBJECT_NULL;
5340	}
5341
5342	/*
5343	 *	Drop the reference count on backing_object.
5344#if	TASK_SWAPPER
5345	 *	Since its ref_count was at least 2, it
5346	 *	will not vanish; so we don't need to call
5347	 *	vm_object_deallocate.
5348	 *	[with a caveat for "named" objects]
5349	 *
5350	 *	The res_count on the backing object is
5351	 *	conditionally decremented.  It's possible
5352	 *	(via vm_pageout_scan) to get here with
5353	 *	a "swapped" object, which has a 0 res_count,
5354	 *	in which case, the backing object res_count
5355	 *	is already down by one.
5356#else
5357	 *	Don't call vm_object_deallocate unless
5358	 *	ref_count drops to zero.
5359	 *
5360	 *	The ref_count can drop to zero here if the
5361	 *	backing object could be bypassed but not
5362	 *	collapsed, such as when the backing object
5363	 *	is temporary and cachable.
5364#endif
5365	 */
5366	if (backing_object->ref_count > 2 ||
5367	    (!backing_object->named && backing_object->ref_count > 1)) {
5368		vm_object_lock_assert_exclusive(backing_object);
5369		backing_object->ref_count--;
5370#if	TASK_SWAPPER
5371		if (object->res_count != 0)
5372			vm_object_res_deallocate(backing_object);
5373		assert(backing_object->ref_count > 0);
5374#endif	/* TASK_SWAPPER */
5375		vm_object_unlock(backing_object);
5376	} else {
5377
5378		/*
5379		 *	Drop locks so that we can deallocate
5380		 *	the backing object.
5381		 */
5382
5383#if	TASK_SWAPPER
5384		if (object->res_count == 0) {
5385			/* XXX get a reference for the deallocate below */
5386			vm_object_res_reference(backing_object);
5387		}
5388#endif	/* TASK_SWAPPER */
5389		/*
5390		 * vm_object_collapse (the caller of this function) is
5391		 * now called from contexts that may not guarantee that a
5392		 * valid reference is held on the object... w/o a valid
5393		 * reference, it is unsafe and unwise (you will definitely
5394		 * regret it) to unlock the object and then retake the lock
5395		 * since the object may be terminated and recycled in between.
5396		 * The "activity_in_progress" reference will keep the object
5397		 * 'stable'.
5398		 */
5399		vm_object_activity_begin(object);
5400		vm_object_unlock(object);
5401
5402		vm_object_unlock(backing_object);
5403		vm_object_deallocate(backing_object);
5404
5405		/*
5406		 *	Relock object. We don't have to reverify
5407		 *	its state since vm_object_collapse will
5408		 *	do that for us as it starts at the
5409		 *	top of its loop.
5410		 */
5411
5412		vm_object_lock(object);
5413		vm_object_activity_end(object);
5414	}
5415
5416	object_bypasses++;
5417}
5418
5419
5420/*
5421 *	vm_object_collapse:
5422 *
5423 *	Perform an object collapse or an object bypass if appropriate.
5424 *	The real work of collapsing and bypassing is performed in
5425 *	the routines vm_object_do_collapse and vm_object_do_bypass.
5426 *
5427 *	Requires that the object be locked and the page queues be unlocked.
5428 *
5429 */
5430static unsigned long vm_object_collapse_calls = 0;
5431static unsigned long vm_object_collapse_objects = 0;
5432static unsigned long vm_object_collapse_do_collapse = 0;
5433static unsigned long vm_object_collapse_do_bypass = 0;
5434
5435__private_extern__ void
5436vm_object_collapse(
5437	register vm_object_t			object,
5438	register vm_object_offset_t		hint_offset,
5439	boolean_t				can_bypass)
5440{
5441	register vm_object_t			backing_object;
5442	register unsigned int			rcount;
5443	register unsigned int			size;
5444	vm_object_t				original_object;
5445	int					object_lock_type;
5446	int					backing_object_lock_type;
5447
5448	vm_object_collapse_calls++;
5449
5450	if (! vm_object_collapse_allowed &&
5451	    ! (can_bypass && vm_object_bypass_allowed)) {
5452		return;
5453	}
5454
5455	XPR(XPR_VM_OBJECT, "vm_object_collapse, obj 0x%X\n",
5456		object, 0,0,0,0);
5457
5458	if (object == VM_OBJECT_NULL)
5459		return;
5460
5461	original_object = object;
5462
5463	/*
5464	 * The top object was locked "exclusive" by the caller.
5465	 * In the first pass, to determine if we can collapse the shadow chain,
5466	 * take a "shared" lock on the shadow objects.  If we can collapse,
5467	 * we'll have to go down the chain again with exclusive locks.
5468	 */
5469	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5470	backing_object_lock_type = OBJECT_LOCK_SHARED;
5471
5472retry:
5473	object = original_object;
5474	vm_object_lock_assert_exclusive(object);
5475
5476	while (TRUE) {
5477		vm_object_collapse_objects++;
5478		/*
5479		 *	Verify that the conditions are right for either
5480		 *	collapse or bypass:
5481		 */
5482
5483		/*
5484		 *	There is a backing object, and
5485		 */
5486
5487		backing_object = object->shadow;
5488		if (backing_object == VM_OBJECT_NULL) {
5489			if (object != original_object) {
5490				vm_object_unlock(object);
5491			}
5492			return;
5493		}
5494		if (backing_object_lock_type == OBJECT_LOCK_SHARED) {
5495			vm_object_lock_shared(backing_object);
5496		} else {
5497			vm_object_lock(backing_object);
5498		}
5499
5500		/*
5501		 *	No pages in the object are currently
5502		 *	being paged out, and
5503		 */
5504		if (object->paging_in_progress != 0 ||
5505		    object->activity_in_progress != 0) {
5506			/* try and collapse the rest of the shadow chain */
5507			if (object != original_object) {
5508				vm_object_unlock(object);
5509			}
5510			object = backing_object;
5511			object_lock_type = backing_object_lock_type;
5512			continue;
5513		}
5514
5515		/*
5516		 *	...
5517		 *		The backing object is not read_only,
5518		 *		and no pages in the backing object are
5519		 *		currently being paged out.
5520		 *		The backing object is internal.
5521		 *
5522		 */
5523
5524		if (!backing_object->internal ||
5525		    backing_object->paging_in_progress != 0 ||
5526		    backing_object->activity_in_progress != 0) {
5527			/* try and collapse the rest of the shadow chain */
5528			if (object != original_object) {
5529				vm_object_unlock(object);
5530			}
5531			object = backing_object;
5532			object_lock_type = backing_object_lock_type;
5533			continue;
5534		}
5535
5536		/*
5537		 * Purgeable objects are not supposed to engage in
5538		 * copy-on-write activities, so should not have
5539		 * any shadow objects or be a shadow object to another
5540		 * object.
5541		 * Collapsing a purgeable object would require some
5542		 * updates to the purgeable compressed ledgers.
5543		 */
5544		if (object->purgable != VM_PURGABLE_DENY ||
5545		    backing_object->purgable != VM_PURGABLE_DENY) {
5546			panic("vm_object_collapse() attempting to collapse "
5547			      "purgeable object: %p(%d) %p(%d)\n",
5548			      object, object->purgable,
5549			      backing_object, backing_object->purgable);
5550			/* try and collapse the rest of the shadow chain */
5551			if (object != original_object) {
5552				vm_object_unlock(object);
5553			}
5554			object = backing_object;
5555			object_lock_type = backing_object_lock_type;
5556			continue;
5557		}
5558
5559		/*
5560		 *	The backing object can't be a copy-object:
5561		 *	the shadow_offset for the copy-object must stay
5562		 *	as 0.  Furthermore (for the 'we have all the
5563		 *	pages' case), if we bypass backing_object and
5564		 *	just shadow the next object in the chain, old
5565		 *	pages from that object would then have to be copied
5566		 *	BOTH into the (former) backing_object and into the
5567		 *	parent object.
5568		 */
5569		if (backing_object->shadow != VM_OBJECT_NULL &&
5570		    backing_object->shadow->copy == backing_object) {
5571			/* try and collapse the rest of the shadow chain */
5572			if (object != original_object) {
5573				vm_object_unlock(object);
5574			}
5575			object = backing_object;
5576			object_lock_type = backing_object_lock_type;
5577			continue;
5578		}
5579
5580		/*
5581		 *	We can now try to either collapse the backing
5582		 *	object (if the parent is the only reference to
5583		 *	it) or (perhaps) remove the parent's reference
5584		 *	to it.
5585		 *
5586		 *	If there is exactly one reference to the backing
5587		 *	object, we may be able to collapse it into the
5588		 *	parent.
5589		 *
5590		 *	If MACH_PAGEMAP is defined:
5591		 *	The parent must not have a pager created for it,
5592		 *	since collapsing a backing_object dumps new pages
5593		 *	into the parent that its pager doesn't know about
5594		 *	(and the collapse code can't merge the existence
5595		 *	maps).
5596		 *	Otherwise:
5597		 *	As long as one of the objects is still not known
5598		 *	to the pager, we can collapse them.
5599		 */
5600		if (backing_object->ref_count == 1 &&
5601		    (vm_object_collapse_compressor_allowed ||
5602		     !object->pager_created
5603#if	!MACH_PAGEMAP
5604		     || (!backing_object->pager_created)
5605#endif	/*!MACH_PAGEMAP */
5606		    ) && vm_object_collapse_allowed) {
5607
5608			/*
5609			 * We need the exclusive lock on the VM objects.
5610			 */
5611			if (backing_object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
5612				/*
5613				 * We have an object and its shadow locked
5614				 * "shared".  We can't just upgrade the locks
5615				 * to "exclusive", as some other thread might
5616				 * also have these objects locked "shared" and
5617				 * attempt to upgrade one or the other to
5618				 * "exclusive".  The upgrades would block
5619				 * forever waiting for the other "shared" locks
5620				 * to get released.
5621				 * So we have to release the locks and go
5622				 * down the shadow chain again (since it could
5623				 * have changed) with "exclusive" locking.
5624				 */
5625				vm_object_unlock(backing_object);
5626				if (object != original_object)
5627					vm_object_unlock(object);
5628				object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5629				backing_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5630				goto retry;
5631			}
5632
5633			XPR(XPR_VM_OBJECT,
5634		   "vm_object_collapse: %x to %x, pager %x, pager_control %x\n",
5635				backing_object, object,
5636				backing_object->pager,
5637				backing_object->pager_control, 0);
5638
5639			/*
5640			 *	Collapse the object with its backing
5641			 *	object, and try again with the object's
5642			 *	new backing object.
5643			 */
5644
5645			vm_object_do_collapse(object, backing_object);
5646			vm_object_collapse_do_collapse++;
5647			continue;
5648		}
5649
5650		/*
5651		 *	Collapsing the backing object was not possible
5652		 *	or permitted, so let's try bypassing it.
5653		 */
5654
5655		if (! (can_bypass && vm_object_bypass_allowed)) {
5656			/* try and collapse the rest of the shadow chain */
5657			if (object != original_object) {
5658				vm_object_unlock(object);
5659			}
5660			object = backing_object;
5661			object_lock_type = backing_object_lock_type;
5662			continue;
5663		}
5664
5665
5666		/*
5667		 *	If the object doesn't have all its pages present,
5668		 *	we have to make sure no pages in the backing object
5669		 *	"show through" before bypassing it.
5670		 */
5671		size = (unsigned int)atop(object->vo_size);
5672		rcount = object->resident_page_count;
5673
5674		if (rcount != size) {
5675			vm_object_offset_t	offset;
5676			vm_object_offset_t	backing_offset;
5677			unsigned int     	backing_rcount;
5678
5679			/*
5680			 *	If the backing object has a pager but no pagemap,
5681			 *	then we cannot bypass it, because we don't know
5682			 *	what pages it has.
5683			 */
5684			if (backing_object->pager_created
5685#if	MACH_PAGEMAP
5686			    && (backing_object->existence_map == VM_EXTERNAL_NULL)
5687#endif	/* MACH_PAGEMAP */
5688				) {
5689				/* try and collapse the rest of the shadow chain */
5690				if (object != original_object) {
5691					vm_object_unlock(object);
5692				}
5693				object = backing_object;
5694				object_lock_type = backing_object_lock_type;
5695				continue;
5696			}
5697
5698			/*
5699			 *	If the object has a pager but no pagemap,
5700			 *	then we cannot bypass it, because we don't know
5701			 *	what pages it has.
5702			 */
5703			if (object->pager_created
5704#if	MACH_PAGEMAP
5705			    && (object->existence_map == VM_EXTERNAL_NULL)
5706#endif	/* MACH_PAGEMAP */
5707				) {
5708				/* try and collapse the rest of the shadow chain */
5709				if (object != original_object) {
5710					vm_object_unlock(object);
5711				}
5712				object = backing_object;
5713				object_lock_type = backing_object_lock_type;
5714				continue;
5715			}
5716
5717			backing_offset = object->vo_shadow_offset;
5718			backing_rcount = backing_object->resident_page_count;
5719
5720			if ( (int)backing_rcount - (int)(atop(backing_object->vo_size) - size) > (int)rcount) {
5721                                /*
5722				 * we have enough pages in the backing object to guarantee that
5723				 * at least 1 of them must be 'uncovered' by a resident page
5724				 * in the object we're evaluating, so move on and
5725				 * try to collapse the rest of the shadow chain
5726				 */
5727				if (object != original_object) {
5728					vm_object_unlock(object);
5729				}
5730				object = backing_object;
5731				object_lock_type = backing_object_lock_type;
5732				continue;
5733			}
5734
5735			/*
5736			 *	If all of the pages in the backing object are
5737			 *	shadowed by the parent object, the parent
5738			 *	object no longer has to shadow the backing
5739			 *	object; it can shadow the next one in the
5740			 *	chain.
5741			 *
5742			 *	If the backing object has existence info,
5743			 *	we must check examine its existence info
5744			 *	as well.
5745			 *
5746			 */
5747
5748#if	MACH_PAGEMAP
5749#define EXISTS_IN_OBJECT(obj, off, rc) \
5750	((vm_external_state_get((obj)->existence_map,	\
5751				(vm_offset_t)(off))	\
5752	  == VM_EXTERNAL_STATE_EXISTS) ||		\
5753	 (VM_COMPRESSOR_PAGER_STATE_GET((obj), (off))	\
5754	  == VM_EXTERNAL_STATE_EXISTS) ||		\
5755	 ((rc) && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--))
5756#else	/* MACH_PAGEMAP */
5757#define EXISTS_IN_OBJECT(obj, off, rc)			\
5758	((VM_COMPRESSOR_PAGER_STATE_GET((obj), (off))	\
5759	  == VM_EXTERNAL_STATE_EXISTS) ||		\
5760	 ((rc) && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--))
5761#endif	/* MACH_PAGEMAP */
5762
5763			/*
5764			 * Check the hint location first
5765			 * (since it is often the quickest way out of here).
5766			 */
5767			if (object->cow_hint != ~(vm_offset_t)0)
5768				hint_offset = (vm_object_offset_t)object->cow_hint;
5769			else
5770				hint_offset = (hint_offset > 8 * PAGE_SIZE_64) ?
5771				              (hint_offset - 8 * PAGE_SIZE_64) : 0;
5772
5773			if (EXISTS_IN_OBJECT(backing_object, hint_offset +
5774			                     backing_offset, backing_rcount) &&
5775			    !EXISTS_IN_OBJECT(object, hint_offset, rcount)) {
5776				/* dependency right at the hint */
5777				object->cow_hint = (vm_offset_t) hint_offset; /* atomic */
5778				/* try and collapse the rest of the shadow chain */
5779				if (object != original_object) {
5780					vm_object_unlock(object);
5781				}
5782				object = backing_object;
5783				object_lock_type = backing_object_lock_type;
5784				continue;
5785			}
5786
5787			/*
5788			 * If the object's window onto the backing_object
5789			 * is large compared to the number of resident
5790			 * pages in the backing object, it makes sense to
5791			 * walk the backing_object's resident pages first.
5792			 *
5793			 * NOTE: Pages may be in both the existence map and/or
5794                         * resident, so if we don't find a dependency while
5795			 * walking the backing object's resident page list
5796			 * directly, and there is an existence map, we'll have
5797			 * to run the offset based 2nd pass.  Because we may
5798			 * have to run both passes, we need to be careful
5799			 * not to decrement 'rcount' in the 1st pass
5800			 */
5801			if (backing_rcount && backing_rcount < (size / 8)) {
5802				unsigned int rc = rcount;
5803				vm_page_t p;
5804
5805				backing_rcount = backing_object->resident_page_count;
5806				p = (vm_page_t)queue_first(&backing_object->memq);
5807				do {
5808					offset = (p->offset - backing_offset);
5809
5810					if (offset < object->vo_size &&
5811					    offset != hint_offset &&
5812					    !EXISTS_IN_OBJECT(object, offset, rc)) {
5813						/* found a dependency */
5814						object->cow_hint = (vm_offset_t) offset; /* atomic */
5815
5816						break;
5817					}
5818					p = (vm_page_t) queue_next(&p->listq);
5819
5820				} while (--backing_rcount);
5821				if (backing_rcount != 0 ) {
5822					/* try and collapse the rest of the shadow chain */
5823					if (object != original_object) {
5824						vm_object_unlock(object);
5825					}
5826					object = backing_object;
5827					object_lock_type = backing_object_lock_type;
5828					continue;
5829				}
5830			}
5831
5832			/*
5833			 * Walk through the offsets looking for pages in the
5834			 * backing object that show through to the object.
5835			 */
5836			if (backing_rcount
5837#if MACH_PAGEMAP
5838			    || backing_object->existence_map
5839#endif	/* MACH_PAGEMAP */
5840				) {
5841				offset = hint_offset;
5842
5843				while((offset =
5844				      (offset + PAGE_SIZE_64 < object->vo_size) ?
5845				      (offset + PAGE_SIZE_64) : 0) != hint_offset) {
5846
5847					if (EXISTS_IN_OBJECT(backing_object, offset +
5848				            backing_offset, backing_rcount) &&
5849					    !EXISTS_IN_OBJECT(object, offset, rcount)) {
5850						/* found a dependency */
5851						object->cow_hint = (vm_offset_t) offset; /* atomic */
5852						break;
5853					}
5854				}
5855				if (offset != hint_offset) {
5856					/* try and collapse the rest of the shadow chain */
5857					if (object != original_object) {
5858						vm_object_unlock(object);
5859					}
5860					object = backing_object;
5861					object_lock_type = backing_object_lock_type;
5862					continue;
5863				}
5864			}
5865		}
5866
5867		/*
5868		 * We need "exclusive" locks on the 2 VM objects.
5869		 */
5870		if (backing_object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
5871			vm_object_unlock(backing_object);
5872			if (object != original_object)
5873				vm_object_unlock(object);
5874			object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5875			backing_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5876			goto retry;
5877		}
5878
5879		/* reset the offset hint for any objects deeper in the chain */
5880		object->cow_hint = (vm_offset_t)0;
5881
5882		/*
5883		 *	All interesting pages in the backing object
5884		 *	already live in the parent or its pager.
5885		 *	Thus we can bypass the backing object.
5886		 */
5887
5888		vm_object_do_bypass(object, backing_object);
5889		vm_object_collapse_do_bypass++;
5890
5891		/*
5892		 *	Try again with this object's new backing object.
5893		 */
5894
5895		continue;
5896	}
5897
5898	/* NOT REACHED */
5899	/*
5900	if (object != original_object) {
5901		vm_object_unlock(object);
5902	}
5903	*/
5904}
5905
5906/*
5907 *	Routine:	vm_object_page_remove: [internal]
5908 *	Purpose:
5909 *		Removes all physical pages in the specified
5910 *		object range from the object's list of pages.
5911 *
5912 *	In/out conditions:
5913 *		The object must be locked.
5914 *		The object must not have paging_in_progress, usually
5915 *		guaranteed by not having a pager.
5916 */
5917unsigned int vm_object_page_remove_lookup = 0;
5918unsigned int vm_object_page_remove_iterate = 0;
5919
5920__private_extern__ void
5921vm_object_page_remove(
5922	register vm_object_t		object,
5923	register vm_object_offset_t	start,
5924	register vm_object_offset_t	end)
5925{
5926	register vm_page_t	p, next;
5927
5928	/*
5929	 *	One and two page removals are most popular.
5930	 *	The factor of 16 here is somewhat arbitrary.
5931	 *	It balances vm_object_lookup vs iteration.
5932	 */
5933
5934	if (atop_64(end - start) < (unsigned)object->resident_page_count/16) {
5935		vm_object_page_remove_lookup++;
5936
5937		for (; start < end; start += PAGE_SIZE_64) {
5938			p = vm_page_lookup(object, start);
5939			if (p != VM_PAGE_NULL) {
5940				assert(!p->cleaning && !p->pageout && !p->laundry);
5941				if (!p->fictitious && p->pmapped)
5942				        pmap_disconnect(p->phys_page);
5943				VM_PAGE_FREE(p);
5944			}
5945		}
5946	} else {
5947		vm_object_page_remove_iterate++;
5948
5949		p = (vm_page_t) queue_first(&object->memq);
5950		while (!queue_end(&object->memq, (queue_entry_t) p)) {
5951			next = (vm_page_t) queue_next(&p->listq);
5952			if ((start <= p->offset) && (p->offset < end)) {
5953				assert(!p->cleaning && !p->pageout && !p->laundry);
5954				if (!p->fictitious && p->pmapped)
5955				        pmap_disconnect(p->phys_page);
5956				VM_PAGE_FREE(p);
5957			}
5958			p = next;
5959		}
5960	}
5961}
5962
5963
5964/*
5965 *	Routine:	vm_object_coalesce
5966 *	Function:	Coalesces two objects backing up adjoining
5967 *			regions of memory into a single object.
5968 *
5969 *	returns TRUE if objects were combined.
5970 *
5971 *	NOTE:	Only works at the moment if the second object is NULL -
5972 *		if it's not, which object do we lock first?
5973 *
5974 *	Parameters:
5975 *		prev_object	First object to coalesce
5976 *		prev_offset	Offset into prev_object
5977 *		next_object	Second object into coalesce
5978 *		next_offset	Offset into next_object
5979 *
5980 *		prev_size	Size of reference to prev_object
5981 *		next_size	Size of reference to next_object
5982 *
5983 *	Conditions:
5984 *	The object(s) must *not* be locked. The map must be locked
5985 *	to preserve the reference to the object(s).
5986 */
5987static int vm_object_coalesce_count = 0;
5988
5989__private_extern__ boolean_t
5990vm_object_coalesce(
5991	register vm_object_t		prev_object,
5992	vm_object_t			next_object,
5993	vm_object_offset_t		prev_offset,
5994	__unused vm_object_offset_t next_offset,
5995	vm_object_size_t		prev_size,
5996	vm_object_size_t		next_size)
5997{
5998	vm_object_size_t	newsize;
5999
6000#ifdef	lint
6001	next_offset++;
6002#endif	/* lint */
6003
6004	if (next_object != VM_OBJECT_NULL) {
6005		return(FALSE);
6006	}
6007
6008	if (prev_object == VM_OBJECT_NULL) {
6009		return(TRUE);
6010	}
6011
6012	XPR(XPR_VM_OBJECT,
6013       "vm_object_coalesce: 0x%X prev_off 0x%X prev_size 0x%X next_size 0x%X\n",
6014		prev_object, prev_offset, prev_size, next_size, 0);
6015
6016	vm_object_lock(prev_object);
6017
6018	/*
6019	 *	Try to collapse the object first
6020	 */
6021	vm_object_collapse(prev_object, prev_offset, TRUE);
6022
6023	/*
6024	 *	Can't coalesce if pages not mapped to
6025	 *	prev_entry may be in use any way:
6026	 *	. more than one reference
6027	 *	. paged out
6028	 *	. shadows another object
6029	 *	. has a copy elsewhere
6030	 *	. is purgeable
6031	 *	. paging references (pages might be in page-list)
6032	 */
6033
6034	if ((prev_object->ref_count > 1) ||
6035	    prev_object->pager_created ||
6036	    (prev_object->shadow != VM_OBJECT_NULL) ||
6037	    (prev_object->copy != VM_OBJECT_NULL) ||
6038	    (prev_object->true_share != FALSE) ||
6039	    (prev_object->purgable != VM_PURGABLE_DENY) ||
6040	    (prev_object->paging_in_progress != 0) ||
6041	    (prev_object->activity_in_progress != 0)) {
6042		vm_object_unlock(prev_object);
6043		return(FALSE);
6044	}
6045
6046	vm_object_coalesce_count++;
6047
6048	/*
6049	 *	Remove any pages that may still be in the object from
6050	 *	a previous deallocation.
6051	 */
6052	vm_object_page_remove(prev_object,
6053		prev_offset + prev_size,
6054		prev_offset + prev_size + next_size);
6055
6056	/*
6057	 *	Extend the object if necessary.
6058	 */
6059	newsize = prev_offset + prev_size + next_size;
6060	if (newsize > prev_object->vo_size) {
6061#if	MACH_PAGEMAP
6062		/*
6063		 *	We cannot extend an object that has existence info,
6064		 *	since the existence info might then fail to cover
6065		 *	the entire object.
6066		 *
6067		 *	This assertion must be true because the object
6068		 *	has no pager, and we only create existence info
6069		 *	for objects with pagers.
6070		 */
6071		assert(prev_object->existence_map == VM_EXTERNAL_NULL);
6072#endif	/* MACH_PAGEMAP */
6073		prev_object->vo_size = newsize;
6074	}
6075
6076	vm_object_unlock(prev_object);
6077	return(TRUE);
6078}
6079
6080/*
6081 *	Attach a set of physical pages to an object, so that they can
6082 *	be mapped by mapping the object.  Typically used to map IO memory.
6083 *
6084 *	The mapping function and its private data are used to obtain the
6085 *	physical addresses for each page to be mapped.
6086 */
6087void
6088vm_object_page_map(
6089	vm_object_t		object,
6090	vm_object_offset_t	offset,
6091	vm_object_size_t	size,
6092	vm_object_offset_t	(*map_fn)(void *map_fn_data,
6093		vm_object_offset_t offset),
6094		void 		*map_fn_data)	/* private to map_fn */
6095{
6096	int64_t	num_pages;
6097	int	i;
6098	vm_page_t	m;
6099	vm_page_t	old_page;
6100	vm_object_offset_t	addr;
6101
6102	num_pages = atop_64(size);
6103
6104	for (i = 0; i < num_pages; i++, offset += PAGE_SIZE_64) {
6105
6106	    addr = (*map_fn)(map_fn_data, offset);
6107
6108	    while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL)
6109		vm_page_more_fictitious();
6110
6111	    vm_object_lock(object);
6112	    if ((old_page = vm_page_lookup(object, offset))
6113			!= VM_PAGE_NULL)
6114	    {
6115		    VM_PAGE_FREE(old_page);
6116	    }
6117
6118	    assert((ppnum_t) addr == addr);
6119	    vm_page_init(m, (ppnum_t) addr, FALSE);
6120	    /*
6121	     * private normally requires lock_queues but since we
6122	     * are initializing the page, its not necessary here
6123	     */
6124	    m->private = TRUE;		/* don`t free page */
6125	    m->wire_count = 1;
6126	    vm_page_insert(m, object, offset);
6127
6128	    PAGE_WAKEUP_DONE(m);
6129	    vm_object_unlock(object);
6130	}
6131}
6132
6133kern_return_t
6134vm_object_populate_with_private(
6135		vm_object_t		object,
6136		vm_object_offset_t	offset,
6137		ppnum_t			phys_page,
6138		vm_size_t		size)
6139{
6140	ppnum_t			base_page;
6141	vm_object_offset_t	base_offset;
6142
6143
6144	if (!object->private)
6145		return KERN_FAILURE;
6146
6147	base_page = phys_page;
6148
6149	vm_object_lock(object);
6150
6151	if (!object->phys_contiguous) {
6152		vm_page_t	m;
6153
6154		if ((base_offset = trunc_page_64(offset)) != offset) {
6155			vm_object_unlock(object);
6156			return KERN_FAILURE;
6157		}
6158		base_offset += object->paging_offset;
6159
6160		while (size) {
6161			m = vm_page_lookup(object, base_offset);
6162
6163			if (m != VM_PAGE_NULL) {
6164				if (m->fictitious) {
6165					if (m->phys_page != vm_page_guard_addr) {
6166
6167						vm_page_lockspin_queues();
6168						m->private = TRUE;
6169						vm_page_unlock_queues();
6170
6171						m->fictitious = FALSE;
6172						m->phys_page = base_page;
6173					}
6174				} else if (m->phys_page != base_page) {
6175
6176				        if ( !m->private) {
6177						/*
6178						 * we'd leak a real page... that can't be right
6179						 */
6180						panic("vm_object_populate_with_private - %p not private", m);
6181					}
6182					if (m->pmapped) {
6183					        /*
6184						 * pmap call to clear old mapping
6185						 */
6186					        pmap_disconnect(m->phys_page);
6187					}
6188					m->phys_page = base_page;
6189				}
6190				if (m->encrypted) {
6191					/*
6192					 * we should never see this on a ficticious or private page
6193					 */
6194					panic("vm_object_populate_with_private - %p encrypted", m);
6195				}
6196
6197			} else {
6198				while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL)
6199                			vm_page_more_fictitious();
6200
6201				/*
6202				 * private normally requires lock_queues but since we
6203				 * are initializing the page, its not necessary here
6204				 */
6205				m->private = TRUE;
6206				m->fictitious = FALSE;
6207				m->phys_page = base_page;
6208				m->unusual = TRUE;
6209				m->busy = FALSE;
6210
6211	    			vm_page_insert(m, object, base_offset);
6212			}
6213			base_page++;									/* Go to the next physical page */
6214			base_offset += PAGE_SIZE;
6215			size -= PAGE_SIZE;
6216		}
6217	} else {
6218		/* NOTE: we should check the original settings here */
6219		/* if we have a size > zero a pmap call should be made */
6220		/* to disable the range */
6221
6222		/* pmap_? */
6223
6224		/* shadows on contiguous memory are not allowed */
6225		/* we therefore can use the offset field */
6226		object->vo_shadow_offset = (vm_object_offset_t)phys_page << PAGE_SHIFT;
6227		object->vo_size = size;
6228	}
6229	vm_object_unlock(object);
6230
6231	return KERN_SUCCESS;
6232}
6233
6234/*
6235 *	memory_object_free_from_cache:
6236 *
6237 *	Walk the vm_object cache list, removing and freeing vm_objects
6238 *	which are backed by the pager identified by the caller, (pager_ops).
6239 *	Remove up to "count" objects, if there are that may available
6240 *	in the cache.
6241 *
6242 *	Walk the list at most once, return the number of vm_objects
6243 *	actually freed.
6244 */
6245
6246__private_extern__ kern_return_t
6247memory_object_free_from_cache(
6248	__unused host_t		host,
6249	__unused memory_object_pager_ops_t pager_ops,
6250	int		*count)
6251{
6252#if VM_OBJECT_CACHE
6253	int	object_released = 0;
6254
6255	register vm_object_t object = VM_OBJECT_NULL;
6256	vm_object_t shadow;
6257
6258/*
6259	if(host == HOST_NULL)
6260		return(KERN_INVALID_ARGUMENT);
6261*/
6262
6263 try_again:
6264	vm_object_cache_lock();
6265
6266	queue_iterate(&vm_object_cached_list, object,
6267					vm_object_t, cached_list) {
6268		if (object->pager &&
6269		    (pager_ops == object->pager->mo_pager_ops)) {
6270			vm_object_lock(object);
6271			queue_remove(&vm_object_cached_list, object,
6272					vm_object_t, cached_list);
6273			vm_object_cached_count--;
6274
6275			vm_object_cache_unlock();
6276			/*
6277		 	*	Since this object is in the cache, we know
6278		 	*	that it is initialized and has only a pager's
6279			*	(implicit) reference. Take a reference to avoid
6280			*	recursive deallocations.
6281		 	*/
6282
6283			assert(object->pager_initialized);
6284			assert(object->ref_count == 0);
6285			vm_object_lock_assert_exclusive(object);
6286			object->ref_count++;
6287
6288			/*
6289		 	*	Terminate the object.
6290		 	*	If the object had a shadow, we let
6291			*	vm_object_deallocate deallocate it.
6292			*	"pageout" objects have a shadow, but
6293		 	*	maintain a "paging reference" rather
6294			*	than a normal reference.
6295		 	*	(We are careful here to limit recursion.)
6296		 	*/
6297			shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
6298
6299			if ((vm_object_terminate(object) == KERN_SUCCESS)
6300					&& (shadow != VM_OBJECT_NULL)) {
6301				vm_object_deallocate(shadow);
6302			}
6303
6304			if(object_released++ == *count)
6305				return KERN_SUCCESS;
6306			goto try_again;
6307		}
6308	}
6309	vm_object_cache_unlock();
6310	*count  = object_released;
6311#else
6312	*count = 0;
6313#endif
6314	return KERN_SUCCESS;
6315}
6316
6317
6318
6319kern_return_t
6320memory_object_create_named(
6321	memory_object_t	pager,
6322	memory_object_offset_t	size,
6323	memory_object_control_t		*control)
6324{
6325	vm_object_t 		object;
6326	vm_object_hash_entry_t	entry;
6327	lck_mtx_t		*lck;
6328
6329	*control = MEMORY_OBJECT_CONTROL_NULL;
6330	if (pager == MEMORY_OBJECT_NULL)
6331		return KERN_INVALID_ARGUMENT;
6332
6333	lck = vm_object_hash_lock_spin(pager);
6334	entry = vm_object_hash_lookup(pager, FALSE);
6335
6336	if ((entry != VM_OBJECT_HASH_ENTRY_NULL) &&
6337			(entry->object != VM_OBJECT_NULL)) {
6338		if (entry->object->named == TRUE)
6339			panic("memory_object_create_named: caller already holds the right");	}
6340	vm_object_hash_unlock(lck);
6341
6342	if ((object = vm_object_enter(pager, size, FALSE, FALSE, TRUE)) == VM_OBJECT_NULL) {
6343		return(KERN_INVALID_OBJECT);
6344	}
6345
6346	/* wait for object (if any) to be ready */
6347	if (object != VM_OBJECT_NULL) {
6348		vm_object_lock(object);
6349		object->named = TRUE;
6350		while (!object->pager_ready) {
6351			vm_object_sleep(object,
6352					VM_OBJECT_EVENT_PAGER_READY,
6353					THREAD_UNINT);
6354		}
6355		*control = object->pager_control;
6356		vm_object_unlock(object);
6357	}
6358	return (KERN_SUCCESS);
6359}
6360
6361
6362/*
6363 *	Routine:	memory_object_recover_named [user interface]
6364 *	Purpose:
6365 *		Attempt to recover a named reference for a VM object.
6366 *		VM will verify that the object has not already started
6367 *		down the termination path, and if it has, will optionally
6368 *		wait for that to finish.
6369 *	Returns:
6370 *		KERN_SUCCESS - we recovered a named reference on the object
6371 *		KERN_FAILURE - we could not recover a reference (object dead)
6372 *		KERN_INVALID_ARGUMENT - bad memory object control
6373 */
6374kern_return_t
6375memory_object_recover_named(
6376	memory_object_control_t	control,
6377	boolean_t		wait_on_terminating)
6378{
6379	vm_object_t		object;
6380
6381	object = memory_object_control_to_vm_object(control);
6382	if (object == VM_OBJECT_NULL) {
6383		return (KERN_INVALID_ARGUMENT);
6384	}
6385restart:
6386	vm_object_lock(object);
6387
6388	if (object->terminating && wait_on_terminating) {
6389		vm_object_wait(object,
6390			VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
6391			THREAD_UNINT);
6392		goto restart;
6393	}
6394
6395	if (!object->alive) {
6396		vm_object_unlock(object);
6397		return KERN_FAILURE;
6398	}
6399
6400	if (object->named == TRUE) {
6401		vm_object_unlock(object);
6402		return KERN_SUCCESS;
6403	}
6404#if VM_OBJECT_CACHE
6405	if ((object->ref_count == 0) && (!object->terminating)) {
6406		if (!vm_object_cache_lock_try()) {
6407			vm_object_unlock(object);
6408			goto restart;
6409		}
6410		queue_remove(&vm_object_cached_list, object,
6411				     vm_object_t, cached_list);
6412		vm_object_cached_count--;
6413		XPR(XPR_VM_OBJECT_CACHE,
6414		    "memory_object_recover_named: removing %X, head (%X, %X)\n",
6415		    object,
6416		    vm_object_cached_list.next,
6417		    vm_object_cached_list.prev, 0,0);
6418
6419		vm_object_cache_unlock();
6420	}
6421#endif
6422	object->named = TRUE;
6423	vm_object_lock_assert_exclusive(object);
6424	object->ref_count++;
6425	vm_object_res_reference(object);
6426	while (!object->pager_ready) {
6427		vm_object_sleep(object,
6428				VM_OBJECT_EVENT_PAGER_READY,
6429				THREAD_UNINT);
6430	}
6431	vm_object_unlock(object);
6432	return (KERN_SUCCESS);
6433}
6434
6435
6436/*
6437 *	vm_object_release_name:
6438 *
6439 *	Enforces name semantic on memory_object reference count decrement
6440 *	This routine should not be called unless the caller holds a name
6441 *	reference gained through the memory_object_create_named.
6442 *
6443 *	If the TERMINATE_IDLE flag is set, the call will return if the
6444 *	reference count is not 1. i.e. idle with the only remaining reference
6445 *	being the name.
6446 *	If the decision is made to proceed the name field flag is set to
6447 *	false and the reference count is decremented.  If the RESPECT_CACHE
6448 *	flag is set and the reference count has gone to zero, the
6449 *	memory_object is checked to see if it is cacheable otherwise when
6450 *	the reference count is zero, it is simply terminated.
6451 */
6452
6453__private_extern__ kern_return_t
6454vm_object_release_name(
6455	vm_object_t	object,
6456	int		flags)
6457{
6458	vm_object_t	shadow;
6459	boolean_t	original_object = TRUE;
6460
6461	while (object != VM_OBJECT_NULL) {
6462
6463		vm_object_lock(object);
6464
6465		assert(object->alive);
6466		if (original_object)
6467			assert(object->named);
6468		assert(object->ref_count > 0);
6469
6470		/*
6471		 *	We have to wait for initialization before
6472		 *	destroying or caching the object.
6473		 */
6474
6475		if (object->pager_created && !object->pager_initialized) {
6476			assert(!object->can_persist);
6477			vm_object_assert_wait(object,
6478					VM_OBJECT_EVENT_INITIALIZED,
6479					THREAD_UNINT);
6480			vm_object_unlock(object);
6481			thread_block(THREAD_CONTINUE_NULL);
6482			continue;
6483		}
6484
6485		if (((object->ref_count > 1)
6486			&& (flags & MEMORY_OBJECT_TERMINATE_IDLE))
6487			|| (object->terminating)) {
6488			vm_object_unlock(object);
6489			return KERN_FAILURE;
6490		} else {
6491			if (flags & MEMORY_OBJECT_RELEASE_NO_OP) {
6492				vm_object_unlock(object);
6493				return KERN_SUCCESS;
6494			}
6495		}
6496
6497		if ((flags & MEMORY_OBJECT_RESPECT_CACHE) &&
6498					(object->ref_count == 1)) {
6499			if (original_object)
6500				object->named = FALSE;
6501			vm_object_unlock(object);
6502			/* let vm_object_deallocate push this thing into */
6503			/* the cache, if that it is where it is bound */
6504			vm_object_deallocate(object);
6505			return KERN_SUCCESS;
6506		}
6507		VM_OBJ_RES_DECR(object);
6508		shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
6509
6510		if (object->ref_count == 1) {
6511			if (vm_object_terminate(object) != KERN_SUCCESS) {
6512				if (original_object) {
6513					return KERN_FAILURE;
6514				} else {
6515					return KERN_SUCCESS;
6516				}
6517			}
6518			if (shadow != VM_OBJECT_NULL) {
6519				original_object = FALSE;
6520				object = shadow;
6521				continue;
6522			}
6523			return KERN_SUCCESS;
6524		} else {
6525			vm_object_lock_assert_exclusive(object);
6526			object->ref_count--;
6527			assert(object->ref_count > 0);
6528			if(original_object)
6529				object->named = FALSE;
6530			vm_object_unlock(object);
6531			return KERN_SUCCESS;
6532		}
6533	}
6534	/*NOTREACHED*/
6535	assert(0);
6536	return KERN_FAILURE;
6537}
6538
6539
6540__private_extern__ kern_return_t
6541vm_object_lock_request(
6542	vm_object_t			object,
6543	vm_object_offset_t		offset,
6544	vm_object_size_t		size,
6545	memory_object_return_t		should_return,
6546	int				flags,
6547	vm_prot_t			prot)
6548{
6549	__unused boolean_t	should_flush;
6550
6551	should_flush = flags & MEMORY_OBJECT_DATA_FLUSH;
6552
6553        XPR(XPR_MEMORY_OBJECT,
6554	    "vm_o_lock_request, obj 0x%X off 0x%X size 0x%X flags %X prot %X\n",
6555	    object, offset, size,
6556 	    (((should_return&1)<<1)|should_flush), prot);
6557
6558	/*
6559	 *	Check for bogus arguments.
6560	 */
6561	if (object == VM_OBJECT_NULL)
6562		return (KERN_INVALID_ARGUMENT);
6563
6564	if ((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE)
6565		return (KERN_INVALID_ARGUMENT);
6566
6567	size = round_page_64(size);
6568
6569	/*
6570	 *	Lock the object, and acquire a paging reference to
6571	 *	prevent the memory_object reference from being released.
6572	 */
6573	vm_object_lock(object);
6574	vm_object_paging_begin(object);
6575
6576	(void)vm_object_update(object,
6577		offset, size, NULL, NULL, should_return, flags, prot);
6578
6579	vm_object_paging_end(object);
6580	vm_object_unlock(object);
6581
6582	return (KERN_SUCCESS);
6583}
6584
6585/*
6586 * Empty a purgeable object by grabbing the physical pages assigned to it and
6587 * putting them on the free queue without writing them to backing store, etc.
6588 * When the pages are next touched they will be demand zero-fill pages.  We
6589 * skip pages which are busy, being paged in/out, wired, etc.  We do _not_
6590 * skip referenced/dirty pages, pages on the active queue, etc.  We're more
6591 * than happy to grab these since this is a purgeable object.  We mark the
6592 * object as "empty" after reaping its pages.
6593 *
6594 * On entry the object must be locked and it must be
6595 * purgeable with no delayed copies pending.
6596 */
6597void
6598vm_object_purge(vm_object_t object, int flags)
6599{
6600        vm_object_lock_assert_exclusive(object);
6601
6602	if (object->purgable == VM_PURGABLE_DENY)
6603		return;
6604
6605	assert(object->copy == VM_OBJECT_NULL);
6606	assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6607
6608	/*
6609	 * We need to set the object's state to VM_PURGABLE_EMPTY *before*
6610	 * reaping its pages.  We update vm_page_purgeable_count in bulk
6611	 * and we don't want vm_page_remove() to update it again for each
6612	 * page we reap later.
6613	 *
6614	 * For the purgeable ledgers, pages from VOLATILE and EMPTY objects
6615	 * are all accounted for in the "volatile" ledgers, so this does not
6616	 * make any difference.
6617	 * If we transitioned directly from NONVOLATILE to EMPTY,
6618	 * vm_page_purgeable_count must have been updated when the object
6619	 * was dequeued from its volatile queue and the purgeable ledgers
6620	 * must have also been updated accordingly at that time (in
6621	 * vm_object_purgable_control()).
6622	 */
6623	if (object->purgable == VM_PURGABLE_VOLATILE) {
6624		unsigned int delta;
6625		assert(object->resident_page_count >=
6626		       object->wired_page_count);
6627		delta = (object->resident_page_count -
6628			 object->wired_page_count);
6629		if (delta != 0) {
6630			assert(vm_page_purgeable_count >=
6631			       delta);
6632			OSAddAtomic(-delta,
6633				    (SInt32 *)&vm_page_purgeable_count);
6634		}
6635		if (object->wired_page_count != 0) {
6636			assert(vm_page_purgeable_wired_count >=
6637			       object->wired_page_count);
6638			OSAddAtomic(-object->wired_page_count,
6639				    (SInt32 *)&vm_page_purgeable_wired_count);
6640		}
6641		object->purgable = VM_PURGABLE_EMPTY;
6642	}
6643	assert(object->purgable == VM_PURGABLE_EMPTY);
6644
6645	vm_object_reap_pages(object, REAP_PURGEABLE);
6646
6647	if (object->pager != NULL &&
6648	    COMPRESSED_PAGER_IS_ACTIVE) {
6649		unsigned int pgcount;
6650
6651		if (object->activity_in_progress == 0 &&
6652		    object->paging_in_progress == 0) {
6653			/*
6654			 * Also reap any memory coming from this object
6655			 * in the VM compressor.
6656			 *
6657			 * There are no operations in progress on the VM object
6658			 * and no operation can start while we're holding the
6659			 * VM object lock, so it's safe to reap the compressed
6660			 * pages and update the page counts.
6661			 */
6662			pgcount = vm_compressor_pager_get_count(object->pager);
6663			if (pgcount) {
6664				pgcount = vm_compressor_pager_reap_pages(object->pager, flags);
6665				vm_compressor_pager_count(object->pager,
6666							  -pgcount,
6667							  FALSE, /* shared */
6668							  object);
6669				vm_purgeable_compressed_update(object,
6670							       -pgcount);
6671			}
6672			if ( !(flags & C_DONT_BLOCK)) {
6673				assert(vm_compressor_pager_get_count(object->pager)
6674				       == 0);
6675			}
6676		} else {
6677			/*
6678			 * There's some kind of paging activity in progress
6679			 * for this object, which could result in a page
6680			 * being compressed or decompressed, possibly while
6681			 * the VM object is not locked, so it could race
6682			 * with us.
6683			 *
6684			 * We can't really synchronize this without possibly
6685			 * causing a deadlock when the compressor needs to
6686			 * allocate or free memory while compressing or
6687			 * decompressing a page from a purgeable object
6688			 * mapped in the kernel_map...
6689			 *
6690			 * So let's not attempt to purge the compressor
6691			 * pager if there's any kind of operation in
6692			 * progress on the VM object.
6693			 */
6694		}
6695	}
6696
6697	vm_object_lock_assert_exclusive(object);
6698}
6699
6700
6701/*
6702 * vm_object_purgeable_control() allows the caller to control and investigate the
6703 * state of a purgeable object.  A purgeable object is created via a call to
6704 * vm_allocate() with VM_FLAGS_PURGABLE specified.  A purgeable object will
6705 * never be coalesced with any other object -- even other purgeable objects --
6706 * and will thus always remain a distinct object.  A purgeable object has
6707 * special semantics when its reference count is exactly 1.  If its reference
6708 * count is greater than 1, then a purgeable object will behave like a normal
6709 * object and attempts to use this interface will result in an error return
6710 * of KERN_INVALID_ARGUMENT.
6711 *
6712 * A purgeable object may be put into a "volatile" state which will make the
6713 * object's pages elligable for being reclaimed without paging to backing
6714 * store if the system runs low on memory.  If the pages in a volatile
6715 * purgeable object are reclaimed, the purgeable object is said to have been
6716 * "emptied."  When a purgeable object is emptied the system will reclaim as
6717 * many pages from the object as it can in a convenient manner (pages already
6718 * en route to backing store or busy for other reasons are left as is).  When
6719 * a purgeable object is made volatile, its pages will generally be reclaimed
6720 * before other pages in the application's working set.  This semantic is
6721 * generally used by applications which can recreate the data in the object
6722 * faster than it can be paged in.  One such example might be media assets
6723 * which can be reread from a much faster RAID volume.
6724 *
6725 * A purgeable object may be designated as "non-volatile" which means it will
6726 * behave like all other objects in the system with pages being written to and
6727 * read from backing store as needed to satisfy system memory needs.  If the
6728 * object was emptied before the object was made non-volatile, that fact will
6729 * be returned as the old state of the purgeable object (see
6730 * VM_PURGABLE_SET_STATE below).  In this case, any pages of the object which
6731 * were reclaimed as part of emptying the object will be refaulted in as
6732 * zero-fill on demand.  It is up to the application to note that an object
6733 * was emptied and recreate the objects contents if necessary.  When a
6734 * purgeable object is made non-volatile, its pages will generally not be paged
6735 * out to backing store in the immediate future.  A purgeable object may also
6736 * be manually emptied.
6737 *
6738 * Finally, the current state (non-volatile, volatile, volatile & empty) of a
6739 * volatile purgeable object may be queried at any time.  This information may
6740 * be used as a control input to let the application know when the system is
6741 * experiencing memory pressure and is reclaiming memory.
6742 *
6743 * The specified address may be any address within the purgeable object.  If
6744 * the specified address does not represent any object in the target task's
6745 * virtual address space, then KERN_INVALID_ADDRESS will be returned.  If the
6746 * object containing the specified address is not a purgeable object, then
6747 * KERN_INVALID_ARGUMENT will be returned.  Otherwise, KERN_SUCCESS will be
6748 * returned.
6749 *
6750 * The control parameter may be any one of VM_PURGABLE_SET_STATE or
6751 * VM_PURGABLE_GET_STATE.  For VM_PURGABLE_SET_STATE, the in/out parameter
6752 * state is used to set the new state of the purgeable object and return its
6753 * old state.  For VM_PURGABLE_GET_STATE, the current state of the purgeable
6754 * object is returned in the parameter state.
6755 *
6756 * The in/out parameter state may be one of VM_PURGABLE_NONVOLATILE,
6757 * VM_PURGABLE_VOLATILE or VM_PURGABLE_EMPTY.  These, respectively, represent
6758 * the non-volatile, volatile and volatile/empty states described above.
6759 * Setting the state of a purgeable object to VM_PURGABLE_EMPTY will
6760 * immediately reclaim as many pages in the object as can be conveniently
6761 * collected (some may have already been written to backing store or be
6762 * otherwise busy).
6763 *
6764 * The process of making a purgeable object non-volatile and determining its
6765 * previous state is atomic.  Thus, if a purgeable object is made
6766 * VM_PURGABLE_NONVOLATILE and the old state is returned as
6767 * VM_PURGABLE_VOLATILE, then the purgeable object's previous contents are
6768 * completely intact and will remain so until the object is made volatile
6769 * again.  If the old state is returned as VM_PURGABLE_EMPTY then the object
6770 * was reclaimed while it was in a volatile state and its previous contents
6771 * have been lost.
6772 */
6773/*
6774 * The object must be locked.
6775 */
6776kern_return_t
6777vm_object_purgable_control(
6778	vm_object_t	object,
6779	vm_purgable_t	control,
6780	int		*state)
6781{
6782	int		old_state;
6783	int		new_state;
6784
6785	if (object == VM_OBJECT_NULL) {
6786		/*
6787		 * Object must already be present or it can't be purgeable.
6788		 */
6789		return KERN_INVALID_ARGUMENT;
6790	}
6791
6792	vm_object_lock_assert_exclusive(object);
6793
6794	/*
6795	 * Get current state of the purgeable object.
6796	 */
6797	old_state = object->purgable;
6798	if (old_state == VM_PURGABLE_DENY)
6799		return KERN_INVALID_ARGUMENT;
6800
6801	/* purgeable cant have delayed copies - now or in the future */
6802	assert(object->copy == VM_OBJECT_NULL);
6803	assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6804
6805	/*
6806	 * Execute the desired operation.
6807	 */
6808	if (control == VM_PURGABLE_GET_STATE) {
6809		*state = old_state;
6810		return KERN_SUCCESS;
6811	}
6812
6813	if ((*state) & VM_PURGABLE_DEBUG_EMPTY) {
6814		object->volatile_empty = TRUE;
6815	}
6816	if ((*state) & VM_PURGABLE_DEBUG_FAULT) {
6817		object->volatile_fault = TRUE;
6818	}
6819
6820	new_state = *state & VM_PURGABLE_STATE_MASK;
6821	if (new_state == VM_PURGABLE_VOLATILE &&
6822	    object->volatile_empty) {
6823		new_state = VM_PURGABLE_EMPTY;
6824	}
6825
6826	switch (new_state) {
6827	case VM_PURGABLE_DENY:
6828	case VM_PURGABLE_NONVOLATILE:
6829		object->purgable = new_state;
6830
6831		if (old_state == VM_PURGABLE_VOLATILE) {
6832			unsigned int delta;
6833
6834			assert(object->resident_page_count >=
6835			       object->wired_page_count);
6836			delta = (object->resident_page_count -
6837				 object->wired_page_count);
6838
6839			assert(vm_page_purgeable_count >= delta);
6840
6841			if (delta != 0) {
6842				OSAddAtomic(-delta,
6843					    (SInt32 *)&vm_page_purgeable_count);
6844			}
6845			if (object->wired_page_count != 0) {
6846				assert(vm_page_purgeable_wired_count >=
6847				       object->wired_page_count);
6848				OSAddAtomic(-object->wired_page_count,
6849					    (SInt32 *)&vm_page_purgeable_wired_count);
6850			}
6851
6852			vm_page_lock_queues();
6853
6854			/* object should be on a queue */
6855			assert(object->objq.next != NULL &&
6856			       object->objq.prev != NULL);
6857			purgeable_q_t queue;
6858
6859			/*
6860			 * Move object from its volatile queue to the
6861			 * non-volatile queue...
6862			 */
6863			queue = vm_purgeable_object_remove(object);
6864			assert(queue);
6865
6866			if (object->purgeable_when_ripe) {
6867				vm_purgeable_token_delete_last(queue);
6868			}
6869			assert(queue->debug_count_objects>=0);
6870
6871			vm_page_unlock_queues();
6872		}
6873		if (old_state == VM_PURGABLE_VOLATILE ||
6874		    old_state == VM_PURGABLE_EMPTY) {
6875			/*
6876			 * Transfer the object's pages from the volatile to
6877			 * non-volatile ledgers.
6878			 */
6879			vm_purgeable_accounting(object, VM_PURGABLE_VOLATILE,
6880						FALSE);
6881		}
6882
6883		break;
6884
6885	case VM_PURGABLE_VOLATILE:
6886		if (object->volatile_fault) {
6887			vm_page_t	p;
6888			int		refmod;
6889
6890			queue_iterate(&object->memq, p, vm_page_t, listq) {
6891				if (p->busy ||
6892				    VM_PAGE_WIRED(p) ||
6893				    p->fictitious) {
6894					continue;
6895				}
6896				refmod = pmap_disconnect(p->phys_page);
6897				if ((refmod & VM_MEM_MODIFIED) &&
6898				    !p->dirty) {
6899					SET_PAGE_DIRTY(p, FALSE);
6900				}
6901			}
6902		}
6903
6904		if (old_state == VM_PURGABLE_EMPTY &&
6905		    object->resident_page_count == 0 &&
6906		    object->pager == NULL)
6907			break;
6908
6909		purgeable_q_t queue;
6910
6911		/* find the correct queue */
6912		if ((*state&VM_PURGABLE_ORDERING_MASK) == VM_PURGABLE_ORDERING_OBSOLETE)
6913		        queue = &purgeable_queues[PURGEABLE_Q_TYPE_OBSOLETE];
6914		else {
6915		        if ((*state&VM_PURGABLE_BEHAVIOR_MASK) == VM_PURGABLE_BEHAVIOR_FIFO)
6916			        queue = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO];
6917			else
6918			        queue = &purgeable_queues[PURGEABLE_Q_TYPE_LIFO];
6919		}
6920
6921		if (old_state == VM_PURGABLE_NONVOLATILE ||
6922		    old_state == VM_PURGABLE_EMPTY) {
6923			unsigned int delta;
6924
6925			if ((*state & VM_PURGABLE_NO_AGING_MASK) ==
6926			    VM_PURGABLE_NO_AGING) {
6927				object->purgeable_when_ripe = FALSE;
6928			} else {
6929				object->purgeable_when_ripe = TRUE;
6930			}
6931
6932			if (object->purgeable_when_ripe) {
6933				kern_return_t result;
6934
6935				/* try to add token... this can fail */
6936				vm_page_lock_queues();
6937
6938				result = vm_purgeable_token_add(queue);
6939				if (result != KERN_SUCCESS) {
6940					vm_page_unlock_queues();
6941					return result;
6942				}
6943				vm_page_unlock_queues();
6944			}
6945
6946			assert(object->resident_page_count >=
6947			       object->wired_page_count);
6948			delta = (object->resident_page_count -
6949				 object->wired_page_count);
6950
6951			if (delta != 0) {
6952				OSAddAtomic(delta,
6953					    &vm_page_purgeable_count);
6954			}
6955			if (object->wired_page_count != 0) {
6956				OSAddAtomic(object->wired_page_count,
6957					    &vm_page_purgeable_wired_count);
6958			}
6959
6960			object->purgable = new_state;
6961
6962			/* object should be on "non-volatile" queue */
6963			assert(object->objq.next != NULL);
6964			assert(object->objq.prev != NULL);
6965		}
6966		else if (old_state == VM_PURGABLE_VOLATILE) {
6967			purgeable_q_t	old_queue;
6968			boolean_t	purgeable_when_ripe;
6969
6970		        /*
6971			 * if reassigning priorities / purgeable groups, we don't change the
6972			 * token queue. So moving priorities will not make pages stay around longer.
6973			 * Reasoning is that the algorithm gives most priority to the most important
6974			 * object. If a new token is added, the most important object' priority is boosted.
6975			 * This biases the system already for purgeable queues that move a lot.
6976			 * It doesn't seem more biasing is neccessary in this case, where no new object is added.
6977			 */
6978		        assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */
6979
6980			old_queue = vm_purgeable_object_remove(object);
6981			assert(old_queue);
6982
6983			if ((*state & VM_PURGABLE_NO_AGING_MASK) ==
6984			    VM_PURGABLE_NO_AGING) {
6985				purgeable_when_ripe = FALSE;
6986			} else {
6987				purgeable_when_ripe = TRUE;
6988			}
6989
6990			if (old_queue != queue ||
6991			    (purgeable_when_ripe !=
6992			     object->purgeable_when_ripe)) {
6993				kern_return_t result;
6994
6995			        /* Changing queue. Have to move token. */
6996			        vm_page_lock_queues();
6997				if (object->purgeable_when_ripe) {
6998					vm_purgeable_token_delete_last(old_queue);
6999				}
7000				object->purgeable_when_ripe = purgeable_when_ripe;
7001				if (object->purgeable_when_ripe) {
7002					result = vm_purgeable_token_add(queue);
7003					assert(result==KERN_SUCCESS);   /* this should never fail since we just freed a token */
7004				}
7005				vm_page_unlock_queues();
7006
7007			}
7008		};
7009		vm_purgeable_object_add(object, queue, (*state&VM_VOLATILE_GROUP_MASK)>>VM_VOLATILE_GROUP_SHIFT );
7010		if (old_state == VM_PURGABLE_NONVOLATILE) {
7011			vm_purgeable_accounting(object, VM_PURGABLE_NONVOLATILE,
7012						FALSE);
7013		}
7014
7015		assert(queue->debug_count_objects>=0);
7016
7017		break;
7018
7019
7020	case VM_PURGABLE_EMPTY:
7021		if (object->volatile_fault) {
7022			vm_page_t	p;
7023			int		refmod;
7024
7025			queue_iterate(&object->memq, p, vm_page_t, listq) {
7026				if (p->busy ||
7027				    VM_PAGE_WIRED(p) ||
7028				    p->fictitious) {
7029					continue;
7030				}
7031				refmod = pmap_disconnect(p->phys_page);
7032				if ((refmod & VM_MEM_MODIFIED) &&
7033				    !p->dirty) {
7034					SET_PAGE_DIRTY(p, FALSE);
7035				}
7036			}
7037		}
7038
7039		if (old_state == new_state) {
7040			/* nothing changes */
7041			break;
7042		}
7043
7044		assert(old_state == VM_PURGABLE_NONVOLATILE ||
7045		       old_state == VM_PURGABLE_VOLATILE);
7046		if (old_state == VM_PURGABLE_VOLATILE) {
7047			purgeable_q_t old_queue;
7048
7049			/* object should be on a queue */
7050			assert(object->objq.next != NULL &&
7051			       object->objq.prev != NULL);
7052
7053			old_queue = vm_purgeable_object_remove(object);
7054			assert(old_queue);
7055			if (object->purgeable_when_ripe) {
7056				vm_page_lock_queues();
7057				vm_purgeable_token_delete_first(old_queue);
7058				vm_page_unlock_queues();
7059			}
7060		}
7061
7062		if (old_state == VM_PURGABLE_NONVOLATILE) {
7063			/*
7064			 * This object's pages were previously accounted as
7065			 * "non-volatile" and now need to be accounted as
7066			 * "volatile".
7067			 */
7068			vm_purgeable_accounting(object, VM_PURGABLE_NONVOLATILE,
7069						FALSE);
7070			/*
7071			 * Set to VM_PURGABLE_EMPTY because the pages are no
7072			 * longer accounted in the "non-volatile" ledger
7073			 * and are also not accounted for in
7074			 * "vm_page_purgeable_count".
7075			 */
7076			object->purgable = VM_PURGABLE_EMPTY;
7077		}
7078
7079		(void) vm_object_purge(object, 0);
7080		assert(object->purgable == VM_PURGABLE_EMPTY);
7081
7082		break;
7083	}
7084
7085	*state = old_state;
7086
7087	vm_object_lock_assert_exclusive(object);
7088
7089	return KERN_SUCCESS;
7090}
7091
7092kern_return_t
7093vm_object_get_page_counts(
7094	vm_object_t		object,
7095	vm_object_offset_t	offset,
7096	vm_object_size_t	size,
7097	unsigned int		*resident_page_count,
7098	unsigned int		*dirty_page_count)
7099{
7100
7101	kern_return_t		kr = KERN_SUCCESS;
7102	boolean_t		count_dirty_pages = FALSE;
7103	vm_page_t		p = VM_PAGE_NULL;
7104	unsigned int 		local_resident_count = 0;
7105	unsigned int		local_dirty_count = 0;
7106	vm_object_offset_t	cur_offset = 0;
7107	vm_object_offset_t	end_offset = 0;
7108
7109	if (object == VM_OBJECT_NULL)
7110		return KERN_INVALID_ARGUMENT;
7111
7112
7113	cur_offset = offset;
7114
7115	end_offset = offset + size;
7116
7117	vm_object_lock_assert_exclusive(object);
7118
7119	if (dirty_page_count != NULL) {
7120
7121		count_dirty_pages = TRUE;
7122	}
7123
7124	if (resident_page_count != NULL && count_dirty_pages == FALSE) {
7125		/*
7126		 * Fast path when:
7127		 * - we only want the resident page count, and,
7128		 * - the entire object is exactly covered by the request.
7129		 */
7130		if (offset == 0 && (object->vo_size == size)) {
7131
7132			*resident_page_count = object->resident_page_count;
7133			goto out;
7134		}
7135	}
7136
7137	if (object->resident_page_count <= (size >> PAGE_SHIFT)) {
7138
7139		queue_iterate(&object->memq, p, vm_page_t, listq) {
7140
7141			if (p->offset >= cur_offset && p->offset < end_offset) {
7142
7143				local_resident_count++;
7144
7145				if (count_dirty_pages) {
7146
7147					if (p->dirty || (p->wpmapped && pmap_is_modified(p->phys_page))) {
7148
7149						local_dirty_count++;
7150					}
7151				}
7152			}
7153		}
7154	} else {
7155
7156		for (cur_offset = offset; cur_offset < end_offset; cur_offset += PAGE_SIZE_64) {
7157
7158			p = vm_page_lookup(object, cur_offset);
7159
7160			if (p != VM_PAGE_NULL) {
7161
7162				local_resident_count++;
7163
7164				if (count_dirty_pages) {
7165
7166					if (p->dirty || (p->wpmapped && pmap_is_modified(p->phys_page))) {
7167
7168						local_dirty_count++;
7169					}
7170				}
7171			}
7172		}
7173
7174	}
7175
7176	if (resident_page_count != NULL) {
7177		*resident_page_count = local_resident_count;
7178	}
7179
7180	if (dirty_page_count != NULL) {
7181		*dirty_page_count = local_dirty_count;
7182	}
7183
7184out:
7185	return kr;
7186}
7187
7188
7189#if	TASK_SWAPPER
7190/*
7191 * vm_object_res_deallocate
7192 *
7193 * (recursively) decrement residence counts on vm objects and their shadows.
7194 * Called from vm_object_deallocate and when swapping out an object.
7195 *
7196 * The object is locked, and remains locked throughout the function,
7197 * even as we iterate down the shadow chain.  Locks on intermediate objects
7198 * will be dropped, but not the original object.
7199 *
7200 * NOTE: this function used to use recursion, rather than iteration.
7201 */
7202
7203__private_extern__ void
7204vm_object_res_deallocate(
7205	vm_object_t	object)
7206{
7207	vm_object_t orig_object = object;
7208	/*
7209	 * Object is locked so it can be called directly
7210	 * from vm_object_deallocate.  Original object is never
7211	 * unlocked.
7212	 */
7213	assert(object->res_count > 0);
7214	while  (--object->res_count == 0) {
7215		assert(object->ref_count >= object->res_count);
7216		vm_object_deactivate_all_pages(object);
7217		/* iterate on shadow, if present */
7218		if (object->shadow != VM_OBJECT_NULL) {
7219			vm_object_t tmp_object = object->shadow;
7220			vm_object_lock(tmp_object);
7221			if (object != orig_object)
7222				vm_object_unlock(object);
7223			object = tmp_object;
7224			assert(object->res_count > 0);
7225		} else
7226			break;
7227	}
7228	if (object != orig_object)
7229		vm_object_unlock(object);
7230}
7231
7232/*
7233 * vm_object_res_reference
7234 *
7235 * Internal function to increment residence count on a vm object
7236 * and its shadows.  It is called only from vm_object_reference, and
7237 * when swapping in a vm object, via vm_map_swap.
7238 *
7239 * The object is locked, and remains locked throughout the function,
7240 * even as we iterate down the shadow chain.  Locks on intermediate objects
7241 * will be dropped, but not the original object.
7242 *
7243 * NOTE: this function used to use recursion, rather than iteration.
7244 */
7245
7246__private_extern__ void
7247vm_object_res_reference(
7248	vm_object_t	object)
7249{
7250	vm_object_t orig_object = object;
7251	/*
7252	 * Object is locked, so this can be called directly
7253	 * from vm_object_reference.  This lock is never released.
7254	 */
7255	while  ((++object->res_count == 1)  &&
7256		(object->shadow != VM_OBJECT_NULL)) {
7257		vm_object_t tmp_object = object->shadow;
7258
7259		assert(object->ref_count >= object->res_count);
7260		vm_object_lock(tmp_object);
7261		if (object != orig_object)
7262			vm_object_unlock(object);
7263		object = tmp_object;
7264	}
7265	if (object != orig_object)
7266		vm_object_unlock(object);
7267	assert(orig_object->ref_count >= orig_object->res_count);
7268}
7269#endif	/* TASK_SWAPPER */
7270
7271/*
7272 *	vm_object_reference:
7273 *
7274 *	Gets another reference to the given object.
7275 */
7276#ifdef vm_object_reference
7277#undef vm_object_reference
7278#endif
7279__private_extern__ void
7280vm_object_reference(
7281	register vm_object_t	object)
7282{
7283	if (object == VM_OBJECT_NULL)
7284		return;
7285
7286	vm_object_lock(object);
7287	assert(object->ref_count > 0);
7288	vm_object_reference_locked(object);
7289	vm_object_unlock(object);
7290}
7291
7292#ifdef MACH_BSD
7293/*
7294 * Scale the vm_object_cache
7295 * This is required to make sure that the vm_object_cache is big
7296 * enough to effectively cache the mapped file.
7297 * This is really important with UBC as all the regular file vnodes
7298 * have memory object associated with them. Havving this cache too
7299 * small results in rapid reclaim of vnodes and hurts performance a LOT!
7300 *
7301 * This is also needed as number of vnodes can be dynamically scaled.
7302 */
7303kern_return_t
7304adjust_vm_object_cache(
7305	__unused vm_size_t oval,
7306	__unused vm_size_t nval)
7307{
7308#if VM_OBJECT_CACHE
7309	vm_object_cached_max = nval;
7310	vm_object_cache_trim(FALSE);
7311#endif
7312	return (KERN_SUCCESS);
7313}
7314#endif /* MACH_BSD */
7315
7316
7317/*
7318 * vm_object_transpose
7319 *
7320 * This routine takes two VM objects of the same size and exchanges
7321 * their backing store.
7322 * The objects should be "quiesced" via a UPL operation with UPL_SET_IO_WIRE
7323 * and UPL_BLOCK_ACCESS if they are referenced anywhere.
7324 *
7325 * The VM objects must not be locked by caller.
7326 */
7327unsigned int vm_object_transpose_count = 0;
7328kern_return_t
7329vm_object_transpose(
7330	vm_object_t		object1,
7331	vm_object_t		object2,
7332	vm_object_size_t	transpose_size)
7333{
7334	vm_object_t		tmp_object;
7335	kern_return_t		retval;
7336	boolean_t		object1_locked, object2_locked;
7337	vm_page_t		page;
7338	vm_object_offset_t	page_offset;
7339	lck_mtx_t		*hash_lck;
7340	vm_object_hash_entry_t	hash_entry;
7341
7342	tmp_object = VM_OBJECT_NULL;
7343	object1_locked = FALSE; object2_locked = FALSE;
7344
7345	if (object1 == object2 ||
7346	    object1 == VM_OBJECT_NULL ||
7347	    object2 == VM_OBJECT_NULL) {
7348		/*
7349		 * If the 2 VM objects are the same, there's
7350		 * no point in exchanging their backing store.
7351		 */
7352		retval = KERN_INVALID_VALUE;
7353		goto done;
7354	}
7355
7356	/*
7357	 * Since we need to lock both objects at the same time,
7358	 * make sure we always lock them in the same order to
7359	 * avoid deadlocks.
7360	 */
7361	if (object1 >  object2) {
7362		tmp_object = object1;
7363		object1 = object2;
7364		object2 = tmp_object;
7365	}
7366
7367	/*
7368	 * Allocate a temporary VM object to hold object1's contents
7369	 * while we copy object2 to object1.
7370	 */
7371	tmp_object = vm_object_allocate(transpose_size);
7372	vm_object_lock(tmp_object);
7373	tmp_object->can_persist = FALSE;
7374
7375
7376	/*
7377	 * Grab control of the 1st VM object.
7378	 */
7379	vm_object_lock(object1);
7380	object1_locked = TRUE;
7381	if (!object1->alive || object1->terminating ||
7382	    object1->copy || object1->shadow || object1->shadowed ||
7383	    object1->purgable != VM_PURGABLE_DENY) {
7384		/*
7385		 * We don't deal with copy or shadow objects (yet).
7386		 */
7387		retval = KERN_INVALID_VALUE;
7388		goto done;
7389	}
7390	/*
7391	 * We're about to mess with the object's backing store and
7392	 * taking a "paging_in_progress" reference wouldn't be enough
7393	 * to prevent any paging activity on this object, so the caller should
7394	 * have "quiesced" the objects beforehand, via a UPL operation with
7395	 * UPL_SET_IO_WIRE (to make sure all the pages are there and wired)
7396	 * and UPL_BLOCK_ACCESS (to mark the pages "busy").
7397	 *
7398	 * Wait for any paging operation to complete (but only paging, not
7399	 * other kind of activities not linked to the pager).  After we're
7400	 * statisfied that there's no more paging in progress, we keep the
7401	 * object locked, to guarantee that no one tries to access its pager.
7402	 */
7403	vm_object_paging_only_wait(object1, THREAD_UNINT);
7404
7405	/*
7406	 * Same as above for the 2nd object...
7407	 */
7408	vm_object_lock(object2);
7409	object2_locked = TRUE;
7410	if (! object2->alive || object2->terminating ||
7411	    object2->copy || object2->shadow || object2->shadowed ||
7412	    object2->purgable != VM_PURGABLE_DENY) {
7413		retval = KERN_INVALID_VALUE;
7414		goto done;
7415	}
7416	vm_object_paging_only_wait(object2, THREAD_UNINT);
7417
7418
7419	if (object1->vo_size != object2->vo_size ||
7420	    object1->vo_size != transpose_size) {
7421		/*
7422		 * If the 2 objects don't have the same size, we can't
7423		 * exchange their backing stores or one would overflow.
7424		 * If their size doesn't match the caller's
7425		 * "transpose_size", we can't do it either because the
7426		 * transpose operation will affect the entire span of
7427		 * the objects.
7428		 */
7429		retval = KERN_INVALID_VALUE;
7430		goto done;
7431	}
7432
7433
7434	/*
7435	 * Transpose the lists of resident pages.
7436	 * This also updates the resident_page_count and the memq_hint.
7437	 */
7438	if (object1->phys_contiguous || queue_empty(&object1->memq)) {
7439		/*
7440		 * No pages in object1, just transfer pages
7441		 * from object2 to object1.  No need to go through
7442		 * an intermediate object.
7443		 */
7444		while (!queue_empty(&object2->memq)) {
7445			page = (vm_page_t) queue_first(&object2->memq);
7446			vm_page_rename(page, object1, page->offset, FALSE);
7447		}
7448		assert(queue_empty(&object2->memq));
7449	} else if (object2->phys_contiguous || queue_empty(&object2->memq)) {
7450		/*
7451		 * No pages in object2, just transfer pages
7452		 * from object1 to object2.  No need to go through
7453		 * an intermediate object.
7454		 */
7455		while (!queue_empty(&object1->memq)) {
7456			page = (vm_page_t) queue_first(&object1->memq);
7457			vm_page_rename(page, object2, page->offset, FALSE);
7458		}
7459		assert(queue_empty(&object1->memq));
7460	} else {
7461		/* transfer object1's pages to tmp_object */
7462		while (!queue_empty(&object1->memq)) {
7463			page = (vm_page_t) queue_first(&object1->memq);
7464			page_offset = page->offset;
7465			vm_page_remove(page, TRUE);
7466			page->offset = page_offset;
7467			queue_enter(&tmp_object->memq, page, vm_page_t, listq);
7468		}
7469		assert(queue_empty(&object1->memq));
7470		/* transfer object2's pages to object1 */
7471		while (!queue_empty(&object2->memq)) {
7472			page = (vm_page_t) queue_first(&object2->memq);
7473			vm_page_rename(page, object1, page->offset, FALSE);
7474		}
7475		assert(queue_empty(&object2->memq));
7476		/* transfer tmp_object's pages to object1 */
7477		while (!queue_empty(&tmp_object->memq)) {
7478			page = (vm_page_t) queue_first(&tmp_object->memq);
7479			queue_remove(&tmp_object->memq, page,
7480				     vm_page_t, listq);
7481			vm_page_insert(page, object2, page->offset);
7482		}
7483		assert(queue_empty(&tmp_object->memq));
7484	}
7485
7486#define __TRANSPOSE_FIELD(field)				\
7487MACRO_BEGIN							\
7488	tmp_object->field = object1->field;			\
7489	object1->field = object2->field;			\
7490	object2->field = tmp_object->field;			\
7491MACRO_END
7492
7493	/* "Lock" refers to the object not its contents */
7494	/* "size" should be identical */
7495	assert(object1->vo_size == object2->vo_size);
7496	/* "memq_hint" was updated above when transposing pages */
7497	/* "ref_count" refers to the object not its contents */
7498#if TASK_SWAPPER
7499	/* "res_count" refers to the object not its contents */
7500#endif
7501	/* "resident_page_count" was updated above when transposing pages */
7502	/* "wired_page_count" was updated above when transposing pages */
7503	/* "reusable_page_count" was updated above when transposing pages */
7504	/* there should be no "copy" */
7505	assert(!object1->copy);
7506	assert(!object2->copy);
7507	/* there should be no "shadow" */
7508	assert(!object1->shadow);
7509	assert(!object2->shadow);
7510	__TRANSPOSE_FIELD(vo_shadow_offset); /* used by phys_contiguous objects */
7511	__TRANSPOSE_FIELD(pager);
7512	__TRANSPOSE_FIELD(paging_offset);
7513	__TRANSPOSE_FIELD(pager_control);
7514	/* update the memory_objects' pointers back to the VM objects */
7515	if (object1->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
7516		memory_object_control_collapse(object1->pager_control,
7517					       object1);
7518	}
7519	if (object2->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
7520		memory_object_control_collapse(object2->pager_control,
7521					       object2);
7522	}
7523	__TRANSPOSE_FIELD(copy_strategy);
7524	/* "paging_in_progress" refers to the object not its contents */
7525	assert(!object1->paging_in_progress);
7526	assert(!object2->paging_in_progress);
7527	assert(object1->activity_in_progress);
7528	assert(object2->activity_in_progress);
7529	/* "all_wanted" refers to the object not its contents */
7530	__TRANSPOSE_FIELD(pager_created);
7531	__TRANSPOSE_FIELD(pager_initialized);
7532	__TRANSPOSE_FIELD(pager_ready);
7533	__TRANSPOSE_FIELD(pager_trusted);
7534	__TRANSPOSE_FIELD(can_persist);
7535	__TRANSPOSE_FIELD(internal);
7536	__TRANSPOSE_FIELD(temporary);
7537	__TRANSPOSE_FIELD(private);
7538	__TRANSPOSE_FIELD(pageout);
7539	/* "alive" should be set */
7540	assert(object1->alive);
7541	assert(object2->alive);
7542	/* "purgeable" should be non-purgeable */
7543	assert(object1->purgable == VM_PURGABLE_DENY);
7544	assert(object2->purgable == VM_PURGABLE_DENY);
7545	/* "shadowed" refers to the the object not its contents */
7546	__TRANSPOSE_FIELD(purgeable_when_ripe);
7547	__TRANSPOSE_FIELD(advisory_pageout);
7548	__TRANSPOSE_FIELD(true_share);
7549	/* "terminating" should not be set */
7550	assert(!object1->terminating);
7551	assert(!object2->terminating);
7552	__TRANSPOSE_FIELD(named);
7553	/* "shadow_severed" refers to the object not its contents */
7554	__TRANSPOSE_FIELD(phys_contiguous);
7555	__TRANSPOSE_FIELD(nophyscache);
7556	/* "cached_list.next" points to transposed object */
7557	object1->cached_list.next = (queue_entry_t) object2;
7558	object2->cached_list.next = (queue_entry_t) object1;
7559	/* "cached_list.prev" should be NULL */
7560	assert(object1->cached_list.prev == NULL);
7561	assert(object2->cached_list.prev == NULL);
7562	/* "msr_q" is linked to the object not its contents */
7563	assert(queue_empty(&object1->msr_q));
7564	assert(queue_empty(&object2->msr_q));
7565	__TRANSPOSE_FIELD(last_alloc);
7566	__TRANSPOSE_FIELD(sequential);
7567	__TRANSPOSE_FIELD(pages_created);
7568	__TRANSPOSE_FIELD(pages_used);
7569	__TRANSPOSE_FIELD(scan_collisions);
7570#if MACH_PAGEMAP
7571	__TRANSPOSE_FIELD(existence_map);
7572#endif
7573	__TRANSPOSE_FIELD(cow_hint);
7574#if MACH_ASSERT
7575	__TRANSPOSE_FIELD(paging_object);
7576#endif
7577	__TRANSPOSE_FIELD(wimg_bits);
7578	__TRANSPOSE_FIELD(set_cache_attr);
7579	__TRANSPOSE_FIELD(code_signed);
7580	if (object1->hashed) {
7581		hash_lck = vm_object_hash_lock_spin(object2->pager);
7582		hash_entry = vm_object_hash_lookup(object2->pager, FALSE);
7583		assert(hash_entry != VM_OBJECT_HASH_ENTRY_NULL);
7584		hash_entry->object = object2;
7585		vm_object_hash_unlock(hash_lck);
7586	}
7587	if (object2->hashed) {
7588		hash_lck = vm_object_hash_lock_spin(object1->pager);
7589		hash_entry = vm_object_hash_lookup(object1->pager, FALSE);
7590		assert(hash_entry != VM_OBJECT_HASH_ENTRY_NULL);
7591		hash_entry->object = object1;
7592		vm_object_hash_unlock(hash_lck);
7593	}
7594	__TRANSPOSE_FIELD(hashed);
7595	object1->transposed = TRUE;
7596	object2->transposed = TRUE;
7597	__TRANSPOSE_FIELD(mapping_in_progress);
7598	__TRANSPOSE_FIELD(volatile_empty);
7599	__TRANSPOSE_FIELD(volatile_fault);
7600	__TRANSPOSE_FIELD(all_reusable);
7601	assert(object1->blocked_access);
7602	assert(object2->blocked_access);
7603	assert(object1->__object2_unused_bits == 0);
7604	assert(object2->__object2_unused_bits == 0);
7605#if UPL_DEBUG
7606	/* "uplq" refers to the object not its contents (see upl_transpose()) */
7607#endif
7608	assert(object1->objq.next == NULL);
7609	assert(object1->objq.prev == NULL);
7610	assert(object2->objq.next == NULL);
7611	assert(object2->objq.prev == NULL);
7612
7613#undef __TRANSPOSE_FIELD
7614
7615	retval = KERN_SUCCESS;
7616
7617done:
7618	/*
7619	 * Cleanup.
7620	 */
7621	if (tmp_object != VM_OBJECT_NULL) {
7622		vm_object_unlock(tmp_object);
7623		/*
7624		 * Re-initialize the temporary object to avoid
7625		 * deallocating a real pager.
7626		 */
7627		_vm_object_allocate(transpose_size, tmp_object);
7628		vm_object_deallocate(tmp_object);
7629		tmp_object = VM_OBJECT_NULL;
7630	}
7631
7632	if (object1_locked) {
7633		vm_object_unlock(object1);
7634		object1_locked = FALSE;
7635	}
7636	if (object2_locked) {
7637		vm_object_unlock(object2);
7638		object2_locked = FALSE;
7639	}
7640
7641	vm_object_transpose_count++;
7642
7643	return retval;
7644}
7645
7646
7647/*
7648 *      vm_object_cluster_size
7649 *
7650 *      Determine how big a cluster we should issue an I/O for...
7651 *
7652 *	Inputs:   *start == offset of page needed
7653 *		  *length == maximum cluster pager can handle
7654 *	Outputs:  *start == beginning offset of cluster
7655 *		  *length == length of cluster to try
7656 *
7657 *	The original *start will be encompassed by the cluster
7658 *
7659 */
7660extern int speculative_reads_disabled;
7661extern int ignore_is_ssd;
7662
7663unsigned int preheat_max_bytes = MAX_UPL_TRANSFER_BYTES;
7664unsigned int preheat_min_bytes = (1024 * 32);
7665
7666
7667__private_extern__ void
7668vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start,
7669		       vm_size_t *length, vm_object_fault_info_t fault_info, uint32_t *io_streaming)
7670{
7671	vm_size_t		pre_heat_size;
7672	vm_size_t		tail_size;
7673	vm_size_t		head_size;
7674	vm_size_t		max_length;
7675	vm_size_t		cluster_size;
7676	vm_object_offset_t	object_size;
7677	vm_object_offset_t	orig_start;
7678	vm_object_offset_t	target_start;
7679	vm_object_offset_t	offset;
7680	vm_behavior_t		behavior;
7681	boolean_t		look_behind = TRUE;
7682	boolean_t		look_ahead  = TRUE;
7683	boolean_t		isSSD = FALSE;
7684	uint32_t		throttle_limit;
7685	int			sequential_run;
7686	int			sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
7687	vm_size_t		max_ph_size;
7688	vm_size_t		min_ph_size;
7689
7690	assert( !(*length & PAGE_MASK));
7691	assert( !(*start & PAGE_MASK_64));
7692
7693	/*
7694	 * remember maxiumum length of run requested
7695	 */
7696	max_length = *length;
7697	/*
7698	 * we'll always return a cluster size of at least
7699	 * 1 page, since the original fault must always
7700	 * be processed
7701	 */
7702	*length = PAGE_SIZE;
7703	*io_streaming = 0;
7704
7705	if (speculative_reads_disabled || fault_info == NULL) {
7706	        /*
7707		 * no cluster... just fault the page in
7708		 */
7709	        return;
7710	}
7711	orig_start = *start;
7712	target_start = orig_start;
7713	cluster_size = round_page(fault_info->cluster_size);
7714	behavior = fault_info->behavior;
7715
7716	vm_object_lock(object);
7717
7718	if (object->pager == MEMORY_OBJECT_NULL)
7719		goto out;	/* pager is gone for this object, nothing more to do */
7720
7721	if (!ignore_is_ssd)
7722		vnode_pager_get_isSSD(object->pager, &isSSD);
7723
7724	min_ph_size = round_page(preheat_min_bytes);
7725	max_ph_size = round_page(preheat_max_bytes);
7726
7727	if (isSSD) {
7728		min_ph_size /= 2;
7729		max_ph_size /= 8;
7730	}
7731	if (min_ph_size < PAGE_SIZE)
7732		min_ph_size = PAGE_SIZE;
7733
7734	if (max_ph_size < PAGE_SIZE)
7735		max_ph_size = PAGE_SIZE;
7736	else if (max_ph_size > MAX_UPL_TRANSFER_BYTES)
7737		max_ph_size = MAX_UPL_TRANSFER_BYTES;
7738
7739	if (max_length > max_ph_size)
7740	        max_length = max_ph_size;
7741
7742	if (max_length <= PAGE_SIZE)
7743		goto out;
7744
7745	if (object->internal)
7746	        object_size = object->vo_size;
7747	else
7748	        vnode_pager_get_object_size(object->pager, &object_size);
7749
7750	object_size = round_page_64(object_size);
7751
7752	if (orig_start >= object_size) {
7753	        /*
7754		 * fault occurred beyond the EOF...
7755		 * we need to punt w/o changing the
7756		 * starting offset
7757		 */
7758	        goto out;
7759	}
7760	if (object->pages_used > object->pages_created) {
7761	        /*
7762		 * must have wrapped our 32 bit counters
7763		 * so reset
7764		 */
7765 	        object->pages_used = object->pages_created = 0;
7766	}
7767	if ((sequential_run = object->sequential)) {
7768		  if (sequential_run < 0) {
7769		          sequential_behavior = VM_BEHAVIOR_RSEQNTL;
7770			  sequential_run = 0 - sequential_run;
7771		  } else {
7772		          sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
7773		  }
7774
7775	}
7776	switch (behavior) {
7777
7778	default:
7779	        behavior = VM_BEHAVIOR_DEFAULT;
7780
7781	case VM_BEHAVIOR_DEFAULT:
7782	        if (object->internal && fault_info->user_tag == VM_MEMORY_STACK)
7783		        goto out;
7784
7785		if (sequential_run >= (3 * PAGE_SIZE)) {
7786		        pre_heat_size = sequential_run + PAGE_SIZE;
7787
7788			if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL)
7789			        look_behind = FALSE;
7790			else
7791			        look_ahead = FALSE;
7792
7793			*io_streaming = 1;
7794		} else {
7795
7796			if (object->pages_created < (20 * (min_ph_size >> PAGE_SHIFT))) {
7797			        /*
7798				 * prime the pump
7799				 */
7800			        pre_heat_size = min_ph_size;
7801			} else {
7802				/*
7803				 * Linear growth in PH size: The maximum size is max_length...
7804				 * this cacluation will result in a size that is neither a
7805				 * power of 2 nor a multiple of PAGE_SIZE... so round
7806				 * it up to the nearest PAGE_SIZE boundary
7807				 */
7808				pre_heat_size = (max_length * object->pages_used) / object->pages_created;
7809
7810				if (pre_heat_size < min_ph_size)
7811					pre_heat_size = min_ph_size;
7812				else
7813					pre_heat_size = round_page(pre_heat_size);
7814			}
7815		}
7816		break;
7817
7818	case VM_BEHAVIOR_RANDOM:
7819	        if ((pre_heat_size = cluster_size) <= PAGE_SIZE)
7820		        goto out;
7821	        break;
7822
7823	case VM_BEHAVIOR_SEQUENTIAL:
7824	        if ((pre_heat_size = cluster_size) == 0)
7825		        pre_heat_size = sequential_run + PAGE_SIZE;
7826		look_behind = FALSE;
7827		*io_streaming = 1;
7828
7829	        break;
7830
7831	case VM_BEHAVIOR_RSEQNTL:
7832	        if ((pre_heat_size = cluster_size) == 0)
7833		        pre_heat_size = sequential_run + PAGE_SIZE;
7834		look_ahead = FALSE;
7835		*io_streaming = 1;
7836
7837	        break;
7838
7839	}
7840	throttle_limit = (uint32_t) max_length;
7841	assert(throttle_limit == max_length);
7842
7843	if (vnode_pager_get_throttle_io_limit(object->pager, &throttle_limit) == KERN_SUCCESS) {
7844		if (max_length > throttle_limit)
7845			max_length = throttle_limit;
7846	}
7847	if (pre_heat_size > max_length)
7848	        pre_heat_size = max_length;
7849
7850	if (behavior == VM_BEHAVIOR_DEFAULT && (pre_heat_size > min_ph_size)) {
7851
7852		unsigned int consider_free = vm_page_free_count + vm_page_cleaned_count;
7853
7854		if (consider_free < vm_page_throttle_limit) {
7855			pre_heat_size = trunc_page(pre_heat_size / 16);
7856		} else if (consider_free < vm_page_free_target) {
7857			pre_heat_size = trunc_page(pre_heat_size / 4);
7858		}
7859
7860		if (pre_heat_size < min_ph_size)
7861			pre_heat_size = min_ph_size;
7862	}
7863	if (look_ahead == TRUE) {
7864	        if (look_behind == TRUE) {
7865			/*
7866			 * if we get here its due to a random access...
7867			 * so we want to center the original fault address
7868			 * within the cluster we will issue... make sure
7869			 * to calculate 'head_size' as a multiple of PAGE_SIZE...
7870			 * 'pre_heat_size' is a multiple of PAGE_SIZE but not
7871			 * necessarily an even number of pages so we need to truncate
7872			 * the result to a PAGE_SIZE boundary
7873			 */
7874			head_size = trunc_page(pre_heat_size / 2);
7875
7876			if (target_start > head_size)
7877				target_start -= head_size;
7878			else
7879				target_start = 0;
7880
7881			/*
7882			 * 'target_start' at this point represents the beginning offset
7883			 * of the cluster we are considering... 'orig_start' will be in
7884			 * the center of this cluster if we didn't have to clip the start
7885			 * due to running into the start of the file
7886			 */
7887		}
7888	        if ((target_start + pre_heat_size) > object_size)
7889		        pre_heat_size = (vm_size_t)(round_page_64(object_size - target_start));
7890		/*
7891		 * at this point caclulate the number of pages beyond the original fault
7892		 * address that we want to consider... this is guaranteed not to extend beyond
7893		 * the current EOF...
7894		 */
7895		assert((vm_size_t)(orig_start - target_start) == (orig_start - target_start));
7896	        tail_size = pre_heat_size - (vm_size_t)(orig_start - target_start) - PAGE_SIZE;
7897	} else {
7898	        if (pre_heat_size > target_start) {
7899			/*
7900			 * since pre_heat_size is always smaller then 2^32,
7901			 * if it is larger then target_start (a 64 bit value)
7902			 * it is safe to clip target_start to 32 bits
7903			 */
7904	                pre_heat_size = (vm_size_t) target_start;
7905		}
7906		tail_size = 0;
7907	}
7908	assert( !(target_start & PAGE_MASK_64));
7909	assert( !(pre_heat_size & PAGE_MASK));
7910
7911	if (pre_heat_size <= PAGE_SIZE)
7912	        goto out;
7913
7914	if (look_behind == TRUE) {
7915	        /*
7916		 * take a look at the pages before the original
7917		 * faulting offset... recalculate this in case
7918		 * we had to clip 'pre_heat_size' above to keep
7919		 * from running past the EOF.
7920		 */
7921	        head_size = pre_heat_size - tail_size - PAGE_SIZE;
7922
7923	        for (offset = orig_start - PAGE_SIZE_64; head_size; offset -= PAGE_SIZE_64, head_size -= PAGE_SIZE) {
7924		        /*
7925			 * don't poke below the lowest offset
7926			 */
7927		        if (offset < fault_info->lo_offset)
7928			        break;
7929		        /*
7930			 * for external objects and internal objects w/o an existence map
7931			 * vm_externl_state_get will return VM_EXTERNAL_STATE_UNKNOWN
7932			 */
7933#if MACH_PAGEMAP
7934		        if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_ABSENT) {
7935			        /*
7936				 * we know for a fact that the pager can't provide the page
7937				 * so don't include it or any pages beyond it in this cluster
7938				 */
7939			        break;
7940			}
7941#endif /* MACH_PAGEMAP */
7942			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
7943			    == VM_EXTERNAL_STATE_ABSENT) {
7944				break;
7945			}
7946			if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
7947			        /*
7948				 * don't bridge resident pages
7949				 */
7950			        break;
7951			}
7952			*start = offset;
7953			*length += PAGE_SIZE;
7954		}
7955	}
7956	if (look_ahead == TRUE) {
7957	        for (offset = orig_start + PAGE_SIZE_64; tail_size; offset += PAGE_SIZE_64, tail_size -= PAGE_SIZE) {
7958		        /*
7959			 * don't poke above the highest offset
7960			 */
7961		        if (offset >= fault_info->hi_offset)
7962			        break;
7963			assert(offset < object_size);
7964
7965		        /*
7966			 * for external objects and internal objects w/o an existence map
7967			 * vm_externl_state_get will return VM_EXTERNAL_STATE_UNKNOWN
7968			 */
7969#if MACH_PAGEMAP
7970		        if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_ABSENT) {
7971			        /*
7972				 * we know for a fact that the pager can't provide the page
7973				 * so don't include it or any pages beyond it in this cluster
7974				 */
7975			        break;
7976			}
7977#endif /* MACH_PAGEMAP */
7978			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) == VM_EXTERNAL_STATE_ABSENT) {
7979				break;
7980			}
7981			if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
7982			        /*
7983				 * don't bridge resident pages
7984				 */
7985			        break;
7986			}
7987			*length += PAGE_SIZE;
7988		}
7989	}
7990out:
7991	if (*length > max_length)
7992		*length = max_length;
7993
7994	vm_object_unlock(object);
7995
7996	DTRACE_VM1(clustersize, vm_size_t, *length);
7997}
7998
7999
8000/*
8001 * Allow manipulation of individual page state.  This is actually part of
8002 * the UPL regimen but takes place on the VM object rather than on a UPL
8003 */
8004
8005kern_return_t
8006vm_object_page_op(
8007	vm_object_t		object,
8008	vm_object_offset_t	offset,
8009	int			ops,
8010	ppnum_t			*phys_entry,
8011	int			*flags)
8012{
8013	vm_page_t		dst_page;
8014
8015	vm_object_lock(object);
8016
8017	if(ops & UPL_POP_PHYSICAL) {
8018		if(object->phys_contiguous) {
8019			if (phys_entry) {
8020				*phys_entry = (ppnum_t)
8021					(object->vo_shadow_offset >> PAGE_SHIFT);
8022			}
8023			vm_object_unlock(object);
8024			return KERN_SUCCESS;
8025		} else {
8026			vm_object_unlock(object);
8027			return KERN_INVALID_OBJECT;
8028		}
8029	}
8030	if(object->phys_contiguous) {
8031		vm_object_unlock(object);
8032		return KERN_INVALID_OBJECT;
8033	}
8034
8035	while(TRUE) {
8036		if((dst_page = vm_page_lookup(object,offset)) == VM_PAGE_NULL) {
8037			vm_object_unlock(object);
8038			return KERN_FAILURE;
8039		}
8040
8041		/* Sync up on getting the busy bit */
8042		if((dst_page->busy || dst_page->cleaning) &&
8043			   (((ops & UPL_POP_SET) &&
8044			   (ops & UPL_POP_BUSY)) || (ops & UPL_POP_DUMP))) {
8045			/* someone else is playing with the page, we will */
8046			/* have to wait */
8047			PAGE_SLEEP(object, dst_page, THREAD_UNINT);
8048			continue;
8049		}
8050
8051		if (ops & UPL_POP_DUMP) {
8052			if (dst_page->pmapped == TRUE)
8053			        pmap_disconnect(dst_page->phys_page);
8054
8055			VM_PAGE_FREE(dst_page);
8056			break;
8057		}
8058
8059		if (flags) {
8060		        *flags = 0;
8061
8062			/* Get the condition of flags before requested ops */
8063			/* are undertaken */
8064
8065			if(dst_page->dirty) *flags |= UPL_POP_DIRTY;
8066			if(dst_page->pageout) *flags |= UPL_POP_PAGEOUT;
8067			if(dst_page->precious) *flags |= UPL_POP_PRECIOUS;
8068			if(dst_page->absent) *flags |= UPL_POP_ABSENT;
8069			if(dst_page->busy) *flags |= UPL_POP_BUSY;
8070		}
8071
8072		/* The caller should have made a call either contingent with */
8073		/* or prior to this call to set UPL_POP_BUSY */
8074		if(ops & UPL_POP_SET) {
8075			/* The protection granted with this assert will */
8076			/* not be complete.  If the caller violates the */
8077			/* convention and attempts to change page state */
8078			/* without first setting busy we may not see it */
8079			/* because the page may already be busy.  However */
8080			/* if such violations occur we will assert sooner */
8081			/* or later. */
8082			assert(dst_page->busy || (ops & UPL_POP_BUSY));
8083			if (ops & UPL_POP_DIRTY) {
8084				SET_PAGE_DIRTY(dst_page, FALSE);
8085			}
8086			if (ops & UPL_POP_PAGEOUT) dst_page->pageout = TRUE;
8087			if (ops & UPL_POP_PRECIOUS) dst_page->precious = TRUE;
8088			if (ops & UPL_POP_ABSENT) dst_page->absent = TRUE;
8089			if (ops & UPL_POP_BUSY) dst_page->busy = TRUE;
8090		}
8091
8092		if(ops & UPL_POP_CLR) {
8093			assert(dst_page->busy);
8094			if (ops & UPL_POP_DIRTY) dst_page->dirty = FALSE;
8095			if (ops & UPL_POP_PAGEOUT) dst_page->pageout = FALSE;
8096			if (ops & UPL_POP_PRECIOUS) dst_page->precious = FALSE;
8097			if (ops & UPL_POP_ABSENT) dst_page->absent = FALSE;
8098			if (ops & UPL_POP_BUSY) {
8099			        dst_page->busy = FALSE;
8100				PAGE_WAKEUP(dst_page);
8101			}
8102		}
8103
8104		if (dst_page->encrypted) {
8105			/*
8106			 * ENCRYPTED SWAP:
8107			 * We need to decrypt this encrypted page before the
8108			 * caller can access its contents.
8109			 * But if the caller really wants to access the page's
8110			 * contents, they have to keep the page "busy".
8111			 * Otherwise, the page could get recycled or re-encrypted
8112			 * at any time.
8113			 */
8114			if ((ops & UPL_POP_SET) && (ops & UPL_POP_BUSY) &&
8115			    dst_page->busy) {
8116				/*
8117				 * The page is stable enough to be accessed by
8118				 * the caller, so make sure its contents are
8119				 * not encrypted.
8120				 */
8121				vm_page_decrypt(dst_page, 0);
8122			} else {
8123				/*
8124				 * The page is not busy, so don't bother
8125				 * decrypting it, since anything could
8126				 * happen to it between now and when the
8127				 * caller wants to access it.
8128				 * We should not give the caller access
8129				 * to this page.
8130				 */
8131				assert(!phys_entry);
8132			}
8133		}
8134
8135		if (phys_entry) {
8136			/*
8137			 * The physical page number will remain valid
8138			 * only if the page is kept busy.
8139			 * ENCRYPTED SWAP: make sure we don't let the
8140			 * caller access an encrypted page.
8141			 */
8142			assert(dst_page->busy);
8143			assert(!dst_page->encrypted);
8144			*phys_entry = dst_page->phys_page;
8145		}
8146
8147		break;
8148	}
8149
8150	vm_object_unlock(object);
8151	return KERN_SUCCESS;
8152
8153}
8154
8155/*
8156 * vm_object_range_op offers performance enhancement over
8157 * vm_object_page_op for page_op functions which do not require page
8158 * level state to be returned from the call.  Page_op was created to provide
8159 * a low-cost alternative to page manipulation via UPLs when only a single
8160 * page was involved.  The range_op call establishes the ability in the _op
8161 * family of functions to work on multiple pages where the lack of page level
8162 * state handling allows the caller to avoid the overhead of the upl structures.
8163 */
8164
8165kern_return_t
8166vm_object_range_op(
8167	vm_object_t		object,
8168	vm_object_offset_t	offset_beg,
8169	vm_object_offset_t	offset_end,
8170	int                     ops,
8171	uint32_t		*range)
8172{
8173        vm_object_offset_t	offset;
8174	vm_page_t		dst_page;
8175
8176	if (offset_end - offset_beg > (uint32_t) -1) {
8177		/* range is too big and would overflow "*range" */
8178		return KERN_INVALID_ARGUMENT;
8179	}
8180	if (object->resident_page_count == 0) {
8181	        if (range) {
8182		        if (ops & UPL_ROP_PRESENT) {
8183			        *range = 0;
8184			} else {
8185			        *range = (uint32_t) (offset_end - offset_beg);
8186				assert(*range == (offset_end - offset_beg));
8187			}
8188		}
8189		return KERN_SUCCESS;
8190	}
8191	vm_object_lock(object);
8192
8193	if (object->phys_contiguous) {
8194		vm_object_unlock(object);
8195	        return KERN_INVALID_OBJECT;
8196	}
8197
8198	offset = offset_beg & ~PAGE_MASK_64;
8199
8200	while (offset < offset_end) {
8201		dst_page = vm_page_lookup(object, offset);
8202		if (dst_page != VM_PAGE_NULL) {
8203			if (ops & UPL_ROP_DUMP) {
8204				if (dst_page->busy || dst_page->cleaning) {
8205					/*
8206					 * someone else is playing with the
8207					 * page, we will have to wait
8208					 */
8209				        PAGE_SLEEP(object, dst_page, THREAD_UNINT);
8210					/*
8211					 * need to relook the page up since it's
8212					 * state may have changed while we slept
8213					 * it might even belong to a different object
8214					 * at this point
8215					 */
8216					continue;
8217				}
8218				if (dst_page->laundry) {
8219					dst_page->pageout = FALSE;
8220
8221					vm_pageout_steal_laundry(dst_page, FALSE);
8222				}
8223				if (dst_page->pmapped == TRUE)
8224				        pmap_disconnect(dst_page->phys_page);
8225
8226				VM_PAGE_FREE(dst_page);
8227
8228			} else if ((ops & UPL_ROP_ABSENT) && !dst_page->absent)
8229			        break;
8230		} else if (ops & UPL_ROP_PRESENT)
8231		        break;
8232
8233		offset += PAGE_SIZE;
8234	}
8235	vm_object_unlock(object);
8236
8237	if (range) {
8238	        if (offset > offset_end)
8239		        offset = offset_end;
8240		if(offset > offset_beg) {
8241			*range = (uint32_t) (offset - offset_beg);
8242			assert(*range == (offset - offset_beg));
8243		} else {
8244			*range = 0;
8245		}
8246	}
8247	return KERN_SUCCESS;
8248}
8249
8250/*
8251 * Used to point a pager directly to a range of memory (when the pager may be associated
8252 *   with a non-device vnode).  Takes a virtual address, an offset, and a size.  We currently
8253 *   expect that the virtual address will denote the start of a range that is physically contiguous.
8254 */
8255kern_return_t pager_map_to_phys_contiguous(
8256	memory_object_control_t	object,
8257	memory_object_offset_t	offset,
8258	addr64_t		base_vaddr,
8259	vm_size_t		size)
8260{
8261	ppnum_t page_num;
8262	boolean_t clobbered_private;
8263	kern_return_t retval;
8264	vm_object_t pager_object;
8265
8266	page_num = pmap_find_phys(kernel_pmap, base_vaddr);
8267
8268	if (!page_num) {
8269		retval = KERN_FAILURE;
8270		goto out;
8271	}
8272
8273	pager_object = memory_object_control_to_vm_object(object);
8274
8275	if (!pager_object) {
8276		retval = KERN_FAILURE;
8277		goto out;
8278	}
8279
8280	clobbered_private = pager_object->private;
8281	pager_object->private = TRUE;
8282	retval = vm_object_populate_with_private(pager_object, offset, page_num, size);
8283
8284	if (retval != KERN_SUCCESS)
8285		pager_object->private = clobbered_private;
8286
8287out:
8288	return retval;
8289}
8290
8291uint32_t scan_object_collision = 0;
8292
8293void
8294vm_object_lock(vm_object_t object)
8295{
8296        if (object == vm_pageout_scan_wants_object) {
8297	        scan_object_collision++;
8298	        mutex_pause(2);
8299	}
8300        lck_rw_lock_exclusive(&object->Lock);
8301}
8302
8303boolean_t
8304vm_object_lock_avoid(vm_object_t object)
8305{
8306        if (object == vm_pageout_scan_wants_object) {
8307	        scan_object_collision++;
8308		return TRUE;
8309	}
8310	return FALSE;
8311}
8312
8313boolean_t
8314_vm_object_lock_try(vm_object_t object)
8315{
8316	return (lck_rw_try_lock_exclusive(&object->Lock));
8317}
8318
8319boolean_t
8320vm_object_lock_try(vm_object_t object)
8321{
8322	/*
8323	 * Called from hibernate path so check before blocking.
8324	 */
8325	if (vm_object_lock_avoid(object) && ml_get_interrupts_enabled() && get_preemption_level()==0) {
8326		mutex_pause(2);
8327	}
8328	return _vm_object_lock_try(object);
8329}
8330
8331void
8332vm_object_lock_shared(vm_object_t object)
8333{
8334        if (vm_object_lock_avoid(object)) {
8335	        mutex_pause(2);
8336	}
8337	lck_rw_lock_shared(&object->Lock);
8338}
8339
8340boolean_t
8341vm_object_lock_try_shared(vm_object_t object)
8342{
8343        if (vm_object_lock_avoid(object)) {
8344	        mutex_pause(2);
8345	}
8346	return (lck_rw_try_lock_shared(&object->Lock));
8347}
8348
8349
8350unsigned int vm_object_change_wimg_mode_count = 0;
8351
8352/*
8353 * The object must be locked
8354 */
8355void
8356vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode)
8357{
8358	vm_page_t p;
8359
8360	vm_object_lock_assert_exclusive(object);
8361
8362	vm_object_paging_wait(object, THREAD_UNINT);
8363
8364	queue_iterate(&object->memq, p, vm_page_t, listq) {
8365
8366		if (!p->fictitious)
8367			pmap_set_cache_attributes(p->phys_page, wimg_mode);
8368	}
8369	if (wimg_mode == VM_WIMG_USE_DEFAULT)
8370		object->set_cache_attr = FALSE;
8371	else
8372		object->set_cache_attr = TRUE;
8373
8374	object->wimg_bits = wimg_mode;
8375
8376	vm_object_change_wimg_mode_count++;
8377}
8378
8379#if CONFIG_FREEZE
8380
8381kern_return_t vm_object_pack(
8382	unsigned int	*purgeable_count,
8383	unsigned int	*wired_count,
8384	unsigned int	*clean_count,
8385	unsigned int	*dirty_count,
8386	unsigned int	dirty_budget,
8387	boolean_t	*shared,
8388	vm_object_t	src_object,
8389	struct default_freezer_handle *df_handle)
8390{
8391	kern_return_t	kr = KERN_SUCCESS;
8392
8393	vm_object_lock(src_object);
8394
8395	*purgeable_count = *wired_count = *clean_count = *dirty_count = 0;
8396	*shared = FALSE;
8397
8398	if (!src_object->alive || src_object->terminating){
8399		kr = KERN_FAILURE;
8400		goto done;
8401	}
8402
8403	if (src_object->purgable == VM_PURGABLE_VOLATILE) {
8404		*purgeable_count = src_object->resident_page_count;
8405
8406		/* If the default freezer handle is null, we're just walking the pages to discover how many can be hibernated */
8407		if (df_handle != NULL) {
8408			purgeable_q_t queue;
8409			/* object should be on a queue */
8410			assert(src_object->objq.next != NULL &&
8411			       src_object->objq.prev != NULL);
8412
8413			queue = vm_purgeable_object_remove(src_object);
8414			assert(queue);
8415			if (src_object->purgeable_when_ripe) {
8416				vm_page_lock_queues();
8417				vm_purgeable_token_delete_first(queue);
8418				vm_page_unlock_queues();
8419			}
8420
8421			vm_object_purge(src_object, 0);
8422			assert(src_object->purgable == VM_PURGABLE_EMPTY);
8423
8424			/*
8425			 * This object was "volatile" so its pages must have
8426			 * already been accounted as "volatile": no change
8427			 * in accounting now that it's "empty".
8428			 */
8429		}
8430		goto done;
8431	}
8432
8433	if (src_object->ref_count == 1) {
8434		vm_object_pack_pages(wired_count, clean_count, dirty_count, dirty_budget, src_object, df_handle);
8435	} else {
8436		if (src_object->internal) {
8437			*shared = TRUE;
8438		}
8439	}
8440done:
8441	vm_object_unlock(src_object);
8442
8443	return kr;
8444}
8445
8446
8447void
8448vm_object_pack_pages(
8449	unsigned int		*wired_count,
8450	unsigned int		*clean_count,
8451	unsigned int		*dirty_count,
8452	unsigned int		dirty_budget,
8453	vm_object_t		src_object,
8454	struct default_freezer_handle *df_handle)
8455{
8456	vm_page_t p, next;
8457
8458	next = (vm_page_t)queue_first(&src_object->memq);
8459
8460	while (!queue_end(&src_object->memq, (queue_entry_t)next)) {
8461		p = next;
8462		next = (vm_page_t)queue_next(&next->listq);
8463
8464		/* Finish up if we've hit our pageout limit */
8465		if (dirty_budget && (dirty_budget == *dirty_count)) {
8466			break;
8467		}
8468		assert(!p->laundry);
8469
8470		if (p->fictitious || p->busy )
8471			continue;
8472
8473		if (p->absent || p->unusual || p->error)
8474			continue;
8475
8476		if (VM_PAGE_WIRED(p)) {
8477			(*wired_count)++;
8478			continue;
8479		}
8480
8481		if (df_handle == NULL) {
8482			if (p->dirty || pmap_is_modified(p->phys_page)) {
8483				(*dirty_count)++;
8484			} else {
8485				(*clean_count)++;
8486			}
8487			continue;
8488		}
8489
8490		if (p->cleaning) {
8491			p->pageout = TRUE;
8492			continue;
8493		}
8494
8495		if (p->pmapped == TRUE) {
8496			int refmod_state;
8497		 	refmod_state = pmap_disconnect(p->phys_page);
8498			if (refmod_state & VM_MEM_MODIFIED) {
8499				SET_PAGE_DIRTY(p, FALSE);
8500			}
8501		}
8502
8503		if (p->dirty) {
8504			default_freezer_pack_page(p, df_handle);
8505			(*dirty_count)++;
8506		}
8507		else {
8508			VM_PAGE_FREE(p);
8509			(*clean_count)++;
8510		}
8511	}
8512}
8513
8514void
8515vm_object_pageout(
8516	vm_object_t object)
8517{
8518	vm_page_t 			p, next;
8519	struct	vm_pageout_queue 	*iq;
8520	boolean_t			set_pageout_bit = FALSE;
8521
8522	iq = &vm_pageout_queue_internal;
8523
8524	assert(object != VM_OBJECT_NULL );
8525
8526	vm_object_lock(object);
8527
8528	if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE) {
8529		if (!object->pager_initialized) {
8530			/*
8531		 	*   If there is no memory object for the page, create
8532		 	*   one and hand it to the default pager.
8533		 	*/
8534			vm_object_pager_create(object);
8535		}
8536
8537		set_pageout_bit = TRUE;
8538	}
8539
8540	if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
8541
8542		set_pageout_bit = FALSE;
8543	}
8544
8545ReScan:
8546	next = (vm_page_t)queue_first(&object->memq);
8547
8548	while (!queue_end(&object->memq, (queue_entry_t)next)) {
8549		p = next;
8550		next = (vm_page_t)queue_next(&next->listq);
8551
8552		/* Throw to the pageout queue */
8553		vm_page_lockspin_queues();
8554
8555		/*
8556		 * see if page is already in the process of
8557		 * being cleaned... if so, leave it alone
8558		 */
8559		if (!p->laundry) {
8560
8561			if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
8562
8563				if (VM_PAGE_Q_THROTTLED(iq)) {
8564
8565					iq->pgo_draining = TRUE;
8566
8567					assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
8568					vm_page_unlock_queues();
8569					vm_object_unlock(object);
8570
8571					thread_block(THREAD_CONTINUE_NULL);
8572
8573					vm_object_lock(object);
8574					goto ReScan;
8575				}
8576
8577				if (p->fictitious || p->busy ) {
8578					vm_page_unlock_queues();
8579					continue;
8580				}
8581
8582				if (p->absent || p->unusual || p->error || VM_PAGE_WIRED(p)) {
8583					vm_page_unlock_queues();
8584					continue;
8585				}
8586
8587				if (p->cleaning) {
8588					p->pageout = TRUE;
8589					vm_page_unlock_queues();
8590					continue;
8591				}
8592
8593				if (p->pmapped == TRUE) {
8594					int refmod_state;
8595		        		refmod_state = pmap_disconnect_options(p->phys_page, PMAP_OPTIONS_COMPRESSOR, NULL);
8596					if (refmod_state & VM_MEM_MODIFIED) {
8597						SET_PAGE_DIRTY(p, FALSE);
8598					}
8599				}
8600
8601				if (p->dirty == FALSE) {
8602					vm_page_unlock_queues();
8603					VM_PAGE_FREE(p);
8604					continue;
8605				}
8606			}
8607
8608			VM_PAGE_QUEUES_REMOVE(p);
8609			vm_pageout_cluster(p, set_pageout_bit);
8610		}
8611		vm_page_unlock_queues();
8612	}
8613
8614	vm_object_unlock(object);
8615}
8616
8617kern_return_t
8618vm_object_pagein(
8619	vm_object_t object)
8620{
8621	memory_object_t	pager;
8622	kern_return_t	kr;
8623
8624	vm_object_lock(object);
8625
8626	pager = object->pager;
8627
8628	if (!object->pager_ready || pager == MEMORY_OBJECT_NULL) {
8629		vm_object_unlock(object);
8630		return KERN_FAILURE;
8631	}
8632
8633	vm_object_paging_wait(object, THREAD_UNINT);
8634	vm_object_paging_begin(object);
8635
8636	object->blocked_access = TRUE;
8637	vm_object_unlock(object);
8638
8639	kr = memory_object_data_reclaim(pager, TRUE);
8640
8641	vm_object_lock(object);
8642
8643	object->blocked_access = FALSE;
8644	vm_object_paging_end(object);
8645
8646	vm_object_unlock(object);
8647
8648	return kr;
8649}
8650#endif /* CONFIG_FREEZE */
8651
8652
8653#if CONFIG_IOSCHED
8654void
8655vm_page_request_reprioritize(vm_object_t o, uint64_t blkno, uint32_t len, int prio)
8656{
8657	io_reprioritize_req_t 	req;
8658	struct vnode 		*devvp = NULL;
8659
8660	if(vnode_pager_get_object_devvp(o->pager, (uintptr_t *)&devvp) != KERN_SUCCESS)
8661		return;
8662
8663	/* Create the request for I/O reprioritization */
8664	req = (io_reprioritize_req_t)zalloc(io_reprioritize_req_zone);
8665	assert(req != NULL);
8666	req->blkno = blkno;
8667	req->len = len;
8668	req->priority = prio;
8669	req->devvp = devvp;
8670
8671	/* Insert request into the reprioritization list */
8672	IO_REPRIORITIZE_LIST_LOCK();
8673	queue_enter(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list);
8674	IO_REPRIORITIZE_LIST_UNLOCK();
8675
8676	/* Wakeup reprioritize thread */
8677	IO_REPRIO_THREAD_WAKEUP();
8678
8679	return;
8680}
8681
8682void
8683vm_decmp_upl_reprioritize(upl_t upl, int prio)
8684{
8685	int offset;
8686	vm_object_t object;
8687	io_reprioritize_req_t 	req;
8688	struct vnode            *devvp = NULL;
8689	uint64_t 		blkno;
8690	uint32_t 		len;
8691	upl_t 			io_upl;
8692	uint64_t 		*io_upl_reprio_info;
8693	int 			io_upl_size;
8694
8695	if ((upl->flags & UPL_TRACKED_BY_OBJECT) == 0 || (upl->flags & UPL_EXPEDITE_SUPPORTED) == 0)
8696		return;
8697
8698	/*
8699	 * We dont want to perform any allocations with the upl lock held since that might
8700	 * result in a deadlock. If the system is low on memory, the pageout thread would
8701	 * try to pageout stuff and might wait on this lock. If we are waiting for the memory to
8702	 * be freed up by the pageout thread, it would be a deadlock.
8703	 */
8704
8705
8706	/* First step is just to get the size of the upl to find out how big the reprio info is */
8707	upl_lock(upl);
8708	if (upl->decmp_io_upl == NULL) {
8709		/* The real I/O upl was destroyed by the time we came in here. Nothing to do. */
8710		upl_unlock(upl);
8711		return;
8712	}
8713
8714	io_upl = upl->decmp_io_upl;
8715	assert((io_upl->flags & UPL_DECMP_REAL_IO) != 0);
8716	io_upl_size = io_upl->size;
8717	upl_unlock(upl);
8718
8719	/* Now perform the allocation */
8720	io_upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * (io_upl_size / PAGE_SIZE));
8721	if (io_upl_reprio_info == NULL)
8722		return;
8723
8724	/* Now again take the lock, recheck the state and grab out the required info */
8725	upl_lock(upl);
8726	if (upl->decmp_io_upl == NULL || upl->decmp_io_upl != io_upl) {
8727		/* The real I/O upl was destroyed by the time we came in here. Nothing to do. */
8728		upl_unlock(upl);
8729		goto out;
8730	}
8731	memcpy(io_upl_reprio_info, io_upl->upl_reprio_info, sizeof(uint64_t) * (io_upl_size / PAGE_SIZE));
8732
8733	/* Get the VM object for this UPL */
8734	if (io_upl->flags & UPL_SHADOWED) {
8735		object = io_upl->map_object->shadow;
8736	} else {
8737		object = io_upl->map_object;
8738	}
8739
8740	/* Get the dev vnode ptr for this object */
8741	if(!object || !object->pager ||
8742	   vnode_pager_get_object_devvp(object->pager, (uintptr_t *)&devvp) != KERN_SUCCESS) {
8743		upl_unlock(upl);
8744		goto out;
8745	}
8746
8747	upl_unlock(upl);
8748
8749	/* Now we have all the information needed to do the expedite */
8750
8751	offset = 0;
8752	while (offset < io_upl_size) {
8753		blkno 	= io_upl_reprio_info[(offset / PAGE_SIZE)] & UPL_REPRIO_INFO_MASK;
8754		len 	= (io_upl_reprio_info[(offset / PAGE_SIZE)] >> UPL_REPRIO_INFO_SHIFT) & UPL_REPRIO_INFO_MASK;
8755
8756		/*
8757		 * This implementation may cause some spurious expedites due to the
8758		 * fact that we dont cleanup the blkno & len from the upl_reprio_info
8759		 * even after the I/O is complete.
8760		 */
8761
8762		if (blkno != 0 && len != 0) {
8763			/* Create the request for I/O reprioritization */
8764       	 		req = (io_reprioritize_req_t)zalloc(io_reprioritize_req_zone);
8765        		assert(req != NULL);
8766        		req->blkno = blkno;
8767        		req->len = len;
8768        		req->priority = prio;
8769        		req->devvp = devvp;
8770
8771        		/* Insert request into the reprioritization list */
8772        		IO_REPRIORITIZE_LIST_LOCK();
8773        		queue_enter(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list);
8774        		IO_REPRIORITIZE_LIST_UNLOCK();
8775
8776			offset += len;
8777		} else {
8778			offset += PAGE_SIZE;
8779		}
8780	}
8781
8782	/* Wakeup reprioritize thread */
8783        IO_REPRIO_THREAD_WAKEUP();
8784
8785out:
8786	kfree(io_upl_reprio_info, sizeof(uint64_t) * (io_upl_size / PAGE_SIZE));
8787	return;
8788}
8789
8790void
8791vm_page_handle_prio_inversion(vm_object_t o, vm_page_t m)
8792{
8793	upl_t upl;
8794        upl_page_info_t *pl;
8795        unsigned int i, num_pages;
8796        int cur_tier;
8797
8798	cur_tier = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
8799
8800	/*
8801	Scan through all UPLs associated with the object to find the
8802	UPL containing the contended page.
8803	*/
8804	queue_iterate(&o->uplq, upl, upl_t, uplq) {
8805		if (((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) || upl->upl_priority <= cur_tier)
8806			continue;
8807		pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
8808                num_pages = (upl->size / PAGE_SIZE);
8809
8810		/*
8811		For each page in the UPL page list, see if it matches the contended
8812		page and was issued as a low prio I/O.
8813		*/
8814		for(i=0; i < num_pages; i++) {
8815			if(UPL_PAGE_PRESENT(pl,i) && m->phys_page == pl[i].phys_addr) {
8816				if ((upl->flags & UPL_DECMP_REQ) && upl->decmp_io_upl) {
8817                        		KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, upl->upl_creator, m, upl, upl->upl_priority, 0);
8818					vm_decmp_upl_reprioritize(upl, cur_tier);
8819					break;
8820				}
8821				KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, upl->upl_creator, m, upl->upl_reprio_info[i], upl->upl_priority, 0);
8822				if (UPL_REPRIO_INFO_BLKNO(upl, i) != 0 && UPL_REPRIO_INFO_LEN(upl, i) != 0)
8823					vm_page_request_reprioritize(o, UPL_REPRIO_INFO_BLKNO(upl, i), UPL_REPRIO_INFO_LEN(upl, i), cur_tier);
8824                                break;
8825                         }
8826		 }
8827		 /* Check if we found any hits */
8828                 if (i != num_pages)
8829			break;
8830	}
8831
8832	return;
8833}
8834
8835wait_result_t
8836vm_page_sleep(vm_object_t o, vm_page_t m, int interruptible)
8837{
8838	wait_result_t ret;
8839
8840	KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_START, o, m, 0, 0, 0);
8841
8842	if (o->io_tracking && ((m->busy == TRUE) || (m->cleaning == TRUE) || VM_PAGE_WIRED(m))) {
8843		/*
8844		Indicates page is busy due to an I/O. Issue a reprioritize request if necessary.
8845		*/
8846		vm_page_handle_prio_inversion(o,m);
8847	}
8848	m->wanted = TRUE;
8849	ret = thread_sleep_vm_object(o, m, interruptible);
8850	KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_END, o, m, 0, 0, 0);
8851	return ret;
8852}
8853
8854static void
8855io_reprioritize_thread(void *param __unused, wait_result_t wr __unused)
8856{
8857	io_reprioritize_req_t   req = NULL;
8858
8859	while(1) {
8860
8861		IO_REPRIORITIZE_LIST_LOCK();
8862		if (queue_empty(&io_reprioritize_list)) {
8863			IO_REPRIORITIZE_LIST_UNLOCK();
8864			break;
8865		}
8866
8867		queue_remove_first(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list);
8868		IO_REPRIORITIZE_LIST_UNLOCK();
8869
8870		vnode_pager_issue_reprioritize_io(req->devvp, req->blkno, req->len, req->priority);
8871		zfree(io_reprioritize_req_zone, req);
8872	}
8873
8874	IO_REPRIO_THREAD_CONTINUATION();
8875}
8876#endif
8877