vm_object.c revision 17334
1/*
2 * Copyright (c) 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
37 *
38 *
39 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40 * All rights reserved.
41 *
42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43 *
44 * Permission to use, copy, modify and distribute this software and
45 * its documentation is hereby granted, provided that both the copyright
46 * notice and this permission notice appear in all copies of the
47 * software, derivative works or modified versions, and any portions
48 * thereof, and that both notices appear in supporting documentation.
49 *
50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53 *
54 * Carnegie Mellon requests users of this software to return to
55 *
56 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57 *  School of Computer Science
58 *  Carnegie Mellon University
59 *  Pittsburgh PA 15213-3890
60 *
61 * any improvements or extensions that they make and grant Carnegie the
62 * rights to redistribute these changes.
63 *
64 * $Id: vm_object.c,v 1.77 1996/07/27 03:24:03 dyson Exp $
65 */
66
67/*
68 *	Virtual memory object module.
69 */
70#include "opt_ddb.h"
71
72#include <sys/param.h>
73#include <sys/systm.h>
74#include <sys/kernel.h>
75#include <sys/proc.h>		/* for curproc, pageproc */
76#include <sys/malloc.h>
77#include <sys/vnode.h>
78#include <sys/mount.h>
79#include <sys/vmmeter.h>
80#include <sys/mman.h>
81
82#include <vm/vm.h>
83#include <vm/vm_param.h>
84#include <vm/vm_prot.h>
85#include <vm/lock.h>
86#include <vm/pmap.h>
87#include <vm/vm_map.h>
88#include <vm/vm_object.h>
89#include <vm/vm_page.h>
90#include <vm/vm_pageout.h>
91#include <vm/vm_pager.h>
92#include <vm/swap_pager.h>
93#include <vm/vm_kern.h>
94#include <vm/vm_extern.h>
95
96#ifdef DDB
97static void	DDB_vm_object_check __P((void));
98#endif
99
100static void	_vm_object_allocate __P((objtype_t, vm_size_t, vm_object_t));
101#ifdef DDB
102static int	_vm_object_in_map __P((vm_map_t map, vm_object_t object,
103				       vm_map_entry_t entry));
104static int	vm_object_in_map __P((vm_object_t object));
105#endif
106static void	vm_object_qcollapse __P((vm_object_t object));
107#ifdef not_used
108static void	vm_object_deactivate_pages __P((vm_object_t));
109#endif
110static void	vm_object_terminate __P((vm_object_t));
111static void	vm_object_cache_trim __P((void));
112
113/*
114 *	Virtual memory objects maintain the actual data
115 *	associated with allocated virtual memory.  A given
116 *	page of memory exists within exactly one object.
117 *
118 *	An object is only deallocated when all "references"
119 *	are given up.  Only one "reference" to a given
120 *	region of an object should be writeable.
121 *
122 *	Associated with each object is a list of all resident
123 *	memory pages belonging to that object; this list is
124 *	maintained by the "vm_page" module, and locked by the object's
125 *	lock.
126 *
127 *	Each object also records a "pager" routine which is
128 *	used to retrieve (and store) pages to the proper backing
129 *	storage.  In addition, objects may be backed by other
130 *	objects from which they were virtual-copied.
131 *
132 *	The only items within the object structure which are
133 *	modified after time of creation are:
134 *		reference count		locked by object's lock
135 *		pager routine		locked by object's lock
136 *
137 */
138
139int vm_object_cache_max;
140struct object_q vm_object_cached_list;
141static int vm_object_cached;
142struct object_q vm_object_list;
143static long vm_object_count;
144vm_object_t kernel_object;
145vm_object_t kmem_object;
146static struct vm_object kernel_object_store;
147static struct vm_object kmem_object_store;
148extern int vm_pageout_page_count;
149
150static long object_collapses;
151static long object_bypasses;
152
153static void
154_vm_object_allocate(type, size, object)
155	objtype_t type;
156	vm_size_t size;
157	register vm_object_t object;
158{
159	TAILQ_INIT(&object->memq);
160	TAILQ_INIT(&object->shadow_head);
161
162	object->type = type;
163	object->size = size;
164	object->ref_count = 1;
165	object->flags = 0;
166	object->behavior = OBJ_NORMAL;
167	object->paging_in_progress = 0;
168	object->resident_page_count = 0;
169	object->shadow_count = 0;
170	object->handle = NULL;
171	object->paging_offset = (vm_ooffset_t) 0;
172	object->backing_object = NULL;
173	object->backing_object_offset = (vm_ooffset_t) 0;
174
175	object->last_read = 0;
176
177	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
178	vm_object_count++;
179}
180
181/*
182 *	vm_object_init:
183 *
184 *	Initialize the VM objects module.
185 */
186void
187vm_object_init()
188{
189	TAILQ_INIT(&vm_object_cached_list);
190	TAILQ_INIT(&vm_object_list);
191	vm_object_count = 0;
192
193	vm_object_cache_max = 84;
194	if (cnt.v_page_count > 1000)
195		vm_object_cache_max += (cnt.v_page_count - 1000) / 4;
196
197	kernel_object = &kernel_object_store;
198	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
199	    kernel_object);
200
201	kmem_object = &kmem_object_store;
202	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
203	    kmem_object);
204}
205
206/*
207 *	vm_object_allocate:
208 *
209 *	Returns a new object with the given size.
210 */
211
212vm_object_t
213vm_object_allocate(type, size)
214	objtype_t type;
215	vm_size_t size;
216{
217	register vm_object_t result;
218
219	result = (vm_object_t)
220	    malloc((u_long) sizeof *result, M_VMOBJ, M_WAITOK);
221
222
223	_vm_object_allocate(type, size, result);
224
225	return (result);
226}
227
228
229/*
230 *	vm_object_reference:
231 *
232 *	Gets another reference to the given object.
233 */
234inline void
235vm_object_reference(object)
236	register vm_object_t object;
237{
238	if (object == NULL)
239		return;
240
241	if (object->ref_count == 0) {
242		if ((object->flags & OBJ_CANPERSIST) == 0)
243			panic("vm_object_reference: non-persistent object with 0 ref_count");
244		TAILQ_REMOVE(&vm_object_cached_list, object, cached_list);
245		vm_object_cached--;
246	}
247	object->ref_count++;
248}
249
250/*
251 *	vm_object_deallocate:
252 *
253 *	Release a reference to the specified object,
254 *	gained either through a vm_object_allocate
255 *	or a vm_object_reference call.  When all references
256 *	are gone, storage associated with this object
257 *	may be relinquished.
258 *
259 *	No object may be locked.
260 */
261void
262vm_object_deallocate(object)
263	vm_object_t object;
264{
265	vm_object_t temp;
266
267	while (object != NULL) {
268
269		if (object->ref_count == 0)
270			panic("vm_object_deallocate: object deallocated too many times");
271
272		/*
273		 * Lose the reference
274		 */
275		object->ref_count--;
276		if (object->ref_count != 0) {
277			if ((object->ref_count == 1) &&
278			    (object->handle == NULL) &&
279			    (object->type == OBJT_DEFAULT ||
280			     object->type == OBJT_SWAP)) {
281				vm_object_t robject;
282				robject = TAILQ_FIRST(&object->shadow_head);
283				if ((robject != NULL) &&
284				    (robject->handle == NULL) &&
285				    (robject->type == OBJT_DEFAULT ||
286				     robject->type == OBJT_SWAP)) {
287					int s;
288					robject->ref_count += 2;
289					object->ref_count += 2;
290
291					do {
292						s = splvm();
293						while (robject->paging_in_progress) {
294							robject->flags |= OBJ_PIPWNT;
295							tsleep(robject, PVM, "objde1", 0);
296						}
297
298						while (object->paging_in_progress) {
299							object->flags |= OBJ_PIPWNT;
300							tsleep(object, PVM, "objde2", 0);
301						}
302						splx(s);
303
304					} while( object->paging_in_progress || robject->paging_in_progress);
305
306					object->ref_count -= 2;
307					robject->ref_count -= 2;
308					if( robject->ref_count == 0) {
309						robject->ref_count += 1;
310						object = robject;
311						continue;
312					}
313					vm_object_collapse(robject);
314					return;
315				}
316			}
317			/*
318			 * If there are still references, then we are done.
319			 */
320			return;
321		}
322
323		if (object->type == OBJT_VNODE) {
324			struct vnode *vp = object->handle;
325
326			vp->v_flag &= ~VTEXT;
327		}
328
329		/*
330		 * See if this object can persist and has some resident
331		 * pages.  If so, enter it in the cache.
332		 */
333		if (object->flags & OBJ_CANPERSIST) {
334			if (object->resident_page_count != 0) {
335				vm_object_page_clean(object, 0, 0 ,TRUE, TRUE);
336				TAILQ_INSERT_TAIL(&vm_object_cached_list, object,
337				    cached_list);
338				vm_object_cached++;
339
340				vm_object_cache_trim();
341				return;
342			} else {
343				object->flags &= ~OBJ_CANPERSIST;
344			}
345		}
346
347		/*
348		 * Make sure no one uses us.
349		 */
350		object->flags |= OBJ_DEAD;
351
352		temp = object->backing_object;
353		if (temp) {
354			TAILQ_REMOVE(&temp->shadow_head, object, shadow_list);
355			--temp->shadow_count;
356		}
357		vm_object_terminate(object);
358		/* unlocks and deallocates object */
359		object = temp;
360	}
361}
362
363/*
364 *	vm_object_terminate actually destroys the specified object, freeing
365 *	up all previously used resources.
366 *
367 *	The object must be locked.
368 */
369static void
370vm_object_terminate(object)
371	register vm_object_t object;
372{
373	register vm_page_t p;
374	int s;
375
376	/*
377	 * wait for the pageout daemon to be done with the object
378	 */
379	s = splvm();
380	while (object->paging_in_progress) {
381		object->flags |= OBJ_PIPWNT;
382		tsleep(object, PVM, "objtrm", 0);
383	}
384	splx(s);
385
386	if (object->paging_in_progress != 0)
387		panic("vm_object_deallocate: pageout in progress");
388
389	/*
390	 * Clean and free the pages, as appropriate. All references to the
391	 * object are gone, so we don't need to lock it.
392	 */
393	if (object->type == OBJT_VNODE) {
394		struct vnode *vp = object->handle;
395
396		VOP_LOCK(vp);
397		vm_object_page_clean(object, 0, 0, TRUE, FALSE);
398		vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0);
399		VOP_UNLOCK(vp);
400	}
401	/*
402	 * Now free the pages. For internal objects, this also removes them
403	 * from paging queues.
404	 */
405	while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
406		if (p->flags & PG_BUSY)
407			printf("vm_object_terminate: freeing busy page\n");
408		PAGE_WAKEUP(p);
409		vm_page_free(p);
410		cnt.v_pfree++;
411	}
412
413	/*
414	 * Let the pager know object is dead.
415	 */
416	vm_pager_deallocate(object);
417
418	TAILQ_REMOVE(&vm_object_list, object, object_list);
419	vm_object_count--;
420
421	wakeup(object);
422
423	/*
424	 * Free the space for the object.
425	 */
426	free((caddr_t) object, M_VMOBJ);
427}
428
429/*
430 *	vm_object_page_clean
431 *
432 *	Clean all dirty pages in the specified range of object.
433 *	Leaves page on whatever queue it is currently on.
434 *
435 *	Odd semantics: if start == end, we clean everything.
436 *
437 *	The object must be locked.
438 */
439
440void
441vm_object_page_clean(object, start, end, syncio, lockflag)
442	vm_object_t object;
443	vm_pindex_t start;
444	vm_pindex_t end;
445	boolean_t syncio;
446	boolean_t lockflag;
447{
448	register vm_page_t p, np, tp;
449	register vm_offset_t tstart, tend;
450	vm_pindex_t pi;
451	int s;
452	struct vnode *vp;
453	int runlen;
454	int maxf;
455	int chkb;
456	int maxb;
457	int i;
458	vm_page_t maf[vm_pageout_page_count];
459	vm_page_t mab[vm_pageout_page_count];
460	vm_page_t ma[vm_pageout_page_count];
461
462	if (object->type != OBJT_VNODE ||
463		(object->flags & OBJ_MIGHTBEDIRTY) == 0)
464		return;
465
466	vp = object->handle;
467
468	if (lockflag)
469		VOP_LOCK(vp);
470	object->flags |= OBJ_CLEANING;
471
472	tstart = start;
473	if (end == 0) {
474		tend = object->size;
475	} else {
476		tend = end;
477	}
478	if ((tstart == 0) && (tend == object->size)) {
479		object->flags &= ~(OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
480	}
481	for(p = TAILQ_FIRST(&object->memq); p; p = TAILQ_NEXT(p, listq))
482		p->flags |= PG_CLEANCHK;
483
484rescan:
485	for(p = TAILQ_FIRST(&object->memq); p; p = np) {
486		np = TAILQ_NEXT(p, listq);
487
488		pi = p->pindex;
489		if (((p->flags & PG_CLEANCHK) == 0) ||
490			(pi < tstart) || (pi >= tend) ||
491			(p->valid == 0) || (p->queue == PQ_CACHE)) {
492			p->flags &= ~PG_CLEANCHK;
493			continue;
494		}
495
496		vm_page_test_dirty(p);
497		if ((p->dirty & p->valid) == 0) {
498			p->flags &= ~PG_CLEANCHK;
499			continue;
500		}
501
502		s = splvm();
503		if ((p->flags & PG_BUSY) || p->busy) {
504			p->flags |= PG_WANTED|PG_REFERENCED;
505			tsleep(p, PVM, "vpcwai", 0);
506			splx(s);
507			goto rescan;
508		}
509		splx(s);
510
511		s = splvm();
512		maxf = 0;
513		for(i=1;i<vm_pageout_page_count;i++) {
514			if (tp = vm_page_lookup(object, pi + i)) {
515				if ((tp->flags & PG_BUSY) ||
516					(tp->flags & PG_CLEANCHK) == 0)
517					break;
518				if (tp->queue == PQ_CACHE) {
519					tp->flags &= ~PG_CLEANCHK;
520					break;
521				}
522				vm_page_test_dirty(tp);
523				if ((tp->dirty & tp->valid) == 0) {
524					tp->flags &= ~PG_CLEANCHK;
525					break;
526				}
527				maf[ i - 1 ] = tp;
528				maxf++;
529				continue;
530			}
531			break;
532		}
533
534		maxb = 0;
535		chkb = vm_pageout_page_count -  maxf;
536		if (chkb) {
537			for(i = 1; i < chkb;i++) {
538				if (tp = vm_page_lookup(object, pi - i)) {
539					if ((tp->flags & PG_BUSY) ||
540						(tp->flags & PG_CLEANCHK) == 0)
541						break;
542					if (tp->queue == PQ_CACHE) {
543						tp->flags &= ~PG_CLEANCHK;
544						break;
545					}
546					vm_page_test_dirty(tp);
547					if ((tp->dirty & tp->valid) == 0) {
548						tp->flags &= ~PG_CLEANCHK;
549						break;
550					}
551					mab[ i - 1 ] = tp;
552					maxb++;
553					continue;
554				}
555				break;
556			}
557		}
558
559		for(i=0;i<maxb;i++) {
560			int index = (maxb - i) - 1;
561			ma[index] = mab[i];
562			ma[index]->flags |= PG_BUSY;
563			ma[index]->flags &= ~PG_CLEANCHK;
564			vm_page_protect(ma[index], VM_PROT_READ);
565		}
566		vm_page_protect(p, VM_PROT_READ);
567		p->flags |= PG_BUSY;
568		p->flags &= ~PG_CLEANCHK;
569		ma[maxb] = p;
570		for(i=0;i<maxf;i++) {
571			int index = (maxb + i) + 1;
572			ma[index] = maf[i];
573			ma[index]->flags |= PG_BUSY;
574			ma[index]->flags &= ~PG_CLEANCHK;
575			vm_page_protect(ma[index], VM_PROT_READ);
576		}
577		runlen = maxb + maxf + 1;
578		splx(s);
579		vm_pageout_flush(ma, runlen, 0);
580		goto rescan;
581	}
582
583	VOP_FSYNC(vp, NULL, syncio, curproc);
584
585	if (lockflag)
586		VOP_UNLOCK(vp);
587	object->flags &= ~OBJ_CLEANING;
588	return;
589}
590
591#ifdef not_used
592/* XXX I cannot tell if this should be an exported symbol */
593/*
594 *	vm_object_deactivate_pages
595 *
596 *	Deactivate all pages in the specified object.  (Keep its pages
597 *	in memory even though it is no longer referenced.)
598 *
599 *	The object must be locked.
600 */
601static void
602vm_object_deactivate_pages(object)
603	register vm_object_t object;
604{
605	register vm_page_t p, next;
606
607	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = next) {
608		next = TAILQ_NEXT(p, listq);
609		vm_page_deactivate(p);
610	}
611}
612#endif
613
614/*
615 *	Trim the object cache to size.
616 */
617static void
618vm_object_cache_trim()
619{
620	register vm_object_t object;
621
622	while (vm_object_cached > vm_object_cache_max) {
623		object = TAILQ_FIRST(&vm_object_cached_list);
624
625		vm_object_reference(object);
626		pager_cache(object, FALSE);
627	}
628}
629
630
631/*
632 *	vm_object_pmap_copy:
633 *
634 *	Makes all physical pages in the specified
635 *	object range copy-on-write.  No writeable
636 *	references to these pages should remain.
637 *
638 *	The object must *not* be locked.
639 */
640void
641vm_object_pmap_copy(object, start, end)
642	register vm_object_t object;
643	register vm_pindex_t start;
644	register vm_pindex_t end;
645{
646	register vm_page_t p;
647
648	if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
649		return;
650
651	for (p = TAILQ_FIRST(&object->memq);
652		p != NULL;
653		p = TAILQ_NEXT(p, listq)) {
654		vm_page_protect(p, VM_PROT_READ);
655	}
656
657	object->flags &= ~OBJ_WRITEABLE;
658}
659
660/*
661 *	vm_object_pmap_remove:
662 *
663 *	Removes all physical pages in the specified
664 *	object range from all physical maps.
665 *
666 *	The object must *not* be locked.
667 */
668void
669vm_object_pmap_remove(object, start, end)
670	register vm_object_t object;
671	register vm_pindex_t start;
672	register vm_pindex_t end;
673{
674	register vm_page_t p;
675	if (object == NULL)
676		return;
677	for (p = TAILQ_FIRST(&object->memq);
678		p != NULL;
679		p = TAILQ_NEXT(p, listq)) {
680		if (p->pindex >= start && p->pindex < end)
681			vm_page_protect(p, VM_PROT_NONE);
682	}
683}
684
685/*
686 *	vm_object_madvise:
687 *
688 *	Implements the madvise function at the object/page level.
689 */
690void
691vm_object_madvise(object, pindex, count, advise)
692	vm_object_t object;
693	vm_pindex_t pindex;
694	int count;
695	int advise;
696{
697	vm_pindex_t end;
698	vm_page_t m;
699
700	if (object == NULL)
701		return;
702
703	end = pindex + count;
704
705	for (; pindex < end; pindex += 1) {
706		m = vm_page_lookup(object, pindex);
707
708		/*
709		 * If the page is busy or not in a normal active state,
710		 * we skip it.  Things can break if we mess with pages
711		 * in any of the below states.
712		 */
713		if (m == NULL || m->busy || (m->flags & PG_BUSY) ||
714			m->hold_count || m->wire_count ||
715			m->valid != VM_PAGE_BITS_ALL)
716			continue;
717
718		if (advise == MADV_WILLNEED) {
719			if (m->queue != PQ_ACTIVE)
720				vm_page_activate(m);
721		} else if ((advise == MADV_DONTNEED) ||
722			((advise == MADV_FREE) &&
723				((object->type != OBJT_DEFAULT) &&
724					(object->type != OBJT_SWAP)))) {
725			vm_page_deactivate(m);
726		} else if (advise == MADV_FREE) {
727			/*
728			 * Force a demand-zero on next ref
729			 */
730			if (object->type == OBJT_SWAP)
731				swap_pager_dmzspace(object, m->pindex, 1);
732			vm_page_protect(m, VM_PROT_NONE);
733			vm_page_free(m);
734		}
735	}
736}
737
738/*
739 *	vm_object_shadow:
740 *
741 *	Create a new object which is backed by the
742 *	specified existing object range.  The source
743 *	object reference is deallocated.
744 *
745 *	The new object and offset into that object
746 *	are returned in the source parameters.
747 */
748
749void
750vm_object_shadow(object, offset, length)
751	vm_object_t *object;	/* IN/OUT */
752	vm_ooffset_t *offset;	/* IN/OUT */
753	vm_size_t length;
754{
755	register vm_object_t source;
756	register vm_object_t result;
757
758	source = *object;
759
760	/*
761	 * Allocate a new object with the given length
762	 */
763
764	if ((result = vm_object_allocate(OBJT_DEFAULT, length)) == NULL)
765		panic("vm_object_shadow: no object for shadowing");
766
767	/*
768	 * The new object shadows the source object, adding a reference to it.
769	 * Our caller changes his reference to point to the new object,
770	 * removing a reference to the source object.  Net result: no change
771	 * of reference count.
772	 */
773	result->backing_object = source;
774	if (source) {
775		TAILQ_INSERT_TAIL(&source->shadow_head, result, shadow_list);
776		++source->shadow_count;
777	}
778
779	/*
780	 * Store the offset into the source object, and fix up the offset into
781	 * the new object.
782	 */
783
784	result->backing_object_offset = *offset;
785
786	/*
787	 * Return the new things
788	 */
789
790	*offset = 0;
791	*object = result;
792}
793
794
795/*
796 * this version of collapse allows the operation to occur earlier and
797 * when paging_in_progress is true for an object...  This is not a complete
798 * operation, but should plug 99.9% of the rest of the leaks.
799 */
800static void
801vm_object_qcollapse(object)
802	register vm_object_t object;
803{
804	register vm_object_t backing_object;
805	register vm_pindex_t backing_offset_index, paging_offset_index;
806	vm_pindex_t backing_object_paging_offset_index;
807	vm_pindex_t new_pindex;
808	register vm_page_t p, pp;
809	register vm_size_t size;
810
811	backing_object = object->backing_object;
812	if (backing_object->ref_count != 1)
813		return;
814
815	backing_object->ref_count += 2;
816
817	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
818	backing_object_paging_offset_index = OFF_TO_IDX(backing_object->paging_offset);
819	paging_offset_index = OFF_TO_IDX(object->paging_offset);
820	size = object->size;
821	p = TAILQ_FIRST(&backing_object->memq);
822	while (p) {
823		vm_page_t next;
824
825		next = TAILQ_NEXT(p, listq);
826		if ((p->flags & (PG_BUSY | PG_FICTITIOUS)) ||
827		    (p->queue == PQ_CACHE) || !p->valid || p->hold_count || p->wire_count || p->busy) {
828			p = next;
829			continue;
830		}
831		new_pindex = p->pindex - backing_offset_index;
832		if (p->pindex < backing_offset_index ||
833		    new_pindex >= size) {
834			if (backing_object->type == OBJT_SWAP)
835				swap_pager_freespace(backing_object,
836				    backing_object_paging_offset_index+p->pindex,
837				    1);
838			vm_page_protect(p, VM_PROT_NONE);
839			vm_page_free(p);
840		} else {
841			pp = vm_page_lookup(object, new_pindex);
842			if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object,
843				    paging_offset_index + new_pindex, NULL, NULL))) {
844				if (backing_object->type == OBJT_SWAP)
845					swap_pager_freespace(backing_object,
846					    backing_object_paging_offset_index + p->pindex, 1);
847				vm_page_protect(p, VM_PROT_NONE);
848				vm_page_free(p);
849			} else {
850				if (backing_object->type == OBJT_SWAP)
851					swap_pager_freespace(backing_object,
852					    backing_object_paging_offset_index + p->pindex, 1);
853				vm_page_rename(p, object, new_pindex);
854				p->dirty = VM_PAGE_BITS_ALL;
855			}
856		}
857		p = next;
858	}
859	backing_object->ref_count -= 2;
860}
861
862/*
863 *	vm_object_collapse:
864 *
865 *	Collapse an object with the object backing it.
866 *	Pages in the backing object are moved into the
867 *	parent, and the backing object is deallocated.
868 */
869void
870vm_object_collapse(object)
871	vm_object_t object;
872
873{
874	vm_object_t backing_object;
875	vm_ooffset_t backing_offset;
876	vm_size_t size;
877	vm_pindex_t new_pindex, backing_offset_index;
878	vm_page_t p, pp;
879
880	while (TRUE) {
881		/*
882		 * Verify that the conditions are right for collapse:
883		 *
884		 * The object exists and no pages in it are currently being paged
885		 * out.
886		 */
887		if (object == NULL)
888			return;
889
890		/*
891		 * Make sure there is a backing object.
892		 */
893		if ((backing_object = object->backing_object) == NULL)
894			return;
895
896		/*
897		 * we check the backing object first, because it is most likely
898		 * not collapsable.
899		 */
900		if (backing_object->handle != NULL ||
901		    (backing_object->type != OBJT_DEFAULT &&
902		     backing_object->type != OBJT_SWAP) ||
903		    (backing_object->flags & OBJ_DEAD) ||
904		    object->handle != NULL ||
905		    (object->type != OBJT_DEFAULT &&
906		     object->type != OBJT_SWAP) ||
907		    (object->flags & OBJ_DEAD)) {
908			return;
909		}
910
911		if (object->paging_in_progress != 0 ||
912		    backing_object->paging_in_progress != 0) {
913			vm_object_qcollapse(object);
914			return;
915		}
916
917		/*
918		 * We know that we can either collapse the backing object (if
919		 * the parent is the only reference to it) or (perhaps) remove
920		 * the parent's reference to it.
921		 */
922
923		backing_offset = object->backing_object_offset;
924		backing_offset_index = OFF_TO_IDX(backing_offset);
925		size = object->size;
926
927		/*
928		 * If there is exactly one reference to the backing object, we
929		 * can collapse it into the parent.
930		 */
931
932		if (backing_object->ref_count == 1) {
933
934			backing_object->flags |= OBJ_DEAD;
935			/*
936			 * We can collapse the backing object.
937			 *
938			 * Move all in-memory pages from backing_object to the
939			 * parent.  Pages that have been paged out will be
940			 * overwritten by any of the parent's pages that
941			 * shadow them.
942			 */
943
944			while ((p = TAILQ_FIRST(&backing_object->memq)) != 0) {
945
946				new_pindex = p->pindex - backing_offset_index;
947
948				/*
949				 * If the parent has a page here, or if this
950				 * page falls outside the parent, dispose of
951				 * it.
952				 *
953				 * Otherwise, move it as planned.
954				 */
955
956				if (p->pindex < backing_offset_index ||
957				    new_pindex >= size) {
958					vm_page_protect(p, VM_PROT_NONE);
959					PAGE_WAKEUP(p);
960					vm_page_free(p);
961				} else {
962					pp = vm_page_lookup(object, new_pindex);
963					if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object,
964					    OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL))) {
965						vm_page_protect(p, VM_PROT_NONE);
966						PAGE_WAKEUP(p);
967						vm_page_free(p);
968					} else {
969						vm_page_rename(p, object, new_pindex);
970					}
971				}
972			}
973
974			/*
975			 * Move the pager from backing_object to object.
976			 */
977
978			if (backing_object->type == OBJT_SWAP) {
979				backing_object->paging_in_progress++;
980				if (object->type == OBJT_SWAP) {
981					object->paging_in_progress++;
982					/*
983					 * copy shadow object pages into ours
984					 * and destroy unneeded pages in
985					 * shadow object.
986					 */
987					swap_pager_copy(
988					    backing_object,
989					    OFF_TO_IDX(backing_object->paging_offset),
990					    object,
991					    OFF_TO_IDX(object->paging_offset),
992					    OFF_TO_IDX(object->backing_object_offset));
993					vm_object_pip_wakeup(object);
994				} else {
995					object->paging_in_progress++;
996					/*
997					 * move the shadow backing_object's pager data to
998					 * "object" and convert "object" type to OBJT_SWAP.
999					 */
1000					object->type = OBJT_SWAP;
1001					object->un_pager.swp.swp_nblocks =
1002					    backing_object->un_pager.swp.swp_nblocks;
1003					object->un_pager.swp.swp_allocsize =
1004					    backing_object->un_pager.swp.swp_allocsize;
1005					object->un_pager.swp.swp_blocks =
1006					    backing_object->un_pager.swp.swp_blocks;
1007					object->un_pager.swp.swp_poip =		/* XXX */
1008					    backing_object->un_pager.swp.swp_poip;
1009					object->paging_offset = backing_object->paging_offset + backing_offset;
1010					TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
1011
1012					/*
1013					 * Convert backing object from OBJT_SWAP to
1014					 * OBJT_DEFAULT. XXX - only the TAILQ_REMOVE is
1015					 * actually necessary.
1016					 */
1017					backing_object->type = OBJT_DEFAULT;
1018					TAILQ_REMOVE(&swap_pager_un_object_list, backing_object, pager_object_list);
1019					/*
1020					 * free unnecessary blocks
1021					 */
1022					swap_pager_freespace(object, 0,
1023						OFF_TO_IDX(object->paging_offset));
1024					vm_object_pip_wakeup(object);
1025				}
1026
1027				vm_object_pip_wakeup(backing_object);
1028			}
1029			/*
1030			 * Object now shadows whatever backing_object did.
1031			 * Note that the reference to backing_object->backing_object
1032			 * moves from within backing_object to within object.
1033			 */
1034
1035			TAILQ_REMOVE(&object->backing_object->shadow_head, object,
1036			    shadow_list);
1037			--object->backing_object->shadow_count;
1038			if (backing_object->backing_object) {
1039				TAILQ_REMOVE(&backing_object->backing_object->shadow_head,
1040				    backing_object, shadow_list);
1041				--backing_object->backing_object->shadow_count;
1042			}
1043			object->backing_object = backing_object->backing_object;
1044			if (object->backing_object) {
1045				TAILQ_INSERT_TAIL(&object->backing_object->shadow_head,
1046				    object, shadow_list);
1047				++object->backing_object->shadow_count;
1048			}
1049
1050			object->backing_object_offset += backing_object->backing_object_offset;
1051			/*
1052			 * Discard backing_object.
1053			 *
1054			 * Since the backing object has no pages, no pager left,
1055			 * and no object references within it, all that is
1056			 * necessary is to dispose of it.
1057			 */
1058
1059			TAILQ_REMOVE(&vm_object_list, backing_object,
1060			    object_list);
1061			vm_object_count--;
1062
1063			free((caddr_t) backing_object, M_VMOBJ);
1064
1065			object_collapses++;
1066		} else {
1067			/*
1068			 * If all of the pages in the backing object are
1069			 * shadowed by the parent object, the parent object no
1070			 * longer has to shadow the backing object; it can
1071			 * shadow the next one in the chain.
1072			 *
1073			 * The backing object must not be paged out - we'd have
1074			 * to check all of the paged-out pages, as well.
1075			 */
1076
1077			if (backing_object->type != OBJT_DEFAULT) {
1078				return;
1079			}
1080			/*
1081			 * Should have a check for a 'small' number of pages
1082			 * here.
1083			 */
1084
1085			for (p = TAILQ_FIRST(&backing_object->memq); p; p = TAILQ_NEXT(p, listq)) {
1086				new_pindex = p->pindex - backing_offset_index;
1087
1088				/*
1089				 * If the parent has a page here, or if this
1090				 * page falls outside the parent, keep going.
1091				 *
1092				 * Otherwise, the backing_object must be left in
1093				 * the chain.
1094				 */
1095
1096				if (p->pindex >= backing_offset_index &&
1097					new_pindex <= size) {
1098
1099					pp = vm_page_lookup(object, new_pindex);
1100
1101					if ((pp == NULL || pp->valid == 0) &&
1102				   	    !vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL)) {
1103						/*
1104						 * Page still needed. Can't go any
1105						 * further.
1106						 */
1107						return;
1108					}
1109				}
1110			}
1111
1112			/*
1113			 * Make the parent shadow the next object in the
1114			 * chain.  Deallocating backing_object will not remove
1115			 * it, since its reference count is at least 2.
1116			 */
1117
1118			TAILQ_REMOVE(&object->backing_object->shadow_head,
1119			    object, shadow_list);
1120			--object->backing_object->shadow_count;
1121			vm_object_reference(object->backing_object = backing_object->backing_object);
1122			if (object->backing_object) {
1123				TAILQ_INSERT_TAIL(&object->backing_object->shadow_head,
1124				    object, shadow_list);
1125				++object->backing_object->shadow_count;
1126			}
1127			object->backing_object_offset += backing_object->backing_object_offset;
1128
1129			/*
1130			 * Drop the reference count on backing_object. Since
1131			 * its ref_count was at least 2, it will not vanish;
1132			 * so we don't need to call vm_object_deallocate.
1133			 */
1134			if (backing_object->ref_count == 1)
1135				printf("should have called obj deallocate\n");
1136			backing_object->ref_count--;
1137
1138			object_bypasses++;
1139
1140		}
1141
1142		/*
1143		 * Try again with this object's new backing object.
1144		 */
1145	}
1146}
1147
1148/*
1149 *	vm_object_page_remove: [internal]
1150 *
1151 *	Removes all physical pages in the specified
1152 *	object range from the object's list of pages.
1153 *
1154 *	The object must be locked.
1155 */
1156void
1157vm_object_page_remove(object, start, end, clean_only)
1158	register vm_object_t object;
1159	register vm_pindex_t start;
1160	register vm_pindex_t end;
1161	boolean_t clean_only;
1162{
1163	register vm_page_t p, next;
1164	unsigned int size;
1165	int s;
1166
1167	if (object == NULL)
1168		return;
1169
1170	object->paging_in_progress++;
1171again:
1172	size = end - start;
1173	if (size > 4 || size >= object->size / 4) {
1174		for (p = TAILQ_FIRST(&object->memq); p != NULL; p = next) {
1175			next = TAILQ_NEXT(p, listq);
1176			if ((start <= p->pindex) && (p->pindex < end)) {
1177				if (p->wire_count != 0) {
1178					vm_page_protect(p, VM_PROT_NONE);
1179					p->valid = 0;
1180					continue;
1181				}
1182
1183				/*
1184				 * The busy flags are only cleared at
1185				 * interrupt -- minimize the spl transitions
1186				 */
1187				if ((p->flags & PG_BUSY) || p->busy) {
1188					s = splvm();
1189					if ((p->flags & PG_BUSY) || p->busy) {
1190						p->flags |= PG_WANTED;
1191						tsleep(p, PVM, "vmopar", 0);
1192						splx(s);
1193						goto again;
1194					}
1195					splx(s);
1196				}
1197
1198				if (clean_only) {
1199					vm_page_test_dirty(p);
1200					if (p->valid & p->dirty)
1201						continue;
1202				}
1203				vm_page_protect(p, VM_PROT_NONE);
1204				PAGE_WAKEUP(p);
1205				vm_page_free(p);
1206			}
1207		}
1208	} else {
1209		while (size > 0) {
1210			if ((p = vm_page_lookup(object, start)) != 0) {
1211				if (p->wire_count != 0) {
1212					p->valid = 0;
1213					vm_page_protect(p, VM_PROT_NONE);
1214					start += 1;
1215					size -= 1;
1216					continue;
1217				}
1218				/*
1219				 * The busy flags are only cleared at
1220				 * interrupt -- minimize the spl transitions
1221				 */
1222				if ((p->flags & PG_BUSY) || p->busy) {
1223					s = splvm();
1224					if ((p->flags & PG_BUSY) || p->busy) {
1225						p->flags |= PG_WANTED;
1226						tsleep(p, PVM, "vmopar", 0);
1227						splx(s);
1228						goto again;
1229					}
1230					splx(s);
1231				}
1232				if (clean_only) {
1233					vm_page_test_dirty(p);
1234					if (p->valid & p->dirty) {
1235						start += 1;
1236						size -= 1;
1237						continue;
1238					}
1239				}
1240				vm_page_protect(p, VM_PROT_NONE);
1241				PAGE_WAKEUP(p);
1242				vm_page_free(p);
1243			}
1244			start += 1;
1245			size -= 1;
1246		}
1247	}
1248	vm_object_pip_wakeup(object);
1249}
1250
1251/*
1252 *	Routine:	vm_object_coalesce
1253 *	Function:	Coalesces two objects backing up adjoining
1254 *			regions of memory into a single object.
1255 *
1256 *	returns TRUE if objects were combined.
1257 *
1258 *	NOTE:	Only works at the moment if the second object is NULL -
1259 *		if it's not, which object do we lock first?
1260 *
1261 *	Parameters:
1262 *		prev_object	First object to coalesce
1263 *		prev_offset	Offset into prev_object
1264 *		next_object	Second object into coalesce
1265 *		next_offset	Offset into next_object
1266 *
1267 *		prev_size	Size of reference to prev_object
1268 *		next_size	Size of reference to next_object
1269 *
1270 *	Conditions:
1271 *	The object must *not* be locked.
1272 */
1273boolean_t
1274vm_object_coalesce(prev_object, prev_pindex, prev_size, next_size)
1275	register vm_object_t prev_object;
1276	vm_pindex_t prev_pindex;
1277	vm_size_t prev_size, next_size;
1278{
1279	vm_size_t newsize;
1280
1281	if (prev_object == NULL) {
1282		return (TRUE);
1283	}
1284
1285	if (prev_object->type != OBJT_DEFAULT) {
1286		return (FALSE);
1287	}
1288
1289	/*
1290	 * Try to collapse the object first
1291	 */
1292	vm_object_collapse(prev_object);
1293
1294	/*
1295	 * Can't coalesce if: . more than one reference . paged out . shadows
1296	 * another object . has a copy elsewhere (any of which mean that the
1297	 * pages not mapped to prev_entry may be in use anyway)
1298	 */
1299
1300	if (prev_object->ref_count > 1 ||
1301	    prev_object->backing_object != NULL) {
1302		return (FALSE);
1303	}
1304
1305	prev_size >>= PAGE_SHIFT;
1306	next_size >>= PAGE_SHIFT;
1307	/*
1308	 * Remove any pages that may still be in the object from a previous
1309	 * deallocation.
1310	 */
1311
1312	vm_object_page_remove(prev_object,
1313	    prev_pindex + prev_size,
1314	    prev_pindex + prev_size + next_size, FALSE);
1315
1316	/*
1317	 * Extend the object if necessary.
1318	 */
1319	newsize = prev_pindex + prev_size + next_size;
1320	if (newsize > prev_object->size)
1321		prev_object->size = newsize;
1322
1323	return (TRUE);
1324}
1325
1326#ifdef DDB
1327
1328static int
1329_vm_object_in_map(map, object, entry)
1330	vm_map_t map;
1331	vm_object_t object;
1332	vm_map_entry_t entry;
1333{
1334	vm_map_t tmpm;
1335	vm_map_entry_t tmpe;
1336	vm_object_t obj;
1337	int entcount;
1338
1339	if (map == 0)
1340		return 0;
1341
1342	if (entry == 0) {
1343		tmpe = map->header.next;
1344		entcount = map->nentries;
1345		while (entcount-- && (tmpe != &map->header)) {
1346			if( _vm_object_in_map(map, object, tmpe)) {
1347				return 1;
1348			}
1349			tmpe = tmpe->next;
1350		}
1351	} else if (entry->is_sub_map || entry->is_a_map) {
1352		tmpm = entry->object.share_map;
1353		tmpe = tmpm->header.next;
1354		entcount = tmpm->nentries;
1355		while (entcount-- && tmpe != &tmpm->header) {
1356			if( _vm_object_in_map(tmpm, object, tmpe)) {
1357				return 1;
1358			}
1359			tmpe = tmpe->next;
1360		}
1361	} else if (obj = entry->object.vm_object) {
1362		for(; obj; obj=obj->backing_object)
1363			if( obj == object) {
1364				return 1;
1365			}
1366	}
1367	return 0;
1368}
1369
1370static int
1371vm_object_in_map( object)
1372	vm_object_t object;
1373{
1374	struct proc *p;
1375	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
1376		if( !p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
1377			continue;
1378		if( _vm_object_in_map(&p->p_vmspace->vm_map, object, 0))
1379			return 1;
1380	}
1381	if( _vm_object_in_map( kernel_map, object, 0))
1382		return 1;
1383	if( _vm_object_in_map( kmem_map, object, 0))
1384		return 1;
1385	if( _vm_object_in_map( pager_map, object, 0))
1386		return 1;
1387	if( _vm_object_in_map( buffer_map, object, 0))
1388		return 1;
1389	if( _vm_object_in_map( io_map, object, 0))
1390		return 1;
1391	if( _vm_object_in_map( phys_map, object, 0))
1392		return 1;
1393	if( _vm_object_in_map( mb_map, object, 0))
1394		return 1;
1395	if( _vm_object_in_map( u_map, object, 0))
1396		return 1;
1397	return 0;
1398}
1399
1400
1401#ifdef DDB
1402static void
1403DDB_vm_object_check()
1404{
1405	vm_object_t object;
1406
1407	/*
1408	 * make sure that internal objs are in a map somewhere
1409	 * and none have zero ref counts.
1410	 */
1411	for (object = TAILQ_FIRST(&vm_object_list);
1412			object != NULL;
1413			object = TAILQ_NEXT(object, object_list)) {
1414		if (object->handle == NULL &&
1415		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
1416			if (object->ref_count == 0) {
1417				printf("vmochk: internal obj has zero ref count: %d\n",
1418					object->size);
1419			}
1420			if (!vm_object_in_map(object)) {
1421				printf("vmochk: internal obj is not in a map: "
1422		"ref: %d, size: %d: 0x%x, backing_object: 0x%x\n",
1423				    object->ref_count, object->size,
1424				    object->size, object->backing_object);
1425			}
1426		}
1427	}
1428}
1429#endif /* DDB */
1430
1431/*
1432 *	vm_object_print:	[ debug ]
1433 */
1434void
1435vm_object_print(iobject, full, dummy3, dummy4)
1436	/* db_expr_t */ int iobject;
1437	boolean_t full;
1438	/* db_expr_t */ int dummy3;
1439	char *dummy4;
1440{
1441	vm_object_t object = (vm_object_t)iobject;	/* XXX */
1442	register vm_page_t p;
1443
1444	register int count;
1445
1446	if (object == NULL)
1447		return;
1448
1449	iprintf("Object 0x%x: size=0x%x, res=%d, ref=%d, ",
1450	    (int) object, (int) object->size,
1451	    object->resident_page_count, object->ref_count);
1452	printf("offset=0x%x, backing_object=(0x%x)+0x%x\n",
1453	    (int) object->paging_offset,
1454	    (int) object->backing_object, (int) object->backing_object_offset);
1455	printf("cache: next=%p, prev=%p\n",
1456	    TAILQ_NEXT(object, cached_list), TAILQ_PREV(object, cached_list));
1457
1458	if (!full)
1459		return;
1460
1461	indent += 2;
1462	count = 0;
1463	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = TAILQ_NEXT(p, listq)) {
1464		if (count == 0)
1465			iprintf("memory:=");
1466		else if (count == 6) {
1467			printf("\n");
1468			iprintf(" ...");
1469			count = 0;
1470		} else
1471			printf(",");
1472		count++;
1473
1474		printf("(off=0x%lx,page=0x%lx)",
1475		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1476	}
1477	if (count != 0)
1478		printf("\n");
1479	indent -= 2;
1480}
1481#endif /* DDB */
1482