vm_object.c revision 15367
1/*
2 * Copyright (c) 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
37 *
38 *
39 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40 * All rights reserved.
41 *
42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43 *
44 * Permission to use, copy, modify and distribute this software and
45 * its documentation is hereby granted, provided that both the copyright
46 * notice and this permission notice appear in all copies of the
47 * software, derivative works or modified versions, and any portions
48 * thereof, and that both notices appear in supporting documentation.
49 *
50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53 *
54 * Carnegie Mellon requests users of this software to return to
55 *
56 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57 *  School of Computer Science
58 *  Carnegie Mellon University
59 *  Pittsburgh PA 15213-3890
60 *
61 * any improvements or extensions that they make and grant Carnegie the
62 * rights to redistribute these changes.
63 *
64 * $Id: vm_object.c,v 1.67 1996/03/29 06:28:48 davidg Exp $
65 */
66
67/*
68 *	Virtual memory object module.
69 */
70#include "opt_ddb.h"
71
72#include <sys/param.h>
73#include <sys/systm.h>
74#include <sys/kernel.h>
75#include <sys/proc.h>		/* for curproc, pageproc */
76#include <sys/malloc.h>
77#include <sys/vnode.h>
78#include <sys/mount.h>
79#include <sys/vmmeter.h>
80
81#include <vm/vm.h>
82#include <vm/vm_param.h>
83#include <vm/vm_prot.h>
84#include <vm/lock.h>
85#include <vm/pmap.h>
86#include <vm/vm_map.h>
87#include <vm/vm_object.h>
88#include <vm/vm_page.h>
89#include <vm/vm_pageout.h>
90#include <vm/vm_pager.h>
91#include <vm/swap_pager.h>
92#include <vm/vm_kern.h>
93#include <vm/vm_extern.h>
94
95#ifdef DDB
96static void	DDB_vm_object_check __P((void));
97#endif
98
99static void	_vm_object_allocate __P((objtype_t, vm_size_t, vm_object_t));
100#ifdef DDB
101static int	_vm_object_in_map __P((vm_map_t map, vm_object_t object,
102				       vm_map_entry_t entry));
103static int	vm_object_in_map __P((vm_object_t object));
104#endif
105static void	vm_object_qcollapse __P((vm_object_t object));
106#ifdef not_used
107static void	vm_object_deactivate_pages __P((vm_object_t));
108#endif
109static void	vm_object_terminate __P((vm_object_t));
110static void	vm_object_cache_trim __P((void));
111
112/*
113 *	Virtual memory objects maintain the actual data
114 *	associated with allocated virtual memory.  A given
115 *	page of memory exists within exactly one object.
116 *
117 *	An object is only deallocated when all "references"
118 *	are given up.  Only one "reference" to a given
119 *	region of an object should be writeable.
120 *
121 *	Associated with each object is a list of all resident
122 *	memory pages belonging to that object; this list is
123 *	maintained by the "vm_page" module, and locked by the object's
124 *	lock.
125 *
126 *	Each object also records a "pager" routine which is
127 *	used to retrieve (and store) pages to the proper backing
128 *	storage.  In addition, objects may be backed by other
129 *	objects from which they were virtual-copied.
130 *
131 *	The only items within the object structure which are
132 *	modified after time of creation are:
133 *		reference count		locked by object's lock
134 *		pager routine		locked by object's lock
135 *
136 */
137
138int vm_object_cache_max;
139struct object_q vm_object_cached_list;
140static int vm_object_cached;
141struct object_q vm_object_list;
142static long vm_object_count;
143vm_object_t kernel_object;
144vm_object_t kmem_object;
145static struct vm_object kernel_object_store;
146static struct vm_object kmem_object_store;
147extern int vm_pageout_page_count;
148
149static long object_collapses;
150static long object_bypasses;
151
152static void
153_vm_object_allocate(type, size, object)
154	objtype_t type;
155	vm_size_t size;
156	register vm_object_t object;
157{
158	TAILQ_INIT(&object->memq);
159	TAILQ_INIT(&object->shadow_head);
160
161	object->type = type;
162	object->size = size;
163	object->ref_count = 1;
164	object->flags = 0;
165	object->paging_in_progress = 0;
166	object->resident_page_count = 0;
167	object->shadow_count = 0;
168	object->handle = NULL;
169	object->paging_offset = (vm_ooffset_t) 0;
170	object->backing_object = NULL;
171	object->backing_object_offset = (vm_ooffset_t) 0;
172
173	object->last_read = 0;
174
175	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
176	vm_object_count++;
177}
178
179/*
180 *	vm_object_init:
181 *
182 *	Initialize the VM objects module.
183 */
184void
185vm_object_init()
186{
187	TAILQ_INIT(&vm_object_cached_list);
188	TAILQ_INIT(&vm_object_list);
189	vm_object_count = 0;
190
191	vm_object_cache_max = 84;
192	if (cnt.v_page_count > 1000)
193		vm_object_cache_max += (cnt.v_page_count - 1000) / 4;
194
195	kernel_object = &kernel_object_store;
196	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
197	    kernel_object);
198
199	kmem_object = &kmem_object_store;
200	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
201	    kmem_object);
202}
203
204/*
205 *	vm_object_allocate:
206 *
207 *	Returns a new object with the given size.
208 */
209
210vm_object_t
211vm_object_allocate(type, size)
212	objtype_t type;
213	vm_size_t size;
214{
215	register vm_object_t result;
216
217	result = (vm_object_t)
218	    malloc((u_long) sizeof *result, M_VMOBJ, M_WAITOK);
219
220
221	_vm_object_allocate(type, size, result);
222
223	return (result);
224}
225
226
227/*
228 *	vm_object_reference:
229 *
230 *	Gets another reference to the given object.
231 */
232inline void
233vm_object_reference(object)
234	register vm_object_t object;
235{
236	if (object == NULL)
237		return;
238
239	if (object->ref_count == 0) {
240		if ((object->flags & OBJ_CANPERSIST) == 0)
241			panic("vm_object_reference: non-persistent object with 0 ref_count");
242		TAILQ_REMOVE(&vm_object_cached_list, object, cached_list);
243		vm_object_cached--;
244	}
245	object->ref_count++;
246}
247
248/*
249 *	vm_object_deallocate:
250 *
251 *	Release a reference to the specified object,
252 *	gained either through a vm_object_allocate
253 *	or a vm_object_reference call.  When all references
254 *	are gone, storage associated with this object
255 *	may be relinquished.
256 *
257 *	No object may be locked.
258 */
259void
260vm_object_deallocate(object)
261	vm_object_t object;
262{
263	vm_object_t temp;
264	vm_page_t p;
265
266	while (object != NULL) {
267
268		if (object->ref_count == 0)
269			panic("vm_object_deallocate: object deallocated too many times");
270
271		/*
272		 * Lose the reference
273		 */
274		object->ref_count--;
275		if (object->ref_count != 0) {
276			if ((object->ref_count == 1) &&
277			    (object->handle == NULL) &&
278			    (object->type == OBJT_DEFAULT ||
279			     object->type == OBJT_SWAP)) {
280				vm_object_t robject;
281				robject = object->shadow_head.tqh_first;
282				if ((robject != NULL) &&
283				    (robject->handle == NULL) &&
284				    (robject->type == OBJT_DEFAULT ||
285				     robject->type == OBJT_SWAP)) {
286					int s;
287					robject->ref_count += 2;
288					object->ref_count += 2;
289
290					do {
291						s = splhigh();
292						while (robject->paging_in_progress) {
293							robject->flags |= OBJ_PIPWNT;
294							tsleep(robject, PVM, "objde1", 0);
295						}
296
297						while (object->paging_in_progress) {
298							object->flags |= OBJ_PIPWNT;
299							tsleep(object, PVM, "objde2", 0);
300						}
301						splx(s);
302
303					} while( object->paging_in_progress || robject->paging_in_progress);
304
305					object->ref_count -= 2;
306					robject->ref_count -= 2;
307					if( robject->ref_count == 0) {
308						robject->ref_count += 1;
309						object = robject;
310						continue;
311					}
312					vm_object_collapse(robject);
313					return;
314				}
315			}
316			/*
317			 * If there are still references, then we are done.
318			 */
319			return;
320		}
321
322		if (object->type == OBJT_VNODE) {
323			struct vnode *vp = object->handle;
324
325			vp->v_flag &= ~VTEXT;
326		}
327
328		/*
329		 * See if this object can persist and has some resident
330		 * pages.  If so, enter it in the cache.
331		 */
332		if (object->flags & OBJ_CANPERSIST) {
333			if (object->resident_page_count != 0) {
334				vm_object_page_clean(object, 0, 0 ,TRUE, TRUE);
335				TAILQ_INSERT_TAIL(&vm_object_cached_list, object,
336				    cached_list);
337				vm_object_cached++;
338
339				vm_object_cache_trim();
340				return;
341			} else {
342				object->flags &= ~OBJ_CANPERSIST;
343			}
344		}
345
346		/*
347		 * Make sure no one uses us.
348		 */
349		object->flags |= OBJ_DEAD;
350
351		temp = object->backing_object;
352		if (temp) {
353			TAILQ_REMOVE(&temp->shadow_head, object, shadow_list);
354			--temp->shadow_count;
355		}
356		vm_object_terminate(object);
357		/* unlocks and deallocates object */
358		object = temp;
359	}
360}
361
362/*
363 *	vm_object_terminate actually destroys the specified object, freeing
364 *	up all previously used resources.
365 *
366 *	The object must be locked.
367 */
368static void
369vm_object_terminate(object)
370	register vm_object_t object;
371{
372	register vm_page_t p;
373	int s;
374
375	/*
376	 * wait for the pageout daemon to be done with the object
377	 */
378	s = splhigh();
379	while (object->paging_in_progress) {
380		object->flags |= OBJ_PIPWNT;
381		tsleep(object, PVM, "objtrm", 0);
382	}
383	splx(s);
384
385	if (object->paging_in_progress != 0)
386		panic("vm_object_deallocate: pageout in progress");
387
388	/*
389	 * Clean and free the pages, as appropriate. All references to the
390	 * object are gone, so we don't need to lock it.
391	 */
392	if (object->type == OBJT_VNODE) {
393		struct vnode *vp = object->handle;
394
395		VOP_LOCK(vp);
396		vm_object_page_clean(object, 0, 0, TRUE, FALSE);
397		vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0);
398		VOP_UNLOCK(vp);
399	}
400
401	/*
402	 * Now free the pages. For internal objects, this also removes them
403	 * from paging queues.
404	 */
405	while ((p = object->memq.tqh_first) != NULL) {
406		if (p->flags & PG_BUSY)
407			printf("vm_object_terminate: freeing busy page\n");
408		PAGE_WAKEUP(p);
409		vm_page_free(p);
410		cnt.v_pfree++;
411	}
412
413	/*
414	 * Let the pager know object is dead.
415	 */
416	vm_pager_deallocate(object);
417
418	TAILQ_REMOVE(&vm_object_list, object, object_list);
419	vm_object_count--;
420
421	wakeup(object);
422
423	/*
424	 * Free the space for the object.
425	 */
426	free((caddr_t) object, M_VMOBJ);
427}
428
429/*
430 *	vm_object_page_clean
431 *
432 *	Clean all dirty pages in the specified range of object.
433 *	Leaves page on whatever queue it is currently on.
434 *
435 *	Odd semantics: if start == end, we clean everything.
436 *
437 *	The object must be locked.
438 */
439
440void
441vm_object_page_clean(object, start, end, syncio, lockflag)
442	vm_object_t object;
443	vm_pindex_t start;
444	vm_pindex_t end;
445	boolean_t syncio;
446	boolean_t lockflag;
447{
448	register vm_page_t p, np, tp;
449	register vm_offset_t tstart, tend;
450	vm_pindex_t pi;
451	int s;
452	struct vnode *vp;
453	int runlen;
454	int maxf;
455	int chkb;
456	int maxb;
457	int i;
458	vm_page_t maf[vm_pageout_page_count];
459	vm_page_t mab[vm_pageout_page_count];
460	vm_page_t ma[vm_pageout_page_count];
461
462	if (object->type != OBJT_VNODE ||
463		(object->flags & OBJ_MIGHTBEDIRTY) == 0)
464		return;
465
466	vp = object->handle;
467
468	if (lockflag)
469		VOP_LOCK(vp);
470	object->flags |= OBJ_CLEANING;
471
472	tstart = start;
473	if (end == 0) {
474		tend = object->size;
475	} else {
476		tend = end;
477	}
478	if ((tstart == 0) && (tend == object->size)) {
479		object->flags &= ~(OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
480	}
481	for(p = object->memq.tqh_first; p; p = p->listq.tqe_next)
482		p->flags |= PG_CLEANCHK;
483
484rescan:
485	for(p = object->memq.tqh_first; p; p = np) {
486		np = p->listq.tqe_next;
487
488		pi = p->pindex;
489		if (((p->flags & PG_CLEANCHK) == 0) ||
490			(pi < tstart) || (pi >= tend) ||
491			(p->valid == 0) || (p->queue == PQ_CACHE)) {
492			p->flags &= ~PG_CLEANCHK;
493			continue;
494		}
495
496		vm_page_test_dirty(p);
497		if ((p->dirty & p->valid) == 0) {
498			p->flags &= ~PG_CLEANCHK;
499			continue;
500		}
501
502		s = splhigh();
503		if ((p->flags & PG_BUSY) || p->busy) {
504			p->flags |= PG_WANTED|PG_REFERENCED;
505			tsleep(p, PVM, "vpcwai", 0);
506			splx(s);
507			goto rescan;
508		}
509		splx(s);
510
511		maxf = 0;
512		for(i=1;i<vm_pageout_page_count;i++) {
513			if (tp = vm_page_lookup(object, pi + i)) {
514				if ((tp->flags & PG_BUSY) ||
515					(tp->flags & PG_CLEANCHK) == 0)
516					break;
517				vm_page_test_dirty(tp);
518				if ((tp->dirty & tp->valid) == 0) {
519					tp->flags &= ~PG_CLEANCHK;
520					break;
521				}
522				maf[ i - 1 ] = tp;
523				maxf++;
524				continue;
525			}
526			break;
527		}
528
529		maxb = 0;
530		chkb = vm_pageout_page_count -  maxf;
531		if (chkb) {
532			for(i = 1; i < chkb;i++) {
533				if (tp = vm_page_lookup(object, pi - i)) {
534					if ((tp->flags & PG_BUSY) ||
535						(tp->flags & PG_CLEANCHK) == 0)
536						break;
537					vm_page_test_dirty(tp);
538					if ((tp->dirty & tp->valid) == 0) {
539						tp->flags &= ~PG_CLEANCHK;
540						break;
541					}
542					mab[ i - 1 ] = tp;
543					maxb++;
544					continue;
545				}
546				break;
547			}
548		}
549
550		for(i=0;i<maxb;i++) {
551			int index = (maxb - i) - 1;
552			ma[index] = mab[i];
553			ma[index]->flags |= PG_BUSY;
554			ma[index]->flags &= ~PG_CLEANCHK;
555			vm_page_protect(ma[index], VM_PROT_READ);
556		}
557		vm_page_protect(p, VM_PROT_READ);
558		p->flags |= PG_BUSY;
559		p->flags &= ~PG_CLEANCHK;
560		ma[maxb] = p;
561		for(i=0;i<maxf;i++) {
562			int index = (maxb + i) + 1;
563			ma[index] = maf[i];
564			ma[index]->flags |= PG_BUSY;
565			ma[index]->flags &= ~PG_CLEANCHK;
566			vm_page_protect(ma[index], VM_PROT_READ);
567		}
568		runlen = maxb + maxf + 1;
569/*
570		printf("maxb: %d, maxf: %d, runlen: %d, offset: %d\n", maxb, maxf, runlen, ma[0]->pindex);
571*/
572		vm_pageout_flush(ma, runlen, 0);
573		goto rescan;
574	}
575
576	VOP_FSYNC(vp, NULL, syncio, curproc);
577
578	if (lockflag)
579		VOP_UNLOCK(vp);
580	object->flags &= ~OBJ_CLEANING;
581	return;
582}
583
584#ifdef not_used
585/* XXX I cannot tell if this should be an exported symbol */
586/*
587 *	vm_object_deactivate_pages
588 *
589 *	Deactivate all pages in the specified object.  (Keep its pages
590 *	in memory even though it is no longer referenced.)
591 *
592 *	The object must be locked.
593 */
594static void
595vm_object_deactivate_pages(object)
596	register vm_object_t object;
597{
598	register vm_page_t p, next;
599
600	for (p = object->memq.tqh_first; p != NULL; p = next) {
601		next = p->listq.tqe_next;
602		vm_page_deactivate(p);
603	}
604}
605#endif
606
607/*
608 *	Trim the object cache to size.
609 */
610static void
611vm_object_cache_trim()
612{
613	register vm_object_t object;
614
615	while (vm_object_cached > vm_object_cache_max) {
616		object = vm_object_cached_list.tqh_first;
617
618		vm_object_reference(object);
619		pager_cache(object, FALSE);
620	}
621}
622
623
624/*
625 *	vm_object_pmap_copy:
626 *
627 *	Makes all physical pages in the specified
628 *	object range copy-on-write.  No writeable
629 *	references to these pages should remain.
630 *
631 *	The object must *not* be locked.
632 */
633void
634vm_object_pmap_copy(object, start, end)
635	register vm_object_t object;
636	register vm_pindex_t start;
637	register vm_pindex_t end;
638{
639	register vm_page_t p;
640
641	if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
642		return;
643
644	for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) {
645		vm_page_protect(p, VM_PROT_READ);
646	}
647
648	object->flags &= ~OBJ_WRITEABLE;
649}
650
651/*
652 *	vm_object_pmap_remove:
653 *
654 *	Removes all physical pages in the specified
655 *	object range from all physical maps.
656 *
657 *	The object must *not* be locked.
658 */
659void
660vm_object_pmap_remove(object, start, end)
661	register vm_object_t object;
662	register vm_pindex_t start;
663	register vm_pindex_t end;
664{
665	register vm_page_t p;
666	if (object == NULL)
667		return;
668	for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) {
669		if (p->pindex >= start && p->pindex < end)
670			vm_page_protect(p, VM_PROT_NONE);
671	}
672}
673
674/*
675 *	vm_object_copy:
676 *
677 *	Create a new object which is a copy of an existing
678 *	object, and mark all of the pages in the existing
679 *	object 'copy-on-write'.  The new object has one reference.
680 *	Returns the new object.
681 *
682 *	May defer the copy until later if the object is not backed
683 *	up by a non-default pager.
684 *
685 */
686void
687vm_object_copy(src_object, src_offset,
688    dst_object, dst_offset, src_needs_copy)
689	register vm_object_t src_object;
690	vm_pindex_t src_offset;
691	vm_object_t *dst_object;/* OUT */
692	vm_pindex_t *dst_offset;/* OUT */
693	boolean_t *src_needs_copy;	/* OUT */
694{
695	if (src_object == NULL) {
696		/*
697		 * Nothing to copy
698		 */
699		*dst_object = NULL;
700		*dst_offset = 0;
701		*src_needs_copy = FALSE;
702		return;
703	}
704
705	/*
706	 * Try to collapse the object before copying it.
707	 */
708	if (src_object->handle == NULL &&
709	    (src_object->type == OBJT_DEFAULT ||
710	     src_object->type == OBJT_SWAP))
711		vm_object_collapse(src_object);
712
713
714	/*
715	 * Make another reference to the object
716	 */
717	src_object->ref_count++;
718	*dst_object = src_object;
719	*dst_offset = src_offset;
720
721	/*
722	 * Must make a shadow when write is desired
723	 */
724	*src_needs_copy = TRUE;
725	return;
726}
727
728/*
729 *	vm_object_shadow:
730 *
731 *	Create a new object which is backed by the
732 *	specified existing object range.  The source
733 *	object reference is deallocated.
734 *
735 *	The new object and offset into that object
736 *	are returned in the source parameters.
737 */
738
739void
740vm_object_shadow(object, offset, length)
741	vm_object_t *object;	/* IN/OUT */
742	vm_ooffset_t *offset;	/* IN/OUT */
743	vm_size_t length;
744{
745	register vm_object_t source;
746	register vm_object_t result;
747
748	source = *object;
749
750	/*
751	 * Allocate a new object with the given length
752	 */
753
754	if ((result = vm_object_allocate(OBJT_DEFAULT, length)) == NULL)
755		panic("vm_object_shadow: no object for shadowing");
756
757	/*
758	 * The new object shadows the source object, adding a reference to it.
759	 * Our caller changes his reference to point to the new object,
760	 * removing a reference to the source object.  Net result: no change
761	 * of reference count.
762	 */
763	result->backing_object = source;
764	if (source) {
765		TAILQ_INSERT_TAIL(&source->shadow_head, result, shadow_list);
766		++source->shadow_count;
767	}
768
769	/*
770	 * Store the offset into the source object, and fix up the offset into
771	 * the new object.
772	 */
773
774	result->backing_object_offset = *offset;
775
776	/*
777	 * Return the new things
778	 */
779
780	*offset = 0;
781	*object = result;
782}
783
784
785/*
786 * this version of collapse allows the operation to occur earlier and
787 * when paging_in_progress is true for an object...  This is not a complete
788 * operation, but should plug 99.9% of the rest of the leaks.
789 */
790static void
791vm_object_qcollapse(object)
792	register vm_object_t object;
793{
794	register vm_object_t backing_object;
795	register vm_pindex_t backing_offset_index, paging_offset_index;
796	vm_pindex_t backing_object_paging_offset_index;
797	vm_pindex_t new_pindex;
798	register vm_page_t p, pp;
799	register vm_size_t size;
800
801	backing_object = object->backing_object;
802	if (backing_object->ref_count != 1)
803		return;
804
805	backing_object->ref_count += 2;
806
807	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
808	backing_object_paging_offset_index = OFF_TO_IDX(backing_object->paging_offset);
809	paging_offset_index = OFF_TO_IDX(object->paging_offset);
810	size = object->size;
811	p = backing_object->memq.tqh_first;
812	while (p) {
813		vm_page_t next;
814
815		next = p->listq.tqe_next;
816		if ((p->flags & (PG_BUSY | PG_FICTITIOUS)) ||
817		    (p->queue == PQ_CACHE) || !p->valid || p->hold_count || p->wire_count || p->busy) {
818			p = next;
819			continue;
820		}
821		vm_page_protect(p, VM_PROT_NONE);
822		new_pindex = p->pindex - backing_offset_index;
823		if (p->pindex < backing_offset_index ||
824		    new_pindex >= size) {
825			if (backing_object->type == OBJT_SWAP)
826				swap_pager_freespace(backing_object,
827				    backing_object_paging_offset_index+p->pindex,
828				    1);
829			vm_page_free(p);
830		} else {
831			pp = vm_page_lookup(object, new_pindex);
832			if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object,
833				    paging_offset_index + new_pindex, NULL, NULL))) {
834				if (backing_object->type == OBJT_SWAP)
835					swap_pager_freespace(backing_object,
836					    backing_object_paging_offset_index + p->pindex, 1);
837				vm_page_free(p);
838			} else {
839				if (backing_object->type == OBJT_SWAP)
840					swap_pager_freespace(backing_object,
841					    backing_object_paging_offset_index + p->pindex, 1);
842				vm_page_rename(p, object, new_pindex);
843				p->dirty = VM_PAGE_BITS_ALL;
844			}
845		}
846		p = next;
847	}
848	backing_object->ref_count -= 2;
849}
850
851/*
852 *	vm_object_collapse:
853 *
854 *	Collapse an object with the object backing it.
855 *	Pages in the backing object are moved into the
856 *	parent, and the backing object is deallocated.
857 */
858void
859vm_object_collapse(object)
860	vm_object_t object;
861
862{
863	vm_object_t backing_object;
864	vm_ooffset_t backing_offset;
865	vm_size_t size;
866	vm_pindex_t new_pindex, backing_offset_index;
867	vm_page_t p, pp;
868
869	while (TRUE) {
870		/*
871		 * Verify that the conditions are right for collapse:
872		 *
873		 * The object exists and no pages in it are currently being paged
874		 * out.
875		 */
876		if (object == NULL)
877			return;
878
879		/*
880		 * Make sure there is a backing object.
881		 */
882		if ((backing_object = object->backing_object) == NULL)
883			return;
884
885		/*
886		 * we check the backing object first, because it is most likely
887		 * not collapsable.
888		 */
889		if (backing_object->handle != NULL ||
890		    (backing_object->type != OBJT_DEFAULT &&
891		     backing_object->type != OBJT_SWAP) ||
892		    (backing_object->flags & OBJ_DEAD) ||
893		    object->handle != NULL ||
894		    (object->type != OBJT_DEFAULT &&
895		     object->type != OBJT_SWAP) ||
896		    (object->flags & OBJ_DEAD)) {
897			return;
898		}
899
900		if (object->paging_in_progress != 0 ||
901		    backing_object->paging_in_progress != 0) {
902			vm_object_qcollapse(object);
903			return;
904		}
905
906		/*
907		 * We know that we can either collapse the backing object (if
908		 * the parent is the only reference to it) or (perhaps) remove
909		 * the parent's reference to it.
910		 */
911
912		backing_offset = object->backing_object_offset;
913		backing_offset_index = OFF_TO_IDX(backing_offset);
914		size = object->size;
915
916		/*
917		 * If there is exactly one reference to the backing object, we
918		 * can collapse it into the parent.
919		 */
920
921		if (backing_object->ref_count == 1) {
922
923			backing_object->flags |= OBJ_DEAD;
924			/*
925			 * We can collapse the backing object.
926			 *
927			 * Move all in-memory pages from backing_object to the
928			 * parent.  Pages that have been paged out will be
929			 * overwritten by any of the parent's pages that
930			 * shadow them.
931			 */
932
933			while ((p = backing_object->memq.tqh_first) != 0) {
934
935				new_pindex = p->pindex - backing_offset_index;
936
937				/*
938				 * If the parent has a page here, or if this
939				 * page falls outside the parent, dispose of
940				 * it.
941				 *
942				 * Otherwise, move it as planned.
943				 */
944
945				if (p->pindex < backing_offset_index ||
946				    new_pindex >= size) {
947					vm_page_protect(p, VM_PROT_NONE);
948					PAGE_WAKEUP(p);
949					vm_page_free(p);
950				} else {
951					pp = vm_page_lookup(object, new_pindex);
952					if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object,
953					    OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL))) {
954						vm_page_protect(p, VM_PROT_NONE);
955						PAGE_WAKEUP(p);
956						vm_page_free(p);
957					} else {
958						vm_page_rename(p, object, new_pindex);
959					}
960				}
961			}
962
963			/*
964			 * Move the pager from backing_object to object.
965			 */
966
967			if (backing_object->type == OBJT_SWAP) {
968				backing_object->paging_in_progress++;
969				if (object->type == OBJT_SWAP) {
970					object->paging_in_progress++;
971					/*
972					 * copy shadow object pages into ours
973					 * and destroy unneeded pages in
974					 * shadow object.
975					 */
976					swap_pager_copy(
977					    backing_object,
978					    OFF_TO_IDX(backing_object->paging_offset),
979					    object,
980					    OFF_TO_IDX(object->paging_offset),
981					    OFF_TO_IDX(object->backing_object_offset));
982					vm_object_pip_wakeup(object);
983				} else {
984					object->paging_in_progress++;
985					/*
986					 * move the shadow backing_object's pager data to
987					 * "object" and convert "object" type to OBJT_SWAP.
988					 */
989					object->type = OBJT_SWAP;
990					object->un_pager.swp.swp_nblocks =
991					    backing_object->un_pager.swp.swp_nblocks;
992					object->un_pager.swp.swp_allocsize =
993					    backing_object->un_pager.swp.swp_allocsize;
994					object->un_pager.swp.swp_blocks =
995					    backing_object->un_pager.swp.swp_blocks;
996					object->un_pager.swp.swp_poip =		/* XXX */
997					    backing_object->un_pager.swp.swp_poip;
998					object->paging_offset = backing_object->paging_offset + backing_offset;
999					TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
1000
1001					/*
1002					 * Convert backing object from OBJT_SWAP to
1003					 * OBJT_DEFAULT. XXX - only the TAILQ_REMOVE is
1004					 * actually necessary.
1005					 */
1006					backing_object->type = OBJT_DEFAULT;
1007					TAILQ_REMOVE(&swap_pager_un_object_list, backing_object, pager_object_list);
1008					/*
1009					 * free unnecessary blocks
1010					 */
1011					swap_pager_freespace(object, 0,
1012						OFF_TO_IDX(object->paging_offset));
1013					vm_object_pip_wakeup(object);
1014				}
1015
1016				vm_object_pip_wakeup(backing_object);
1017			}
1018			/*
1019			 * Object now shadows whatever backing_object did.
1020			 * Note that the reference to backing_object->backing_object
1021			 * moves from within backing_object to within object.
1022			 */
1023
1024			TAILQ_REMOVE(&object->backing_object->shadow_head, object,
1025			    shadow_list);
1026			--object->backing_object->shadow_count;
1027			if (backing_object->backing_object) {
1028				TAILQ_REMOVE(&backing_object->backing_object->shadow_head,
1029				    backing_object, shadow_list);
1030				--backing_object->backing_object->shadow_count;
1031			}
1032			object->backing_object = backing_object->backing_object;
1033			if (object->backing_object) {
1034				TAILQ_INSERT_TAIL(&object->backing_object->shadow_head,
1035				    object, shadow_list);
1036				++object->backing_object->shadow_count;
1037			}
1038
1039			object->backing_object_offset += backing_object->backing_object_offset;
1040			/*
1041			 * Discard backing_object.
1042			 *
1043			 * Since the backing object has no pages, no pager left,
1044			 * and no object references within it, all that is
1045			 * necessary is to dispose of it.
1046			 */
1047
1048			TAILQ_REMOVE(&vm_object_list, backing_object,
1049			    object_list);
1050			vm_object_count--;
1051
1052			free((caddr_t) backing_object, M_VMOBJ);
1053
1054			object_collapses++;
1055		} else {
1056			/*
1057			 * If all of the pages in the backing object are
1058			 * shadowed by the parent object, the parent object no
1059			 * longer has to shadow the backing object; it can
1060			 * shadow the next one in the chain.
1061			 *
1062			 * The backing object must not be paged out - we'd have
1063			 * to check all of the paged-out pages, as well.
1064			 */
1065
1066			if (backing_object->type != OBJT_DEFAULT) {
1067				return;
1068			}
1069			/*
1070			 * Should have a check for a 'small' number of pages
1071			 * here.
1072			 */
1073
1074			for (p = backing_object->memq.tqh_first; p; p = p->listq.tqe_next) {
1075				new_pindex = p->pindex - backing_offset_index;
1076
1077				/*
1078				 * If the parent has a page here, or if this
1079				 * page falls outside the parent, keep going.
1080				 *
1081				 * Otherwise, the backing_object must be left in
1082				 * the chain.
1083				 */
1084
1085				if (p->pindex >= backing_offset_index &&
1086					new_pindex <= size) {
1087
1088					pp = vm_page_lookup(object, new_pindex);
1089
1090					if ((pp == NULL || pp->valid == 0) &&
1091				   	    !vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL)) {
1092						/*
1093						 * Page still needed. Can't go any
1094						 * further.
1095						 */
1096						return;
1097					}
1098				}
1099			}
1100
1101			/*
1102			 * Make the parent shadow the next object in the
1103			 * chain.  Deallocating backing_object will not remove
1104			 * it, since its reference count is at least 2.
1105			 */
1106
1107			TAILQ_REMOVE(&object->backing_object->shadow_head,
1108			    object, shadow_list);
1109			--object->backing_object->shadow_count;
1110			vm_object_reference(object->backing_object = backing_object->backing_object);
1111			if (object->backing_object) {
1112				TAILQ_INSERT_TAIL(&object->backing_object->shadow_head,
1113				    object, shadow_list);
1114				++object->backing_object->shadow_count;
1115			}
1116			object->backing_object_offset += backing_object->backing_object_offset;
1117
1118			/*
1119			 * Drop the reference count on backing_object. Since
1120			 * its ref_count was at least 2, it will not vanish;
1121			 * so we don't need to call vm_object_deallocate.
1122			 */
1123			if (backing_object->ref_count == 1)
1124				printf("should have called obj deallocate\n");
1125			backing_object->ref_count--;
1126
1127			object_bypasses++;
1128
1129		}
1130
1131		/*
1132		 * Try again with this object's new backing object.
1133		 */
1134	}
1135}
1136
1137/*
1138 *	vm_object_page_remove: [internal]
1139 *
1140 *	Removes all physical pages in the specified
1141 *	object range from the object's list of pages.
1142 *
1143 *	The object must be locked.
1144 */
1145void
1146vm_object_page_remove(object, start, end, clean_only)
1147	register vm_object_t object;
1148	register vm_pindex_t start;
1149	register vm_pindex_t end;
1150	boolean_t clean_only;
1151{
1152	register vm_page_t p, next;
1153	unsigned int size;
1154	int s;
1155
1156	if (object == NULL)
1157		return;
1158
1159	object->paging_in_progress++;
1160again:
1161	size = end - start;
1162	if (size > 4 || size >= object->size / 4) {
1163		for (p = object->memq.tqh_first; p != NULL; p = next) {
1164			next = p->listq.tqe_next;
1165			if ((start <= p->pindex) && (p->pindex < end)) {
1166
1167				if (p->wire_count != 0) {
1168					vm_page_protect(p, VM_PROT_NONE);
1169					p->valid = 0;
1170					continue;
1171				}
1172
1173				s = splhigh();
1174				if ((p->flags & PG_BUSY) || p->busy) {
1175					p->flags |= PG_WANTED;
1176					tsleep(p, PVM, "vmopar", 0);
1177					splx(s);
1178					goto again;
1179				}
1180				splx(s);
1181
1182				if (clean_only) {
1183					vm_page_test_dirty(p);
1184					if (p->valid & p->dirty)
1185						continue;
1186				}
1187				vm_page_protect(p, VM_PROT_NONE);
1188				PAGE_WAKEUP(p);
1189				vm_page_free(p);
1190			}
1191		}
1192	} else {
1193		while (size > 0) {
1194			if ((p = vm_page_lookup(object, start)) != 0) {
1195				if (p->wire_count != 0) {
1196					p->valid = 0;
1197					vm_page_protect(p, VM_PROT_NONE);
1198					start += 1;
1199					size -= 1;
1200					continue;
1201				}
1202				s = splhigh();
1203				if ((p->flags & PG_BUSY) || p->busy) {
1204					p->flags |= PG_WANTED;
1205					tsleep(p, PVM, "vmopar", 0);
1206					splx(s);
1207					goto again;
1208				}
1209				splx(s);
1210				if (clean_only) {
1211					vm_page_test_dirty(p);
1212					if (p->valid & p->dirty) {
1213						start += 1;
1214						size -= 1;
1215						continue;
1216					}
1217				}
1218				vm_page_protect(p, VM_PROT_NONE);
1219				PAGE_WAKEUP(p);
1220				vm_page_free(p);
1221			}
1222			start += 1;
1223			size -= 1;
1224		}
1225	}
1226	vm_object_pip_wakeup(object);
1227}
1228
1229/*
1230 *	Routine:	vm_object_coalesce
1231 *	Function:	Coalesces two objects backing up adjoining
1232 *			regions of memory into a single object.
1233 *
1234 *	returns TRUE if objects were combined.
1235 *
1236 *	NOTE:	Only works at the moment if the second object is NULL -
1237 *		if it's not, which object do we lock first?
1238 *
1239 *	Parameters:
1240 *		prev_object	First object to coalesce
1241 *		prev_offset	Offset into prev_object
1242 *		next_object	Second object into coalesce
1243 *		next_offset	Offset into next_object
1244 *
1245 *		prev_size	Size of reference to prev_object
1246 *		next_size	Size of reference to next_object
1247 *
1248 *	Conditions:
1249 *	The object must *not* be locked.
1250 */
1251boolean_t
1252vm_object_coalesce(prev_object, prev_pindex, prev_size, next_size)
1253	register vm_object_t prev_object;
1254	vm_pindex_t prev_pindex;
1255	vm_size_t prev_size, next_size;
1256{
1257	vm_size_t newsize;
1258
1259	if (prev_object == NULL) {
1260		return (TRUE);
1261	}
1262
1263	if (prev_object->type != OBJT_DEFAULT) {
1264		return (FALSE);
1265	}
1266
1267	/*
1268	 * Try to collapse the object first
1269	 */
1270	vm_object_collapse(prev_object);
1271
1272	/*
1273	 * Can't coalesce if: . more than one reference . paged out . shadows
1274	 * another object . has a copy elsewhere (any of which mean that the
1275	 * pages not mapped to prev_entry may be in use anyway)
1276	 */
1277
1278	if (prev_object->ref_count > 1 ||
1279	    prev_object->backing_object != NULL) {
1280		return (FALSE);
1281	}
1282
1283	prev_size >>= PAGE_SHIFT;
1284	next_size >>= PAGE_SHIFT;
1285	/*
1286	 * Remove any pages that may still be in the object from a previous
1287	 * deallocation.
1288	 */
1289
1290	vm_object_page_remove(prev_object,
1291	    prev_pindex + prev_size,
1292	    prev_pindex + prev_size + next_size, FALSE);
1293
1294	/*
1295	 * Extend the object if necessary.
1296	 */
1297	newsize = prev_pindex + prev_size + next_size;
1298	if (newsize > prev_object->size)
1299		prev_object->size = newsize;
1300
1301	return (TRUE);
1302}
1303
1304#ifdef DDB
1305
1306static int
1307_vm_object_in_map(map, object, entry)
1308	vm_map_t map;
1309	vm_object_t object;
1310	vm_map_entry_t entry;
1311{
1312	vm_map_t tmpm;
1313	vm_map_entry_t tmpe;
1314	vm_object_t obj;
1315	int entcount;
1316
1317	if (map == 0)
1318		return 0;
1319
1320	if (entry == 0) {
1321		tmpe = map->header.next;
1322		entcount = map->nentries;
1323		while (entcount-- && (tmpe != &map->header)) {
1324			if( _vm_object_in_map(map, object, tmpe)) {
1325				return 1;
1326			}
1327			tmpe = tmpe->next;
1328		}
1329	} else if (entry->is_sub_map || entry->is_a_map) {
1330		tmpm = entry->object.share_map;
1331		tmpe = tmpm->header.next;
1332		entcount = tmpm->nentries;
1333		while (entcount-- && tmpe != &tmpm->header) {
1334			if( _vm_object_in_map(tmpm, object, tmpe)) {
1335				return 1;
1336			}
1337			tmpe = tmpe->next;
1338		}
1339	} else if (obj = entry->object.vm_object) {
1340		for(; obj; obj=obj->backing_object)
1341			if( obj == object) {
1342				return 1;
1343			}
1344	}
1345	return 0;
1346}
1347
1348static int
1349vm_object_in_map( object)
1350	vm_object_t object;
1351{
1352	struct proc *p;
1353	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
1354		if( !p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
1355			continue;
1356/*
1357		if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
1358			continue;
1359		}
1360*/
1361		if( _vm_object_in_map(&p->p_vmspace->vm_map, object, 0))
1362			return 1;
1363	}
1364	if( _vm_object_in_map( kernel_map, object, 0))
1365		return 1;
1366	if( _vm_object_in_map( kmem_map, object, 0))
1367		return 1;
1368	if( _vm_object_in_map( pager_map, object, 0))
1369		return 1;
1370	if( _vm_object_in_map( buffer_map, object, 0))
1371		return 1;
1372	if( _vm_object_in_map( io_map, object, 0))
1373		return 1;
1374	if( _vm_object_in_map( phys_map, object, 0))
1375		return 1;
1376	if( _vm_object_in_map( mb_map, object, 0))
1377		return 1;
1378	if( _vm_object_in_map( u_map, object, 0))
1379		return 1;
1380	return 0;
1381}
1382
1383
1384#ifdef DDB
1385static void
1386DDB_vm_object_check()
1387{
1388	vm_object_t object;
1389
1390	/*
1391	 * make sure that internal objs are in a map somewhere
1392	 * and none have zero ref counts.
1393	 */
1394	for (object = vm_object_list.tqh_first;
1395			object != NULL;
1396			object = object->object_list.tqe_next) {
1397		if (object->handle == NULL &&
1398		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
1399			if (object->ref_count == 0) {
1400				printf("vmochk: internal obj has zero ref count: %d\n",
1401					object->size);
1402			}
1403			if (!vm_object_in_map(object)) {
1404				printf("vmochk: internal obj is not in a map: "
1405		"ref: %d, size: %d: 0x%x, backing_object: 0x%x\n",
1406				    object->ref_count, object->size,
1407				    object->size, object->backing_object);
1408			}
1409		}
1410	}
1411}
1412#endif /* DDB */
1413
1414/*
1415 *	vm_object_print:	[ debug ]
1416 */
1417void
1418vm_object_print(iobject, full, dummy3, dummy4)
1419	/* db_expr_t */ int iobject;
1420	boolean_t full;
1421	/* db_expr_t */ int dummy3;
1422	char *dummy4;
1423{
1424	vm_object_t object = (vm_object_t)iobject;	/* XXX */
1425	register vm_page_t p;
1426
1427	register int count;
1428
1429	if (object == NULL)
1430		return;
1431
1432	iprintf("Object 0x%x: size=0x%x, res=%d, ref=%d, ",
1433	    (int) object, (int) object->size,
1434	    object->resident_page_count, object->ref_count);
1435	printf("offset=0x%x, backing_object=(0x%x)+0x%x\n",
1436	    (int) object->paging_offset,
1437	    (int) object->backing_object, (int) object->backing_object_offset);
1438	printf("cache: next=%p, prev=%p\n",
1439	    object->cached_list.tqe_next, object->cached_list.tqe_prev);
1440
1441	if (!full)
1442		return;
1443
1444	indent += 2;
1445	count = 0;
1446	for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) {
1447		if (count == 0)
1448			iprintf("memory:=");
1449		else if (count == 6) {
1450			printf("\n");
1451			iprintf(" ...");
1452			count = 0;
1453		} else
1454			printf(",");
1455		count++;
1456
1457		printf("(off=0x%lx,page=0x%lx)",
1458		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1459	}
1460	if (count != 0)
1461		printf("\n");
1462	indent -= 2;
1463}
1464#endif /* DDB */
1465