vm_object.c revision 207739
11573Srgrimes/*-
21573Srgrimes * Copyright (c) 1991, 1993
31573Srgrimes *	The Regents of the University of California.  All rights reserved.
41573Srgrimes *
51573Srgrimes * This code is derived from software contributed to Berkeley by
61573Srgrimes * The Mach Operating System project at Carnegie-Mellon University.
71573Srgrimes *
81573Srgrimes * Redistribution and use in source and binary forms, with or without
91573Srgrimes * modification, are permitted provided that the following conditions
101573Srgrimes * are met:
111573Srgrimes * 1. Redistributions of source code must retain the above copyright
121573Srgrimes *    notice, this list of conditions and the following disclaimer.
131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
141573Srgrimes *    notice, this list of conditions and the following disclaimer in the
151573Srgrimes *    documentation and/or other materials provided with the distribution.
161573Srgrimes * 4. Neither the name of the University nor the names of its contributors
171573Srgrimes *    may be used to endorse or promote products derived from this software
181573Srgrimes *    without specific prior written permission.
191573Srgrimes *
201573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
211573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
221573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
231573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
241573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
251573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
261573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
271573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
281573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
291573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
301573Srgrimes * SUCH DAMAGE.
311573Srgrimes *
3255837Sjasone *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
3355837Sjasone *
341573Srgrimes *
351573Srgrimes * Copyright (c) 1987, 1990 Carnegie-Mellon University.
361573Srgrimes * All rights reserved.
3723668Speter *
381573Srgrimes * Authors: Avadis Tevanian, Jr., Michael Wayne Young
391573Srgrimes *
401573Srgrimes * Permission to use, copy, modify and distribute this software and
4123768Sbde * its documentation is hereby granted, provided that both the copyright
427978Sbde * notice and this permission notice appear in all copies of the
431573Srgrimes * software, derivative works or modified versions, and any portions
441573Srgrimes * thereof, and that both notices appear in supporting documentation.
457978Sbde *
461573Srgrimes * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
471573Srgrimes * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
481573Srgrimes * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
491573Srgrimes *
5069841Sdeischen * Carnegie Mellon requests users of this software to return to
5169841Sdeischen *
521573Srgrimes *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
5323668Speter *  School of Computer Science
541573Srgrimes *  Carnegie Mellon University
551573Srgrimes *  Pittsburgh PA 15213-3890
561573Srgrimes *
571573Srgrimes * any improvements or extensions that they make and grant Carnegie the
581573Srgrimes * rights to redistribute these changes.
5923768Sbde */
6023668Speter
6123668Speter/*
621573Srgrimes *	Virtual memory object module.
6323668Speter */
6423668Speter
6523668Speter#include <sys/cdefs.h>
6623668Speter__FBSDID("$FreeBSD: head/sys/vm/vm_object.c 207739 2010-05-07 05:23:15Z alc $");
6723668Speter
6823668Speter#include "opt_vm.h"
6923668Speter
7023668Speter#include <sys/param.h>
7123768Sbde#include <sys/systm.h>
7223668Speter#include <sys/lock.h>
7323668Speter#include <sys/mman.h>
7423668Speter#include <sys/mount.h>
7514910Sbde#include <sys/kernel.h>
7614910Sbde#include <sys/sysctl.h>
7714910Sbde#include <sys/mutex.h>
7814910Sbde#include <sys/proc.h>		/* for curproc, pageproc */
7923668Speter#include <sys/socket.h>
8023768Sbde#include <sys/resourcevar.h>
8123668Speter#include <sys/vnode.h>
8214910Sbde#include <sys/vmmeter.h>
8323768Sbde#include <sys/sx.h>
8414910Sbde
8556698Sjasone#include <vm/vm.h>
8623668Speter#include <vm/vm_param.h>
8723768Sbde#include <vm/pmap.h>
8823768Sbde#include <vm/vm_map.h>
8923768Sbde#include <vm/vm_object.h>
9023768Sbde#include <vm/vm_page.h>
917978Sbde#include <vm/vm_pageout.h>
9223768Sbde#include <vm/vm_pager.h>
937978Sbde#include <vm/swap_pager.h>
9456698Sjasone#include <vm/vm_kern.h>
9569841Sdeischen#include <vm/vm_extern.h>
9623768Sbde#include <vm/vm_reserv.h>
9723668Speter#include <vm/uma.h>
9869841Sdeischen
9969841Sdeischen#define EASY_SCAN_FACTOR       8
10069841Sdeischen
10169841Sdeischen#define MSYNC_FLUSH_HARDSEQ	0x01
1021573Srgrimes#define MSYNC_FLUSH_SOFTSEQ	0x02
10323768Sbde
10423668Speter/*
10523668Speter * msync / VM object flushing optimizations
1061573Srgrimes */
10723668Speterstatic int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ;
10823668SpeterSYSCTL_INT(_vm, OID_AUTO, msync_flush_flags, CTLFLAG_RW, &msync_flush_flags, 0,
10923668Speter    "Enable sequential iteration optimization");
11023668Speter
11123668Speterstatic int old_msync;
11223668SpeterSYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
11323668Speter    "Use old (insecure) msync behavior");
11423668Speter
11523668Speterstatic void	vm_object_qcollapse(vm_object_t object);
11623668Speterstatic int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
11723768Sbdestatic void	vm_object_vndeallocate(vm_object_t object);
11823768Sbde
11923668Speter/*
12023668Speter *	Virtual memory objects maintain the actual data
12123668Speter *	associated with allocated virtual memory.  A given
12223668Speter *	page of memory exists within exactly one object.
12323668Speter *
12423668Speter *	An object is only deallocated when all "references"
12523668Speter *	are given up.  Only one "reference" to a given
12623668Speter *	region of an object should be writeable.
12723668Speter *
12823668Speter *	Associated with each object is a list of all resident
12923668Speter *	memory pages belonging to that object; this list is
13023668Speter *	maintained by the "vm_page" module, and locked by the object's
13123668Speter *	lock.
13223668Speter *
13323668Speter *	Each object also records a "pager" routine which is
13423668Speter *	used to retrieve (and store) pages to the proper backing
13523668Speter *	storage.  In addition, objects may be backed by other
13623668Speter *	objects from which they were virtual-copied.
13723668Speter *
13823668Speter *	The only items within the object structure which are
13923668Speter *	modified after time of creation are:
14023668Speter *		reference count		locked by object's lock
14123668Speter *		pager routine		locked by object's lock
14223668Speter *
14323668Speter */
14423668Speter
14523668Speterstruct object_q vm_object_list;
14623668Speterstruct mtx vm_object_list_mtx;	/* lock for object list and count */
14723668Speter
14839327Simpstruct vm_object kernel_object_store;
14923768Sbdestruct vm_object kmem_object_store;
15023768Sbde
15123668SpeterSYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats");
15223668Speter
15323668Speterstatic long object_collapses;
15423668SpeterSYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
15523668Speter    &object_collapses, 0, "VM object collapses");
15623668Speter
15723668Speterstatic long object_bypasses;
15823668SpeterSYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
15923668Speter    &object_bypasses, 0, "VM object bypasses");
16023668Speter
16123668Speterstatic uma_zone_t obj_zone;
16223668Speter
16323668Speterstatic int vm_object_zinit(void *mem, int size, int flags);
16423668Speter
16523668Speter#ifdef INVARIANTS
16623668Speterstatic void vm_object_zdtor(void *mem, int size, void *arg);
16723668Speter
16823668Speterstatic void
16923668Spetervm_object_zdtor(void *mem, int size, void *arg)
17023668Speter{
17123668Speter	vm_object_t object;
17256698Sjasone
17356698Sjasone	object = (vm_object_t)mem;
17423768Sbde	KASSERT(TAILQ_EMPTY(&object->memq),
17523668Speter	    ("object %p has resident pages",
17623668Speter	    object));
17723768Sbde#if VM_NRESERVLEVEL > 0
17823668Speter	KASSERT(LIST_EMPTY(&object->rvq),
17923668Speter	    ("object %p has reservations",
18023668Speter	    object));
18123668Speter#endif
18223668Speter	KASSERT(object->cache == NULL,
18323668Speter	    ("object %p has cached pages",
18423668Speter	    object));
18523668Speter	KASSERT(object->paging_in_progress == 0,
18623668Speter	    ("object %p paging_in_progress = %d",
18723668Speter	    object, object->paging_in_progress));
18823668Speter	KASSERT(object->resident_page_count == 0,
18923668Speter	    ("object %p resident_page_count = %d",
19023668Speter	    object, object->resident_page_count));
19123668Speter	KASSERT(object->shadow_count == 0,
19223668Speter	    ("object %p shadow_count = %d",
19323668Speter	    object, object->shadow_count));
19423668Speter}
19523668Speter#endif
19623668Speter
19723668Speterstatic int
19823668Spetervm_object_zinit(void *mem, int size, int flags)
19923668Speter{
20023668Speter	vm_object_t object;
20123668Speter
20234357Sjb	object = (vm_object_t)mem;
20323668Speter	bzero(&object->mtx, sizeof(object->mtx));
20423668Speter	VM_OBJECT_LOCK_INIT(object, "standard object");
20523668Speter
20623668Speter	/* These are true for any object that has been freed */
20723668Speter	object->paging_in_progress = 0;
20823668Speter	object->resident_page_count = 0;
20923668Speter	object->shadow_count = 0;
21023668Speter	return (0);
21123668Speter}
21223668Speter
21323668Spetervoid
21423668Speter_vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
21523668Speter{
21623668Speter
21723668Speter	TAILQ_INIT(&object->memq);
21823668Speter	LIST_INIT(&object->shadow_head);
21923668Speter
22023668Speter	object->root = NULL;
22123668Speter	object->type = type;
22223668Speter	object->size = size;
22323668Speter	object->generation = 1;
22423668Speter	object->ref_count = 1;
22523668Speter	object->memattr = VM_MEMATTR_DEFAULT;
22623668Speter	object->flags = 0;
22723668Speter	object->uip = NULL;
22823668Speter	object->charge = 0;
22923668Speter	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
23023668Speter		object->flags = OBJ_ONEMAPPING;
23123668Speter	object->pg_color = 0;
23223668Speter	object->handle = NULL;
23323668Speter	object->backing_object = NULL;
23423668Speter	object->backing_object_offset = (vm_ooffset_t) 0;
23523668Speter#if VM_NRESERVLEVEL > 0
23623668Speter	LIST_INIT(&object->rvq);
23723668Speter#endif
23823668Speter	object->cache = NULL;
23923668Speter
24023668Speter	mtx_lock(&vm_object_list_mtx);
24123668Speter	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
24223668Speter	mtx_unlock(&vm_object_list_mtx);
24323668Speter}
24423668Speter
24523668Speter/*
24623668Speter *	vm_object_init:
24723668Speter *
24823668Speter *	Initialize the VM objects module.
24923668Speter */
25023668Spetervoid
25123668Spetervm_object_init(void)
25223668Speter{
25323668Speter	TAILQ_INIT(&vm_object_list);
25423668Speter	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
25523668Speter
25623668Speter	VM_OBJECT_LOCK_INIT(&kernel_object_store, "kernel object");
25723668Speter	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
25823668Speter	    kernel_object);
25923768Sbde#if VM_NRESERVLEVEL > 0
26023768Sbde	kernel_object->flags |= OBJ_COLORED;
26123668Speter	kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
26223668Speter#endif
26323668Speter
26423668Speter	VM_OBJECT_LOCK_INIT(&kmem_object_store, "kmem object");
26523668Speter	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
2661573Srgrimes	    kmem_object);
26723668Speter#if VM_NRESERVLEVEL > 0
26823668Speter	kmem_object->flags |= OBJ_COLORED;
2691573Srgrimes	kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
2701573Srgrimes#endif
2711573Srgrimes
2721573Srgrimes	/*
2737978Sbde	 * The lock portion of struct vm_object must be type stable due
27423668Speter	 * to vm_pageout_fallback_object_lock locking a vm object
27523768Sbde	 * without holding any references to it.
27623768Sbde	 */
27723768Sbde	obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
27823768Sbde#ifdef INVARIANTS
27956698Sjasone	    vm_object_zdtor,
28023768Sbde#else
28123768Sbde	    NULL,
2821573Srgrimes#endif
283	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
284}
285
286void
287vm_object_clear_flag(vm_object_t object, u_short bits)
288{
289
290	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
291	object->flags &= ~bits;
292}
293
294/*
295 *	Sets the default memory attribute for the specified object.  Pages
296 *	that are allocated to this object are by default assigned this memory
297 *	attribute.
298 *
299 *	Presently, this function must be called before any pages are allocated
300 *	to the object.  In the future, this requirement may be relaxed for
301 *	"default" and "swap" objects.
302 */
303int
304vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
305{
306
307	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
308	switch (object->type) {
309	case OBJT_DEFAULT:
310	case OBJT_DEVICE:
311	case OBJT_PHYS:
312	case OBJT_SG:
313	case OBJT_SWAP:
314	case OBJT_VNODE:
315		if (!TAILQ_EMPTY(&object->memq))
316			return (KERN_FAILURE);
317		break;
318	case OBJT_DEAD:
319		return (KERN_INVALID_ARGUMENT);
320	}
321	object->memattr = memattr;
322	return (KERN_SUCCESS);
323}
324
325void
326vm_object_pip_add(vm_object_t object, short i)
327{
328
329	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
330	object->paging_in_progress += i;
331}
332
333void
334vm_object_pip_subtract(vm_object_t object, short i)
335{
336
337	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
338	object->paging_in_progress -= i;
339}
340
341void
342vm_object_pip_wakeup(vm_object_t object)
343{
344
345	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
346	object->paging_in_progress--;
347	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
348		vm_object_clear_flag(object, OBJ_PIPWNT);
349		wakeup(object);
350	}
351}
352
353void
354vm_object_pip_wakeupn(vm_object_t object, short i)
355{
356
357	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
358	if (i)
359		object->paging_in_progress -= i;
360	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
361		vm_object_clear_flag(object, OBJ_PIPWNT);
362		wakeup(object);
363	}
364}
365
366void
367vm_object_pip_wait(vm_object_t object, char *waitid)
368{
369
370	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
371	while (object->paging_in_progress) {
372		object->flags |= OBJ_PIPWNT;
373		msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0);
374	}
375}
376
377/*
378 *	vm_object_allocate:
379 *
380 *	Returns a new object with the given size.
381 */
382vm_object_t
383vm_object_allocate(objtype_t type, vm_pindex_t size)
384{
385	vm_object_t object;
386
387	object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
388	_vm_object_allocate(type, size, object);
389	return (object);
390}
391
392
393/*
394 *	vm_object_reference:
395 *
396 *	Gets another reference to the given object.  Note: OBJ_DEAD
397 *	objects can be referenced during final cleaning.
398 */
399void
400vm_object_reference(vm_object_t object)
401{
402	if (object == NULL)
403		return;
404	VM_OBJECT_LOCK(object);
405	vm_object_reference_locked(object);
406	VM_OBJECT_UNLOCK(object);
407}
408
409/*
410 *	vm_object_reference_locked:
411 *
412 *	Gets another reference to the given object.
413 *
414 *	The object must be locked.
415 */
416void
417vm_object_reference_locked(vm_object_t object)
418{
419	struct vnode *vp;
420
421	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
422	object->ref_count++;
423	if (object->type == OBJT_VNODE) {
424		vp = object->handle;
425		vref(vp);
426	}
427}
428
429/*
430 * Handle deallocating an object of type OBJT_VNODE.
431 */
432static void
433vm_object_vndeallocate(vm_object_t object)
434{
435	struct vnode *vp = (struct vnode *) object->handle;
436
437	VFS_ASSERT_GIANT(vp->v_mount);
438	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
439	KASSERT(object->type == OBJT_VNODE,
440	    ("vm_object_vndeallocate: not a vnode object"));
441	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
442#ifdef INVARIANTS
443	if (object->ref_count == 0) {
444		vprint("vm_object_vndeallocate", vp);
445		panic("vm_object_vndeallocate: bad object reference count");
446	}
447#endif
448
449	object->ref_count--;
450	if (object->ref_count == 0) {
451		mp_fixme("Unlocked vflag access.");
452		vp->v_vflag &= ~VV_TEXT;
453	}
454	VM_OBJECT_UNLOCK(object);
455	/*
456	 * vrele may need a vop lock
457	 */
458	vrele(vp);
459}
460
461/*
462 *	vm_object_deallocate:
463 *
464 *	Release a reference to the specified object,
465 *	gained either through a vm_object_allocate
466 *	or a vm_object_reference call.  When all references
467 *	are gone, storage associated with this object
468 *	may be relinquished.
469 *
470 *	No object may be locked.
471 */
472void
473vm_object_deallocate(vm_object_t object)
474{
475	vm_object_t temp;
476
477	while (object != NULL) {
478		int vfslocked;
479
480		vfslocked = 0;
481	restart:
482		VM_OBJECT_LOCK(object);
483		if (object->type == OBJT_VNODE) {
484			struct vnode *vp = (struct vnode *) object->handle;
485
486			/*
487			 * Conditionally acquire Giant for a vnode-backed
488			 * object.  We have to be careful since the type of
489			 * a vnode object can change while the object is
490			 * unlocked.
491			 */
492			if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) {
493				vfslocked = 1;
494				if (!mtx_trylock(&Giant)) {
495					VM_OBJECT_UNLOCK(object);
496					mtx_lock(&Giant);
497					goto restart;
498				}
499			}
500			vm_object_vndeallocate(object);
501			VFS_UNLOCK_GIANT(vfslocked);
502			return;
503		} else
504			/*
505			 * This is to handle the case that the object
506			 * changed type while we dropped its lock to
507			 * obtain Giant.
508			 */
509			VFS_UNLOCK_GIANT(vfslocked);
510
511		KASSERT(object->ref_count != 0,
512			("vm_object_deallocate: object deallocated too many times: %d", object->type));
513
514		/*
515		 * If the reference count goes to 0 we start calling
516		 * vm_object_terminate() on the object chain.
517		 * A ref count of 1 may be a special case depending on the
518		 * shadow count being 0 or 1.
519		 */
520		object->ref_count--;
521		if (object->ref_count > 1) {
522			VM_OBJECT_UNLOCK(object);
523			return;
524		} else if (object->ref_count == 1) {
525			if (object->shadow_count == 0 &&
526			    object->handle == NULL &&
527			    (object->type == OBJT_DEFAULT ||
528			     object->type == OBJT_SWAP)) {
529				vm_object_set_flag(object, OBJ_ONEMAPPING);
530			} else if ((object->shadow_count == 1) &&
531			    (object->handle == NULL) &&
532			    (object->type == OBJT_DEFAULT ||
533			     object->type == OBJT_SWAP)) {
534				vm_object_t robject;
535
536				robject = LIST_FIRST(&object->shadow_head);
537				KASSERT(robject != NULL,
538				    ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
539					 object->ref_count,
540					 object->shadow_count));
541				if (!VM_OBJECT_TRYLOCK(robject)) {
542					/*
543					 * Avoid a potential deadlock.
544					 */
545					object->ref_count++;
546					VM_OBJECT_UNLOCK(object);
547					/*
548					 * More likely than not the thread
549					 * holding robject's lock has lower
550					 * priority than the current thread.
551					 * Let the lower priority thread run.
552					 */
553					pause("vmo_de", 1);
554					continue;
555				}
556				/*
557				 * Collapse object into its shadow unless its
558				 * shadow is dead.  In that case, object will
559				 * be deallocated by the thread that is
560				 * deallocating its shadow.
561				 */
562				if ((robject->flags & OBJ_DEAD) == 0 &&
563				    (robject->handle == NULL) &&
564				    (robject->type == OBJT_DEFAULT ||
565				     robject->type == OBJT_SWAP)) {
566
567					robject->ref_count++;
568retry:
569					if (robject->paging_in_progress) {
570						VM_OBJECT_UNLOCK(object);
571						vm_object_pip_wait(robject,
572						    "objde1");
573						temp = robject->backing_object;
574						if (object == temp) {
575							VM_OBJECT_LOCK(object);
576							goto retry;
577						}
578					} else if (object->paging_in_progress) {
579						VM_OBJECT_UNLOCK(robject);
580						object->flags |= OBJ_PIPWNT;
581						msleep(object,
582						    VM_OBJECT_MTX(object),
583						    PDROP | PVM, "objde2", 0);
584						VM_OBJECT_LOCK(robject);
585						temp = robject->backing_object;
586						if (object == temp) {
587							VM_OBJECT_LOCK(object);
588							goto retry;
589						}
590					} else
591						VM_OBJECT_UNLOCK(object);
592
593					if (robject->ref_count == 1) {
594						robject->ref_count--;
595						object = robject;
596						goto doterm;
597					}
598					object = robject;
599					vm_object_collapse(object);
600					VM_OBJECT_UNLOCK(object);
601					continue;
602				}
603				VM_OBJECT_UNLOCK(robject);
604			}
605			VM_OBJECT_UNLOCK(object);
606			return;
607		}
608doterm:
609		temp = object->backing_object;
610		if (temp != NULL) {
611			VM_OBJECT_LOCK(temp);
612			LIST_REMOVE(object, shadow_list);
613			temp->shadow_count--;
614			temp->generation++;
615			VM_OBJECT_UNLOCK(temp);
616			object->backing_object = NULL;
617		}
618		/*
619		 * Don't double-terminate, we could be in a termination
620		 * recursion due to the terminate having to sync data
621		 * to disk.
622		 */
623		if ((object->flags & OBJ_DEAD) == 0)
624			vm_object_terminate(object);
625		else
626			VM_OBJECT_UNLOCK(object);
627		object = temp;
628	}
629}
630
631/*
632 *	vm_object_destroy removes the object from the global object list
633 *      and frees the space for the object.
634 */
635void
636vm_object_destroy(vm_object_t object)
637{
638
639	/*
640	 * Remove the object from the global object list.
641	 */
642	mtx_lock(&vm_object_list_mtx);
643	TAILQ_REMOVE(&vm_object_list, object, object_list);
644	mtx_unlock(&vm_object_list_mtx);
645
646	/*
647	 * Release the allocation charge.
648	 */
649	if (object->uip != NULL) {
650		KASSERT(object->type == OBJT_DEFAULT ||
651		    object->type == OBJT_SWAP,
652		    ("vm_object_terminate: non-swap obj %p has uip",
653		     object));
654		swap_release_by_uid(object->charge, object->uip);
655		object->charge = 0;
656		uifree(object->uip);
657		object->uip = NULL;
658	}
659
660	/*
661	 * Free the space for the object.
662	 */
663	uma_zfree(obj_zone, object);
664}
665
666/*
667 *	vm_object_terminate actually destroys the specified object, freeing
668 *	up all previously used resources.
669 *
670 *	The object must be locked.
671 *	This routine may block.
672 */
673void
674vm_object_terminate(vm_object_t object)
675{
676	vm_page_t p;
677
678	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
679
680	/*
681	 * Make sure no one uses us.
682	 */
683	vm_object_set_flag(object, OBJ_DEAD);
684
685	/*
686	 * wait for the pageout daemon to be done with the object
687	 */
688	vm_object_pip_wait(object, "objtrm");
689
690	KASSERT(!object->paging_in_progress,
691		("vm_object_terminate: pageout in progress"));
692
693	/*
694	 * Clean and free the pages, as appropriate. All references to the
695	 * object are gone, so we don't need to lock it.
696	 */
697	if (object->type == OBJT_VNODE) {
698		struct vnode *vp = (struct vnode *)object->handle;
699
700		/*
701		 * Clean pages and flush buffers.
702		 */
703		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
704		VM_OBJECT_UNLOCK(object);
705
706		vinvalbuf(vp, V_SAVE, 0, 0);
707
708		VM_OBJECT_LOCK(object);
709	}
710
711	KASSERT(object->ref_count == 0,
712		("vm_object_terminate: object with references, ref_count=%d",
713		object->ref_count));
714
715	/*
716	 * Now free any remaining pages. For internal objects, this also
717	 * removes them from paging queues. Don't free wired pages, just
718	 * remove them from the object.
719	 */
720	while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
721		KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
722			("vm_object_terminate: freeing busy page %p "
723			"p->busy = %d, p->oflags %x\n", p, p->busy, p->oflags));
724		vm_page_lock(p);
725		if (p->wire_count == 0) {
726			vm_page_free(p);
727			PCPU_INC(cnt.v_pfree);
728		} else
729			vm_page_remove(p);
730		vm_page_unlock(p);
731	}
732
733#if VM_NRESERVLEVEL > 0
734	if (__predict_false(!LIST_EMPTY(&object->rvq)))
735		vm_reserv_break_all(object);
736#endif
737	if (__predict_false(object->cache != NULL))
738		vm_page_cache_free(object, 0, 0);
739
740	/*
741	 * Let the pager know object is dead.
742	 */
743	vm_pager_deallocate(object);
744	VM_OBJECT_UNLOCK(object);
745
746	vm_object_destroy(object);
747}
748
749/*
750 *	vm_object_page_clean
751 *
752 *	Clean all dirty pages in the specified range of object.  Leaves page
753 * 	on whatever queue it is currently on.   If NOSYNC is set then do not
754 *	write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
755 *	leaving the object dirty.
756 *
757 *	When stuffing pages asynchronously, allow clustering.  XXX we need a
758 *	synchronous clustering mode implementation.
759 *
760 *	Odd semantics: if start == end, we clean everything.
761 *
762 *	The object must be locked.
763 */
764void
765vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int flags)
766{
767	vm_page_t p, np;
768	vm_pindex_t tstart, tend;
769	vm_pindex_t pi;
770	int clearobjflags;
771	int pagerflags;
772	int curgeneration;
773
774	mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
775	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
776	if ((object->flags & OBJ_MIGHTBEDIRTY) == 0)
777		return;
778	KASSERT(object->type == OBJT_VNODE, ("Not a vnode object"));
779
780	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
781	pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
782
783	vm_object_set_flag(object, OBJ_CLEANING);
784
785	tstart = start;
786	if (end == 0) {
787		tend = object->size;
788	} else {
789		tend = end;
790	}
791
792	/*
793	 * If the caller is smart and only msync()s a range he knows is
794	 * dirty, we may be able to avoid an object scan.  This results in
795	 * a phenominal improvement in performance.  We cannot do this
796	 * as a matter of course because the object may be huge - e.g.
797	 * the size might be in the gigabytes or terrabytes.
798	 */
799	if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) {
800		vm_pindex_t tscan;
801		int scanlimit;
802		int scanreset;
803
804		scanreset = object->resident_page_count / EASY_SCAN_FACTOR;
805		if (scanreset < 16)
806			scanreset = 16;
807		pagerflags |= VM_PAGER_IGNORE_CLEANCHK;
808
809		scanlimit = scanreset;
810		tscan = tstart;
811		while (tscan < tend) {
812			curgeneration = object->generation;
813			p = vm_page_lookup(object, tscan);
814			if (p == NULL || p->valid == 0) {
815				if (--scanlimit == 0)
816					break;
817				++tscan;
818				continue;
819			}
820			vm_page_lock(p);
821			vm_page_lock_queues();
822			vm_page_test_dirty(p);
823			if (p->dirty == 0) {
824				vm_page_unlock_queues();
825				vm_page_unlock(p);
826				if (--scanlimit == 0)
827					break;
828				++tscan;
829				continue;
830			}
831			vm_page_unlock_queues();
832			vm_page_unlock(p);
833			/*
834			 * If we have been asked to skip nosync pages and
835			 * this is a nosync page, we can't continue.
836			 */
837			if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
838				if (--scanlimit == 0)
839					break;
840				++tscan;
841				continue;
842			}
843			scanlimit = scanreset;
844
845			/*
846			 * This returns 0 if it was unable to busy the first
847			 * page (i.e. had to sleep).
848			 */
849			tscan += vm_object_page_collect_flush(object, p, curgeneration, pagerflags);
850
851		}
852
853		/*
854		 * If everything was dirty and we flushed it successfully,
855		 * and the requested range is not the entire object, we
856		 * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can
857		 * return immediately.
858		 */
859		if (tscan >= tend && (tstart || tend < object->size)) {
860			vm_object_clear_flag(object, OBJ_CLEANING);
861			return;
862		}
863		pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK;
864	}
865
866	/*
867	 * Generally set CLEANCHK interlock and make the page read-only so
868	 * we can then clear the object flags.
869	 *
870	 * However, if this is a nosync mmap then the object is likely to
871	 * stay dirty so do not mess with the page and do not clear the
872	 * object flags.
873	 */
874	clearobjflags = 1;
875	TAILQ_FOREACH(p, &object->memq, listq) {
876		p->oflags |= VPO_CLEANCHK;
877		if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC))
878			clearobjflags = 0;
879		else {
880			vm_page_lock(p);
881			vm_page_lock_queues();
882			pmap_remove_write(p);
883			vm_page_unlock_queues();
884			vm_page_unlock(p);
885		}
886	}
887
888	if (clearobjflags && (tstart == 0) && (tend == object->size))
889		vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
890
891rescan:
892	curgeneration = object->generation;
893
894	for (p = TAILQ_FIRST(&object->memq); p; p = np) {
895		int n;
896
897		np = TAILQ_NEXT(p, listq);
898
899again:
900		pi = p->pindex;
901		if ((p->oflags & VPO_CLEANCHK) == 0 ||
902			(pi < tstart) || (pi >= tend) ||
903		    p->valid == 0) {
904			p->oflags &= ~VPO_CLEANCHK;
905			continue;
906		}
907
908		vm_page_lock(p);
909		vm_page_lock_queues();
910		vm_page_test_dirty(p);
911		if (p->dirty == 0) {
912			vm_page_unlock_queues();
913			vm_page_unlock(p);
914			p->oflags &= ~VPO_CLEANCHK;
915			continue;
916		}
917		vm_page_unlock_queues();
918		vm_page_unlock(p);
919		/*
920		 * If we have been asked to skip nosync pages and this is a
921		 * nosync page, skip it.  Note that the object flags were
922		 * not cleared in this case so we do not have to set them.
923		 */
924		if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
925			p->oflags &= ~VPO_CLEANCHK;
926			continue;
927		}
928
929		n = vm_object_page_collect_flush(object, p,
930			curgeneration, pagerflags);
931		if (n == 0)
932			goto rescan;
933
934		if (object->generation != curgeneration)
935			goto rescan;
936
937		/*
938		 * Try to optimize the next page.  If we can't we pick up
939		 * our (random) scan where we left off.
940		 */
941		if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ)
942			if ((p = vm_page_lookup(object, pi + n)) != NULL)
943				goto again;
944	}
945#if 0
946	VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc);
947#endif
948
949	vm_object_clear_flag(object, OBJ_CLEANING);
950	return;
951}
952
953static int
954vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags)
955{
956	int runlen;
957	int maxf;
958	int chkb;
959	int maxb;
960	int i;
961	vm_pindex_t pi;
962	vm_page_t maf[vm_pageout_page_count];
963	vm_page_t mab[vm_pageout_page_count];
964	vm_page_t ma[vm_pageout_page_count];
965
966	mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
967	vm_page_lock_assert(p, MA_NOTOWNED);
968	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
969	pi = p->pindex;
970	while (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) {
971		if (object->generation != curgeneration) {
972			return(0);
973		}
974	}
975	maxf = 0;
976	for(i = 1; i < vm_pageout_page_count; i++) {
977		vm_page_t tp;
978
979		if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
980			if ((tp->oflags & VPO_BUSY) ||
981				((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
982				 (tp->oflags & VPO_CLEANCHK) == 0) ||
983				(tp->busy != 0))
984				break;
985			vm_page_lock(tp);
986			vm_page_lock_queues();
987			vm_page_test_dirty(tp);
988			if (tp->dirty == 0) {
989				vm_page_unlock(tp);
990				vm_page_unlock_queues();
991				tp->oflags &= ~VPO_CLEANCHK;
992				break;
993			}
994			vm_page_unlock(tp);
995			vm_page_unlock_queues();
996			maf[ i - 1 ] = tp;
997			maxf++;
998			continue;
999		}
1000		break;
1001	}
1002
1003	maxb = 0;
1004	chkb = vm_pageout_page_count -  maxf;
1005	if (chkb) {
1006		for(i = 1; i < chkb;i++) {
1007			vm_page_t tp;
1008
1009			if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
1010				if ((tp->oflags & VPO_BUSY) ||
1011					((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1012					 (tp->oflags & VPO_CLEANCHK) == 0) ||
1013					(tp->busy != 0))
1014					break;
1015				vm_page_lock(tp);
1016				vm_page_lock_queues();
1017				vm_page_test_dirty(tp);
1018				if (tp->dirty == 0) {
1019					vm_page_unlock_queues();
1020					vm_page_unlock(tp);
1021					tp->oflags &= ~VPO_CLEANCHK;
1022					break;
1023				}
1024				vm_page_unlock_queues();
1025				vm_page_unlock(tp);
1026				mab[ i - 1 ] = tp;
1027				maxb++;
1028				continue;
1029			}
1030			break;
1031		}
1032	}
1033
1034	for(i = 0; i < maxb; i++) {
1035		int index = (maxb - i) - 1;
1036		ma[index] = mab[i];
1037		ma[index]->oflags &= ~VPO_CLEANCHK;
1038	}
1039	p->oflags &= ~VPO_CLEANCHK;
1040	ma[maxb] = p;
1041	for(i = 0; i < maxf; i++) {
1042		int index = (maxb + i) + 1;
1043		ma[index] = maf[i];
1044		ma[index]->oflags &= ~VPO_CLEANCHK;
1045	}
1046	runlen = maxb + maxf + 1;
1047
1048	vm_pageout_flush(ma, runlen, pagerflags);
1049	for (i = 0; i < runlen; i++) {
1050		if (ma[i]->dirty) {
1051			vm_page_lock(ma[i]);
1052			vm_page_lock_queues();
1053			pmap_remove_write(ma[i]);
1054			vm_page_unlock_queues();
1055			vm_page_unlock(ma[i]);
1056			ma[i]->oflags |= VPO_CLEANCHK;
1057
1058			/*
1059			 * maxf will end up being the actual number of pages
1060			 * we wrote out contiguously, non-inclusive of the
1061			 * first page.  We do not count look-behind pages.
1062			 */
1063			if (i >= maxb + 1 && (maxf > i - maxb - 1))
1064				maxf = i - maxb - 1;
1065		}
1066	}
1067	return(maxf + 1);
1068}
1069
1070/*
1071 * Note that there is absolutely no sense in writing out
1072 * anonymous objects, so we track down the vnode object
1073 * to write out.
1074 * We invalidate (remove) all pages from the address space
1075 * for semantic correctness.
1076 *
1077 * Note: certain anonymous maps, such as MAP_NOSYNC maps,
1078 * may start out with a NULL object.
1079 */
1080void
1081vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
1082    boolean_t syncio, boolean_t invalidate)
1083{
1084	vm_object_t backing_object;
1085	struct vnode *vp;
1086	struct mount *mp;
1087	int flags;
1088
1089	if (object == NULL)
1090		return;
1091	VM_OBJECT_LOCK(object);
1092	while ((backing_object = object->backing_object) != NULL) {
1093		VM_OBJECT_LOCK(backing_object);
1094		offset += object->backing_object_offset;
1095		VM_OBJECT_UNLOCK(object);
1096		object = backing_object;
1097		if (object->size < OFF_TO_IDX(offset + size))
1098			size = IDX_TO_OFF(object->size) - offset;
1099	}
1100	/*
1101	 * Flush pages if writing is allowed, invalidate them
1102	 * if invalidation requested.  Pages undergoing I/O
1103	 * will be ignored by vm_object_page_remove().
1104	 *
1105	 * We cannot lock the vnode and then wait for paging
1106	 * to complete without deadlocking against vm_fault.
1107	 * Instead we simply call vm_object_page_remove() and
1108	 * allow it to block internally on a page-by-page
1109	 * basis when it encounters pages undergoing async
1110	 * I/O.
1111	 */
1112	if (object->type == OBJT_VNODE &&
1113	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
1114		int vfslocked;
1115		vp = object->handle;
1116		VM_OBJECT_UNLOCK(object);
1117		(void) vn_start_write(vp, &mp, V_WAIT);
1118		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1119		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1120		flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
1121		flags |= invalidate ? OBJPC_INVAL : 0;
1122		VM_OBJECT_LOCK(object);
1123		vm_object_page_clean(object,
1124		    OFF_TO_IDX(offset),
1125		    OFF_TO_IDX(offset + size + PAGE_MASK),
1126		    flags);
1127		VM_OBJECT_UNLOCK(object);
1128		VOP_UNLOCK(vp, 0);
1129		VFS_UNLOCK_GIANT(vfslocked);
1130		vn_finished_write(mp);
1131		VM_OBJECT_LOCK(object);
1132	}
1133	if ((object->type == OBJT_VNODE ||
1134	     object->type == OBJT_DEVICE) && invalidate) {
1135		boolean_t purge;
1136		purge = old_msync || (object->type == OBJT_DEVICE);
1137		vm_object_page_remove(object,
1138		    OFF_TO_IDX(offset),
1139		    OFF_TO_IDX(offset + size + PAGE_MASK),
1140		    purge ? FALSE : TRUE);
1141	}
1142	VM_OBJECT_UNLOCK(object);
1143}
1144
1145/*
1146 *	vm_object_madvise:
1147 *
1148 *	Implements the madvise function at the object/page level.
1149 *
1150 *	MADV_WILLNEED	(any object)
1151 *
1152 *	    Activate the specified pages if they are resident.
1153 *
1154 *	MADV_DONTNEED	(any object)
1155 *
1156 *	    Deactivate the specified pages if they are resident.
1157 *
1158 *	MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects,
1159 *			 OBJ_ONEMAPPING only)
1160 *
1161 *	    Deactivate and clean the specified pages if they are
1162 *	    resident.  This permits the process to reuse the pages
1163 *	    without faulting or the kernel to reclaim the pages
1164 *	    without I/O.
1165 */
1166void
1167vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
1168{
1169	vm_pindex_t end, tpindex;
1170	vm_object_t backing_object, tobject;
1171	vm_page_t m;
1172
1173	if (object == NULL)
1174		return;
1175	VM_OBJECT_LOCK(object);
1176	end = pindex + count;
1177	/*
1178	 * Locate and adjust resident pages
1179	 */
1180	for (; pindex < end; pindex += 1) {
1181relookup:
1182		tobject = object;
1183		tpindex = pindex;
1184shadowlookup:
1185		/*
1186		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1187		 * and those pages must be OBJ_ONEMAPPING.
1188		 */
1189		if (advise == MADV_FREE) {
1190			if ((tobject->type != OBJT_DEFAULT &&
1191			     tobject->type != OBJT_SWAP) ||
1192			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
1193				goto unlock_tobject;
1194			}
1195		} else if (tobject->type == OBJT_PHYS)
1196			goto unlock_tobject;
1197		m = vm_page_lookup(tobject, tpindex);
1198		if (m == NULL && advise == MADV_WILLNEED) {
1199			/*
1200			 * If the page is cached, reactivate it.
1201			 */
1202			m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
1203			    VM_ALLOC_NOBUSY);
1204		}
1205		if (m == NULL) {
1206			/*
1207			 * There may be swap even if there is no backing page
1208			 */
1209			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1210				swap_pager_freespace(tobject, tpindex, 1);
1211			/*
1212			 * next object
1213			 */
1214			backing_object = tobject->backing_object;
1215			if (backing_object == NULL)
1216				goto unlock_tobject;
1217			VM_OBJECT_LOCK(backing_object);
1218			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
1219			if (tobject != object)
1220				VM_OBJECT_UNLOCK(tobject);
1221			tobject = backing_object;
1222			goto shadowlookup;
1223		} else if (m->valid != VM_PAGE_BITS_ALL)
1224			goto unlock_tobject;
1225		/*
1226		 * If the page is not in a normal state, skip it.
1227		 */
1228		vm_page_lock(m);
1229		vm_page_lock_queues();
1230		if (m->hold_count != 0 || m->wire_count != 0) {
1231			vm_page_unlock_queues();
1232			vm_page_unlock(m);
1233			goto unlock_tobject;
1234		}
1235		if ((m->oflags & VPO_BUSY) || m->busy) {
1236			if (advise == MADV_WILLNEED)
1237				/*
1238				 * Reference the page before unlocking and
1239				 * sleeping so that the page daemon is less
1240				 * likely to reclaim it.
1241				 */
1242				vm_page_flag_set(m, PG_REFERENCED);
1243			vm_page_unlock_queues();
1244			vm_page_unlock(m);
1245			if (object != tobject)
1246				VM_OBJECT_UNLOCK(object);
1247			m->oflags |= VPO_WANTED;
1248			msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo",
1249			    0);
1250			VM_OBJECT_LOCK(object);
1251  			goto relookup;
1252		}
1253		if (advise == MADV_WILLNEED) {
1254			vm_page_activate(m);
1255		} else if (advise == MADV_DONTNEED) {
1256			vm_page_dontneed(m);
1257		} else if (advise == MADV_FREE) {
1258			/*
1259			 * Mark the page clean.  This will allow the page
1260			 * to be freed up by the system.  However, such pages
1261			 * are often reused quickly by malloc()/free()
1262			 * so we do not do anything that would cause
1263			 * a page fault if we can help it.
1264			 *
1265			 * Specifically, we do not try to actually free
1266			 * the page now nor do we try to put it in the
1267			 * cache (which would cause a page fault on reuse).
1268			 *
1269			 * But we do make the page is freeable as we
1270			 * can without actually taking the step of unmapping
1271			 * it.
1272			 */
1273			pmap_clear_modify(m);
1274			m->dirty = 0;
1275			m->act_count = 0;
1276			vm_page_dontneed(m);
1277		}
1278		vm_page_unlock_queues();
1279		vm_page_unlock(m);
1280		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1281			swap_pager_freespace(tobject, tpindex, 1);
1282unlock_tobject:
1283		if (tobject != object)
1284			VM_OBJECT_UNLOCK(tobject);
1285	}
1286	VM_OBJECT_UNLOCK(object);
1287}
1288
1289/*
1290 *	vm_object_shadow:
1291 *
1292 *	Create a new object which is backed by the
1293 *	specified existing object range.  The source
1294 *	object reference is deallocated.
1295 *
1296 *	The new object and offset into that object
1297 *	are returned in the source parameters.
1298 */
1299void
1300vm_object_shadow(
1301	vm_object_t *object,	/* IN/OUT */
1302	vm_ooffset_t *offset,	/* IN/OUT */
1303	vm_size_t length)
1304{
1305	vm_object_t source;
1306	vm_object_t result;
1307
1308	source = *object;
1309
1310	/*
1311	 * Don't create the new object if the old object isn't shared.
1312	 */
1313	if (source != NULL) {
1314		VM_OBJECT_LOCK(source);
1315		if (source->ref_count == 1 &&
1316		    source->handle == NULL &&
1317		    (source->type == OBJT_DEFAULT ||
1318		     source->type == OBJT_SWAP)) {
1319			VM_OBJECT_UNLOCK(source);
1320			return;
1321		}
1322		VM_OBJECT_UNLOCK(source);
1323	}
1324
1325	/*
1326	 * Allocate a new object with the given length.
1327	 */
1328	result = vm_object_allocate(OBJT_DEFAULT, length);
1329
1330	/*
1331	 * The new object shadows the source object, adding a reference to it.
1332	 * Our caller changes his reference to point to the new object,
1333	 * removing a reference to the source object.  Net result: no change
1334	 * of reference count.
1335	 *
1336	 * Try to optimize the result object's page color when shadowing
1337	 * in order to maintain page coloring consistency in the combined
1338	 * shadowed object.
1339	 */
1340	result->backing_object = source;
1341	/*
1342	 * Store the offset into the source object, and fix up the offset into
1343	 * the new object.
1344	 */
1345	result->backing_object_offset = *offset;
1346	if (source != NULL) {
1347		VM_OBJECT_LOCK(source);
1348		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
1349		source->shadow_count++;
1350		source->generation++;
1351#if VM_NRESERVLEVEL > 0
1352		result->flags |= source->flags & OBJ_COLORED;
1353		result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
1354		    ((1 << (VM_NFREEORDER - 1)) - 1);
1355#endif
1356		VM_OBJECT_UNLOCK(source);
1357	}
1358
1359
1360	/*
1361	 * Return the new things
1362	 */
1363	*offset = 0;
1364	*object = result;
1365}
1366
1367/*
1368 *	vm_object_split:
1369 *
1370 * Split the pages in a map entry into a new object.  This affords
1371 * easier removal of unused pages, and keeps object inheritance from
1372 * being a negative impact on memory usage.
1373 */
1374void
1375vm_object_split(vm_map_entry_t entry)
1376{
1377	vm_page_t m, m_next;
1378	vm_object_t orig_object, new_object, source;
1379	vm_pindex_t idx, offidxstart;
1380	vm_size_t size;
1381
1382	orig_object = entry->object.vm_object;
1383	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
1384		return;
1385	if (orig_object->ref_count <= 1)
1386		return;
1387	VM_OBJECT_UNLOCK(orig_object);
1388
1389	offidxstart = OFF_TO_IDX(entry->offset);
1390	size = atop(entry->end - entry->start);
1391
1392	/*
1393	 * If swap_pager_copy() is later called, it will convert new_object
1394	 * into a swap object.
1395	 */
1396	new_object = vm_object_allocate(OBJT_DEFAULT, size);
1397
1398	/*
1399	 * At this point, the new object is still private, so the order in
1400	 * which the original and new objects are locked does not matter.
1401	 */
1402	VM_OBJECT_LOCK(new_object);
1403	VM_OBJECT_LOCK(orig_object);
1404	source = orig_object->backing_object;
1405	if (source != NULL) {
1406		VM_OBJECT_LOCK(source);
1407		if ((source->flags & OBJ_DEAD) != 0) {
1408			VM_OBJECT_UNLOCK(source);
1409			VM_OBJECT_UNLOCK(orig_object);
1410			VM_OBJECT_UNLOCK(new_object);
1411			vm_object_deallocate(new_object);
1412			VM_OBJECT_LOCK(orig_object);
1413			return;
1414		}
1415		LIST_INSERT_HEAD(&source->shadow_head,
1416				  new_object, shadow_list);
1417		source->shadow_count++;
1418		source->generation++;
1419		vm_object_reference_locked(source);	/* for new_object */
1420		vm_object_clear_flag(source, OBJ_ONEMAPPING);
1421		VM_OBJECT_UNLOCK(source);
1422		new_object->backing_object_offset =
1423			orig_object->backing_object_offset + entry->offset;
1424		new_object->backing_object = source;
1425	}
1426	if (orig_object->uip != NULL) {
1427		new_object->uip = orig_object->uip;
1428		uihold(orig_object->uip);
1429		new_object->charge = ptoa(size);
1430		KASSERT(orig_object->charge >= ptoa(size),
1431		    ("orig_object->charge < 0"));
1432		orig_object->charge -= ptoa(size);
1433	}
1434retry:
1435	if ((m = TAILQ_FIRST(&orig_object->memq)) != NULL) {
1436		if (m->pindex < offidxstart) {
1437			m = vm_page_splay(offidxstart, orig_object->root);
1438			if ((orig_object->root = m)->pindex < offidxstart)
1439				m = TAILQ_NEXT(m, listq);
1440		}
1441	}
1442	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
1443	    m = m_next) {
1444		m_next = TAILQ_NEXT(m, listq);
1445
1446		/*
1447		 * We must wait for pending I/O to complete before we can
1448		 * rename the page.
1449		 *
1450		 * We do not have to VM_PROT_NONE the page as mappings should
1451		 * not be changed by this operation.
1452		 */
1453		if ((m->oflags & VPO_BUSY) || m->busy) {
1454			VM_OBJECT_UNLOCK(new_object);
1455			m->oflags |= VPO_WANTED;
1456			msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
1457			VM_OBJECT_LOCK(new_object);
1458			goto retry;
1459		}
1460		vm_page_lock(m);
1461		vm_page_rename(m, new_object, idx);
1462		vm_page_unlock(m);
1463		/* page automatically made dirty by rename and cache handled */
1464		vm_page_busy(m);
1465	}
1466	if (orig_object->type == OBJT_SWAP) {
1467		/*
1468		 * swap_pager_copy() can sleep, in which case the orig_object's
1469		 * and new_object's locks are released and reacquired.
1470		 */
1471		swap_pager_copy(orig_object, new_object, offidxstart, 0);
1472
1473		/*
1474		 * Transfer any cached pages from orig_object to new_object.
1475		 */
1476		if (__predict_false(orig_object->cache != NULL))
1477			vm_page_cache_transfer(orig_object, offidxstart,
1478			    new_object);
1479	}
1480	VM_OBJECT_UNLOCK(orig_object);
1481	TAILQ_FOREACH(m, &new_object->memq, listq)
1482		vm_page_wakeup(m);
1483	VM_OBJECT_UNLOCK(new_object);
1484	entry->object.vm_object = new_object;
1485	entry->offset = 0LL;
1486	vm_object_deallocate(orig_object);
1487	VM_OBJECT_LOCK(new_object);
1488}
1489
1490#define	OBSC_TEST_ALL_SHADOWED	0x0001
1491#define	OBSC_COLLAPSE_NOWAIT	0x0002
1492#define	OBSC_COLLAPSE_WAIT	0x0004
1493
1494static int
1495vm_object_backing_scan(vm_object_t object, int op)
1496{
1497	int r = 1;
1498	vm_page_t p;
1499	vm_object_t backing_object;
1500	vm_pindex_t backing_offset_index;
1501
1502	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1503	VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
1504
1505	backing_object = object->backing_object;
1506	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
1507
1508	/*
1509	 * Initial conditions
1510	 */
1511	if (op & OBSC_TEST_ALL_SHADOWED) {
1512		/*
1513		 * We do not want to have to test for the existence of cache
1514		 * or swap pages in the backing object.  XXX but with the
1515		 * new swapper this would be pretty easy to do.
1516		 *
1517		 * XXX what about anonymous MAP_SHARED memory that hasn't
1518		 * been ZFOD faulted yet?  If we do not test for this, the
1519		 * shadow test may succeed! XXX
1520		 */
1521		if (backing_object->type != OBJT_DEFAULT) {
1522			return (0);
1523		}
1524	}
1525	if (op & OBSC_COLLAPSE_WAIT) {
1526		vm_object_set_flag(backing_object, OBJ_DEAD);
1527	}
1528
1529	/*
1530	 * Our scan
1531	 */
1532	p = TAILQ_FIRST(&backing_object->memq);
1533	while (p) {
1534		vm_page_t next = TAILQ_NEXT(p, listq);
1535		vm_pindex_t new_pindex = p->pindex - backing_offset_index;
1536
1537		if (op & OBSC_TEST_ALL_SHADOWED) {
1538			vm_page_t pp;
1539
1540			/*
1541			 * Ignore pages outside the parent object's range
1542			 * and outside the parent object's mapping of the
1543			 * backing object.
1544			 *
1545			 * note that we do not busy the backing object's
1546			 * page.
1547			 */
1548			if (
1549			    p->pindex < backing_offset_index ||
1550			    new_pindex >= object->size
1551			) {
1552				p = next;
1553				continue;
1554			}
1555
1556			/*
1557			 * See if the parent has the page or if the parent's
1558			 * object pager has the page.  If the parent has the
1559			 * page but the page is not valid, the parent's
1560			 * object pager must have the page.
1561			 *
1562			 * If this fails, the parent does not completely shadow
1563			 * the object and we might as well give up now.
1564			 */
1565
1566			pp = vm_page_lookup(object, new_pindex);
1567			if (
1568			    (pp == NULL || pp->valid == 0) &&
1569			    !vm_pager_has_page(object, new_pindex, NULL, NULL)
1570			) {
1571				r = 0;
1572				break;
1573			}
1574		}
1575
1576		/*
1577		 * Check for busy page
1578		 */
1579		if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
1580			vm_page_t pp;
1581
1582			if (op & OBSC_COLLAPSE_NOWAIT) {
1583				if ((p->oflags & VPO_BUSY) ||
1584				    !p->valid ||
1585				    p->busy) {
1586					p = next;
1587					continue;
1588				}
1589			} else if (op & OBSC_COLLAPSE_WAIT) {
1590				if ((p->oflags & VPO_BUSY) || p->busy) {
1591					VM_OBJECT_UNLOCK(object);
1592					p->oflags |= VPO_WANTED;
1593					msleep(p, VM_OBJECT_MTX(backing_object),
1594					    PDROP | PVM, "vmocol", 0);
1595					VM_OBJECT_LOCK(object);
1596					VM_OBJECT_LOCK(backing_object);
1597					/*
1598					 * If we slept, anything could have
1599					 * happened.  Since the object is
1600					 * marked dead, the backing offset
1601					 * should not have changed so we
1602					 * just restart our scan.
1603					 */
1604					p = TAILQ_FIRST(&backing_object->memq);
1605					continue;
1606				}
1607			}
1608
1609			KASSERT(
1610			    p->object == backing_object,
1611			    ("vm_object_backing_scan: object mismatch")
1612			);
1613
1614			/*
1615			 * Destroy any associated swap
1616			 */
1617			if (backing_object->type == OBJT_SWAP) {
1618				swap_pager_freespace(
1619				    backing_object,
1620				    p->pindex,
1621				    1
1622				);
1623			}
1624
1625			if (
1626			    p->pindex < backing_offset_index ||
1627			    new_pindex >= object->size
1628			) {
1629				/*
1630				 * Page is out of the parent object's range, we
1631				 * can simply destroy it.
1632				 */
1633				vm_page_lock(p);
1634				KASSERT(!pmap_page_is_mapped(p),
1635				    ("freeing mapped page %p", p));
1636				if (p->wire_count == 0)
1637					vm_page_free(p);
1638				else
1639					vm_page_remove(p);
1640				vm_page_unlock(p);
1641				p = next;
1642				continue;
1643			}
1644
1645			pp = vm_page_lookup(object, new_pindex);
1646			if (
1647			    pp != NULL ||
1648			    vm_pager_has_page(object, new_pindex, NULL, NULL)
1649			) {
1650				/*
1651				 * page already exists in parent OR swap exists
1652				 * for this location in the parent.  Destroy
1653				 * the original page from the backing object.
1654				 *
1655				 * Leave the parent's page alone
1656				 */
1657				vm_page_lock(p);
1658				KASSERT(!pmap_page_is_mapped(p),
1659				    ("freeing mapped page %p", p));
1660				if (p->wire_count == 0)
1661					vm_page_free(p);
1662				else
1663					vm_page_remove(p);
1664				vm_page_unlock(p);
1665				p = next;
1666				continue;
1667			}
1668
1669#if VM_NRESERVLEVEL > 0
1670			/*
1671			 * Rename the reservation.
1672			 */
1673			vm_reserv_rename(p, object, backing_object,
1674			    backing_offset_index);
1675#endif
1676
1677			/*
1678			 * Page does not exist in parent, rename the
1679			 * page from the backing object to the main object.
1680			 *
1681			 * If the page was mapped to a process, it can remain
1682			 * mapped through the rename.
1683			 */
1684			vm_page_lock(p);
1685			vm_page_rename(p, object, new_pindex);
1686			vm_page_unlock(p);
1687			/* page automatically made dirty by rename */
1688		}
1689		p = next;
1690	}
1691	return (r);
1692}
1693
1694
1695/*
1696 * this version of collapse allows the operation to occur earlier and
1697 * when paging_in_progress is true for an object...  This is not a complete
1698 * operation, but should plug 99.9% of the rest of the leaks.
1699 */
1700static void
1701vm_object_qcollapse(vm_object_t object)
1702{
1703	vm_object_t backing_object = object->backing_object;
1704
1705	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1706	VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED);
1707
1708	if (backing_object->ref_count != 1)
1709		return;
1710
1711	vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
1712}
1713
1714/*
1715 *	vm_object_collapse:
1716 *
1717 *	Collapse an object with the object backing it.
1718 *	Pages in the backing object are moved into the
1719 *	parent, and the backing object is deallocated.
1720 */
1721void
1722vm_object_collapse(vm_object_t object)
1723{
1724	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1725
1726	while (TRUE) {
1727		vm_object_t backing_object;
1728
1729		/*
1730		 * Verify that the conditions are right for collapse:
1731		 *
1732		 * The object exists and the backing object exists.
1733		 */
1734		if ((backing_object = object->backing_object) == NULL)
1735			break;
1736
1737		/*
1738		 * we check the backing object first, because it is most likely
1739		 * not collapsable.
1740		 */
1741		VM_OBJECT_LOCK(backing_object);
1742		if (backing_object->handle != NULL ||
1743		    (backing_object->type != OBJT_DEFAULT &&
1744		     backing_object->type != OBJT_SWAP) ||
1745		    (backing_object->flags & OBJ_DEAD) ||
1746		    object->handle != NULL ||
1747		    (object->type != OBJT_DEFAULT &&
1748		     object->type != OBJT_SWAP) ||
1749		    (object->flags & OBJ_DEAD)) {
1750			VM_OBJECT_UNLOCK(backing_object);
1751			break;
1752		}
1753
1754		if (
1755		    object->paging_in_progress != 0 ||
1756		    backing_object->paging_in_progress != 0
1757		) {
1758			vm_object_qcollapse(object);
1759			VM_OBJECT_UNLOCK(backing_object);
1760			break;
1761		}
1762		/*
1763		 * We know that we can either collapse the backing object (if
1764		 * the parent is the only reference to it) or (perhaps) have
1765		 * the parent bypass the object if the parent happens to shadow
1766		 * all the resident pages in the entire backing object.
1767		 *
1768		 * This is ignoring pager-backed pages such as swap pages.
1769		 * vm_object_backing_scan fails the shadowing test in this
1770		 * case.
1771		 */
1772		if (backing_object->ref_count == 1) {
1773			/*
1774			 * If there is exactly one reference to the backing
1775			 * object, we can collapse it into the parent.
1776			 */
1777			vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
1778
1779#if VM_NRESERVLEVEL > 0
1780			/*
1781			 * Break any reservations from backing_object.
1782			 */
1783			if (__predict_false(!LIST_EMPTY(&backing_object->rvq)))
1784				vm_reserv_break_all(backing_object);
1785#endif
1786
1787			/*
1788			 * Move the pager from backing_object to object.
1789			 */
1790			if (backing_object->type == OBJT_SWAP) {
1791				/*
1792				 * swap_pager_copy() can sleep, in which case
1793				 * the backing_object's and object's locks are
1794				 * released and reacquired.
1795				 */
1796				swap_pager_copy(
1797				    backing_object,
1798				    object,
1799				    OFF_TO_IDX(object->backing_object_offset), TRUE);
1800
1801				/*
1802				 * Free any cached pages from backing_object.
1803				 */
1804				if (__predict_false(backing_object->cache != NULL))
1805					vm_page_cache_free(backing_object, 0, 0);
1806			}
1807			/*
1808			 * Object now shadows whatever backing_object did.
1809			 * Note that the reference to
1810			 * backing_object->backing_object moves from within
1811			 * backing_object to within object.
1812			 */
1813			LIST_REMOVE(object, shadow_list);
1814			backing_object->shadow_count--;
1815			backing_object->generation++;
1816			if (backing_object->backing_object) {
1817				VM_OBJECT_LOCK(backing_object->backing_object);
1818				LIST_REMOVE(backing_object, shadow_list);
1819				LIST_INSERT_HEAD(
1820				    &backing_object->backing_object->shadow_head,
1821				    object, shadow_list);
1822				/*
1823				 * The shadow_count has not changed.
1824				 */
1825				backing_object->backing_object->generation++;
1826				VM_OBJECT_UNLOCK(backing_object->backing_object);
1827			}
1828			object->backing_object = backing_object->backing_object;
1829			object->backing_object_offset +=
1830			    backing_object->backing_object_offset;
1831
1832			/*
1833			 * Discard backing_object.
1834			 *
1835			 * Since the backing object has no pages, no pager left,
1836			 * and no object references within it, all that is
1837			 * necessary is to dispose of it.
1838			 */
1839			KASSERT(backing_object->ref_count == 1, (
1840"backing_object %p was somehow re-referenced during collapse!",
1841			    backing_object));
1842			VM_OBJECT_UNLOCK(backing_object);
1843			vm_object_destroy(backing_object);
1844
1845			object_collapses++;
1846		} else {
1847			vm_object_t new_backing_object;
1848
1849			/*
1850			 * If we do not entirely shadow the backing object,
1851			 * there is nothing we can do so we give up.
1852			 */
1853			if (object->resident_page_count != object->size &&
1854			    vm_object_backing_scan(object,
1855			    OBSC_TEST_ALL_SHADOWED) == 0) {
1856				VM_OBJECT_UNLOCK(backing_object);
1857				break;
1858			}
1859
1860			/*
1861			 * Make the parent shadow the next object in the
1862			 * chain.  Deallocating backing_object will not remove
1863			 * it, since its reference count is at least 2.
1864			 */
1865			LIST_REMOVE(object, shadow_list);
1866			backing_object->shadow_count--;
1867			backing_object->generation++;
1868
1869			new_backing_object = backing_object->backing_object;
1870			if ((object->backing_object = new_backing_object) != NULL) {
1871				VM_OBJECT_LOCK(new_backing_object);
1872				LIST_INSERT_HEAD(
1873				    &new_backing_object->shadow_head,
1874				    object,
1875				    shadow_list
1876				);
1877				new_backing_object->shadow_count++;
1878				new_backing_object->generation++;
1879				vm_object_reference_locked(new_backing_object);
1880				VM_OBJECT_UNLOCK(new_backing_object);
1881				object->backing_object_offset +=
1882					backing_object->backing_object_offset;
1883			}
1884
1885			/*
1886			 * Drop the reference count on backing_object. Since
1887			 * its ref_count was at least 2, it will not vanish.
1888			 */
1889			backing_object->ref_count--;
1890			VM_OBJECT_UNLOCK(backing_object);
1891			object_bypasses++;
1892		}
1893
1894		/*
1895		 * Try again with this object's new backing object.
1896		 */
1897	}
1898}
1899
1900/*
1901 *	vm_object_page_remove:
1902 *
1903 *	For the given object, either frees or invalidates each of the
1904 *	specified pages.  In general, a page is freed.  However, if a
1905 *	page is wired for any reason other than the existence of a
1906 *	managed, wired mapping, then it may be invalidated but not
1907 *	removed from the object.  Pages are specified by the given
1908 *	range ["start", "end") and Boolean "clean_only".  As a
1909 *	special case, if "end" is zero, then the range extends from
1910 *	"start" to the end of the object.  If "clean_only" is TRUE,
1911 *	then only the non-dirty pages within the specified range are
1912 *	affected.
1913 *
1914 *	In general, this operation should only be performed on objects
1915 *	that contain managed pages.  There are two exceptions.  First,
1916 *	it may be performed on the kernel and kmem objects.  Second,
1917 *	it may be used by msync(..., MS_INVALIDATE) to invalidate
1918 *	device-backed pages.
1919 *
1920 *	The object must be locked.
1921 */
1922void
1923vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1924    boolean_t clean_only)
1925{
1926	vm_page_t p, next;
1927	int wirings;
1928
1929	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1930	if (object->resident_page_count == 0)
1931		goto skipmemq;
1932
1933	/*
1934	 * Since physically-backed objects do not use managed pages, we can't
1935	 * remove pages from the object (we must instead remove the page
1936	 * references, and then destroy the object).
1937	 */
1938	KASSERT(object->type != OBJT_PHYS || object == kernel_object ||
1939	    object == kmem_object,
1940	    ("attempt to remove pages from a physical object"));
1941
1942	vm_object_pip_add(object, 1);
1943again:
1944	if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
1945		if (p->pindex < start) {
1946			p = vm_page_splay(start, object->root);
1947			if ((object->root = p)->pindex < start)
1948				p = TAILQ_NEXT(p, listq);
1949		}
1950	}
1951
1952	/*
1953	 * Assert: the variable p is either (1) the page with the
1954	 * least pindex greater than or equal to the parameter pindex
1955	 * or (2) NULL.
1956	 */
1957	for (;
1958	     p != NULL && (p->pindex < end || end == 0);
1959	     p = next) {
1960		next = TAILQ_NEXT(p, listq);
1961
1962		/*
1963		 * If the page is wired for any reason besides the
1964		 * existence of managed, wired mappings, then it cannot
1965		 * be freed.  For example, fictitious pages, which
1966		 * represent device memory, are inherently wired and
1967		 * cannot be freed.  They can, however, be invalidated
1968		 * if "clean_only" is FALSE.
1969		 */
1970		vm_page_lock(p);
1971		vm_page_lock_queues();
1972		if ((wirings = p->wire_count) != 0 &&
1973		    (wirings = pmap_page_wired_mappings(p)) != p->wire_count) {
1974			/* Fictitious pages do not have managed mappings. */
1975			if ((p->flags & PG_FICTITIOUS) == 0)
1976				pmap_remove_all(p);
1977			/* Account for removal of managed, wired mappings. */
1978			p->wire_count -= wirings;
1979			if (!clean_only) {
1980				p->valid = 0;
1981				vm_page_undirty(p);
1982			}
1983			vm_page_unlock_queues();
1984			vm_page_unlock(p);
1985			continue;
1986		}
1987		if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
1988			goto again;
1989		KASSERT((p->flags & PG_FICTITIOUS) == 0,
1990		    ("vm_object_page_remove: page %p is fictitious", p));
1991		if (clean_only && p->valid) {
1992			pmap_remove_write(p);
1993			if (p->dirty) {
1994				vm_page_unlock_queues();
1995				vm_page_unlock(p);
1996				continue;
1997			}
1998		}
1999		pmap_remove_all(p);
2000		/* Account for removal of managed, wired mappings. */
2001		if (wirings != 0)
2002			p->wire_count -= wirings;
2003		vm_page_free(p);
2004		vm_page_unlock_queues();
2005		vm_page_unlock(p);
2006	}
2007	vm_object_pip_wakeup(object);
2008skipmemq:
2009	if (__predict_false(object->cache != NULL))
2010		vm_page_cache_free(object, start, end);
2011}
2012
2013/*
2014 *	Populate the specified range of the object with valid pages.  Returns
2015 *	TRUE if the range is successfully populated and FALSE otherwise.
2016 *
2017 *	Note: This function should be optimized to pass a larger array of
2018 *	pages to vm_pager_get_pages() before it is applied to a non-
2019 *	OBJT_DEVICE object.
2020 *
2021 *	The object must be locked.
2022 */
2023boolean_t
2024vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
2025{
2026	vm_page_t m, ma[1];
2027	vm_pindex_t pindex;
2028	int rv;
2029
2030	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2031	for (pindex = start; pindex < end; pindex++) {
2032		m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL |
2033		    VM_ALLOC_RETRY);
2034		if (m->valid != VM_PAGE_BITS_ALL) {
2035			ma[0] = m;
2036			rv = vm_pager_get_pages(object, ma, 1, 0);
2037			m = vm_page_lookup(object, pindex);
2038			if (m == NULL)
2039				break;
2040			if (rv != VM_PAGER_OK) {
2041				vm_page_lock(m);
2042				vm_page_free(m);
2043				vm_page_unlock(m);
2044				break;
2045			}
2046		}
2047		/*
2048		 * Keep "m" busy because a subsequent iteration may unlock
2049		 * the object.
2050		 */
2051	}
2052	if (pindex > start) {
2053		m = vm_page_lookup(object, start);
2054		while (m != NULL && m->pindex < pindex) {
2055			vm_page_wakeup(m);
2056			m = TAILQ_NEXT(m, listq);
2057		}
2058	}
2059	return (pindex == end);
2060}
2061
2062/*
2063 *	Routine:	vm_object_coalesce
2064 *	Function:	Coalesces two objects backing up adjoining
2065 *			regions of memory into a single object.
2066 *
2067 *	returns TRUE if objects were combined.
2068 *
2069 *	NOTE:	Only works at the moment if the second object is NULL -
2070 *		if it's not, which object do we lock first?
2071 *
2072 *	Parameters:
2073 *		prev_object	First object to coalesce
2074 *		prev_offset	Offset into prev_object
2075 *		prev_size	Size of reference to prev_object
2076 *		next_size	Size of reference to the second object
2077 *		reserved	Indicator that extension region has
2078 *				swap accounted for
2079 *
2080 *	Conditions:
2081 *	The object must *not* be locked.
2082 */
2083boolean_t
2084vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
2085    vm_size_t prev_size, vm_size_t next_size, boolean_t reserved)
2086{
2087	vm_pindex_t next_pindex;
2088
2089	if (prev_object == NULL)
2090		return (TRUE);
2091	VM_OBJECT_LOCK(prev_object);
2092	if (prev_object->type != OBJT_DEFAULT &&
2093	    prev_object->type != OBJT_SWAP) {
2094		VM_OBJECT_UNLOCK(prev_object);
2095		return (FALSE);
2096	}
2097
2098	/*
2099	 * Try to collapse the object first
2100	 */
2101	vm_object_collapse(prev_object);
2102
2103	/*
2104	 * Can't coalesce if: . more than one reference . paged out . shadows
2105	 * another object . has a copy elsewhere (any of which mean that the
2106	 * pages not mapped to prev_entry may be in use anyway)
2107	 */
2108	if (prev_object->backing_object != NULL) {
2109		VM_OBJECT_UNLOCK(prev_object);
2110		return (FALSE);
2111	}
2112
2113	prev_size >>= PAGE_SHIFT;
2114	next_size >>= PAGE_SHIFT;
2115	next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
2116
2117	if ((prev_object->ref_count > 1) &&
2118	    (prev_object->size != next_pindex)) {
2119		VM_OBJECT_UNLOCK(prev_object);
2120		return (FALSE);
2121	}
2122
2123	/*
2124	 * Account for the charge.
2125	 */
2126	if (prev_object->uip != NULL) {
2127
2128		/*
2129		 * If prev_object was charged, then this mapping,
2130		 * althought not charged now, may become writable
2131		 * later. Non-NULL uip in the object would prevent
2132		 * swap reservation during enabling of the write
2133		 * access, so reserve swap now. Failed reservation
2134		 * cause allocation of the separate object for the map
2135		 * entry, and swap reservation for this entry is
2136		 * managed in appropriate time.
2137		 */
2138		if (!reserved && !swap_reserve_by_uid(ptoa(next_size),
2139		    prev_object->uip)) {
2140			return (FALSE);
2141		}
2142		prev_object->charge += ptoa(next_size);
2143	}
2144
2145	/*
2146	 * Remove any pages that may still be in the object from a previous
2147	 * deallocation.
2148	 */
2149	if (next_pindex < prev_object->size) {
2150		vm_object_page_remove(prev_object,
2151				      next_pindex,
2152				      next_pindex + next_size, FALSE);
2153		if (prev_object->type == OBJT_SWAP)
2154			swap_pager_freespace(prev_object,
2155					     next_pindex, next_size);
2156#if 0
2157		if (prev_object->uip != NULL) {
2158			KASSERT(prev_object->charge >=
2159			    ptoa(prev_object->size - next_pindex),
2160			    ("object %p overcharged 1 %jx %jx", prev_object,
2161				(uintmax_t)next_pindex, (uintmax_t)next_size));
2162			prev_object->charge -= ptoa(prev_object->size -
2163			    next_pindex);
2164		}
2165#endif
2166	}
2167
2168	/*
2169	 * Extend the object if necessary.
2170	 */
2171	if (next_pindex + next_size > prev_object->size)
2172		prev_object->size = next_pindex + next_size;
2173
2174	VM_OBJECT_UNLOCK(prev_object);
2175	return (TRUE);
2176}
2177
2178void
2179vm_object_set_writeable_dirty(vm_object_t object)
2180{
2181
2182	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2183	if (object->type != OBJT_VNODE ||
2184	    (object->flags & OBJ_MIGHTBEDIRTY) != 0)
2185		return;
2186	vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
2187}
2188
2189#include "opt_ddb.h"
2190#ifdef DDB
2191#include <sys/kernel.h>
2192
2193#include <sys/cons.h>
2194
2195#include <ddb/ddb.h>
2196
2197static int
2198_vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
2199{
2200	vm_map_t tmpm;
2201	vm_map_entry_t tmpe;
2202	vm_object_t obj;
2203	int entcount;
2204
2205	if (map == 0)
2206		return 0;
2207
2208	if (entry == 0) {
2209		tmpe = map->header.next;
2210		entcount = map->nentries;
2211		while (entcount-- && (tmpe != &map->header)) {
2212			if (_vm_object_in_map(map, object, tmpe)) {
2213				return 1;
2214			}
2215			tmpe = tmpe->next;
2216		}
2217	} else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
2218		tmpm = entry->object.sub_map;
2219		tmpe = tmpm->header.next;
2220		entcount = tmpm->nentries;
2221		while (entcount-- && tmpe != &tmpm->header) {
2222			if (_vm_object_in_map(tmpm, object, tmpe)) {
2223				return 1;
2224			}
2225			tmpe = tmpe->next;
2226		}
2227	} else if ((obj = entry->object.vm_object) != NULL) {
2228		for (; obj; obj = obj->backing_object)
2229			if (obj == object) {
2230				return 1;
2231			}
2232	}
2233	return 0;
2234}
2235
2236static int
2237vm_object_in_map(vm_object_t object)
2238{
2239	struct proc *p;
2240
2241	/* sx_slock(&allproc_lock); */
2242	FOREACH_PROC_IN_SYSTEM(p) {
2243		if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
2244			continue;
2245		if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
2246			/* sx_sunlock(&allproc_lock); */
2247			return 1;
2248		}
2249	}
2250	/* sx_sunlock(&allproc_lock); */
2251	if (_vm_object_in_map(kernel_map, object, 0))
2252		return 1;
2253	if (_vm_object_in_map(kmem_map, object, 0))
2254		return 1;
2255	if (_vm_object_in_map(pager_map, object, 0))
2256		return 1;
2257	if (_vm_object_in_map(buffer_map, object, 0))
2258		return 1;
2259	return 0;
2260}
2261
2262DB_SHOW_COMMAND(vmochk, vm_object_check)
2263{
2264	vm_object_t object;
2265
2266	/*
2267	 * make sure that internal objs are in a map somewhere
2268	 * and none have zero ref counts.
2269	 */
2270	TAILQ_FOREACH(object, &vm_object_list, object_list) {
2271		if (object->handle == NULL &&
2272		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
2273			if (object->ref_count == 0) {
2274				db_printf("vmochk: internal obj has zero ref count: %ld\n",
2275					(long)object->size);
2276			}
2277			if (!vm_object_in_map(object)) {
2278				db_printf(
2279			"vmochk: internal obj is not in a map: "
2280			"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
2281				    object->ref_count, (u_long)object->size,
2282				    (u_long)object->size,
2283				    (void *)object->backing_object);
2284			}
2285		}
2286	}
2287}
2288
2289/*
2290 *	vm_object_print:	[ debug ]
2291 */
2292DB_SHOW_COMMAND(object, vm_object_print_static)
2293{
2294	/* XXX convert args. */
2295	vm_object_t object = (vm_object_t)addr;
2296	boolean_t full = have_addr;
2297
2298	vm_page_t p;
2299
2300	/* XXX count is an (unused) arg.  Avoid shadowing it. */
2301#define	count	was_count
2302
2303	int count;
2304
2305	if (object == NULL)
2306		return;
2307
2308	db_iprintf(
2309	    "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x uip %d charge %jx\n",
2310	    object, (int)object->type, (uintmax_t)object->size,
2311	    object->resident_page_count, object->ref_count, object->flags,
2312	    object->uip ? object->uip->ui_uid : -1, (uintmax_t)object->charge);
2313	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
2314	    object->shadow_count,
2315	    object->backing_object ? object->backing_object->ref_count : 0,
2316	    object->backing_object, (uintmax_t)object->backing_object_offset);
2317
2318	if (!full)
2319		return;
2320
2321	db_indent += 2;
2322	count = 0;
2323	TAILQ_FOREACH(p, &object->memq, listq) {
2324		if (count == 0)
2325			db_iprintf("memory:=");
2326		else if (count == 6) {
2327			db_printf("\n");
2328			db_iprintf(" ...");
2329			count = 0;
2330		} else
2331			db_printf(",");
2332		count++;
2333
2334		db_printf("(off=0x%jx,page=0x%jx)",
2335		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
2336	}
2337	if (count != 0)
2338		db_printf("\n");
2339	db_indent -= 2;
2340}
2341
2342/* XXX. */
2343#undef count
2344
2345/* XXX need this non-static entry for calling from vm_map_print. */
2346void
2347vm_object_print(
2348        /* db_expr_t */ long addr,
2349	boolean_t have_addr,
2350	/* db_expr_t */ long count,
2351	char *modif)
2352{
2353	vm_object_print_static(addr, have_addr, count, modif);
2354}
2355
2356DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
2357{
2358	vm_object_t object;
2359	vm_pindex_t fidx;
2360	vm_paddr_t pa;
2361	vm_page_t m, prev_m;
2362	int rcount, nl, c;
2363
2364	nl = 0;
2365	TAILQ_FOREACH(object, &vm_object_list, object_list) {
2366		db_printf("new object: %p\n", (void *)object);
2367		if (nl > 18) {
2368			c = cngetc();
2369			if (c != ' ')
2370				return;
2371			nl = 0;
2372		}
2373		nl++;
2374		rcount = 0;
2375		fidx = 0;
2376		pa = -1;
2377		TAILQ_FOREACH(m, &object->memq, listq) {
2378			if (m->pindex > 128)
2379				break;
2380			if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL &&
2381			    prev_m->pindex + 1 != m->pindex) {
2382				if (rcount) {
2383					db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2384						(long)fidx, rcount, (long)pa);
2385					if (nl > 18) {
2386						c = cngetc();
2387						if (c != ' ')
2388							return;
2389						nl = 0;
2390					}
2391					nl++;
2392					rcount = 0;
2393				}
2394			}
2395			if (rcount &&
2396				(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
2397				++rcount;
2398				continue;
2399			}
2400			if (rcount) {
2401				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2402					(long)fidx, rcount, (long)pa);
2403				if (nl > 18) {
2404					c = cngetc();
2405					if (c != ' ')
2406						return;
2407					nl = 0;
2408				}
2409				nl++;
2410			}
2411			fidx = m->pindex;
2412			pa = VM_PAGE_TO_PHYS(m);
2413			rcount = 1;
2414		}
2415		if (rcount) {
2416			db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2417				(long)fidx, rcount, (long)pa);
2418			if (nl > 18) {
2419				c = cngetc();
2420				if (c != ' ')
2421					return;
2422				nl = 0;
2423			}
2424			nl++;
2425		}
2426	}
2427}
2428#endif /* DDB */
2429