vm_page.c revision 136952
19Sjkh/*
29Sjkh * Copyright (c) 1991 Regents of the University of California.
39Sjkh * All rights reserved.
49Sjkh *
59Sjkh * This code is derived from software contributed to Berkeley by
69Sjkh * The Mach Operating System project at Carnegie-Mellon University.
79Sjkh *
89Sjkh * Redistribution and use in source and binary forms, with or without
99Sjkh * modification, are permitted provided that the following conditions
109Sjkh * are met:
119Sjkh * 1. Redistributions of source code must retain the above copyright
129Sjkh *    notice, this list of conditions and the following disclaimer.
139Sjkh * 2. Redistributions in binary form must reproduce the above copyright
149Sjkh *    notice, this list of conditions and the following disclaimer in the
159Sjkh *    documentation and/or other materials provided with the distribution.
169Sjkh * 4. Neither the name of the University nor the names of its contributors
179Sjkh *    may be used to endorse or promote products derived from this software
189Sjkh *    without specific prior written permission.
199Sjkh *
209Sjkh * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
219Sjkh * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
229Sjkh * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
239Sjkh * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
249Sjkh * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
259Sjkh * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
269Sjkh * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
279Sjkh * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
289Sjkh * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
299Sjkh * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
309Sjkh * SUCH DAMAGE.
319Sjkh *
329Sjkh *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
339Sjkh */
349Sjkh
359Sjkh/*
369Sjkh * Copyright (c) 1987, 1990 Carnegie-Mellon University.
379Sjkh * All rights reserved.
389Sjkh *
399Sjkh * Authors: Avadis Tevanian, Jr., Michael Wayne Young
409Sjkh *
419Sjkh * Permission to use, copy, modify and distribute this software and
429Sjkh * its documentation is hereby granted, provided that both the copyright
439Sjkh * notice and this permission notice appear in all copies of the
449Sjkh * software, derivative works or modified versions, and any portions
459Sjkh * thereof, and that both notices appear in supporting documentation.
469Sjkh *
479Sjkh * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
489Sjkh * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
499Sjkh * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
509Sjkh *
519Sjkh * Carnegie Mellon requests users of this software to return to
529Sjkh *
539Sjkh *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
549Sjkh *  School of Computer Science
559Sjkh *  Carnegie Mellon University
569Sjkh *  Pittsburgh PA 15213-3890
579Sjkh *
589Sjkh * any improvements or extensions that they make and grant Carnegie the
599Sjkh * rights to redistribute these changes.
609Sjkh */
619Sjkh
629Sjkh/*
639Sjkh *			GENERAL RULES ON VM_PAGE MANIPULATION
649Sjkh *
659Sjkh *	- a pageq mutex is required when adding or removing a page from a
669Sjkh *	  page queue (vm_page_queue[]), regardless of other mutexes or the
679Sjkh *	  busy state of a page.
689Sjkh *
699Sjkh *	- a hash chain mutex is required when associating or disassociating
709Sjkh *	  a page from the VM PAGE CACHE hash table (vm_page_buckets),
719Sjkh *	  regardless of other mutexes or the busy state of a page.
729Sjkh *
739Sjkh *	- either a hash chain mutex OR a busied page is required in order
749Sjkh *	  to modify the page flags.  A hash chain mutex must be obtained in
759Sjkh *	  order to busy a page.  A page's flags cannot be modified by a
769Sjkh *	  hash chain mutex if the page is marked busy.
779Sjkh *
789Sjkh *	- The object memq mutex is held when inserting or removing
799Sjkh *	  pages from an object (vm_page_insert() or vm_page_remove()).  This
809Sjkh *	  is different from the object's main mutex.
819Sjkh *
829Sjkh *	Generally speaking, you have to be aware of side effects when running
839Sjkh *	vm_page ops.  A vm_page_lookup() will return with the hash chain
849Sjkh *	locked, whether it was able to lookup the page or not.  vm_page_free(),
859Sjkh *	vm_page_cache(), vm_page_activate(), and a number of other routines
869Sjkh *	will release the hash chain mutex for you.  Intermediate manipulation
879Sjkh *	routines such as vm_page_flag_set() expect the hash chain to be held
889Sjkh *	on entry and the hash chain will remain held on return.
899Sjkh *
909Sjkh *	pageq scanning can only occur with the pageq in question locked.
919Sjkh *	We have a known bottleneck with the active queue, but the cache
929Sjkh *	and free queues are actually arrays already.
939Sjkh */
949Sjkh
959Sjkh/*
969Sjkh *	Resident memory management module.
979Sjkh */
989Sjkh
999Sjkh#include <sys/cdefs.h>
1009Sjkh__FBSDID("$FreeBSD: head/sys/vm/vm_page.c 136952 2004-10-25 19:52:44Z alc $");
1019Sjkh
1029Sjkh#include <sys/param.h>
1039Sjkh#include <sys/systm.h>
1049Sjkh#include <sys/lock.h>
1059Sjkh#include <sys/malloc.h>
1069Sjkh#include <sys/mutex.h>
1079Sjkh#include <sys/proc.h>
1089Sjkh#include <sys/vmmeter.h>
1099Sjkh#include <sys/vnode.h>
1109Sjkh
1119Sjkh#include <vm/vm.h>
1129Sjkh#include <vm/vm_param.h>
1139Sjkh#include <vm/vm_kern.h>
1149Sjkh#include <vm/vm_object.h>
1159Sjkh#include <vm/vm_page.h>
1169Sjkh#include <vm/vm_pageout.h>
1179Sjkh#include <vm/vm_pager.h>
1189Sjkh#include <vm/vm_extern.h>
1199Sjkh#include <vm/uma.h>
1209Sjkh#include <vm/uma_int.h>
1219Sjkh
1229Sjkh/*
1239Sjkh *	Associated with page of user-allocatable memory is a
1249Sjkh *	page structure.
1259Sjkh */
1269Sjkh
1279Sjkhstruct mtx vm_page_queue_mtx;
1289Sjkhstruct mtx vm_page_queue_free_mtx;
1299Sjkh
1309Sjkhvm_page_t vm_page_array = 0;
1319Sjkhint vm_page_array_size = 0;
1329Sjkhlong first_page = 0;
1339Sjkhint vm_page_zero_count = 0;
1349Sjkh
1359Sjkh/*
1369Sjkh *	vm_set_page_size:
1379Sjkh *
1389Sjkh *	Sets the page size, perhaps based upon the memory
1399Sjkh *	size.  Must be called before any use of page-size
1409Sjkh *	dependent functions.
1419Sjkh */
1429Sjkhvoid
1439Sjkhvm_set_page_size(void)
1449Sjkh{
1459Sjkh	if (cnt.v_page_size == 0)
1469Sjkh		cnt.v_page_size = PAGE_SIZE;
1479Sjkh	if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
1489Sjkh		panic("vm_set_page_size: page size not a power of two");
1499Sjkh}
1509Sjkh
1519Sjkh/*
1529Sjkh *	vm_page_startup:
1539Sjkh *
1549Sjkh *	Initializes the resident memory module.
1559Sjkh *
1569Sjkh *	Allocates memory for the page cells, and
1579Sjkh *	for the object/offset-to-page hash table headers.
1589Sjkh *	Each page cell is initialized and placed on the free list.
1599Sjkh */
1609Sjkhvm_offset_t
1619Sjkhvm_page_startup(vm_offset_t vaddr)
1629Sjkh{
1639Sjkh	vm_offset_t mapped;
1649Sjkh	vm_size_t npages;
1659Sjkh	vm_paddr_t page_range;
1669Sjkh	vm_paddr_t new_end;
1679Sjkh	int i;
1689Sjkh	vm_paddr_t pa;
1699Sjkh	int nblocks;
1709Sjkh	vm_paddr_t last_pa;
1719Sjkh
1729Sjkh	/* the biggest memory array is the second group of pages */
1739Sjkh	vm_paddr_t end;
1749Sjkh	vm_paddr_t biggestsize;
1759Sjkh	int biggestone;
1769Sjkh
1779Sjkh	vm_paddr_t total;
1789Sjkh	vm_size_t bootpages;
1799Sjkh
1809Sjkh	total = 0;
1819Sjkh	biggestsize = 0;
1829Sjkh	biggestone = 0;
1839Sjkh	nblocks = 0;
1849Sjkh	vaddr = round_page(vaddr);
1859Sjkh
1869Sjkh	for (i = 0; phys_avail[i + 1]; i += 2) {
1879Sjkh		phys_avail[i] = round_page(phys_avail[i]);
1889Sjkh		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
1899Sjkh	}
1909Sjkh
1919Sjkh	for (i = 0; phys_avail[i + 1]; i += 2) {
1929Sjkh		vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
1939Sjkh
1949Sjkh		if (size > biggestsize) {
1959Sjkh			biggestone = i;
1969Sjkh			biggestsize = size;
1979Sjkh		}
1989Sjkh		++nblocks;
1999Sjkh		total += size;
2009Sjkh	}
2019Sjkh
2029Sjkh	end = phys_avail[biggestone+1];
2039Sjkh
2049Sjkh	/*
2059Sjkh	 * Initialize the locks.
2069Sjkh	 */
2079Sjkh	mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF |
2089Sjkh	    MTX_RECURSE);
2099Sjkh	mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL,
2109Sjkh	    MTX_SPIN);
2119Sjkh
2129Sjkh	/*
2139Sjkh	 * Initialize the queue headers for the free queue, the active queue
2149Sjkh	 * and the inactive queue.
2159Sjkh	 */
2169Sjkh	vm_pageq_init();
2179Sjkh
2189Sjkh	/*
2199Sjkh	 * Allocate memory for use when boot strapping the kernel memory
2209Sjkh	 * allocator.
2219Sjkh	 */
2229Sjkh	bootpages = UMA_BOOT_PAGES * UMA_SLAB_SIZE;
2239Sjkh	new_end = end - bootpages;
2249Sjkh	new_end = trunc_page(new_end);
2259Sjkh	mapped = pmap_map(&vaddr, new_end, end,
2269Sjkh	    VM_PROT_READ | VM_PROT_WRITE);
2279Sjkh	bzero((caddr_t) mapped, end - new_end);
2289Sjkh	uma_startup((caddr_t)mapped);
2299Sjkh
2309Sjkh	/*
2319Sjkh	 * Compute the number of pages of memory that will be available for
2329Sjkh	 * use (taking into account the overhead of a page structure per
2339Sjkh	 * page).
2349Sjkh	 */
2359Sjkh	first_page = phys_avail[0] / PAGE_SIZE;
2369Sjkh	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page;
2379Sjkh	npages = (total - (page_range * sizeof(struct vm_page)) -
2389Sjkh	    (end - new_end)) / PAGE_SIZE;
2399Sjkh	end = new_end;
2409Sjkh
2419Sjkh	/*
2429Sjkh	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
2439Sjkh	 */
2449Sjkh	vaddr += PAGE_SIZE;
2459Sjkh
2469Sjkh	/*
2479Sjkh	 * Initialize the mem entry structures now, and put them in the free
2489Sjkh	 * queue.
2499Sjkh	 */
2509Sjkh	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
2519Sjkh	mapped = pmap_map(&vaddr, new_end, end,
2529Sjkh	    VM_PROT_READ | VM_PROT_WRITE);
2539Sjkh	vm_page_array = (vm_page_t) mapped;
2549Sjkh	phys_avail[biggestone + 1] = new_end;
2559Sjkh
2569Sjkh	/*
2579Sjkh	 * Clear all of the page structures
2589Sjkh	 */
2599Sjkh	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
2609Sjkh	vm_page_array_size = page_range;
2619Sjkh
2629Sjkh	/*
2639Sjkh	 * Construct the free queue(s) in descending order (by physical
2649Sjkh	 * address) so that the first 16MB of physical memory is allocated
2659Sjkh	 * last rather than first.  On large-memory machines, this avoids
2669Sjkh	 * the exhaustion of low physical memory before isa_dma_init has run.
2679Sjkh	 */
2689Sjkh	cnt.v_page_count = 0;
2699Sjkh	cnt.v_free_count = 0;
2709Sjkh	for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
2719Sjkh		pa = phys_avail[i];
2729Sjkh		last_pa = phys_avail[i + 1];
2739Sjkh		while (pa < last_pa && npages-- > 0) {
2749Sjkh			vm_pageq_add_new_page(pa);
2759Sjkh			pa += PAGE_SIZE;
2769Sjkh		}
2779Sjkh	}
2789Sjkh	return (vaddr);
2799Sjkh}
2809Sjkh
2819Sjkhvoid
2829Sjkhvm_page_flag_set(vm_page_t m, unsigned short bits)
2839Sjkh{
2849Sjkh
2859Sjkh	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2869Sjkh	m->flags |= bits;
2879Sjkh}
2889Sjkh
2899Sjkhvoid
2909Sjkhvm_page_flag_clear(vm_page_t m, unsigned short bits)
2919Sjkh{
2929Sjkh
2939Sjkh	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2949Sjkh	m->flags &= ~bits;
2959Sjkh}
2969Sjkh
2979Sjkhvoid
2989Sjkhvm_page_busy(vm_page_t m)
2999Sjkh{
3009Sjkh
3019Sjkh	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
3029Sjkh	KASSERT((m->flags & PG_BUSY) == 0,
3039Sjkh	    ("vm_page_busy: page already busy!!!"));
3049Sjkh	vm_page_flag_set(m, PG_BUSY);
3059Sjkh}
3069Sjkh
3079Sjkh/*
3089Sjkh *      vm_page_flash:
3099Sjkh *
3109Sjkh *      wakeup anyone waiting for the page.
3119Sjkh */
3129Sjkhvoid
3139Sjkhvm_page_flash(vm_page_t m)
3149Sjkh{
3159Sjkh
3169Sjkh	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
3179Sjkh	if (m->flags & PG_WANTED) {
3189Sjkh		vm_page_flag_clear(m, PG_WANTED);
3199Sjkh		wakeup(m);
3209Sjkh	}
3219Sjkh}
3229Sjkh
3239Sjkh/*
3249Sjkh *      vm_page_wakeup:
3259Sjkh *
3269Sjkh *      clear the PG_BUSY flag and wakeup anyone waiting for the
3279Sjkh *      page.
3289Sjkh *
3299Sjkh */
3309Sjkhvoid
3319Sjkhvm_page_wakeup(vm_page_t m)
3329Sjkh{
3339Sjkh
3349Sjkh	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
3359Sjkh	KASSERT(m->flags & PG_BUSY, ("vm_page_wakeup: page not busy!!!"));
3369Sjkh	vm_page_flag_clear(m, PG_BUSY);
3379Sjkh	vm_page_flash(m);
3389Sjkh}
3399Sjkh
3409Sjkhvoid
3419Sjkhvm_page_io_start(vm_page_t m)
3429Sjkh{
3439Sjkh
3449Sjkh	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
3459Sjkh	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3469Sjkh	m->busy++;
3479Sjkh}
3489Sjkh
3499Sjkhvoid
3509Sjkhvm_page_io_finish(vm_page_t m)
3519Sjkh{
3529Sjkh
3539Sjkh	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
3549Sjkh	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3559Sjkh	m->busy--;
3569Sjkh	if (m->busy == 0)
3579Sjkh		vm_page_flash(m);
3589Sjkh}
3599Sjkh
3609Sjkh/*
3619Sjkh * Keep page from being freed by the page daemon
3629Sjkh * much of the same effect as wiring, except much lower
3639Sjkh * overhead and should be used only for *very* temporary
3649Sjkh * holding ("wiring").
3659Sjkh */
3669Sjkhvoid
3679Sjkhvm_page_hold(vm_page_t mem)
3689Sjkh{
3699Sjkh
3709Sjkh	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3719Sjkh        mem->hold_count++;
3729Sjkh}
3739Sjkh
3749Sjkhvoid
3759Sjkhvm_page_unhold(vm_page_t mem)
3769Sjkh{
3779Sjkh
3789Sjkh	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3799Sjkh	--mem->hold_count;
3809Sjkh	KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
3819Sjkh	if (mem->hold_count == 0 && mem->queue == PQ_HOLD)
3829Sjkh		vm_page_free_toq(mem);
3839Sjkh}
3849Sjkh
3859Sjkh/*
3869Sjkh *	vm_page_free:
3879Sjkh *
3889Sjkh *	Free a page
3899Sjkh *
3909Sjkh *	The clearing of PG_ZERO is a temporary safety until the code can be
3919Sjkh *	reviewed to determine that PG_ZERO is being properly cleared on
3929Sjkh *	write faults or maps.  PG_ZERO was previously cleared in
3939Sjkh *	vm_page_alloc().
3949Sjkh */
3959Sjkhvoid
3969Sjkhvm_page_free(vm_page_t m)
3979Sjkh{
3989Sjkh	vm_page_flag_clear(m, PG_ZERO);
3999Sjkh	vm_page_free_toq(m);
4009Sjkh	vm_page_zero_idle_wakeup();
4019Sjkh}
4029Sjkh
4039Sjkh/*
4049Sjkh *	vm_page_free_zero:
4059Sjkh *
4069Sjkh *	Free a page to the zerod-pages queue
4079Sjkh */
4089Sjkhvoid
4099Sjkhvm_page_free_zero(vm_page_t m)
4109Sjkh{
4119Sjkh	vm_page_flag_set(m, PG_ZERO);
4129Sjkh	vm_page_free_toq(m);
4139Sjkh}
4149Sjkh
4159Sjkh/*
4169Sjkh *	vm_page_sleep_if_busy:
4179Sjkh *
4189Sjkh *	Sleep and release the page queues lock if PG_BUSY is set or,
4199Sjkh *	if also_m_busy is TRUE, busy is non-zero.  Returns TRUE if the
4209Sjkh *	thread slept and the page queues lock was released.
4219Sjkh *	Otherwise, retains the page queues lock and returns FALSE.
4229Sjkh */
423int
424vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg)
425{
426	vm_object_t object;
427	int is_object_locked;
428
429	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
430	if ((m->flags & PG_BUSY) || (also_m_busy && m->busy)) {
431		vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
432		/*
433		 * It's possible that while we sleep, the page will get
434		 * unbusied and freed.  If we are holding the object
435		 * lock, we will assume we hold a reference to the object
436		 * such that even if m->object changes, we can re-lock
437		 * it.
438		 *
439		 * Remove mtx_owned() after vm_object locking is finished.
440		 */
441		object = m->object;
442		if ((is_object_locked = object != NULL &&
443		     mtx_owned(&object->mtx)))
444			mtx_unlock(&object->mtx);
445		msleep(m, &vm_page_queue_mtx, PDROP | PVM, msg, 0);
446		if (is_object_locked)
447			mtx_lock(&object->mtx);
448		return (TRUE);
449	}
450	return (FALSE);
451}
452
453/*
454 *	vm_page_dirty:
455 *
456 *	make page all dirty
457 */
458void
459vm_page_dirty(vm_page_t m)
460{
461	KASSERT(m->queue - m->pc != PQ_CACHE,
462	    ("vm_page_dirty: page in cache!"));
463	KASSERT(m->queue - m->pc != PQ_FREE,
464	    ("vm_page_dirty: page is free!"));
465	m->dirty = VM_PAGE_BITS_ALL;
466}
467
468/*
469 *	vm_page_splay:
470 *
471 *	Implements Sleator and Tarjan's top-down splay algorithm.  Returns
472 *	the vm_page containing the given pindex.  If, however, that
473 *	pindex is not found in the vm_object, returns a vm_page that is
474 *	adjacent to the pindex, coming before or after it.
475 */
476vm_page_t
477vm_page_splay(vm_pindex_t pindex, vm_page_t root)
478{
479	struct vm_page dummy;
480	vm_page_t lefttreemax, righttreemin, y;
481
482	if (root == NULL)
483		return (root);
484	lefttreemax = righttreemin = &dummy;
485	for (;; root = y) {
486		if (pindex < root->pindex) {
487			if ((y = root->left) == NULL)
488				break;
489			if (pindex < y->pindex) {
490				/* Rotate right. */
491				root->left = y->right;
492				y->right = root;
493				root = y;
494				if ((y = root->left) == NULL)
495					break;
496			}
497			/* Link into the new root's right tree. */
498			righttreemin->left = root;
499			righttreemin = root;
500		} else if (pindex > root->pindex) {
501			if ((y = root->right) == NULL)
502				break;
503			if (pindex > y->pindex) {
504				/* Rotate left. */
505				root->right = y->left;
506				y->left = root;
507				root = y;
508				if ((y = root->right) == NULL)
509					break;
510			}
511			/* Link into the new root's left tree. */
512			lefttreemax->right = root;
513			lefttreemax = root;
514		} else
515			break;
516	}
517	/* Assemble the new root. */
518	lefttreemax->right = root->left;
519	righttreemin->left = root->right;
520	root->left = dummy.right;
521	root->right = dummy.left;
522	return (root);
523}
524
525/*
526 *	vm_page_insert:		[ internal use only ]
527 *
528 *	Inserts the given mem entry into the object and object list.
529 *
530 *	The pagetables are not updated but will presumably fault the page
531 *	in if necessary, or if a kernel page the caller will at some point
532 *	enter the page into the kernel's pmap.  We are not allowed to block
533 *	here so we *can't* do this anyway.
534 *
535 *	The object and page must be locked.
536 *	This routine may not block.
537 */
538void
539vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
540{
541	vm_page_t root;
542
543	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
544	if (m->object != NULL)
545		panic("vm_page_insert: page already inserted");
546
547	/*
548	 * Record the object/offset pair in this page
549	 */
550	m->object = object;
551	m->pindex = pindex;
552
553	/*
554	 * Now link into the object's ordered list of backed pages.
555	 */
556	root = object->root;
557	if (root == NULL) {
558		m->left = NULL;
559		m->right = NULL;
560		TAILQ_INSERT_TAIL(&object->memq, m, listq);
561	} else {
562		root = vm_page_splay(pindex, root);
563		if (pindex < root->pindex) {
564			m->left = root->left;
565			m->right = root;
566			root->left = NULL;
567			TAILQ_INSERT_BEFORE(root, m, listq);
568		} else if (pindex == root->pindex)
569			panic("vm_page_insert: offset already allocated");
570		else {
571			m->right = root->right;
572			m->left = root;
573			root->right = NULL;
574			TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
575		}
576	}
577	object->root = m;
578	object->generation++;
579
580	/*
581	 * show that the object has one more resident page.
582	 */
583	object->resident_page_count++;
584
585	/*
586	 * Since we are inserting a new and possibly dirty page,
587	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
588	 */
589	if (m->flags & PG_WRITEABLE)
590		vm_object_set_writeable_dirty(object);
591}
592
593/*
594 *	vm_page_remove:
595 *				NOTE: used by device pager as well -wfj
596 *
597 *	Removes the given mem entry from the object/offset-page
598 *	table and the object page list, but do not invalidate/terminate
599 *	the backing store.
600 *
601 *	The object and page must be locked.
602 *	The underlying pmap entry (if any) is NOT removed here.
603 *	This routine may not block.
604 */
605void
606vm_page_remove(vm_page_t m)
607{
608	vm_object_t object;
609	vm_page_t root;
610
611	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
612	if (m->object == NULL)
613		return;
614	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
615	if ((m->flags & PG_BUSY) == 0) {
616		panic("vm_page_remove: page not busy");
617	}
618
619	/*
620	 * Basically destroy the page.
621	 */
622	vm_page_wakeup(m);
623
624	object = m->object;
625
626	/*
627	 * Now remove from the object's list of backed pages.
628	 */
629	if (m != object->root)
630		vm_page_splay(m->pindex, object->root);
631	if (m->left == NULL)
632		root = m->right;
633	else {
634		root = vm_page_splay(m->pindex, m->left);
635		root->right = m->right;
636	}
637	object->root = root;
638	TAILQ_REMOVE(&object->memq, m, listq);
639
640	/*
641	 * And show that the object has one fewer resident page.
642	 */
643	object->resident_page_count--;
644	object->generation++;
645
646	m->object = NULL;
647}
648
649/*
650 *	vm_page_lookup:
651 *
652 *	Returns the page associated with the object/offset
653 *	pair specified; if none is found, NULL is returned.
654 *
655 *	The object must be locked.
656 *	This routine may not block.
657 *	This is a critical path routine
658 */
659vm_page_t
660vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
661{
662	vm_page_t m;
663
664	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
665	if ((m = object->root) != NULL && m->pindex != pindex) {
666		m = vm_page_splay(pindex, m);
667		if ((object->root = m)->pindex != pindex)
668			m = NULL;
669	}
670	return (m);
671}
672
673/*
674 *	vm_page_rename:
675 *
676 *	Move the given memory entry from its
677 *	current object to the specified target object/offset.
678 *
679 *	The object must be locked.
680 *	This routine may not block.
681 *
682 *	Note: swap associated with the page must be invalidated by the move.  We
683 *	      have to do this for several reasons:  (1) we aren't freeing the
684 *	      page, (2) we are dirtying the page, (3) the VM system is probably
685 *	      moving the page from object A to B, and will then later move
686 *	      the backing store from A to B and we can't have a conflict.
687 *
688 *	Note: we *always* dirty the page.  It is necessary both for the
689 *	      fact that we moved it, and because we may be invalidating
690 *	      swap.  If the page is on the cache, we have to deactivate it
691 *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
692 *	      on the cache.
693 */
694void
695vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
696{
697
698	vm_page_remove(m);
699	vm_page_insert(m, new_object, new_pindex);
700	if (m->queue - m->pc == PQ_CACHE)
701		vm_page_deactivate(m);
702	vm_page_dirty(m);
703}
704
705/*
706 *	vm_page_select_cache:
707 *
708 *	Find a page on the cache queue with color optimization.  As pages
709 *	might be found, but not applicable, they are deactivated.  This
710 *	keeps us from using potentially busy cached pages.
711 *
712 *	This routine may not block.
713 */
714vm_page_t
715vm_page_select_cache(int color)
716{
717	vm_page_t m;
718
719	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
720	while ((m = vm_pageq_find(PQ_CACHE, color, FALSE)) != NULL) {
721		if ((m->flags & PG_BUSY) == 0 && m->busy == 0 &&
722		    m->hold_count == 0 && (VM_OBJECT_TRYLOCK(m->object) ||
723		    VM_OBJECT_LOCKED(m->object))) {
724			KASSERT(m->dirty == 0,
725			    ("Found dirty cache page %p", m));
726			KASSERT(!pmap_page_is_mapped(m),
727			    ("Found mapped cache page %p", m));
728			KASSERT((m->flags & PG_UNMANAGED) == 0,
729			    ("Found unmanaged cache page %p", m));
730			KASSERT(m->wire_count == 0,
731			    ("Found wired cache page %p", m));
732			break;
733		}
734		vm_page_deactivate(m);
735	}
736	return (m);
737}
738
739/*
740 *	vm_page_alloc:
741 *
742 *	Allocate and return a memory cell associated
743 *	with this VM object/offset pair.
744 *
745 *	page_req classes:
746 *	VM_ALLOC_NORMAL		normal process request
747 *	VM_ALLOC_SYSTEM		system *really* needs a page
748 *	VM_ALLOC_INTERRUPT	interrupt time request
749 *	VM_ALLOC_ZERO		zero page
750 *
751 *	This routine may not block.
752 *
753 *	Additional special handling is required when called from an
754 *	interrupt (VM_ALLOC_INTERRUPT).  We are not allowed to mess with
755 *	the page cache in this case.
756 */
757vm_page_t
758vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
759{
760	vm_object_t m_object;
761	vm_page_t m = NULL;
762	int color, flags, page_req;
763
764	page_req = req & VM_ALLOC_CLASS_MASK;
765
766	if ((req & VM_ALLOC_NOOBJ) == 0) {
767		KASSERT(object != NULL,
768		    ("vm_page_alloc: NULL object."));
769		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
770		color = (pindex + object->pg_color) & PQ_L2_MASK;
771	} else
772		color = pindex & PQ_L2_MASK;
773
774	/*
775	 * The pager is allowed to eat deeper into the free page list.
776	 */
777	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
778		page_req = VM_ALLOC_SYSTEM;
779	};
780
781loop:
782	mtx_lock_spin(&vm_page_queue_free_mtx);
783	if (cnt.v_free_count > cnt.v_free_reserved ||
784	    (page_req == VM_ALLOC_SYSTEM &&
785	     cnt.v_cache_count == 0 &&
786	     cnt.v_free_count > cnt.v_interrupt_free_min) ||
787	    (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)) {
788		/*
789		 * Allocate from the free queue if the number of free pages
790		 * exceeds the minimum for the request class.
791		 */
792		m = vm_pageq_find(PQ_FREE, color, (req & VM_ALLOC_ZERO) != 0);
793	} else if (page_req != VM_ALLOC_INTERRUPT) {
794		mtx_unlock_spin(&vm_page_queue_free_mtx);
795		/*
796		 * Allocatable from cache (non-interrupt only).  On success,
797		 * we must free the page and try again, thus ensuring that
798		 * cnt.v_*_free_min counters are replenished.
799		 */
800		vm_page_lock_queues();
801		if ((m = vm_page_select_cache(color)) == NULL) {
802#if defined(DIAGNOSTIC)
803			if (cnt.v_cache_count > 0)
804				printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count);
805#endif
806			vm_page_unlock_queues();
807			atomic_add_int(&vm_pageout_deficit, 1);
808			pagedaemon_wakeup();
809			return (NULL);
810		}
811		m_object = m->object;
812		VM_OBJECT_LOCK_ASSERT(m_object, MA_OWNED);
813		vm_page_busy(m);
814		vm_page_free(m);
815		vm_page_unlock_queues();
816		if (m_object != object)
817			VM_OBJECT_UNLOCK(m_object);
818		goto loop;
819	} else {
820		/*
821		 * Not allocatable from cache from interrupt, give up.
822		 */
823		mtx_unlock_spin(&vm_page_queue_free_mtx);
824		atomic_add_int(&vm_pageout_deficit, 1);
825		pagedaemon_wakeup();
826		return (NULL);
827	}
828
829	/*
830	 *  At this point we had better have found a good page.
831	 */
832
833	KASSERT(
834	    m != NULL,
835	    ("vm_page_alloc(): missing page on free queue")
836	);
837
838	/*
839	 * Remove from free queue
840	 */
841	vm_pageq_remove_nowakeup(m);
842
843	/*
844	 * Initialize structure.  Only the PG_ZERO flag is inherited.
845	 */
846	flags = PG_BUSY;
847	if (m->flags & PG_ZERO) {
848		vm_page_zero_count--;
849		if (req & VM_ALLOC_ZERO)
850			flags = PG_ZERO | PG_BUSY;
851	}
852	if (req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ))
853		flags &= ~PG_BUSY;
854	m->flags = flags;
855	if (req & VM_ALLOC_WIRED) {
856		atomic_add_int(&cnt.v_wire_count, 1);
857		m->wire_count = 1;
858	} else
859		m->wire_count = 0;
860	m->hold_count = 0;
861	m->act_count = 0;
862	m->busy = 0;
863	m->valid = 0;
864	KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
865	mtx_unlock_spin(&vm_page_queue_free_mtx);
866
867	if ((req & VM_ALLOC_NOOBJ) == 0)
868		vm_page_insert(m, object, pindex);
869	else
870		m->pindex = pindex;
871
872	/*
873	 * Don't wakeup too often - wakeup the pageout daemon when
874	 * we would be nearly out of memory.
875	 */
876	if (vm_paging_needed())
877		pagedaemon_wakeup();
878
879	return (m);
880}
881
882/*
883 *	vm_wait:	(also see VM_WAIT macro)
884 *
885 *	Block until free pages are available for allocation
886 *	- Called in various places before memory allocations.
887 */
888void
889vm_wait(void)
890{
891
892	vm_page_lock_queues();
893	if (curproc == pageproc) {
894		vm_pageout_pages_needed = 1;
895		msleep(&vm_pageout_pages_needed, &vm_page_queue_mtx,
896		    PDROP | PSWP, "VMWait", 0);
897	} else {
898		if (!vm_pages_needed) {
899			vm_pages_needed = 1;
900			wakeup(&vm_pages_needed);
901		}
902		msleep(&cnt.v_free_count, &vm_page_queue_mtx, PDROP | PVM,
903		    "vmwait", 0);
904	}
905}
906
907/*
908 *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
909 *
910 *	Block until free pages are available for allocation
911 *	- Called only in vm_fault so that processes page faulting
912 *	  can be easily tracked.
913 *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
914 *	  processes will be able to grab memory first.  Do not change
915 *	  this balance without careful testing first.
916 */
917void
918vm_waitpfault(void)
919{
920
921	vm_page_lock_queues();
922	if (!vm_pages_needed) {
923		vm_pages_needed = 1;
924		wakeup(&vm_pages_needed);
925	}
926	msleep(&cnt.v_free_count, &vm_page_queue_mtx, PDROP | PUSER,
927	    "pfault", 0);
928}
929
930/*
931 *	vm_page_activate:
932 *
933 *	Put the specified page on the active list (if appropriate).
934 *	Ensure that act_count is at least ACT_INIT but do not otherwise
935 *	mess with it.
936 *
937 *	The page queues must be locked.
938 *	This routine may not block.
939 */
940void
941vm_page_activate(vm_page_t m)
942{
943
944	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
945	if (m->queue != PQ_ACTIVE) {
946		if ((m->queue - m->pc) == PQ_CACHE)
947			cnt.v_reactivated++;
948		vm_pageq_remove(m);
949		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
950			if (m->act_count < ACT_INIT)
951				m->act_count = ACT_INIT;
952			vm_pageq_enqueue(PQ_ACTIVE, m);
953		}
954	} else {
955		if (m->act_count < ACT_INIT)
956			m->act_count = ACT_INIT;
957	}
958}
959
960/*
961 *	vm_page_free_wakeup:
962 *
963 *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
964 *	routine is called when a page has been added to the cache or free
965 *	queues.
966 *
967 *	The page queues must be locked.
968 *	This routine may not block.
969 */
970static __inline void
971vm_page_free_wakeup(void)
972{
973
974	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
975	/*
976	 * if pageout daemon needs pages, then tell it that there are
977	 * some free.
978	 */
979	if (vm_pageout_pages_needed &&
980	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
981		wakeup(&vm_pageout_pages_needed);
982		vm_pageout_pages_needed = 0;
983	}
984	/*
985	 * wakeup processes that are waiting on memory if we hit a
986	 * high water mark. And wakeup scheduler process if we have
987	 * lots of memory. this process will swapin processes.
988	 */
989	if (vm_pages_needed && !vm_page_count_min()) {
990		vm_pages_needed = 0;
991		wakeup(&cnt.v_free_count);
992	}
993}
994
995/*
996 *	vm_page_free_toq:
997 *
998 *	Returns the given page to the PQ_FREE list,
999 *	disassociating it with any VM object.
1000 *
1001 *	Object and page must be locked prior to entry.
1002 *	This routine may not block.
1003 */
1004
1005void
1006vm_page_free_toq(vm_page_t m)
1007{
1008	struct vpgqueues *pq;
1009	vm_object_t object = m->object;
1010
1011	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1012	cnt.v_tfree++;
1013
1014	if (m->busy || ((m->queue - m->pc) == PQ_FREE)) {
1015		printf(
1016		"vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n",
1017		    (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0,
1018		    m->hold_count);
1019		if ((m->queue - m->pc) == PQ_FREE)
1020			panic("vm_page_free: freeing free page");
1021		else
1022			panic("vm_page_free: freeing busy page");
1023	}
1024
1025	/*
1026	 * unqueue, then remove page.  Note that we cannot destroy
1027	 * the page here because we do not want to call the pager's
1028	 * callback routine until after we've put the page on the
1029	 * appropriate free queue.
1030	 */
1031	vm_pageq_remove_nowakeup(m);
1032	vm_page_remove(m);
1033
1034	/*
1035	 * If fictitious remove object association and
1036	 * return, otherwise delay object association removal.
1037	 */
1038	if ((m->flags & PG_FICTITIOUS) != 0) {
1039		return;
1040	}
1041
1042	m->valid = 0;
1043	vm_page_undirty(m);
1044
1045	if (m->wire_count != 0) {
1046		if (m->wire_count > 1) {
1047			panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx",
1048				m->wire_count, (long)m->pindex);
1049		}
1050		panic("vm_page_free: freeing wired page");
1051	}
1052
1053	/*
1054	 * If we've exhausted the object's resident pages we want to free
1055	 * it up.
1056	 */
1057	if (object &&
1058	    (object->type == OBJT_VNODE) &&
1059	    ((object->flags & OBJ_DEAD) == 0)
1060	) {
1061		struct vnode *vp = (struct vnode *)object->handle;
1062
1063		if (vp) {
1064			VI_LOCK(vp);
1065			if (VSHOULDFREE(vp))
1066				vfree(vp);
1067			VI_UNLOCK(vp);
1068		}
1069	}
1070
1071	/*
1072	 * Clear the UNMANAGED flag when freeing an unmanaged page.
1073	 */
1074	if (m->flags & PG_UNMANAGED) {
1075		m->flags &= ~PG_UNMANAGED;
1076	}
1077
1078	if (m->hold_count != 0) {
1079		m->flags &= ~PG_ZERO;
1080		m->queue = PQ_HOLD;
1081	} else
1082		m->queue = PQ_FREE + m->pc;
1083	pq = &vm_page_queues[m->queue];
1084	mtx_lock_spin(&vm_page_queue_free_mtx);
1085	pq->lcnt++;
1086	++(*pq->cnt);
1087
1088	/*
1089	 * Put zero'd pages on the end ( where we look for zero'd pages
1090	 * first ) and non-zerod pages at the head.
1091	 */
1092	if (m->flags & PG_ZERO) {
1093		TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
1094		++vm_page_zero_count;
1095	} else {
1096		TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1097	}
1098	mtx_unlock_spin(&vm_page_queue_free_mtx);
1099	vm_page_free_wakeup();
1100}
1101
1102/*
1103 *	vm_page_unmanage:
1104 *
1105 * 	Prevent PV management from being done on the page.  The page is
1106 *	removed from the paging queues as if it were wired, and as a
1107 *	consequence of no longer being managed the pageout daemon will not
1108 *	touch it (since there is no way to locate the pte mappings for the
1109 *	page).  madvise() calls that mess with the pmap will also no longer
1110 *	operate on the page.
1111 *
1112 *	Beyond that the page is still reasonably 'normal'.  Freeing the page
1113 *	will clear the flag.
1114 *
1115 *	This routine is used by OBJT_PHYS objects - objects using unswappable
1116 *	physical memory as backing store rather then swap-backed memory and
1117 *	will eventually be extended to support 4MB unmanaged physical
1118 *	mappings.
1119 */
1120void
1121vm_page_unmanage(vm_page_t m)
1122{
1123
1124	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1125	if ((m->flags & PG_UNMANAGED) == 0) {
1126		if (m->wire_count == 0)
1127			vm_pageq_remove(m);
1128	}
1129	vm_page_flag_set(m, PG_UNMANAGED);
1130}
1131
1132/*
1133 *	vm_page_wire:
1134 *
1135 *	Mark this page as wired down by yet
1136 *	another map, removing it from paging queues
1137 *	as necessary.
1138 *
1139 *	The page queues must be locked.
1140 *	This routine may not block.
1141 */
1142void
1143vm_page_wire(vm_page_t m)
1144{
1145
1146	/*
1147	 * Only bump the wire statistics if the page is not already wired,
1148	 * and only unqueue the page if it is on some queue (if it is unmanaged
1149	 * it is already off the queues).
1150	 */
1151	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1152	if (m->flags & PG_FICTITIOUS)
1153		return;
1154	if (m->wire_count == 0) {
1155		if ((m->flags & PG_UNMANAGED) == 0)
1156			vm_pageq_remove(m);
1157		atomic_add_int(&cnt.v_wire_count, 1);
1158	}
1159	m->wire_count++;
1160	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
1161}
1162
1163/*
1164 *	vm_page_unwire:
1165 *
1166 *	Release one wiring of this page, potentially
1167 *	enabling it to be paged again.
1168 *
1169 *	Many pages placed on the inactive queue should actually go
1170 *	into the cache, but it is difficult to figure out which.  What
1171 *	we do instead, if the inactive target is well met, is to put
1172 *	clean pages at the head of the inactive queue instead of the tail.
1173 *	This will cause them to be moved to the cache more quickly and
1174 *	if not actively re-referenced, freed more quickly.  If we just
1175 *	stick these pages at the end of the inactive queue, heavy filesystem
1176 *	meta-data accesses can cause an unnecessary paging load on memory bound
1177 *	processes.  This optimization causes one-time-use metadata to be
1178 *	reused more quickly.
1179 *
1180 *	BUT, if we are in a low-memory situation we have no choice but to
1181 *	put clean pages on the cache queue.
1182 *
1183 *	A number of routines use vm_page_unwire() to guarantee that the page
1184 *	will go into either the inactive or active queues, and will NEVER
1185 *	be placed in the cache - for example, just after dirtying a page.
1186 *	dirty pages in the cache are not allowed.
1187 *
1188 *	The page queues must be locked.
1189 *	This routine may not block.
1190 */
1191void
1192vm_page_unwire(vm_page_t m, int activate)
1193{
1194
1195	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1196	if (m->flags & PG_FICTITIOUS)
1197		return;
1198	if (m->wire_count > 0) {
1199		m->wire_count--;
1200		if (m->wire_count == 0) {
1201			atomic_subtract_int(&cnt.v_wire_count, 1);
1202			if (m->flags & PG_UNMANAGED) {
1203				;
1204			} else if (activate)
1205				vm_pageq_enqueue(PQ_ACTIVE, m);
1206			else {
1207				vm_page_flag_clear(m, PG_WINATCFLS);
1208				vm_pageq_enqueue(PQ_INACTIVE, m);
1209			}
1210		}
1211	} else {
1212		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
1213	}
1214}
1215
1216
1217/*
1218 * Move the specified page to the inactive queue.  If the page has
1219 * any associated swap, the swap is deallocated.
1220 *
1221 * Normally athead is 0 resulting in LRU operation.  athead is set
1222 * to 1 if we want this page to be 'as if it were placed in the cache',
1223 * except without unmapping it from the process address space.
1224 *
1225 * This routine may not block.
1226 */
1227static __inline void
1228_vm_page_deactivate(vm_page_t m, int athead)
1229{
1230
1231	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1232
1233	/*
1234	 * Ignore if already inactive.
1235	 */
1236	if (m->queue == PQ_INACTIVE)
1237		return;
1238	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
1239		if ((m->queue - m->pc) == PQ_CACHE)
1240			cnt.v_reactivated++;
1241		vm_page_flag_clear(m, PG_WINATCFLS);
1242		vm_pageq_remove(m);
1243		if (athead)
1244			TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1245		else
1246			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1247		m->queue = PQ_INACTIVE;
1248		vm_page_queues[PQ_INACTIVE].lcnt++;
1249		cnt.v_inactive_count++;
1250	}
1251}
1252
1253void
1254vm_page_deactivate(vm_page_t m)
1255{
1256    _vm_page_deactivate(m, 0);
1257}
1258
1259/*
1260 * vm_page_try_to_cache:
1261 *
1262 * Returns 0 on failure, 1 on success
1263 */
1264int
1265vm_page_try_to_cache(vm_page_t m)
1266{
1267
1268	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1269	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
1270	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
1271		return (0);
1272	}
1273	pmap_remove_all(m);
1274	if (m->dirty)
1275		return (0);
1276	vm_page_cache(m);
1277	return (1);
1278}
1279
1280/*
1281 * vm_page_try_to_free()
1282 *
1283 *	Attempt to free the page.  If we cannot free it, we do nothing.
1284 *	1 is returned on success, 0 on failure.
1285 */
1286int
1287vm_page_try_to_free(vm_page_t m)
1288{
1289
1290	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1291	if (m->object != NULL)
1292		VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1293	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
1294	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
1295		return (0);
1296	}
1297	pmap_remove_all(m);
1298	if (m->dirty)
1299		return (0);
1300	vm_page_busy(m);
1301	vm_page_free(m);
1302	return (1);
1303}
1304
1305/*
1306 * vm_page_cache
1307 *
1308 * Put the specified page onto the page cache queue (if appropriate).
1309 *
1310 * This routine may not block.
1311 */
1312void
1313vm_page_cache(vm_page_t m)
1314{
1315
1316	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1317	if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy ||
1318	    m->hold_count || m->wire_count) {
1319		printf("vm_page_cache: attempting to cache busy page\n");
1320		return;
1321	}
1322	if ((m->queue - m->pc) == PQ_CACHE)
1323		return;
1324
1325	/*
1326	 * Remove all pmaps and indicate that the page is not
1327	 * writeable or mapped.
1328	 */
1329	pmap_remove_all(m);
1330	if (m->dirty != 0) {
1331		panic("vm_page_cache: caching a dirty page, pindex: %ld",
1332			(long)m->pindex);
1333	}
1334	vm_pageq_remove_nowakeup(m);
1335	vm_pageq_enqueue(PQ_CACHE + m->pc, m);
1336	vm_page_free_wakeup();
1337}
1338
1339/*
1340 * vm_page_dontneed
1341 *
1342 *	Cache, deactivate, or do nothing as appropriate.  This routine
1343 *	is typically used by madvise() MADV_DONTNEED.
1344 *
1345 *	Generally speaking we want to move the page into the cache so
1346 *	it gets reused quickly.  However, this can result in a silly syndrome
1347 *	due to the page recycling too quickly.  Small objects will not be
1348 *	fully cached.  On the otherhand, if we move the page to the inactive
1349 *	queue we wind up with a problem whereby very large objects
1350 *	unnecessarily blow away our inactive and cache queues.
1351 *
1352 *	The solution is to move the pages based on a fixed weighting.  We
1353 *	either leave them alone, deactivate them, or move them to the cache,
1354 *	where moving them to the cache has the highest weighting.
1355 *	By forcing some pages into other queues we eventually force the
1356 *	system to balance the queues, potentially recovering other unrelated
1357 *	space from active.  The idea is to not force this to happen too
1358 *	often.
1359 */
1360void
1361vm_page_dontneed(vm_page_t m)
1362{
1363	static int dnweight;
1364	int dnw;
1365	int head;
1366
1367	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1368	dnw = ++dnweight;
1369
1370	/*
1371	 * occassionally leave the page alone
1372	 */
1373	if ((dnw & 0x01F0) == 0 ||
1374	    m->queue == PQ_INACTIVE ||
1375	    m->queue - m->pc == PQ_CACHE
1376	) {
1377		if (m->act_count >= ACT_INIT)
1378			--m->act_count;
1379		return;
1380	}
1381
1382	if (m->dirty == 0 && pmap_is_modified(m))
1383		vm_page_dirty(m);
1384
1385	if (m->dirty || (dnw & 0x0070) == 0) {
1386		/*
1387		 * Deactivate the page 3 times out of 32.
1388		 */
1389		head = 0;
1390	} else {
1391		/*
1392		 * Cache the page 28 times out of every 32.  Note that
1393		 * the page is deactivated instead of cached, but placed
1394		 * at the head of the queue instead of the tail.
1395		 */
1396		head = 1;
1397	}
1398	_vm_page_deactivate(m, head);
1399}
1400
1401/*
1402 * Grab a page, waiting until we are waken up due to the page
1403 * changing state.  We keep on waiting, if the page continues
1404 * to be in the object.  If the page doesn't exist, first allocate it
1405 * and then conditionally zero it.
1406 *
1407 * This routine may block.
1408 */
1409vm_page_t
1410vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
1411{
1412	vm_page_t m;
1413
1414	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1415retrylookup:
1416	if ((m = vm_page_lookup(object, pindex)) != NULL) {
1417		vm_page_lock_queues();
1418		if (m->busy || (m->flags & PG_BUSY)) {
1419			vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
1420			VM_OBJECT_UNLOCK(object);
1421			msleep(m, &vm_page_queue_mtx, PDROP | PVM, "pgrbwt", 0);
1422			VM_OBJECT_LOCK(object);
1423			if ((allocflags & VM_ALLOC_RETRY) == 0)
1424				return (NULL);
1425			goto retrylookup;
1426		} else {
1427			if (allocflags & VM_ALLOC_WIRED)
1428				vm_page_wire(m);
1429			if ((allocflags & VM_ALLOC_NOBUSY) == 0)
1430				vm_page_busy(m);
1431			vm_page_unlock_queues();
1432			return (m);
1433		}
1434	}
1435	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
1436	if (m == NULL) {
1437		VM_OBJECT_UNLOCK(object);
1438		VM_WAIT;
1439		VM_OBJECT_LOCK(object);
1440		if ((allocflags & VM_ALLOC_RETRY) == 0)
1441			return (NULL);
1442		goto retrylookup;
1443	}
1444	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
1445		pmap_zero_page(m);
1446	return (m);
1447}
1448
1449/*
1450 * Mapping function for valid bits or for dirty bits in
1451 * a page.  May not block.
1452 *
1453 * Inputs are required to range within a page.
1454 */
1455__inline int
1456vm_page_bits(int base, int size)
1457{
1458	int first_bit;
1459	int last_bit;
1460
1461	KASSERT(
1462	    base + size <= PAGE_SIZE,
1463	    ("vm_page_bits: illegal base/size %d/%d", base, size)
1464	);
1465
1466	if (size == 0)		/* handle degenerate case */
1467		return (0);
1468
1469	first_bit = base >> DEV_BSHIFT;
1470	last_bit = (base + size - 1) >> DEV_BSHIFT;
1471
1472	return ((2 << last_bit) - (1 << first_bit));
1473}
1474
1475/*
1476 *	vm_page_set_validclean:
1477 *
1478 *	Sets portions of a page valid and clean.  The arguments are expected
1479 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
1480 *	of any partial chunks touched by the range.  The invalid portion of
1481 *	such chunks will be zero'd.
1482 *
1483 *	This routine may not block.
1484 *
1485 *	(base + size) must be less then or equal to PAGE_SIZE.
1486 */
1487void
1488vm_page_set_validclean(vm_page_t m, int base, int size)
1489{
1490	int pagebits;
1491	int frag;
1492	int endoff;
1493
1494	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1495	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1496	if (size == 0)	/* handle degenerate case */
1497		return;
1498
1499	/*
1500	 * If the base is not DEV_BSIZE aligned and the valid
1501	 * bit is clear, we have to zero out a portion of the
1502	 * first block.
1503	 */
1504	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
1505	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
1506		pmap_zero_page_area(m, frag, base - frag);
1507
1508	/*
1509	 * If the ending offset is not DEV_BSIZE aligned and the
1510	 * valid bit is clear, we have to zero out a portion of
1511	 * the last block.
1512	 */
1513	endoff = base + size;
1514	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
1515	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
1516		pmap_zero_page_area(m, endoff,
1517		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
1518
1519	/*
1520	 * Set valid, clear dirty bits.  If validating the entire
1521	 * page we can safely clear the pmap modify bit.  We also
1522	 * use this opportunity to clear the PG_NOSYNC flag.  If a process
1523	 * takes a write fault on a MAP_NOSYNC memory area the flag will
1524	 * be set again.
1525	 *
1526	 * We set valid bits inclusive of any overlap, but we can only
1527	 * clear dirty bits for DEV_BSIZE chunks that are fully within
1528	 * the range.
1529	 */
1530	pagebits = vm_page_bits(base, size);
1531	m->valid |= pagebits;
1532#if 0	/* NOT YET */
1533	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
1534		frag = DEV_BSIZE - frag;
1535		base += frag;
1536		size -= frag;
1537		if (size < 0)
1538			size = 0;
1539	}
1540	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
1541#endif
1542	m->dirty &= ~pagebits;
1543	if (base == 0 && size == PAGE_SIZE) {
1544		pmap_clear_modify(m);
1545		vm_page_flag_clear(m, PG_NOSYNC);
1546	}
1547}
1548
1549void
1550vm_page_clear_dirty(vm_page_t m, int base, int size)
1551{
1552
1553	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1554	m->dirty &= ~vm_page_bits(base, size);
1555}
1556
1557/*
1558 *	vm_page_set_invalid:
1559 *
1560 *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
1561 *	valid and dirty bits for the effected areas are cleared.
1562 *
1563 *	May not block.
1564 */
1565void
1566vm_page_set_invalid(vm_page_t m, int base, int size)
1567{
1568	int bits;
1569
1570	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1571	bits = vm_page_bits(base, size);
1572	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1573	m->valid &= ~bits;
1574	m->dirty &= ~bits;
1575	m->object->generation++;
1576}
1577
1578/*
1579 * vm_page_zero_invalid()
1580 *
1581 *	The kernel assumes that the invalid portions of a page contain
1582 *	garbage, but such pages can be mapped into memory by user code.
1583 *	When this occurs, we must zero out the non-valid portions of the
1584 *	page so user code sees what it expects.
1585 *
1586 *	Pages are most often semi-valid when the end of a file is mapped
1587 *	into memory and the file's size is not page aligned.
1588 */
1589void
1590vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
1591{
1592	int b;
1593	int i;
1594
1595	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1596	/*
1597	 * Scan the valid bits looking for invalid sections that
1598	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
1599	 * valid bit may be set ) have already been zerod by
1600	 * vm_page_set_validclean().
1601	 */
1602	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
1603		if (i == (PAGE_SIZE / DEV_BSIZE) ||
1604		    (m->valid & (1 << i))
1605		) {
1606			if (i > b) {
1607				pmap_zero_page_area(m,
1608				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
1609			}
1610			b = i + 1;
1611		}
1612	}
1613
1614	/*
1615	 * setvalid is TRUE when we can safely set the zero'd areas
1616	 * as being valid.  We can do this if there are no cache consistancy
1617	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
1618	 */
1619	if (setvalid)
1620		m->valid = VM_PAGE_BITS_ALL;
1621}
1622
1623/*
1624 *	vm_page_is_valid:
1625 *
1626 *	Is (partial) page valid?  Note that the case where size == 0
1627 *	will return FALSE in the degenerate case where the page is
1628 *	entirely invalid, and TRUE otherwise.
1629 *
1630 *	May not block.
1631 */
1632int
1633vm_page_is_valid(vm_page_t m, int base, int size)
1634{
1635	int bits = vm_page_bits(base, size);
1636
1637	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1638	if (m->valid && ((m->valid & bits) == bits))
1639		return 1;
1640	else
1641		return 0;
1642}
1643
1644/*
1645 * update dirty bits from pmap/mmu.  May not block.
1646 */
1647void
1648vm_page_test_dirty(vm_page_t m)
1649{
1650	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
1651		vm_page_dirty(m);
1652	}
1653}
1654
1655int so_zerocp_fullpage = 0;
1656
1657void
1658vm_page_cowfault(vm_page_t m)
1659{
1660	vm_page_t mnew;
1661	vm_object_t object;
1662	vm_pindex_t pindex;
1663
1664	object = m->object;
1665	pindex = m->pindex;
1666
1667 retry_alloc:
1668	vm_page_busy(m);
1669	vm_page_remove(m);
1670	mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
1671	if (mnew == NULL) {
1672		vm_page_insert(m, object, pindex);
1673		vm_page_unlock_queues();
1674		VM_OBJECT_UNLOCK(object);
1675		VM_WAIT;
1676		VM_OBJECT_LOCK(object);
1677		vm_page_lock_queues();
1678		goto retry_alloc;
1679	}
1680
1681	if (m->cow == 0) {
1682		/*
1683		 * check to see if we raced with an xmit complete when
1684		 * waiting to allocate a page.  If so, put things back
1685		 * the way they were
1686		 */
1687		vm_page_free(mnew);
1688		vm_page_insert(m, object, pindex);
1689	} else { /* clear COW & copy page */
1690		if (!so_zerocp_fullpage)
1691			pmap_copy_page(m, mnew);
1692		mnew->valid = VM_PAGE_BITS_ALL;
1693		vm_page_dirty(mnew);
1694		vm_page_flag_clear(mnew, PG_BUSY);
1695	}
1696}
1697
1698void
1699vm_page_cowclear(vm_page_t m)
1700{
1701
1702	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1703	if (m->cow) {
1704		m->cow--;
1705		/*
1706		 * let vm_fault add back write permission  lazily
1707		 */
1708	}
1709	/*
1710	 *  sf_buf_free() will free the page, so we needn't do it here
1711	 */
1712}
1713
1714void
1715vm_page_cowsetup(vm_page_t m)
1716{
1717
1718	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1719	m->cow++;
1720	pmap_page_protect(m, VM_PROT_READ);
1721}
1722
1723#include "opt_ddb.h"
1724#ifdef DDB
1725#include <sys/kernel.h>
1726
1727#include <ddb/ddb.h>
1728
1729DB_SHOW_COMMAND(page, vm_page_print_page_info)
1730{
1731	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
1732	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
1733	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
1734	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
1735	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
1736	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
1737	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
1738	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
1739	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
1740	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
1741}
1742
1743DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
1744{
1745	int i;
1746	db_printf("PQ_FREE:");
1747	for (i = 0; i < PQ_L2_SIZE; i++) {
1748		db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
1749	}
1750	db_printf("\n");
1751
1752	db_printf("PQ_CACHE:");
1753	for (i = 0; i < PQ_L2_SIZE; i++) {
1754		db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
1755	}
1756	db_printf("\n");
1757
1758	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
1759		vm_page_queues[PQ_ACTIVE].lcnt,
1760		vm_page_queues[PQ_INACTIVE].lcnt);
1761}
1762#endif /* DDB */
1763